In [1]:
from __future__ import print_function

import numpy as np
from numpy import array
from numpy import asarray
from numpy import zeros
import pandas as pd
import tensorflow as tf
from tensorflow.compat.v1.keras.models import save_model

import keras
import h5py
from keras import initializers
from keras.models import Sequential
from keras.models import Model
# from keras.models import load_model
from tensorflow.keras.models import load_model
from keras.layers import Flatten, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.layers import Input, InputLayer
from keras.layers import Embedding, Activation, Dropout, Dense, Conv1D, MaxPooling1D, GlobalMaxPooling1D, BatchNormalization
from keras.layers.merge import Concatenate
# from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import one_hot
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import multi_gpu_model

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, binarize
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict

import re
import os
from os import listdir
# !pip install nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

import itertools
import operator

Using TensorFlow backend.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
AWS_Path = 'https://kstonedev.s3-us-west-2.amazonaws.com/W266/USPTO-2M/'
local_path = 'data/'
all_files = [
    '2006_USPTO.json',
    '2007_USPTO.json',
    '2008_USPTO.json',
    '2009_USPTO.json',
    '2010_USPTO.json',
    '2011_USPTO.json',
    '2012_USPTO.json',
    '2013_USPTO.json',
    '2014_USPTO.json',
    '2015_USPTO.json'
]

# for file in all_files:
#     wget.download(AWS_Path + file, out='.')
#     print(file)

In [19]:
num_files_to_read = "all" # set to "all" or a number such as 2
if num_files_to_read == "all":
  print('Ingesting the following files:\n', sorted(all_files))
  patents = pd.concat(pd.read_json(local_path + 'USPTO-2M/' + f) for f in sorted(all_files))
else:
  print('Ingesting the following files:\n', sorted(all_files)[-num_files_to_read:])
  patents = pd.concat(pd.read_json(local_path + 'USPTO-2M/' + f) for f in sorted(all_files)[-num_files_to_read:])

Ingesting the following files:
 ['2006_USPTO.json', '2007_USPTO.json', '2008_USPTO.json', '2009_USPTO.json', '2010_USPTO.json', '2011_USPTO.json', '2012_USPTO.json', '2013_USPTO.json', '2014_USPTO.json', '2015_USPTO.json']


In [20]:
X = list(patents["Abstract"])
abstract_words = [x for sent in X for x in sent.split(' ')]
unique_words = len(set(abstract_words))
CRC_labels = patents.Subclass_labels
# binarize labels into 1-hot encodings
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(CRC_labels)
num_CRC_labels = len(y[0])

print("Total number of CRC labels", num_CRC_labels)

# save CRC label list to a file for inference
pd.DataFrame(mlb.classes_).to_csv('crc_labels.csv')

Total number of CRC labels 632


In [21]:
# Keep full patent table intact for analysis at end
P_train3, P_test3, y_train3, y_test3 = train_test_split(patents, y, test_size=0.20, random_state=42)
X_train3 = list(P_train3['Abstract'])
X_test3 = list(P_test3['Abstract'])

In [22]:
experiment_number = 3
if experiment_number == 1:
  (P_train, P_test, X_train, X_test, y_train, y_test) = (P_train1, P_test1, X_train1, X_test1, y_train1, y_test1)
elif experiment_number == 2:
  (P_train, P_test, X_train, X_test, y_train, y_test) = (P_train2, P_test2, X_train2, X_test2, y_train2, y_test2)
elif experiment_number == 3:
  (P_train, P_test, X_train, X_test, y_train, y_test) = (P_train3, P_test3, X_train3, X_test3, y_train3, y_test3)
elif experiment_number == 4:
  (P_train, P_test, X_train, X_test, y_train, y_test) = (P_train4, P_test4, X_train4, X_test4, y_train4, y_test4)
else:
  print("Unknown experiment number")

In [23]:
# Convert to word embeddings

# https://keras.io/preprocessing/text/
tokenizer = Tokenizer(num_words=unique_words)
tokenizer.fit_on_texts(X_train)

# Top 10 words
top10words = sorted(tokenizer.word_counts.items(), key=lambda x: x[1], reverse=True)[:10]
# print("Top 10 words:\n", top10words)

# Convert text to sequence of numbers, each number representing a word
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [24]:
# Use GloVe word embeddings to convert text inputs to their numeric counterparts

embeddings_dictionary = dict()

glove_file = open(local_path + 'glove.6B.100d.txt', encoding='utf8')

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [28]:
# metric f1 definition
from keras import backend as K
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall
def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
def f1(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))
def weighted_bce(y_true, y_pred):
    # weights become 2 if y_true is 1, and 1 if y_true is 0
    weights = (y_true * 2.) + (1. - y_true)
    bce = K.binary_crossentropy(y_true, y_pred)
    weighted_bce = K.mean(bce * weights)
    return weighted_bce

In [29]:
filters = 512
hidden_dims = 512
print('Build model...')
input = keras.Input(shape=(maxlen,), name='input_embeddings')
embedding = keras.layers.Embedding(vocab_size, 100,
                    input_length=maxlen, weights=[embedding_matrix],
                                             trainable=False)(input)
conv0 = keras.layers.Conv1D(filters, 2, activation='relu')(embedding)
maxpool0 = keras.layers.GlobalMaxPooling1D()(conv0)
conv1 = keras.layers.Conv1D(filters, 3, activation='relu')(embedding)
maxpool1 = keras.layers.GlobalMaxPooling1D()(conv1)
conv2 = keras.layers.Conv1D(filters, 4, activation='relu')(embedding)
maxpool2 = keras.layers.GlobalMaxPooling1D()(conv2)
conv3 = keras.layers.Conv1D(filters, 5, activation='relu')(embedding)
maxpool3 = keras.layers.GlobalMaxPooling1D()(conv3)
concat1 = keras.layers.concatenate([maxpool0, maxpool1, maxpool2, maxpool3], axis=1)
dropout1 = keras.layers.Dropout(rate=0.2)(concat1)
dense = keras.layers.Dense(hidden_dims, activation='relu', name='dense')(dropout1)
batchnorm = keras.layers.BatchNormalization()(dense)
dense = keras.layers.Dropout(rate=0.5)(batchnorm)
pred = keras.layers.Dense(num_CRC_labels, activation='sigmoid', name='crc')(dense)
model = keras.models.Model(inputs=input, outputs=pred)

Build model...


In [30]:
model.summary()
model.compile(loss=weighted_bce,
              optimizer='adam',
              metrics=[f1,
                      tf.keras.metrics.Precision(name='precision'),
                      tf.keras.metrics.Precision(name='precision_1', top_k=1),
                      tf.keras.metrics.Recall(name='recall'),
                      tf.keras.metrics.Recall(name='recall_5', top_k=5)])


Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_embeddings (InputLayer)   (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 200, 100)     21382700    input_embeddings[0][0]           
__________________________________________________________________________________________________
conv1d_9 (Conv1D)               (None, 199, 512)     102912      embedding_3[0][0]                
__________________________________________________________________________________________________
conv1d_10 (Conv1D)              (None, 198, 512)     154112      embedding_3[0][0]                
____________________________________________________________________________________________

In [33]:
history3 = model.fit(X_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(X_test, y_test))

Train on 1600117 samples, validate on 400030 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:
# Save Model for inference
from keras.models import save_model

save_model(model, local_path + 'model_allfiles.h5')
print('Saved model to disk')

# save CRC label list for inference
path = local_path
pd.DataFrame(mlb.classes_).to_csv(local_path + 'crc_labels_allfiles.csv')
print('Saved CRC labels to disk')

# !pip install pickle
import pickle

# saving tokenizer info for inference
with open(local_path + 'tokenizer_allfiles.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print('Saved tokenizer data to disk')

Saved model to disk
Saved CRC labels to disk
Save tokenizer data to disk


In [2]:
keras.__version__

'2.3.1'

In [3]:
tf.__version__

'2.0.0'

In [4]:
!python --version

Python 3.6.8


In [5]:
tf.test.is_gpu_available()

True

In [6]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 1163125727079847662
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 1567043388891744622
physical_device_desc: "device: XLA_CPU device"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 5823642204537443658
physical_device_desc: "device: XLA_GPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 15956161332
locality {
  bus_id: 1
  links {
  }
}
incarnation: 6800377534166770804
physical_device_desc: "device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:07.0, compute capability: 6.0"
]
