In [1]:
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np

## Data Processing

### Use pre-trained word embeddings based on patent data

In [2]:
word2vec_model = KeyedVectors.load_word2vec_format('uspto_2m_abstract_word2vec.bin', binary=True)

In [3]:
#read text data and their labels
label_words_df = pd.DataFrame(pd.read_csv('uspto_2m_abstr_label_valid_label.tsv', sep='\t'))

In [4]:
label_words_df.head()

Unnamed: 0,Abstract,label
0,an apparatus for generating a saddle shaped tr...,H01L
1,a container or tray having various features th...,B65D
2,screening methods for identifying compounds an...,A61K
3,methods of forming conductive pattern structur...,H01L
4,a method of logging information about events f...,G07C


In [5]:
#set up balanced dataset
size = 500        # sample size
replace = True  # cannot choose False, since some classes do not have 100 samples
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:]
sub_df = label_words_df.groupby('label', as_index=False).apply(fn)
y = sub_df['label']
len(sub_df)

315000

In [6]:
sub_df.columns

Index(['Abstract', 'label'], dtype='object')

In [7]:
#extract words
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

tokenizer = Tokenizer()
#tokenizer = Tokenizer(num_words=20)
tokenizer.fit_on_texts(sub_df['Abstract'].values)
sequences = tokenizer.texts_to_sequences(sub_df['Abstract'].values)


#max length for pad sequences
seq_max_len = max([len(s.split()) for s in sub_df['Abstract'].values])

word_index = tokenizer.word_index
print ('Number of unique tokens: ', len(word_index))

x_pad = pad_sequences(sequences, maxlen=seq_max_len)
encoder = LabelEncoder()
y_set = encoder.fit_transform(y)

Number of unique tokens:  63929


### Prepare the embedding layer

In [8]:
embedding_dim = word2vec_model.vector_size
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]
    #print (word, i)

## Train LSTM, GRU, CNN with the pre-trained embedding layer

CNN is faster than LSTM, GRU.

Future work:

1. Try seqCNN, etc., other CNN variations.
2. Try large dataset using CNN, compared to Naive Bayes, SVM.

In [None]:
from tensorflow.python.keras.layers import Embedding, Dense, LSTM, GRU
from tensorflow.python.keras.layers import Conv1D, MaxPooling1D, Flatten
from tensorflow.python.keras.layers import Dropout, SeparableConv1D, GlobalAveragePooling1D
from tensorflow.python.keras import models, callbacks
from tensorflow.python.keras.optimizers import Adam

In [29]:
numOfclasses = len(set(y_set))
last_layer_activation = 'softmax'
hidden_layer_activation = 'relu'
dropout_rate = 0.2
loss = 'sparse_categorical_crossentropy'
learning_rate = 0.001
epochs = 2
n_layers = 1
units = 64
batch_size = 128

In [83]:
#LSTM
#for output layer
model = models.Sequential()
#use pre-trained embedding matrix
model.add(Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], 
                    input_length=seq_max_len, trainable=False))

model.add(LSTM(units=units, dropout=dropout_rate, recurrent_dropout=dropout_rate))
model.add(Dense(units=numOfclasses, activation=last_layer_activation))
model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])
history = model.fit(x_pad, y_set, epochs=epochs, batch_size=batch_size, validation_split=0.1, 
                    verbose=1)

Train on 283500 samples, validate on 31500 samples
Epoch 1/2
Epoch 2/2


In [61]:
#GRU
epochs = 1
model = models.Sequential()
#use pre-trained embedding matrix
model.add(Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], 
                    input_length=seq_max_len, trainable=False))

model.add(GRU(units=units, dropout=dropout_rate, recurrent_dropout=dropout_rate))
model.add(Dense(units=numOfclasses, activation=last_layer_activation))
model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])
history = model.fit(x_pad, y_set, epochs=epochs, batch_size=batch_size, validation_split=0.1, 
                    verbose=1)

Train on 283500 samples, validate on 31500 samples


In [27]:
#CNN
model = models.Sequential()
model.add(Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], 
                    input_length=seq_max_len, trainable=False))
model.add(Conv1D(250, 5, activation=hidden_layer_activation))
model.add(MaxPooling1D(pool_size=4))
model.add(Flatten())
model.add(Dense(units=numOfclasses, activation=last_layer_activation))
model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])
history = model.fit(x_pad, y_set, epochs=epochs, batch_size=batch_size, validation_split=0.1, verbose=1)

Train on 283500 samples, validate on 31500 samples
Epoch 1/2
Epoch 2/2


In [46]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_pad, y_set, test_size = 0.2, random_state = 0)

In [59]:
#adding more layers get worse results
epochs = 20
model = models.Sequential()
model.add(Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], 
                    input_length=seq_max_len, trainable=False))
model.add(Conv1D(128, 3, activation=hidden_layer_activation))
model.add(MaxPooling1D(pool_size=2))  #vary 2,3,4,5
model.add(Flatten())
model.add(Dense(units=numOfclasses, activation=last_layer_activation))
model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1)

Train on 252000 samples, validate on 63000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# Train CNN and allows to learn the embedding layer

In [60]:
#CNN without weights at the embedding layer
model = models.Sequential()
model.add(Embedding(len(word_index) + 1, embedding_dim, input_length=seq_max_len))
model.add(Conv1D(128, 3, activation=hidden_layer_activation))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(units=numOfclasses, activation=last_layer_activation))
model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1)

Train on 252000 samples, validate on 63000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


There is no big difference between using a pre-trained embedding layer or not, for CNN.

With the pre-trained embedding layer, LSTM or GRU does not perform well. Since it takes a long time using these two methods, we did not attempt them with learning the embedding layer. Later, we can add more layers to optimize these neural networks.