In [1]:
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np

## Data Processing

### Use pre-trained word embeddings based on patent data

In [2]:
word2vec_model = KeyedVectors.load_word2vec_format('../input/patent-textclassification/uspto_2m_abstract_word2vec.bin', binary=True)

In [3]:
#read text data and their labels
label_words_df = pd.DataFrame(pd.read_csv('../input/patent-textclassification/uspto_2m_abstr_label_valid_label.tsv', sep='\t'))

In [4]:
label_words_df.head()

Unnamed: 0,Abstract,label
0,an apparatus for generating a saddle shaped tr...,H01L
1,a container or tray having various features th...,B65D
2,screening methods for identifying compounds an...,A61K
3,methods of forming conductive pattern structur...,H01L
4,a method of logging information about events f...,G07C


In [5]:
len(label_words_df)

678873

In [6]:
#set up balanced dataset
size = 1000       # sample size
replace = True  # cannot choose False, since some classes do not have 100 samples
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:]
sub_df = label_words_df.groupby('label', as_index=False).apply(fn)
y = sub_df['label']
len(sub_df)

630000

In [7]:
sub_df.columns

Index(['Abstract', 'label'], dtype='object')

In [8]:
#extract words
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

tokenizer = Tokenizer()
#tokenizer = Tokenizer(num_words=20)
tokenizer.fit_on_texts(sub_df['Abstract'].values)
sequences = tokenizer.texts_to_sequences(sub_df['Abstract'].values)


#max length for pad sequences
seq_max_len = max([len(s.split()) for s in sub_df['Abstract'].values])

word_index = tokenizer.word_index
print ('Number of unique tokens: ', len(word_index))

x_pad = pad_sequences(sequences, maxlen=seq_max_len)
encoder = LabelEncoder()
y_set = encoder.fit_transform(y)

Number of unique tokens:  76519


### Prepare the embedding layer

In [9]:
embedding_dim = word2vec_model.vector_size
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]
    #print (word, i)

## Train LSTM, GRU, CNN with the pre-trained embedding layer

In [10]:
from tensorflow.python.keras.layers import Embedding, Dense, LSTM, GRU
from tensorflow.python.keras.layers import Conv1D, MaxPooling1D, Flatten
from tensorflow.python.keras.layers import Dropout, SeparableConv1D, GlobalAveragePooling1D
from tensorflow.python.keras import models, callbacks
from tensorflow.python.keras.optimizers import Adam

In [11]:
numOfclasses = len(set(y_set))
last_layer_activation = 'softmax'
hidden_layer_activation = 'relu'
loss = 'sparse_categorical_crossentropy'
batch_size = 128

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_pad, y_set, test_size = 0.2, random_state = 0)

In [None]:
#use grid search to choose best options
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier


def create_model(filters, kernel_size, pool_size):
    model = models.Sequential()
    model.add(Embedding(len(word_index) + 1, embedding_dim, 
                        weights=[embedding_matrix], 
                        input_length=seq_max_len, trainable=False))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size))  #vary 2,3,4,5
    model.add(Flatten())
    model.add(Dense(units=numOfclasses, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    #history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1)
    return model

model = KerasClassifier(build_fn=create_model, verbose=1)

numOfclasses = len(set(y_set))
pool_size = [2, 3, 4]
features_list = [64, 128, 256]
kernel_size = [3, 5, 7]
batch_size = [32, 64, 128]
epochs = [1]

param_grid = dict(batch_size=batch_size,
                  epochs=epochs,
                  pool_size=pool_size,
                 filters=features_list,
                 kernel_size=kernel_size)

grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_result = grid.fit(X_train, y_train)

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
#{'batch_size': 128, 'epochs': 1, 'filters': 256, 'kernel_size': 3, 'pool_size': 2}

In [13]:
#adding more layers get worse results
epochs = 5
model = models.Sequential()
model.add(Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], 
                    input_length=seq_max_len, trainable=False))
model.add(Conv1D(256, 3, activation=hidden_layer_activation))
model.add(MaxPooling1D(pool_size=2))  #vary 2,3,4,5
model.add(Flatten())
model.add(Dense(units=numOfclasses, activation=last_layer_activation))
model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1)

Train on 504000 samples, validate on 126000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Increased 5%.

# Train CNN and allows to learn the embedding layer

In [14]:
#CNN without weights at the embedding layer
epochs = 2
model = models.Sequential()
model.add(Embedding(len(word_index) + 1, embedding_dim, input_length=seq_max_len))
model.add(Conv1D(256, 3, activation=hidden_layer_activation))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(units=numOfclasses, activation=last_layer_activation))
model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1)

Train on 504000 samples, validate on 126000 samples
Epoch 1/2
Epoch 2/2
