In [3]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In this section, use the pretrained word2vec model based on 6m patent dataset.

In [4]:
word2vec_model = KeyedVectors.load_word2vec_format('../../../NLP_Patent_Project/model_ver2.bin', binary=True)

Use the average vector to represent each abstract.

In [5]:
def get_mean_vector(word2vec_model, words):
    # remove out-of-vocabulary words
    words = [word for word in words if word in word2vec_model.vocab]
    if len(words) >= 1:
        return np.mean(word2vec_model[words], axis=0)
    else:
        return []
#print (get_mean_vector(word2vec_model, words))

Use the prepared keywords from the csv file.

In [6]:
#for the notebook crash
keywords_df = pd.DataFrame(pd.read_csv('uspto_2m_keywords.tsv', sep='\t'))

In [7]:
keywords_df.head()

Unnamed: 0,label,Abstract,Title,pid,noun,verb,phrases
0,G05B,an apparatus for generating a saddle shaped tr...,saddle shaped trajectory generator for two int...,8536817,"apparatus, saddle, trajectory, intersection, c...","generate, shape, motorize, connect, intersect,...","two cylindrical conduit, motorize axial module..."
1,H01L,an apparatus for generating a saddle shaped tr...,saddle shaped trajectory generator for two int...,8536817,"apparatus, saddle, trajectory, intersection, c...","generate, shape, motorize, connect, intersect,...","two cylindrical conduit, motorize axial module..."
2,A61M,the present invention provides apparatus and m...,balloon insertion apparatus and method of seal...,8382794,"apparatus, tissue, introducer, sheath, side, p...","provide, close, puncture, enable, seal","internal tissue puncture site, introducer shea..."
3,A01K,a restraint system for an animal comprising a ...,retractable leash and restraint assembly,8474414,"restraint, system, animal, collar, assembly, l...","comprise, adapt, secure, define, connect","extended configuration, restraint assembly, st..."
4,B29C,a container or tray having various features th...,container having a rim or other feature encaps...,8540111,"container, tray, feature, comprise, sidewall, ...","correspond, connect, mold, encapsulate, extend...","bottom surface, top surface, second region con..."


Remove invalid classes

In [8]:
#read the valid list of classes
class_df = pd.DataFrame(pd.read_csv('ipc4_descriptions.csv', sep=','))
class_list = list(class_df['Class'])
vaild_df = keywords_df[~keywords_df['label'].isin(class_list)]
len(vaild_df)

678887

In this experiment, we only keep noun and verb columns for creating vectors for each patent

In [9]:
label_words_df = vaild_df.drop(['Title', 'Abstract', 'pid', 'phrases'], axis = 1)
label_words_df.head()

Unnamed: 0,label,noun,verb
1,H01L,"apparatus, saddle, trajectory, intersection, c...","generate, shape, motorize, connect, intersect,..."
5,B65D,"container, tray, feature, comprise, sidewall, ...","correspond, connect, mold, encapsulate, extend..."
8,A61K,"compound, pharmaceutical, composition, cancer,...","screen, identify, treat, prevent, disclose, af..."
10,H01L,"pattern, structure, insulating, interlayer, su...","form, etch, extend, block, electroplate, polis..."
17,G07C,"information, event, combustion, engine, igniti...","log, arrange, implement, comprise, incremente,..."


Combine noun and verb columns and generate words list

In [10]:
label_words_df['words'] = label_words_df['noun'] + ', ' + label_words_df['verb']

In [11]:
label_words_df = label_words_df.dropna()

Choose fixed number of patents for each class randomly

In [12]:
size = 1000        # sample size
replace = True  # cannot choose False, since some classes do not have 100 samples
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:]
sub_df = label_words_df.groupby('label', as_index=False).apply(fn)
y = sub_df['label']
len(sub_df)

630000

In [13]:
flag = 0
abstr_vectors = []
for row in sub_df.itertuples():
    words = set(x.strip() for x in row[4].split(','))
    vec = get_mean_vector(word2vec_model, words)
    abstr_vectors.append(vec)
input_vectors = np.array(abstr_vectors)

In [14]:
len(input_vectors)

630000

### Classifiers

In this experiment, we cannot use Naive Bayes classifier, since the vectors contain negative values.

In our word2vec model, we only chose 100 dimension, which might lead to bad results.

Logistic regression method is better than others. We should try it with tfidf matrix.

Following work: 

1) Use Doc2vec to represent features, with the same classifiers, and compare the results.

2) Train better word2vec models.

In [15]:
X_train, X_test, y_train, y_test = train_test_split(input_vectors, y, test_size = 0.2, random_state = 0)

In [16]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

In [20]:
from sklearn.ensemble import RandomForestClassifier
start_time = time.time()
rfc = RandomForestClassifier(n_jobs=8, n_estimators=20, random_state=9, class_weight='balanced')
model = rfc.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))

y_pred = model.predict(X_test)
print('Random Forest accuracy %s' % accuracy_score(y_pred, y_test))

--- 288.45141196250916 seconds ---
Random Forest accuracy 0.719563492063492


In [17]:
from sklearn.linear_model import SGDClassifier

start_time = time.time()
sgd = SGDClassifier(n_jobs=4, random_state=99)
model = sgd.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))

y_pred = model.predict(X_test)
print('SVM_SGD accuracy %s' % accuracy_score(y_pred, y_test))



--- 139.00281810760498 seconds ---
SVM_SGD accuracy 0.267484126984127


In [None]:
from sklearn.svm import LinearSVC
start_time = time.time()
svc = LinearSVC()
model = svc.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
y_pred = model.predict(X_test)
print('SVM accuracy %s' % accuracy_score(y_pred, y_test))

In [18]:
from sklearn.linear_model import LogisticRegression
start_time = time.time()
lrc = LogisticRegression(C=0.5, random_state=9, solver='sag', multi_class='multinomial', n_jobs=8)
model = lrc.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
y_pred = model.predict(X_test)
print('Logistic regression accuracy %s' % accuracy_score(y_pred, y_test))



--- 12250.07357120514 seconds ---
Logistic regression accuracy 0.11524758610006712


In [23]:
from tensorflow.python.keras import models, layers, callbacks
from tensorflow.python.keras.optimizers import Adam

def build_mlp_model(units, n_layers, last_layer_activation, hidden_layer_activation, input_shape, dropout_rate, numOfclasses):
    model = models.Sequential()
    
    #hidden layers
    for _ in range(n_layers-1):
        model.add(layers.Dense(units=units, activation=hidden_layer_activation, 
                               kernel_initializer='glorot_uniform', input_dim = input_shape))
        model.add(layers.Dropout(rate=dropout_rate))
    
    #output layer
    model.add(layers.Dense(units=numOfclasses, activation=last_layer_activation))
    return model

numOfclasses = len(sub_df.groupby('label'))
last_layer_activation = 'softmax'
hidden_layer_activation = 'relu'
input_shape = X_train.shape[1]
dropout_rate = 0.5
loss = 'sparse_categorical_crossentropy'
learning_rate = 0.01
epochs = 10
n_layers = 1
units = 32
batch_size = 128

model = build_mlp_model(units, n_layers, last_layer_activation, hidden_layer_activation, input_shape, dropout_rate, numOfclasses)
model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])

#use early-stopping
callback_early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='auto')
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), 
                    verbose=1)

Train on 504000 samples, validate on 126000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
