In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB

In [2]:
label_words_df = pd.DataFrame(pd.read_csv('uspto_2m_keywords.tsv', sep='\t'))

In [3]:
label_words_df = label_words_df.drop(['Title'], axis = 1)
#label_words_df.head()

In [4]:
#read the valid list of classes
class_df = pd.DataFrame(pd.read_csv('ipc4_descriptions.csv', sep=','))
class_list = list(class_df['Class'])
print ('Number of classes: ', len(class_list))

Number of classes:  633


In [5]:
#remove rows with invalid labels
label_words_df = label_words_df[~label_words_df['label'].isin(class_list)]
len(label_words_df)

678887

In [6]:
def combineRows_column(col):
    return col.str.cat(sep=', ')
def removeDuplicateWords(words):
    return set(x.strip() for x in words.split(','))

In [7]:
nouns = combineRows_column(label_words_df['noun'])

In [8]:
#remove duplicate nouns
nouns = removeDuplicateWords(nouns)
print ('nouns size: ', len(nouns))

nouns size:  84033


In [9]:
def getVocabulary(words_set):
    customize_vocabulary = {}
    idx = 0
    for word in words_set:
        if (word not in customize_vocabulary):
            customize_vocabulary[word] = idx
            idx += 1
    return customize_vocabulary

In [10]:
#from phrases
customize_vocabulary = getVocabulary(nouns)
len(customize_vocabulary)

84033

In [11]:
label_words_df = label_words_df.dropna()

In [12]:
#choose fixed number of patents randomly
size = 500        # sample size
replace = True  # cannot choose False, since some classes do not have 100 samples
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:]
test_df = label_words_df.groupby('label', as_index=False).apply(fn)
y = test_df['label']
len(test_df)

315000

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
import time

start_time = time.time()

dataset = test_df['Abstract']

tfidf_vectorizer = TfidfVectorizer(vocabulary=customize_vocabulary, ngram_range=(1,5))
tfidf_vectors = tfidf_vectorizer.fit_transform(dataset)

print("--- %s seconds ---" % (time.time() - start_time))

--- 104.99544501304626 seconds ---


In [14]:
tfidf_vectors.shape

(315000, 84033)

### Feature selection

In [14]:
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif, chi2

#TOP_K = 20000
#TOP_K = 10000
TOP_K = 15000

selector = SelectKBest(f_classif, k=min(TOP_K, tfidf_vectors.shape[1]))
selector.fit(tfidf_vectors, y)
tfidf_vectors = selector.transform(tfidf_vectors).toarray()

  f = msb / msw
  f = msb / msw


In [19]:
tfidf_vectors.shape

(315000, 15000)

### Classifiers

In [15]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_vectors, y, test_size = 0.2, random_state = 0)

In [16]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

#### Naive Bayes

In [19]:
start_time = time.time()
nb = MultinomialNB(alpha=0.001, fit_prior=False)
model = nb.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))

y_pred = model.predict(X_test)
print('Naive Bayes accuracy %s' % accuracy_score(y_pred, y_test))

--- 186.60967016220093 seconds ---
Naive Bayes accuracy 0.6232539682539683


#### Simple MLP

In [17]:
from tensorflow.python.keras import models, layers, callbacks
from tensorflow.python.keras.optimizers import Adam

def build_mlp_model(units, n_layers, last_layer_activation, hidden_layer_activation, input_shape, dropout_rate, numOfclasses):
    model = models.Sequential()
    
    #hidden layers
    for _ in range(n_layers-1):
        model.add(layers.Dense(units=units, activation=hidden_layer_activation, 
                               kernel_initializer='glorot_uniform', input_dim = input_shape))
        model.add(layers.Dropout(rate=dropout_rate))
    
    #output layer
    model.add(layers.Dense(units=numOfclasses, activation=last_layer_activation))
    return model

In [39]:
#initilize parameters
numOfclasses = len(test_df.groupby('label'))
last_layer_activation = 'softmax'
hidden_layer_activation = 'relu'
input_shape = tfidf_vectors.shape[1]
dropout_rate = 0.5
loss = 'sparse_categorical_crossentropy'
learning_rate = 0.001
epochs = 10
n_layers = 1
units = 30
batch_size = 128

model = build_mlp_model(units, n_layers, last_layer_activation, hidden_layer_activation, input_shape, dropout_rate, numOfclasses)
model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)

Train on 252000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Tune hyperparameters

1) Add more layers, e.g., 2 or 3, the results became worse.

2) Vary units with larger numbers, no affect.

3) Increase the learning rate did not help a lot.

4) Vary the features size from 20k to 5k, 10k, and 15k. 15k seems more effective.

In [20]:
#15k features
numOfclasses = len(test_df.groupby('label'))
last_layer_activation = 'softmax'
hidden_layer_activation = 'relu'
input_shape = tfidf_vectors.shape[1]
dropout_rate = 0.5
loss = 'sparse_categorical_crossentropy'
learning_rate = 0.1
epochs = 100
n_layers = 1
units = 32
batch_size = 128

model = build_mlp_model(units, n_layers, last_layer_activation, hidden_layer_activation, input_shape, dropout_rate, numOfclasses)
model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])

#use early-stopping
callback_early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='auto')
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), 
                    verbose=1, callbacks=[callback_early_stopping])

Train on 252000 samples, validate on 63000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100


### Notes

This experiment used a smaller dataset by considering feature selection. Only chose 5k~20k features. The classification algorithms perform worse. 

Applied a simple multi-layer perceptron with the filtered tfidf matrix. The result does not look too bad, since we only chose few epochs and early stopping.

We also tried to vary different layers, units, batch_size, learning rate. Only the change of batch_size affected the accuracy a lot.