In [13]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, matthews_corrcoef
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier


!pip install nltk
import nltk
nltk.download('stopwords')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd 

patent = pd.read_csv('/content/drive/MyDrive/patent_text_test_set_nlp_oct31_2022.csv', sep=',')
patent

Unnamed: 0,patent_id,patent_text
0,3930271,Golf glove A golf glove disclosed extra finger...
1,3930272,Crib leg lock A lock height-adjustable crib pl...
2,3930273,Bed safety side rail arrangement A bed safety ...
3,3930274,Assembly use recreational activities The assem...
4,3930276,Wheel spinning vehicle conveying apparatus aut...
...,...,...
99114,4035655,Method device implantation particles substrate...
99115,4035656,Method apparatus use approaching thermonuclear...
99116,4035657,Ozone generator An ozone generator air pump di...
99117,4035658,High power wind turbine kinetic accumulator A ...


In [4]:
patent2 = pd.read_csv('/content/drive/MyDrive/patents_USPC_map_nlp_oct31_2022.csv', sep=',')
patent2

Unnamed: 0,patent_id,mainclass_id
0,3930271,2
1,3930271,473
2,3930272,5
3,3930272,248
4,3930272,403
...,...,...
186893,4035657,422
186894,4035658,290
186895,4035658,416
186896,4035659,307


In [5]:
len(patent2['mainclass_id'].unique())

330

In [6]:
datasets = []

classes = patent2['mainclass_id'].value_counts().keys()

for c in classes:
    data = patent2.loc[patent2['mainclass_id'] == c].reset_index(drop=True)
    txt, cls = [],[]
    for idx, row in data.iterrows():
        txt.append(patent.loc[patent['patent_id'] == row['patent_id']]['patent_text'].reset_index(drop=True))
        cls.append(row['mainclass_id'])
    temp = pd.DataFrame()
    temp['text'] = txt
    temp['label'] = cls
    datasets.append(temp)
datasets[0]

Unnamed: 0,text,label
0,0 Casing shafts cables A conductor casing s...,428
1,0 Method making saw blade In manufacture sa...,428
2,0 Impact resistant panels An impact resista...,428
3,0 Method fabricating conformable sandwich s...,428
4,0 Color former pressure-sensitive copying p...,428
...,...,...
3322,0 Bonding element separate heating agitatin...,428
3323,"0 Laminated articles prepared transparent, ...",428
3324,0 Interlayer laminated safety glass Disclos...,428
3325,0 Fiber reinforced composite high fracture ...,428


In [7]:
dfs = []
for data in datasets:
    temp = pd.DataFrame()
    sample = patent2.loc[patent2['mainclass_id'] != data['label'][0]].reset_index(drop=True)
    sample = sample.sample(n=len(data))
    txt = []
    for idx, row in sample.iterrows():
        txt.append(patent.loc[patent['patent_id'] == row['patent_id']]['patent_text'].reset_index(drop=True))
    temp['text'] = txt 
    temp['label'] = 0
    data = pd.concat([data, temp])
    dfs.append(data.reset_index(drop=True))
dfs[0]

Unnamed: 0,text,label
0,0 Casing shafts cables A conductor casing s...,428
1,0 Method making saw blade In manufacture sa...,428
2,0 Impact resistant panels An impact resista...,428
3,0 Method fabricating conformable sandwich s...,428
4,0 Color former pressure-sensitive copying p...,428
...,...,...
6649,0 Tong type recovery tool A recovery tool d...,0
6650,0 Speed limiting governor internal combusti...,0
6651,0 Holding temperature metal melts specified...,0
6652,0 Analog A-C storage circuit employing high...,0


In [8]:
from nltk.corpus import stopwords
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_0]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = str(text)
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = re.sub('0', '', text)
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text



In [9]:
cleaned = []
for data in dfs:
    data['text'] = data['text'].apply(clean_text)    
    cleaned.append(data)


In [10]:
cleaned[0]

Unnamed: 0,text,label
0,casing shafts cables conductor casing shaftsna...,428
1,method making saw blade manufacture saw blanam...,428
2,impact resistant panels impact resistant panam...,428
3,method fabricating conformable sandwich struct...,428
4,color former pressuresensitive copying paper n...,428
...,...,...
6649,tong type recovery tool recovery tool designna...,0
6650,speed limiting governor internal combustion en...,0
6651,holding temperature metal melts specified comp...,0
6652,analog ac storage circuit employing high gainn...,0


In [14]:
multinb = {}
i=0
for data in cleaned:
    mnb = MultinomialNB()
    vectorizer = TfidfVectorizer(
        stop_words='english', 
        min_df = 1,
        ngram_range = (1,2),
        use_idf = True,
        max_df = 0.5, 
        smooth_idf=True
    )
    train, test = train_test_split(data, test_size=0.1, random_state=42,shuffle=True)
    x_train = vectorizer.fit_transform(train['text']).toarray()
    y_train = train['label'].values
    x_test = vectorizer.transform(test['text']).toarray()
    y_test = test['label'].values
    multinb[classes[i]] = {
        'x_test': x_test,
        'y_test': y_test,
        'classifier': mnb.fit(x_train, y_train)
    }
    i+=1


In [16]:
supportv = {}
i=0
for data in cleaned:
    svm=LinearSVC(C=0.0001)
    vectorizer = TfidfVectorizer(
        stop_words='english', 
        min_df = 1,
        ngram_range = (1,2),
        use_idf = True,
        max_df = 0.5, 
        smooth_idf=True
    )
    train, test = train_test_split(data, test_size=0.1, random_state=42,shuffle=True)
    x_train = vectorizer.fit_transform(train['text']).toarray()
    y_train = train['label'].values
    x_test = vectorizer.transform(test['text']).toarray()
    y_test = test['label'].values
    supportv[classes[i]] = {
        'x_test': x_test,
        'y_test': y_test,
        'classifier': svm.fit(x_train, y_train)
    }
    i+=1

In [17]:
randomf = {}
i=0
for data in cleaned:
    rf = RandomForestClassifier(n_estimators=30, max_depth=9)
    vectorizer = TfidfVectorizer(
        stop_words='english', 
        min_df = 1,
        ngram_range = (1,2),
        use_idf = True,
        max_df = 0.5, 
        smooth_idf=True
    )
    train, test = train_test_split(data, test_size=0.1, random_state=42,shuffle=True)
    x_train = vectorizer.fit_transform(train['text']).toarray()
    y_train = train['label'].values
    x_test = vectorizer.transform(test['text']).toarray()
    y_test = test['label'].values
    randomf[classes[i]] = {
        'x_test': x_test,
        'y_test': y_test,
        'classifier': rf.fit(x_train, y_train)
    }
    i+=1

In [18]:
'''
Classification reports for 330 Multinomial Naive Bayes 
'''
for cls in classes:
    print('******************************')
    print(f'Report for class {cls}')
    y_pred = multinb[cls]['classifier'].predict(multinb[cls]['x_test'])
    y_true = multinb[cls]['y_test']
    print(classification_report(y_true,y_pred))
    # ROC AUC
    auc = roc_auc_score(y_true,y_pred)
    print('ROC AUC: %f' % auc)
    #mcc
    mcc = matthews_corrcoef(y_true, y_pred)
    print('Matthews Corrcoef: %f' % mcc)
    # confusion matrix
    matrix = confusion_matrix(y_true,y_pred)
    print('Confusion Matrix:')
    print(matrix)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   macro avg       0.90      0.88      0.89       322
weighted avg       0.90      0.89      0.89       322

ROC AUC: 0.879230
Matthews Corrcoef: 0.778912
Confusion Matrix:
[[109  27]
 [  8 178]]
******************************
Report for class 528
              precision    recall  f1-score   support

           0       0.90      0.82      0.86       141
         528       0.86      0.93      0.89       178

    accuracy                           0.88       319
   macro avg       0.88      0.87      0.87       319
weighted avg       0.88      0.88      0.88       319

ROC AUC: 0.871285
Matthews Corrcoef: 0.752376
Confusion Matrix:
[[115  26]
 [ 13 165]]
******************************
Report for class 544
              precision    recall  f1-score   support

           0       0.77      0.87      0.82       141
         544       0.89      0.79      0.84       175

    accuracy                           0.83       316
   

In [20]:
'''
Classification reports for 330 Support Vector Machines
'''
for cls in classes:
    print('******************************')
    print(f'Report for class {cls}')
    y_pred = supportv[cls]['classifier'].predict(supportv[cls]['x_test'])
    y_true = supportv[cls]['y_test']
    print(classification_report(y_true,y_pred, zero_division=1))
    # ROC AUC
    auc = roc_auc_score(y_true,y_pred)
    print('ROC AUC: %f' % auc)
    #mcc
    mcc = matthews_corrcoef(y_true, y_pred)
    print('Matthews Corrcoef: %f' % mcc)
    # confusion matrix
    matrix = confusion_matrix(y_true,y_pred)
    print('Confusion Matrix:')
    print(matrix)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   macro avg       0.71      0.51      0.32       322
weighted avg       0.76      0.43      0.28       322

ROC AUC: 0.510753
Matthews Corrcoef: 0.095902
Confusion Matrix:
[[136   0]
 [182   4]]
******************************
Report for class 528
              precision    recall  f1-score   support

           0       0.44      1.00      0.62       141
         528       1.00      0.01      0.02       178

    accuracy                           0.45       319
   macro avg       0.72      0.51      0.32       319
weighted avg       0.75      0.45      0.28       319

ROC AUC: 0.505618
Matthews Corrcoef: 0.070694
Confusion Matrix:
[[141   0]
 [176   2]]
******************************
Report for class 544
              precision    recall  f1-score   support

           0       0.45      1.00      0.62       141
         544       1.00      0.01      0.01       175

    accuracy                           0.45       316
   

In [21]:
'''
Classification reports for 330 Random Forest 
'''
for cls in classes:
    print('******************************')
    print(f'Report for class {cls}')
    y_pred = randomf[cls]['classifier'].predict(randomf[cls]['x_test'])
    y_true = randomf[cls]['y_test']
    print(classification_report(y_true,y_pred))
    # ROC AUC
    auc = roc_auc_score(y_true,y_pred)
    print('ROC AUC: %f' % auc)
    #mcc
    mcc = matthews_corrcoef(y_true, y_pred)
    print('Matthews Corrcoef: %f' % mcc)
    # confusion matrix
    matrix = confusion_matrix(y_true,y_pred)
    print('Confusion Matrix:')
    print(matrix)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   macro avg       0.78      0.79      0.77       322
weighted avg       0.80      0.77      0.77       322

ROC AUC: 0.785539
Matthews Corrcoef: 0.570526
Confusion Matrix:
[[123  13]
 [ 62 124]]
******************************
Report for class 528
              precision    recall  f1-score   support

           0       0.64      0.91      0.75       141
         528       0.90      0.60      0.72       178

    accuracy                           0.74       319
   macro avg       0.77      0.76      0.74       319
weighted avg       0.78      0.74      0.73       319

ROC AUC: 0.755200
Matthews Corrcoef: 0.525038
Confusion Matrix:
[[129  12]
 [ 72 106]]
******************************
Report for class 544
              precision    recall  f1-score   support

           0       0.63      0.87      0.73       141
         544       0.84      0.59      0.69       175

    accuracy                           0.71       316
   