In [28]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, matthews_corrcoef
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score


!pip install nltk
import nltk
nltk.download('stopwords')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
import pandas as pd 

patent = pd.read_csv('/content/drive/MyDrive/patent_text_test_set_nlp_oct31_2022.csv', sep=',')
patent

Unnamed: 0,patent_id,patent_text
0,3930271,Golf glove A golf glove disclosed extra finger...
1,3930272,Crib leg lock A lock height-adjustable crib pl...
2,3930273,Bed safety side rail arrangement A bed safety ...
3,3930274,Assembly use recreational activities The assem...
4,3930276,Wheel spinning vehicle conveying apparatus aut...
...,...,...
99114,4035655,Method device implantation particles substrate...
99115,4035656,Method apparatus use approaching thermonuclear...
99116,4035657,Ozone generator An ozone generator air pump di...
99117,4035658,High power wind turbine kinetic accumulator A ...


In [16]:
patent2 = pd.read_csv('/content/drive/MyDrive/patents_USPC_map_nlp_oct31_2022.csv', sep=',')
patent2

Unnamed: 0,patent_id,mainclass_id
0,3930271,2
1,3930271,473
2,3930272,5
3,3930272,248
4,3930272,403
...,...,...
186893,4035657,422
186894,4035658,290
186895,4035658,416
186896,4035659,307


In [17]:
len(patent2['mainclass_id'].unique())

330

In [18]:
datasets = []

classes = patent2['mainclass_id'].value_counts().keys()

for c in classes:
    data = patent2.loc[patent2['mainclass_id'] == c].reset_index(drop=True)
    txt, cls = [],[]
    for idx, row in data.iterrows():
        txt.append(patent.loc[patent['patent_id'] == row['patent_id']]['patent_text'].reset_index(drop=True))
        cls.append(row['mainclass_id'])
    temp = pd.DataFrame()
    temp['text'] = txt
    temp['label'] = cls
    datasets.append(temp)
datasets[0]

Unnamed: 0,text,label
0,0 Casing shafts cables A conductor casing s...,428
1,0 Method making saw blade In manufacture sa...,428
2,0 Impact resistant panels An impact resista...,428
3,0 Method fabricating conformable sandwich s...,428
4,0 Color former pressure-sensitive copying p...,428
...,...,...
3322,0 Bonding element separate heating agitatin...,428
3323,"0 Laminated articles prepared transparent, ...",428
3324,0 Interlayer laminated safety glass Disclos...,428
3325,0 Fiber reinforced composite high fracture ...,428


In [19]:
dfs = []
for data in datasets:
    temp = pd.DataFrame()
    sample = patent2.loc[patent2['mainclass_id'] != data['label'][0]].reset_index(drop=True)
    sample = sample.sample(n=len(data))
    txt = []
    for idx, row in sample.iterrows():
        txt.append(patent.loc[patent['patent_id'] == row['patent_id']]['patent_text'].reset_index(drop=True))
    temp['text'] = txt 
    temp['label'] = 0
    data = pd.concat([data, temp])
    dfs.append(data.reset_index(drop=True))
dfs[0]

Unnamed: 0,text,label
0,0 Casing shafts cables A conductor casing s...,428
1,0 Method making saw blade In manufacture sa...,428
2,0 Impact resistant panels An impact resista...,428
3,0 Method fabricating conformable sandwich s...,428
4,0 Color former pressure-sensitive copying p...,428
...,...,...
6649,0 Ring type recovery tool A recovery tool l...,0
6650,0 Insulation system A thermal insulation co...,0
6651,0 Notch filter color transparency copying m...,0
6652,0 Hair curl clip This disclosure relates cl...,0


In [20]:
from nltk.corpus import stopwords
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_0]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = str(text)
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = re.sub('0', '', text)
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text



In [21]:
cleaned = []
for data in dfs:
    data['text'] = data['text'].apply(clean_text)    
    cleaned.append(data)


In [22]:
cleaned[0]

Unnamed: 0,text,label
0,casing shafts cables conductor casing shaftsna...,428
1,method making saw blade manufacture saw blanam...,428
2,impact resistant panels impact resistant panam...,428
3,method fabricating conformable sandwich struct...,428
4,color former pressuresensitive copying paper n...,428
...,...,...
6649,ring type recovery tool recovery tool locatena...,0
6650,insulation system thermal insulation composina...,0
6651,notch filter color transparency copying machin...,0
6652,hair curl clip disclosure relates clip sename ...,0


In [23]:
multinb = {}
i=0
for data in cleaned:
    mnb = MultinomialNB()
    vectorizer = TfidfVectorizer(
        stop_words='english', 
        min_df = 1,
        ngram_range = (1,2),
        use_idf = True,
        max_df = 0.5, 
        smooth_idf=True
    )
    train, test = train_test_split(data, test_size=0.1, random_state=42,shuffle=True)
    x_train = vectorizer.fit_transform(train['text']).toarray()
    y_train = train['label'].values
    x_test = vectorizer.transform(test['text']).toarray()
    y_test = test['label'].values
    multinb[classes[i]] = {
        'x_test': x_test,
        'y_test': y_test,
        'classifier': mnb.fit(x_train, y_train)
    }
    i+=1


In [24]:
supportv = {}
i=0
for data in cleaned:
    svm=LinearSVC(C=0.0001)
    vectorizer = TfidfVectorizer(
        stop_words='english', 
        min_df = 1,
        ngram_range = (1,2),
        use_idf = True,
        max_df = 0.5, 
        smooth_idf=True
    )
    train, test = train_test_split(data, test_size=0.1, random_state=42,shuffle=True)
    x_train = vectorizer.fit_transform(train['text']).toarray()
    y_train = train['label'].values
    x_test = vectorizer.transform(test['text']).toarray()
    y_test = test['label'].values
    supportv[classes[i]] = {
        'x_test': x_test,
        'y_test': y_test,
        'classifier': svm.fit(x_train, y_train)
    }
    i+=1

In [25]:
randomf = {}
i=0
for data in cleaned:
    rf = RandomForestClassifier(n_estimators=30, max_depth=9)
    vectorizer = TfidfVectorizer(
        stop_words='english', 
        min_df = 1,
        ngram_range = (1,2),
        use_idf = True,
        max_df = 0.5, 
        smooth_idf=True
    )
    train, test = train_test_split(data, test_size=0.1, random_state=42,shuffle=True)
    x_train = vectorizer.fit_transform(train['text']).toarray()
    y_train = train['label'].values
    x_test = vectorizer.transform(test['text']).toarray()
    y_test = test['label'].values
    randomf[classes[i]] = {
        'x_test': x_test,
        'y_test': y_test,
        'classifier': rf.fit(x_train, y_train)
    }
    i+=1

In [30]:
'''
Classification reports for 330 Multinomial Naive Bayes 
'''
p_macro, p_micro, p_weighted = [],[],[]
r_macro, r_micro, r_weighted = [],[],[]
f1_macro, f1_micro, f1_weighted = [],[],[]
matt, roc_auc = [],[]

for cls in classes:
    y_pred = multinb[cls]['classifier'].predict(multinb[cls]['x_test'])
    y_true = multinb[cls]['y_test']

    p_macro.append(precision_score(y_true,y_pred, average='macro'))
    p_micro.append(precision_score(y_true,y_pred, average='micro'))
    p_weighted.append(precision_score(y_true,y_pred, average='weighted'))

    r_macro.append(recall_score(y_true,y_pred, average='macro'))
    r_micro.append(recall_score(y_true,y_pred, average='micro'))
    r_weighted.append(recall_score(y_true,y_pred, average='weighted'))

    f1_macro.append(f1_score(y_true,y_pred, average='macro'))
    f1_micro.append(f1_score(y_true,y_pred, average='micro'))
    f1_weighted.append(f1_score(y_true,y_pred, average='weighted'))

    matt.append(matthews_corrcoef(y_true, y_pred))
    roc_auc.append(roc_auc_score(y_true,y_pred))

df = pd.DataFrame(columns=[
    'precision_macro', 
    'precision_micro', 
    'precision_weighted', 
    'recall_macro', 
    'recall_micro', 
    'recall_weighted', 
    'f1_macro', 
    'f1_micro', 
    'f1_weighted', 
    'matthews',
    'roc_auc'
    ])

df['precision_macro']=p_macro
df['precision_micro']=p_micro
df['precision_weighted']=p_weighted
df['recall_macro']=r_macro
df['recall_micro']=r_micro
df['recall_weighted']=r_weighted
df['f1_macro']=f1_macro
df['f1_micro']=f1_micro
df['f1_weighted']=f1_weighted
df['matthews']=matt
df['roc_auc']=roc_auc
df.to_csv('MultiNB_classifier_results.csv', index=False)


In [None]:
'''
Classification reports for 330 Support Vector Machines
'''
p_macro, p_micro, p_weighted = [],[],[]
r_macro, r_micro, r_weighted = [],[],[]
f1_macro, f1_micro, f1_weighted = [],[],[]
matt, roc_auc = [],[]
for cls in classes:
    y_pred = supportv[cls]['classifier'].predict(supportv[cls]['x_test'])
    y_true = supportv[cls]['y_test']
    p_macro.append(precision_score(y_true,y_pred, average='macro'))
    p_micro.append(precision_score(y_true,y_pred, average='micro'))
    p_weighted.append(precision_score(y_true,y_pred, average='weighted'))

    r_macro.append(recall_score(y_true,y_pred, average='macro'))
    r_micro.append(recall_score(y_true,y_pred, average='micro'))
    r_weighted.append(recall_score(y_true,y_pred, average='weighted'))

    f1_macro.append(f1_score(y_true,y_pred, average='macro'))
    f1_micro.append(f1_score(y_true,y_pred, average='micro'))
    f1_weighted.append(f1_score(y_true,y_pred, average='weighted'))

    matt.append(matthews_corrcoef(y_true, y_pred))
    roc_auc.append(roc_auc_score(y_true,y_pred))

df = pd.DataFrame(columns=[
    'precision_macro', 
    'precision_micro', 
    'precision_weighted', 
    'recall_macro', 
    'recall_micro', 
    'recall_weighted', 
    'f1_macro', 
    'f1_micro', 
    'f1_weighted', 
    'matthews',
    'roc_auc'
    ])

df['precision_macro']=p_macro
df['precision_micro']=p_micro
df['precision_weighted']=p_weighted
df['recall_macro']=r_macro
df['recall_micro']=r_micro
df['recall_weighted']=r_weighted
df['f1_macro']=f1_macro
df['f1_micro']=f1_micro
df['f1_weighted']=f1_weighted
df['matthews']=matt
df['roc_auc']=roc_auc
df.to_csv('SVM_classifier_results.csv', index=False)

In [32]:
'''
Classification reports for 330 Random Forest 
'''
p_macro, p_micro, p_weighted = [],[],[]
r_macro, r_micro, r_weighted = [],[],[]
f1_macro, f1_micro, f1_weighted = [],[],[]
matt, roc_auc = [],[]
for cls in classes:
    y_pred = randomf[cls]['classifier'].predict(randomf[cls]['x_test'])
    y_true = randomf[cls]['y_test']
    p_macro.append(precision_score(y_true,y_pred, average='macro'))
    p_micro.append(precision_score(y_true,y_pred, average='micro'))
    p_weighted.append(precision_score(y_true,y_pred, average='weighted'))

    r_macro.append(recall_score(y_true,y_pred, average='macro'))
    r_micro.append(recall_score(y_true,y_pred, average='micro'))
    r_weighted.append(recall_score(y_true,y_pred, average='weighted'))

    f1_macro.append(f1_score(y_true,y_pred, average='macro'))
    f1_micro.append(f1_score(y_true,y_pred, average='micro'))
    f1_weighted.append(f1_score(y_true,y_pred, average='weighted'))

    matt.append(matthews_corrcoef(y_true, y_pred))
    roc_auc.append(roc_auc_score(y_true,y_pred))

df = pd.DataFrame(columns=[
    'precision_macro', 
    'precision_micro', 
    'precision_weighted', 
    'recall_macro', 
    'recall_micro', 
    'recall_weighted', 
    'f1_macro', 
    'f1_micro', 
    'f1_weighted', 
    'matthews',
    'roc_auc'
    ])

df['precision_macro']=p_macro
df['precision_micro']=p_micro
df['precision_weighted']=p_weighted
df['recall_macro']=r_macro
df['recall_micro']=r_micro
df['recall_weighted']=r_weighted
df['f1_macro']=f1_macro
df['f1_micro']=f1_micro
df['f1_weighted']=f1_weighted
df['matthews']=matt
df['roc_auc']=roc_auc
df.to_csv('RF_classifier_results.csv', index=False)