## Patent Analysis

* Airody Mohandas Pai
* Chaitali Suhas Bagwe
* Rakshit Bhat
* Pranav Jayant Kulkarni

***

#### Setting up external libraries 

1. <a href="https://pypi.org/project/yake/" target="_blank">YAKE</a> : Unsupervised Approach for Automatic Keyword Extraction using Text Features.
2. <a href="https://pypi.org/project/german-nouns/" target="_blank">german-nouns</a> : A comma seperated list of ~100 thousand German nouns and their grammatical properties.

In [1]:
# To import external libraries, uncomment the lines below. 
# Reference for yake - https://github.com/LIAAD/yake


import os
os.system('pip install git+https://github.com/LIAAD/yake')
os.system('pip install german-nouns')
os.system('python -m spacy download de_core_news_sm')

0

***
#### Import Library

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import re
import spacy
import yake

from german_nouns.lookup import Nouns
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Beware of tensorflow error!

***
#### Loading the dataset

In [3]:
dataset = pd.read_excel('Phoenix_Contact_Makeathon_2022_Trainingsdatensatz.xlsx')
dataset.head()

ImportError: Missing optional dependency 'xlrd'. Install xlrd >= 1.0.0 for Excel support Use pip or conda to install xlrd.

In [None]:
testdataset = pd.read_excel('Phoenix_Contact_Makeathon_2022_Testdatensatz.xlsx')
testdataset.head()

***
### Text Preprocessing

#### 1) Setting up stop-words and stemmer
Stop Words : Removing commonly used german stop words like *und*, *viel*, etc.

Stemmer : Reducing the word to its word stem; *verbindung* -> *verbind*

Part of Speech : Identify compound nouns; *batterieladegerat* -> *batterie + lade + gerat*

In [None]:
stemmer = SnowballStemmer("german")

In [None]:
stop_words = stopwords.words("german")
# Add custom stop words (frequently occuring but add no value)
stop_words += ['non', 'referencing', 'claim', 'wobei', 'mindestens', 'erste', 'ersten', 'erstem', 'zweite', 'zumindestest', 'jeweils', 'zwei', 'wenigtens']

In [None]:
nouns = Nouns()
nlp = spacy.load('de_core_news_sm')

***
#### Initializing YAKE

In [None]:
max_ngram_size = 1
deduplication_threshold = 0.9
num_of_keywords = 30
window_size = 1

kw_extractor = yake.KeywordExtractor(n=max_ngram_size, dedupLim=deduplication_threshold, top=num_of_keywords, windowsSize=window_size)

#### 2) Cleaning the text
Removing white spaces, html tags, numbers, special characters, punctuations and stop words and then finding stem of each word.

In [None]:
def clean_text(text):
    # remove white spaces, html tags, numbers, special characters, punctuations

    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)
    RE_TAGS = re.compile(r"<[^>]+>")
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE)
    RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE)

    text = re.sub(RE_TAGS, " ", text)
    text = re.sub(RE_ASCII, " ", text)
    text = re.sub(RE_SINGLECHAR, " ", text)
    text = re.sub(RE_WSPACE, " ", text)

    word_tokens = word_tokenize(text)
    words_tokens_lower = [word.lower() for word in word_tokens]

    # split compound nouns
    doc = nlp(u''+' '.join(words_tokens_lower))
    split_compound_nouns = ""

    for word in doc:
        words = nouns.parse_compound(word.orth_) if word.pos_ in ['NOUN', 'VERB', 'ADJ'] else [word.orth_,'']
        split_compound_nouns += " ".join(words) if len(words) > 1 else word.orth_
        split_compound_nouns += " "

    split_compound_nouns = split_compound_nouns.split(' ')

    # perform stemming on each word
    words_filtered = [
        stemmer.stem(word) for word in split_compound_nouns if word not in stop_words
    ]

    text_clean = " ".join(words_filtered)
    return text_clean

***
#### Dataset Split
Performing a 89:11 (3200:400) split for training and testing respectively


In [None]:
# Splitting dataset into 3200:400 balenced entries

X_train, X_test, y_train, y_test = train_test_split(dataset[['Zusammenfassung', 'Unabhängige Patentansprüche', 'IPC-Hauptklasse']], dataset['IPC-Unterklasse'], test_size=0.1111, random_state=42, stratify=dataset['IPC-Unterklasse'])

train_df = X_train.join(y_train)
valdf = X_test.join(y_test)
ext_testset = testdataset

In [None]:
def generate_input_text(pd_df):
    pd_df['inputText'] = pd_df['Zusammenfassung'] + pd_df['Unabhängige Patentansprüche']
    return pd_df[["inputText", "IPC-Hauptklasse", "IPC-Unterklasse"]]

In [None]:
def apply_method_to_column(pd_df, col, meth):
    return pd_df[col].apply(meth)
    # return pd_df['inputText'].apply(clean_text)

In [None]:
def extract_keywords(string):
    keywords = kw_extractor.extract_keywords(string)
    keywords = [x for (x,_) in keywords]
    return " ".join(keywords)


In [None]:
def run_XGB_model(train_input, train_output, valinput):
    clf = HistGradientBoostingClassifier()
    clf.fit(train_input, train_output)  
    pred = clf.predict(valinput)
    return pred

In [None]:
def print_statistics(actual, predicted):
    avg = 'weighted'
    print("Accuracy:",metrics.accuracy_score(actual, predicted))
    print("Precision:",metrics.precision_score(actual, predicted, average=avg))
    print("Recall:",metrics.recall_score(actual, predicted, average=avg))
    print("F1 score:",metrics.f1_score(actual, predicted, average=avg))

In [None]:
def generate_sub_class_input(input_list, copy_list):
    return_list = [[int(x[-1])] for x in input_list]
    for i in range(len(return_list)):
        return_list[i] += copy_list[i].copy()
 
    return return_list

In [None]:
def find_mis_match_classes(main_preds, subclass_preds):
    x = []
    for i in range(len(subclass_preds)):
        if (subclass_preds[i].find(main_preds[i]) < 0):
            x += [(subclass_preds[i], main_preds[i])]
        
    if(len(x)):
        print(x)

In [None]:
train_df = generate_input_text(train_df)
valdf = generate_input_text(valdf)
test_df = generate_input_text(ext_testset)

In [None]:
train_df['cleaned_text'] = apply_method_to_column(train_df, 'inputText', clean_text)
valdf['cleaned_text'] = apply_method_to_column(valdf, 'inputText', clean_text)
test_df['cleaned_text'] = apply_method_to_column(test_df, 'inputText', clean_text)

In [None]:
# train_df['yake_keywords'] = apply_method_to_column(train_df, 'cleaned_text', extract_keywords)
# valdf['yake_keywords'] = apply_method_to_column(valdf, 'cleaned_text', extract_keywords)

test_df.head()

***
#### Feature extraction using TF-IDF
TF-IDF measures how relvant a word is when compared with the entire document.

In [None]:
tfidf_vectorizor = TfidfVectorizer()

In [None]:
train_input = tfidf_vectorizor.fit_transform(train_df['cleaned_text']).toarray().tolist()
valinput = tfidf_vectorizor.transform(valdf['cleaned_text']).toarray().tolist()
test_input = tfidf_vectorizor.transform(test_df['cleaned_text']).toarray().tolist()

In [None]:
train_output = train_df['IPC-Hauptklasse'].tolist()
valoutput =  valdf['IPC-Hauptklasse'].tolist()
# test_output = train_df['IPC-Hauptklasse'].tolist()

***
#### Use Gradient Boosting Classifier to Identify Main Class

In [None]:
_predictions = run_XGB_model(train_input, train_output, valinput)

In [None]:
print_statistics(valoutput, knn_predictions)

In [None]:
train_sub_input = generate_sub_class_input(train_df['IPC-Hauptklasse'].tolist(), train_input)
valsub_input = generate_sub_class_input(_predictions, valinput)

In [None]:
train_sub_output = train_df['IPC-Unterklasse'].tolist()
valsub_output = valdf['IPC-Unterklasse'].tolist()
# test_sub_output = test_df['IPC-Unterklasse'].tolist()

***
#### Identify Sub Class

In [None]:
_subclass_predictions = run_XGB_model(train_sub_input, train_sub_output, valsub_input)

In [None]:
print_statistics(valsub_output, knn_subclass_predictions)
find_mis_match_classes(knn_predictions, knn_subclass_predictions)

***
#### Run on test dataset

In [None]:
test_main_predictions = run_XGB_model(train_input, train_output, test_input)
test_sub_input = generate_sub_class_input(test_main_predictions, test_input)
test_subclass_predictions = run_XGB_model(train_input, train_output, test_input)

***


1 SVM



In [None]:
from sklearn import svm

def run_model_svm(train_input, train_output, valinput):
    model = svm.SVC(C=1.0, kernel='poly', degree=4)
    model.fit(train_input, train_output)
    return model.predict(valinput)

In [None]:
#Validation data
_predictions = run_model_svm(train_input, train_output, valinput)
print_statistics(valoutput, _predictions)
#train_sub_input = generate_sub_class_input(train_df['IPC-Hauptklasse'].tolist(), train_input)
valsub_input = generate_sub_class_input(_predictions, valinput)
_subclass_predictions = run_model_svm(train_sub_input, train_sub_output, valsub_input)
print_statistics(valsub_output, _subclass_predictions)
find_mis_match_classes(_predictions, _subclass_predictions)

In [None]:
#Test data
_predictions = run_model_svm(train_input, train_output, test_input)
test_sub_input = generate_sub_class_input(_predictions, test_input)
_subclass_predictions = run_model_svm(train_sub_input, train_sub_output, test_sub_input)

***


2 Multinomial Naive Bayes



In [None]:
from sklearn.naive_bayes import MultinomialNB

def run_MNB_model(train_input, train_output, valinput):
    _classifier = MultinomialNB()
    _classifier.fit(train_input, train_output)
    _predictions = _classifier.predict(valinput)
    return _predictions

In [None]:
_predictions = run_MNB_model(train_input, train_output, valinput)
print_statistics(valoutput, _predictions)
#train_sub_input = generate_sub_class_input(train_df['IPC-Hauptklasse'].tolist(), train_input)
valsub_input = generate_sub_class_input(_predictions, valinput)
_subclass_predictions = run_MNB_model(train_sub_input, train_sub_output, valsub_input)
print_statistics(valsub_output, _subclass_predictions)
find_mis_match_classes(_predictions, _subclass_predictions)


In [None]:
#Test data
_predictions = run_MNB_model(train_input, train_output, test_input)
test_sub_input = generate_sub_class_input(_predictions, test_input)
_subclass_predictions = run_MNB_model(train_sub_input, train_sub_output, test_sub_input)

***


3 Random Forest Classifier



In [None]:
from sklearn.ensemble import RandomForestClassifier 

def run_RF_model(train_input, train_output, valinput):
    _classifier = RandomForestClassifier()
    _classifier.fit(train_input, train_output)
    _predictions = _classifier.predict(valinput)
    return _predictions

In [None]:
_predictions = run_RF_model(train_input, train_output, valinput)
print_statistics(valoutput, _predictions)
#train_sub_input = generate_sub_class_input(train_df['IPC-Hauptklasse'].tolist(), train_input)
valsub_input = generate_sub_class_input(_predictions, valinput)
_subclass_predictions = run_RF_model(train_sub_input, train_sub_output, valsub_input)
print_statistics(valsub_output, _subclass_predictions)
find_mis_match_classes(_predictions, _subclass_predictions)


In [None]:
#Test data
rf_predictions = run_RF_model(train_input, train_output, test_input)
test_sub_input = generate_sub_class_input(_predictions, test_input)
rf_subclass_predictions = run_RF_model(train_sub_input, train_sub_output, test_sub_input)

***


4 Gradient Boost Classifier



In [None]:
# from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier, AdaBoostClassifier

def run_XGB_model(train_input, train_output, valinput):
    # clf = XGBClassifier(max_depth=5, objective='multi:softmax', n_estimators=1000, num_classes=4)
    # clf = GradientBoostingClassifier()
    clf = HistGradientBoostingClassifier()
    # clf = AdaBoostClassifier()
    clf.fit(train_input, train_output)  
    pred = clf.predict(valinput)
    return pred

In [None]:
_predictions = run_XGB_model(train_input, train_output, valinput)
print_statistics(valoutput, _predictions)
#train_sub_input = generate_sub_class_input(train_df['IPC-Hauptklasse'].tolist(), train_input)
valsub_input = generate_sub_class_input(_predictions, valinput)
_subclass_predictions = run_XGB_model(train_sub_input, train_sub_output, valsub_input)
print_statistics(valsub_output, _subclass_predictions)
find_mis_match_classes(_predictions, _subclass_predictions)

In [None]:
#Test data
_predictions = run_XGB_model(train_input, train_output, test_input)
test_sub_input = generate_sub_class_input(_predictions, test_input)
_subclass_predictions = run_XGB_model(train_sub_input, train_sub_output, test_sub_input)

***


5 Combination of two different classifiers for Main and Sub Class Prediction



In [None]:
# rf mnb
_predictions = run_RF_model(train_input, train_output, valinput)
print_statistics(valoutput, _predictions)
#train_sub_input = generate_sub_class_input(train_df['IPC-Hauptklasse'].tolist(), train_input)
valsub_input = generate_sub_class_input(_predictions, valinput)

_subclass_predictions = run_MNB_model(train_sub_input, train_sub_output, valsub_input)
print_statistics(valsub_output, _subclass_predictions)
find_mis_match_classes(_predictions, _subclass_predictions)

In [None]:
#Test data
_predictions = run_RF_model(train_input, train_output, test_input)
test_sub_input = generate_sub_class_input(_predictions, test_input)
_subclass_predictions = run_MNB_model(train_sub_input, train_sub_output, test_sub_input)

In [None]:
# knn rf
_predictions = run_knn_clf(train_input, train_output, valinput)
print_statistics(valoutput, _predictions)
#train_sub_input = generate_sub_class_input(train_df['IPC-Hauptklasse'].tolist(), train_input)
valsub_input = generate_sub_class_input(_predictions, valinput)

_subclass_predictions = run_RF_model(train_sub_input, train_sub_output, valsub_input)
print_statistics(valsub_output, _subclass_predictions)
find_mis_match_classes(_predictions, _subclass_predictions)

In [None]:
# knn mnb
_predictions = run_knn_clf(train_input, train_output, valinput)
print_statistics(valoutput, _predictions)
#train_sub_input = generate_sub_class_input(train_df['IPC-Hauptklasse'].tolist(), train_input)
valsub_input = generate_sub_class_input(_predictions, valinput)

_subclass_predictions = run_MNB_model(train_sub_input, train_sub_output, valsub_input)
print_statistics(valsub_output, _subclass_predictions)
find_mis_match_classes(_predictions, _subclass_predictions)

In [None]:
# rf knn
_predictions = run_RF_model(train_input, train_output, valinput)
print_statistics(valoutput, _predictions)
#train_sub_input = generate_sub_class_input(train_df['IPC-Hauptklasse'].tolist(), train_input)
valsub_input = generate_sub_class_input(_predictions, valinput)

_subclass_predictions = run_knn_clf(train_sub_input, train_sub_output, valsub_input)
print_statistics(valsub_output, _subclass_predictions)
find_mis_match_classes(_predictions, _subclass_predictions)

***


Data Visualization



In [None]:
valmain_act = dict(valdf['IPC-Hauptklasse'].value_counts())
valsub_act = dict(valdf['IPC-Unterklasse'].value_counts())

In [None]:
main_pred = {}
for x in knn_predictions:
    if x in main_pred:
        main_pred[x] += 1
    else:
        main_pred[x] = 1
        


In [None]:
sub_pred = {}
for x in knn_subclass_predictions:
    if x in sub_pred:
        sub_pred[x] += 1
    else:
        sub_pred[x] = 1
        


In [None]:
list_of_dict = []
list_of_dict.append(valmain_act)
list_of_dict.append(main_pred)
list_of_dict.append(valsub_act)
list_of_dict.append(sub_pred)



In [None]:
for l in list_of_dict:
    #explode = (0, 0.0, 0, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')
    fig1, ax1 = plt.subplots()
    ax1.pie(list(l.values()), labels=list(l.keys()), autopct='%1.1f%%',
        shadow=True, startangle=90)
    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.show()

In [None]:
# grid_params = { 'n_neighbors' : [5,7,9,11,13,15,17],
#                'weights' : ['uniform','distance'],
#                'metric' : ['minkowski','euclidean','manhattan']}

# gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)

# g_res = gs.fit(train_input, train_output)
# g_res.best_score_
# g_res.best_params_

In [None]:
# grid_params = { 'n_neighbors' : [5,7,9,11,13,15,17],
#                'weights' : ['uniform','distance'],
#                'metric' : ['minkowski','euclidean','manhattan']}

# gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)

# g_res = gs.fit(train_input, train_output)
# g_res.best_score_
# g_res.best_params_