# Importings

In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
def get_words(df):
    words = []
    for i in range(len(df)):
        words.append(ast.literal_eval(df['claim_segments'][i]))
    words = [item for sublist in words for item in sublist]
    words = [*set(words)]
    # add ENDPAD so that all sentences will have the same length
    words.append("ENDPAD")
    return words
def FindMaxLength(lst):
    maxList = max(lst, key = lambda i: len(i))
    maxLength = len(maxList)
     
    return maxLength

In [23]:
df = pd.read_csv('./uspto_df_final.csv')
words = get_words(df)
sentences = df['claim_segments'].to_list()
sentences = [ast.literal_eval(x) for x in sentences]
tags = df['claim_segments_binary'].to_list()
tags = [ast.literal_eval(x) for x in tags]

## Prepare the data:
- this model gets as an input a list of all the words that are in the claim sentences. Same for the tags

In [24]:
sentences = [sent for sentence in sentences for sent in sentence ]
tags = [t for tag in tags for t in tag ]

In [25]:
tags = tags[:len(sentences)]

- Here we create a MemoryTagger class that takes as an input a list of words and a list of tags and tries to memorize for each word its tag.
- To evaluate the model with cross validation we need to write two methods in the class which are: fit and predict.

In [26]:
class MemoryTagger(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y):
        voc = {}
        self.tags = []
        for x, t in zip(X, y):
            if t not in self.tags:
                self.tags.append(t)
            if x in voc:
                if t in voc[x]:
                    voc[x][t] += 1
                else:
                    voc[x][t] = 1
            else:
                voc[x] = {t: 1}
        self.memory = {}
        for k, d in voc.items():
            self.memory[k] = max(d, key=d.get)
    
    def predict(self, X, y=None):
        return [self.memory.get(x, 0) for x in X]

In [41]:
tagger = MemoryTagger()

In [28]:
tagger.fit(sentences, tags)

In [29]:
tagger.tags

[0, 1]

Cross validation

In [42]:
pred = cross_val_predict(estimator=MemoryTagger(), X=sentences, y=tags, cv=3)

In [43]:
report = classification_report(y_pred=pred, y_true=tags)

In [44]:
print(report)

              precision    recall  f1-score   support

           0       0.87      1.00      0.93  15609842
           1       0.13      0.00      0.00   2232757

    accuracy                           0.87  17842599
   macro avg       0.50      0.50      0.47  17842599
weighted avg       0.78      0.87      0.82  17842599



In [45]:
def feature_map(word):
    #this function creates feature to words for further context
    return np.array([word.istitle(), word.islower(), word.isupper(), len(word), word.isdigit(), word.isalpha()])


In [46]:
words = [feature_map(w) for w in sentences]


In [47]:
classifier = RandomForestClassifier(n_estimators=10)

In [48]:
pred = cross_val_predict(classifier, X=words, y=tags, cv=3)

In [49]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)


              precision    recall  f1-score   support

           0       0.87      1.00      0.93  15609842
           1       0.07      0.00      0.00   2232757

    accuracy                           0.87  17842599
   macro avg       0.47      0.50      0.47  17842599
weighted avg       0.77      0.87      0.82  17842599



- There is a lack a lot of information necessary for the decision about the features. So now we enhance our simple features by both memory and context information.

In [50]:
class FeatureTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.memory_tagger = MemoryTagger()
        self.tag_encoder = LabelEncoder()
        
    def fit(self, X, y):
        words = X
        tags = y
        self.memory_tagger.fit(words, tags)
        self.tag_encoder.fit(tags)
        return self
    
    def transform(self, X, y=None):
        def pos_default(p):
            if p in self.pos:
                return self.pos_encoder.transform([p])[0]
            else:
                return -1
        
        words = X
        out = []
        for i in range(len(words)):
            w = words[i]
            if i < len(words) - 1:
                wp = self.tag_encoder.transform(self.memory_tagger.predict([words[i+1]]))[0]
            else:
                wp = self.tag_encoder.transform([0])[0]
            if i > 0:
                if words[i-1] != ".":
                    wm = self.tag_encoder.transform(self.memory_tagger.predict([words[i-1]]))[0]
                else:
                    wm = self.tag_encoder.transform([0])[0]
            else:
                wm = self.tag_encoder.transform([0])[0]
            out.append(np.array([w.istitle(), w.islower(), w.isupper(), len(w), w.isdigit(), w.isalpha(),
                                 self.tag_encoder.transform(self.memory_tagger.predict([w]))[0], wp, wm]))
        return out


In [51]:
pred = cross_val_predict(Pipeline([("feature_map", FeatureTransformer()), 
                                   ("clf", RandomForestClassifier(n_estimators=10, n_jobs=3))]),
                         X=sentences, y=tags, cv=5)


In [52]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)

              precision    recall  f1-score   support

           0       0.87      1.00      0.93  15609842
           1       0.14      0.00      0.00   2232757

    accuracy                           0.87  17842599
   macro avg       0.51      0.50      0.47  17842599
weighted avg       0.78      0.87      0.82  17842599

