venv (base)

# Importings

In [None]:
import pandas as pd
import ast
from sklearn_crfsuite import CRF
import eli5
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report

# helper functions

In [2]:
def get_words(df):
    #this function takes list of sentences and return unique words
    words = []
    for i in range(len(df)):
        words.append(ast.literal_eval(df['claim_segments'][i]))
    words = [item for sublist in words for item in sublist]
    words = [*set(words)]
    words.append("ENDPAD")
    return words

def FindMaxLength(lst):
    #this function returns the length of the largest sentence
    maxList = max(lst, key = lambda i: len(i))
    maxLength = len(maxList)
     
    return maxLength

In [3]:
df = pd.read_csv('./uspto_df_final.csv')
words = get_words(df)

# Data preparation:
* Prepare two lists: one list of claim sentences and another list wich contains for each sentence a list of binary values stating if each word in thesentence ends a segment in the claim.

In [4]:
sentences = df['claim_segments'].to_list()
sentences = [ast.literal_eval(x) for x in sentences]
labels = df['claim_segments_binary'].to_list()
labels = [ast.literal_eval(x) for x in labels]
for i in range(len(sentences)):
    sentences[i] = list(zip(sentences[i], labels[i]))

An example of the input:

In [5]:
print(sentences[1000])

[('A', 0), ('multi-modality', 0), ('medical', 0), ('imaging', 0), ('system', 0), ('comprising', 1), (':,a', 0), ('first', 0), ('module', 0), ('having', 1), ('a', 0), ('first', 0), ('catcher', 0), ('detector,', 1), ('a', 0), ('position', 0), ('for', 1), ('a', 0), ('first', 0), ('scatter', 0), ('detector', 0), ('spaced', 0), ('from', 0), ('the', 0), ('catcher', 0), ('detector,', 0), ('and', 1), ('a', 0), ('position', 0), ('for', 1), ('a', 0), ('first', 0), ('physical', 0), ('aperture', 0), ('between', 1), ('a', 0), ('patient', 0), ('space', 0), ('and', 0), ('the', 0), ('first', 0), ('catcher', 0), ('detector;', 1), ('and,an', 0), ('image', 0), ('processor', 0), ('configured', 0), ('to', 0), ('determine', 1), ('angles', 0), ('of', 0), ('incidence', 0), ('for', 0), ('Compton', 0), ('events', 0), ('where', 0), ('the', 0), ('first', 0), ('scatter', 0), ('detector', 0), ('is', 0), ('included', 0), ('in', 0), ('the', 0), ('first', 0), ('module', 0), ('and', 0), ('to', 0), ('count', 0), ('photo

In [6]:
tags = ['0','1']

In [7]:
max_len = FindMaxLength(sentences)
n_words = len(words)
n_tags = len(tags)

* Add features to each word to improve the model

In [9]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),})
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [str(label) for token,label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [10]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [11]:
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)


In [None]:
pred = cross_val_predict(crf, X=X, y=y, cv=5)

In [13]:
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)



              precision    recall  f1-score   support

           0       0.97      0.98      0.97  46061881
           1       0.84      0.74      0.78   6158955

    accuracy                           0.95  52220836
   macro avg       0.90      0.86      0.88  52220836
weighted avg       0.95      0.95      0.95  52220836



In [14]:
history = crf.fit(X, y)

- The transition probabilities from one tag to another can be seen . We can also observe which characteristics are crucial for predicting a specific tag. To carry out the investigation, we employ the eli5 library. 

In [15]:
eli5.show_weights(crf, top=30)

From \ To,0,1
0,0.177,-0.421
1,-0.693,0.209

Weight?,Feature
Weight?,Feature
+0.996,word[-2:]:ed
+0.962,+1:word.lower():of
+0.722,+1:word.lower():to
+0.665,bias
+0.633,word[-3:]:ion
+0.606,+1:word.lower():and
+0.583,word[-2:]:er
+0.582,word[-2:]:st
… 392793 more positive …,… 392793 more positive …
… 79881 more negative …,… 79881 more negative …

Weight?,Feature
+0.996,word[-2:]:ed
+0.962,+1:word.lower():of
+0.722,+1:word.lower():to
+0.665,bias
+0.633,word[-3:]:ion
+0.606,+1:word.lower():and
+0.583,word[-2:]:er
+0.582,word[-2:]:st
… 392793 more positive …,… 392793 more positive …
… 79881 more negative …,… 79881 more negative …

Weight?,Feature
+1.990,+1:word.lower():an
+1.917,+1:word.lower():a
+1.262,word.lower():comprising
+1.063,word[-2:]:e;
+0.810,EOS
+0.806,+1:word.lower():e
+0.777,-1:word.lower():e
+0.737,word[-2:]:s;
+0.731,word.lower():e
+0.727,word[-2:]:e


# Improve the model:
- We change the regularization metrics and evaluate the CRF model

In [16]:
crf = CRF(algorithm='lbfgs',
c1=10,
c2=0.1,
max_iterations=100,
all_possible_transitions=False)

In [None]:
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)


In [None]:
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)


In [73]:
history = crf.fit(X, y)


In [None]:
eli5.show_weights(crf, top=30)
