virtual env: tmp

# Requirements: 

* !pip install keras==2.4
* pip install -U scikit-learn scipy matplotlib
* pip install pandas==1.3.4
* To use GRU and LSTM for the criteria of using cudnn kernel: https://www.tensorflow.org/api_docs/python/tf/keras/layers/GRU#used-in-the-notebooks_1
* For LSTM recurrent_dropout = 0

For performance issues, we are going to use tensorflow-gpu library for this model

In [None]:
# check for physical devices
import tensorflow as tf
from tensorflow.python.client import device_lib
tf.config.list_physical_devices('GPU')
device_lib.list_local_devices()

# Description of the model:
* Our model is a NN_Bi-LSTM
## - Input: 
X = an array of sentences where a sentence is a list of claim segments. To prepare data for the model we add "PAD" values to make all of them the same length then we convert sentences to numerical values using  word2idx, then we use pad_sequence method to get a 2D array shape
## - Output:
y = an array of tags where a tag is a list containing binary values for each element in the sequence

# Preparing the imports

In [2]:
import ast
import pandas as pd
from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input
from keras.models import Model
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
import keras
import numpy as np
import tensorflow as tf
from sklearn.metrics import recall_score,classification_report,f1_score, accuracy_score, average_precision_score
from matplotlib import pyplot as plt

Using TensorFlow backend.


In [3]:
def get_words(df):
    words = []
    for i in range(len(df)):
        words.append(ast.literal_eval(df['claim_segments'][i]))
    words = [item for sublist in words for item in sublist]
    words = [*set(words)]
    # add ENDPAD so that all sentences will have the same length
    words.append("ENDPAD")
    return words

def FindMaxLength(lst):
    maxList = max(lst, key = lambda i: len(i))
    maxLength = len(maxList)
     
    return maxLength

## Reading the dataset: This data is based on the rule based method
we are interested in 2 columns in the data:\
*Claim segments: each value is a list contains all the words from first claim split by ' '\
*Claim segments binary: each value has the same length as claim segments list and conatins binary values stating if each eleent in the list ends a segment or not

In [4]:
df = pd.read_csv('./uspto_df_final.csv')
words = get_words(df)
sentences = df['claim_segments'].to_list()
sentences = [ast.literal_eval(x) for x in sentences]
tags = df['claim_segments_binary'].to_list()
tags = [ast.literal_eval(x) for x in tags]
for i in range(len(sentences)):
    sentences[i] = list(zip(sentences[i], tags[i]))

In [9]:
import statistics
statistics.mean([len(s) for s in sentences])

180.77500069234816

* For the use of neural nets we need to use equal-lenght input sequences. So we are going to pad our sentences to a length of 200. But first we need dictionaries of words and tags.

In [30]:
tags = [0,1]
max_len = 200
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
n_words = len(words)
n_tags = len(tags)

# Prepare Input and Output data
* convert each word in sentences to a numerical value
* Pad sentences and tags: 
* convert tags to categorical

In [31]:
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words - 1)
y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx[0])
y = [to_categorical(i, num_classes=n_tags) for i in y]

# Split data to train and test data

In [32]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3)


# Preparing the model Architecture
* First layer: takes an input of size max len where max len is the mean length of sentences.
* Second layers: The data is embedded with a 50-dimension embedding
* Third layer:  For an effective technique for regularization and preventing the co-adaptation of neurons
* Fouth layer: Our BiLSTM model with a number of units set to 100 as the preliminary value
* Fifth layer:the output of LSTM is not softmax, so the dimensionality of this output is equal to the number of units, which is not the dimensionality of your desired target, itâ€™s common to add this layer as a softmax output layer

In [33]:
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words, output_dim=50, input_length=max_len)(input)  # 50-dim embedding
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=20, return_sequences=True, recurrent_dropout=0))(model)  # variational biLSTM units=100
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer


In [34]:
model = Model(input, out)


# Specifications:
* Optimizer: Adam, rmsprop
* loss: categorical cross entropy
* metrics : accuracy, Precision, Recall

In [37]:
# parameters:
epochs = 1
batch_size = 64
optimizer = "Adam"
loss="categorical_crossentropy"

In [38]:
model.compile(optimizer = optimizer, loss = loss, metrics=["accuracy"])


# Parameters will change depending on the experiments:
* batch size = [16,8]
* epochs = [5,4,3,2,1]
* validation split = 0.1

In [39]:
history = model.fit(X_tr, np.array(y_tr), batch_size=batch_size, epochs=epochs, validation_split=0.1, verbose=1)



# Plotting the model metrics

In [40]:
hist = pd.DataFrame(history.history)


In [41]:
hist

Unnamed: 0,loss,accuracy,val_loss,val_accuracy
0,0.01317,0.996118,0.00512,0.998156


In [None]:
plt.figure(figsize=(10,5))
plt.plot(hist["accuracy"],label = 'acc')
plt.plot(hist["val_accuracy"], label = 'val_acc')
plt.legend()
plt.show()


* Test example

In [43]:
i = 600
p = model.predict(np.array([X_te[i]]))
p = np.argmax(p, axis=-1)
print("{:15} ({:5}): {}".format("Word", "True", "Pred"))
for w, pred in zip(X_te[i], p[0]):
    print("{:15}: {}".format(words[w], tags[pred]))


Word            (True ): Pred
An             : 0
image          : 0
projection     : 0
apparatus      : 0
comprising     : 1
:,one          : 0
or             : 0
more           : 0
sensors        : 0
configured     : 0
to             : 0
acquire        : 0
information    : 0
relating       : 0
to             : 1
a              : 0
vehicle;       : 1
a              : 0
projector      : 0
configured     : 0
to             : 0
project        : 1
an             : 0
image          : 0
on             : 1
a              : 0
road           : 0
surface,[sub]  : 0
a              : 0
processor      : 0
communicatively: 0
coupled        : 0
to             : 1
a              : 0
navigation     : 0
device,        : 0
the            : 0
one            : 0
or             : 0
more           : 0
sensors        : 0
and            : 0
the            : 0
projector,wherein: 0
the            : 0
projector      : 0
is             : 0
further        : 0
configured     : 0
to             : 0
project        : 0

# Model evaluation on the test set

* Prepare test values for prediction: convert to numpy arrays

In [44]:
pred = model.predict(np.array(X_te))

In [45]:
pred

array([[[9.9962914e-01, 3.7088938e-04],
        [9.9973625e-01, 2.6379031e-04],
        [9.9470562e-01, 5.2944366e-03],
        ...,
        [9.9998868e-01, 1.1304548e-05],
        [9.9997115e-01, 2.8887353e-05],
        [9.9992657e-01, 7.3446783e-05]],

       [[9.9891126e-01, 1.0887396e-03],
        [9.9031454e-01, 9.6853981e-03],
        [4.8654133e-01, 5.1345873e-01],
        ...,
        [9.9998868e-01, 1.1304548e-05],
        [9.9997115e-01, 2.8887353e-05],
        [9.9992657e-01, 7.3446783e-05]],

       [[9.9811268e-01, 1.8873377e-03],
        [9.9898201e-01, 1.0179820e-03],
        [5.8734089e-01, 4.1265908e-01],
        ...,
        [9.9998868e-01, 1.1304548e-05],
        [9.9997115e-01, 2.8887353e-05],
        [9.9992657e-01, 7.3446783e-05]],

       ...,

       [[9.9936765e-01, 6.3238665e-04],
        [9.9610859e-01, 3.8913980e-03],
        [9.8386174e-01, 1.6138269e-02],
        ...,
        [9.9998868e-01, 1.1304548e-05],
        [9.9997115e-01, 2.8887353e-05],
        [

In [46]:
pred_ = np.argmax(pred, axis=-1)
t = np.argmax(y_te, axis=-1)

In [47]:
from sklearn.metrics import precision_score

* Getting the evaluation metrics

In [48]:
print('epochs =',epochs,'|', 'batch_size=', batch_size, '|', 'optimizer: ',optimizer,'|', 'loss:', loss, '\n')
print('recall_score', recall_score(t, pred_, average='macro', labels=np.unique(pred_)))
print('f1_score',f1_score(t, pred_, average='macro', labels=np.unique(pred_)))
print('accuracy_score',accuracy_score(t, pred_))
print('precision_score',precision_score(t, pred_, average='macro'))

epochs = 1 | batch_size= 64 | optimizer:  Adam | loss: categorical_crossentropy 

recall_score 0.743415894966704
f1_score 0.8065033400283892
accuracy_score 0.19132953312870693
precision_score 0.9620631682824221


In [49]:
report = classification_report(t, pred_, labels = np.unique(pred_))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Print all the evaluation metrics

In [31]:
print(report)

              precision    recall  f1-score   support

           0       0.89      0.89      0.89      7496
           1       0.94      0.94      0.94     15277

   micro avg       0.92      0.92      0.92     22773
   macro avg       0.92      0.91      0.91     22773
weighted avg       0.92      0.92      0.92     22773
 samples avg       0.21      0.21      0.21     22773

