# **Notebook C**: Patent Classification with CNN 
---- 



# C.1. Load Packages 
---

In [None]:
# General Packages #
import os
import pandas as pd
import numpy as np

# Load TQDM to Show Progress Bars #
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix

# Keras Packages #

import tensorflow as tf

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras import layers
from keras.layers import Dense, Dropout, Activation, GlobalMaxPooling1D, LSTM, Bidirectional, BatchNormalization
from keras.layers.convolutional import Conv1D, MaxPooling1D


from textblob import TextBlob

import zipfile

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Turn off warnings, just to avoid pesky messages that might cause confusion here
# Remove when testing your own code #
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Mount Personal Google Drive on own Machine -- You have to follow the link to log in #
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# C.2. Load Training Data ##
---------------- 

We are going to use the data on the Google drive. This is in a csv file, and so we are going to load the data as a dataframe, and then convert the main data (Patent Ids, Indicator for AI / Non-AI, Patent Abstract) from a Pandas DataFrame to a list (which is more easily used in later sections). 

In [None]:
# Change to Working Directory with Training Data # 
os.chdir("/content/drive/MyDrive/USPTO AI Patent Classification/")

# Load Training Data #
TrainingData = pd.read_csv("./Training Data/4K Patents - AI 20p.csv")

# Store Data in Lists for Text Classification #
IDs = np.array(TrainingData['app number'].values.tolist())
Abstract_Text = TrainingData['abstract'].values.tolist()
Classes = TrainingData['actual'].values.tolist()

In [None]:
# We need to use the Keras based tokenizer. We also need to define a custom text cleaner function which is going 
# to stem the words (this is what we have been doing for all of the models)

n_words = 2000

# Define Tokenizer Function #

tokenizer = Tokenizer(num_words=n_words, lower = True,
                        filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n',
                        char_level=False)
                        

# Define String Cleaner Function 
def string_cleaner(str_input):
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    words = [token.stem() for token in tokens]
    # If you want to use non-stemmed (Full words not just base) you can use the comment below
    #words = [token for token in tokens]
    return words


In [None]:
# Convert the Abstracts to a Keras Format #

Abstracts_Lemmatized = []
for x in Abstract_Text:
    segments = string_cleaner(x)
    Abstracts_Lemmatized.append( " ".join(segments))

Abstracts = Abstracts_Lemmatized

# Fit the abstracts to obtain the word index which will be subsequently used # 
tokenizer.fit_on_texts(Abstracts)

vocab_size = len(tokenizer.word_index) + 1

Once we have the list of words that occur in our corpus of abstracts (i.e. word index), then we can try to map those words to embedding vectors. Below we define the functions that will go through each of the words in our word index and extract the coresponding embedding vector and save it to an embedding matrix that will be used as a layer in a subsequent convolutional neural network (CNN) model. 

In [None]:
# Create Embedding Matrix by Loading Embedding File and Mapping it to Word Index #

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding='utf-8') as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                try:
                  embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]
                except:
                  embedding_matrix[idx] = np.array([0]*embedding_dim, dtype=np.float32)[:embedding_dim]
    return embedding_matrix

# Function to Create Empty Embedding Matrix -- For model without embeddings #
def create_empty_matrix(word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.ones((vocab_size, embedding_dim))

    return embedding_matrix

In [None]:
# Define Different Models with Embeddings # 

CLASSIFIERS = [
               ['No Embeddings', 'NONE', 50],
               ['GLOVE (6B - 50)', './Embeddings/glove.6B.50d.txt', 50],
               ['GLOVE (840B - 300)','./Embeddings/glove.840B.300d.txt', 300],
               ['Doc2Vec USPTO Patent Embeddings', './Embeddings/W2V Pat Abstracts 50 - AI Bigrams[Lemma].txt', 50],
               ['FastText', './Embeddings/FastText.en.300.vec', 300]
               ]

In [None]:
# Define CNN  model Parameters # 

maxlen = 200
batch_size = 50
epochs = 20
NUM_OF_SPLITS =5


# Define arrays in which to store classification outputs # 
RESULTS = []
Classified_Values =[]

for params in tqdm_notebook(CLASSIFIERS, 
                            desc = 'Loop Through Embeddings',
                            leave = True): 
  name = params[0]
  path = params[1]
  embedding_dim = params[2]


  # Load Embedding Matrix # 

  if path != "NONE":
    embedding_matrix = create_embedding_matrix(path,tokenizer.word_index, embedding_dim)
  else:
    embedding_matrix = create_empty_matrix(tokenizer.word_index, embedding_dim)

  # Define Lists to Store Data # 

  y_actual = []
  y_predicted = []
  id_s = []

  for train, test in tqdm_notebook(StratifiedKFold(n_splits= NUM_OF_SPLITS, shuffle=True).split(Abstracts, Classes), 
                                          desc = 'Cross-Validating',
                                          leave = False,
                                          total = NUM_OF_SPLITS):

      Test_IDs = np.array(IDs)[test].tolist()

      X_train = np.array(Abstracts)[train].tolist()
      X_test = np.array(Abstracts)[test].tolist()

      y_train = np.array(Classes)[train].tolist()
      y_test = np.array(Classes)[test].tolist()

      # Tokenize using Keras tools - Different from sklearn #

      X_train = tokenizer.texts_to_sequences(X_train)
      X_test = tokenizer.texts_to_sequences(X_test)

      X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
      X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


      # Define Model #

      model = Sequential()
      
      if path == "NONE":

        model.add(layers.Embedding(vocab_size, embedding_dim,
                                  #weights=[embedding_matrix],
                                  input_length=maxlen,
                                  trainable=True))
      else: 

        model.add(layers.Embedding(vocab_size, embedding_dim,
                                  weights=[embedding_matrix],
                                  input_length=maxlen,
                                  trainable=True))

      model.add(Dropout(0.2))
      model.add(layers.Conv1D(filters=64, kernel_size=2, activation='relu'))
      model.add(layers.MaxPooling1D(pool_size = 4))
      model.add(layers.Dense(512, activation='sigmoid'))
      model.add(layers.LSTM(100))
      model.add(layers.Dense(1, activation='sigmoid'))

      #model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
      #model.add(MaxPooling1D(3))
      #model.add(GlobalMaxPooling1D())
      #model.add(BatchNormalization())

      # Add fully connected layers
      #model.add(Dense(50, activation='relu'))
      #model.add(Dropout(0.3))
      #model.add(Dense(1, activation='sigmoid'))

      model.compile(optimizer='adam',
                    loss='binary_crossentropy',
                    metrics=[tf.metrics.Recall()])
      
      #model.summary()

      #tf.keras.metrics.Recall()

      # Run Model #

      model.fit(np.array(X_train), 
                np.array(y_train),  
                batch_size=batch_size, 
                epochs=epochs, 
                verbose=False,
                validation_data = (np.array(X_test), np.array(y_test)))
      
      #score = model.evaluate(np.array(X_test), 
      #                       np.array(C_test), 
      #                       batch_size=batch_size, 
      #                       verbose=False)

      y_pred_p = model.predict(X_test)
      y_pred = model.predict_classes(X_test)


      # Add to List with Final Results # 
      y_actual = y_actual + list(y_test)
      y_predicted = y_predicted + list(y_pred)
      id_s = id_s + list(Test_IDs)



  # ---------------------------------------------------------- #
  # This runs only after all of the folds have been classified # 
  # ---------------------------------------------------------- #

  # Compute the Share of AI Patents #
  Share = np.round(np.mean(y_predicted), 3)

  # Calculate Model Performance Metrics #
  Accuracy = accuracy_score(y_actual, y_predicted)
  ROC = roc_auc_score(y_actual, y_predicted)
  Precision = precision_score(y_actual, y_predicted)
  Recall = recall_score(y_actual, y_predicted)
  F1 = f1_score(y_actual, y_predicted)
  CM = confusion_matrix(y_actual, y_predicted)

  # Round to 3 Decimal Places # 
  #FN = np.round(CM[0][0]/CM[0].sum(), 3)
  #FP = np.round(CM[0][1]/CM[0].sum(), 3)
  #TN = np.round(CM[1][0]/CM[1].sum(), 3)
  #TP = np.round(CM[1][1]/CM[1].sum(), 3)

  FN = np.round(CM[0][0]/(CM[0][0] + CM[1][0]), 3)
  FP = np.round(CM[0][1]/(CM[0][1] + CM[1][1]), 3)
  TN = np.round(CM[1][0]/(CM[0][0] + CM[1][0]), 3)
  TP = np.round(CM[1][1]/(CM[0][1] + CM[1][1]), 3)
  
  # Add Classification Performance Metrics to List#
  RESULTS.append([name, Share, TP, FN, FP, TN,
                                          np.round(Accuracy, 3),
                                          np.round(ROC, 3),
                                          np.round(Precision, 3),
                                          np.round(Recall, 3),
                                          np.round(F1, 3)])

  # Add Classification Results to List # 
  Classified_Values.append(list(zip(len(id_s)*[name],id_s, y_actual, y_predicted)))


HBox(children=(FloatProgress(value=0.0, description='Loop Through Embeddings', max=5.0, style=ProgressStyle(de…

HBox(children=(FloatProgress(value=0.0, description='Cross-Validating', max=5.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='Cross-Validating', max=5.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='Cross-Validating', max=5.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='Cross-Validating', max=5.0, style=ProgressStyle(descripti…

HBox(children=(FloatProgress(value=0.0, description='Cross-Validating', max=5.0, style=ProgressStyle(descripti…




In [None]:
# Convert List to Dataframe #
RESULTS_TABLE = pd.DataFrame(RESULTS, columns = ["Name", "Share", "True-Positives", 
                                                 "False-Negatives", "False-Positives", 
                                                 "True-Negatives","Accuracy", "AUC", 
                                                 "Precision", "Recall", "F1"] )

RESULTS_TABLE["Type"] = "CNN"
RESULTS_TABLE = RESULTS_TABLE[["Name", "Type", "Share", "True-Positives", 
                               "False-Negatives", "False-Positives", 
                               "True-Negatives","Accuracy", "AUC", 
                               "Precision", "Recall", "F1"]]

# Output Results #
RESULTS_TABLE.sort_values("Accuracy", ascending = False ).to_csv("./Output/Model Performance/CNN Model Classification Performance.csv")

# Display Results -- Out of Sample (Holdout) prediction -- Sorted by Accuracy #
RESULTS_TABLE.sort_values("Accuracy", ascending = False )

Unnamed: 0,Name,Type,Share,True-Positives,False-Negatives,False-Positives,True-Negatives,Accuracy,AUC,Precision,Recall,F1
3,Doc2Vec USPTO Patent Embeddings,CNN,0.203,0.842,0.963,0.158,0.037,0.938,0.906,0.842,0.853,0.848
1,GLOVE (6B - 50),CNN,0.19,0.855,0.953,0.145,0.047,0.934,0.888,0.855,0.81,0.832
2,GLOVE (840B - 300),CNN,0.194,0.834,0.952,0.166,0.048,0.929,0.884,0.834,0.808,0.821
4,FastText,CNN,0.193,0.811,0.946,0.189,0.054,0.92,0.868,0.811,0.782,0.796
0,No Embeddings,CNN,0.193,0.777,0.938,0.223,0.062,0.907,0.848,0.777,0.749,0.763


In [None]:
# Output Classification Results for Training Dataset -- PREDICTED VALUES -- Out Of Sample (Holdout) Prediction # 

for i in range(0,len(Classified_Values), 1):

  Temp = pd.DataFrame(  Classified_Values[i],
                        columns = ['Model', 'id', 'Actual', 'Predicted'] )
  
  if i == 0: 
    name = Temp.head(1)['Model'][0]
    Temp = Temp[['id', 'Actual', 'Predicted']]
    Temp.columns = ['id', 'Actual', name]
    Final = Temp

  else: 

    name = Temp.head(1)['Model'][0]
    Temp = Temp[['id', 'Predicted']]
    Temp.columns = ['id', name]

    Final = Final.merge(Temp, on = ['id'])

# Save Data Frame # 
Final.to_csv("./Output/Classification Output/CNN Classification Results.csv")