In [None]:
# !pip install torch
# !pip install sentence_transformers

In [1]:
import numpy as np
import pandas as pd
import torch
from collections import Counter
import re
import time

from sentence_transformers import SentenceTransformer, util

# text processing
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords

import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- Description of the data fields ---
# patent_train and patent_test
    # id - unique identifier for a pair of phrases
    # anchor - first phrase
    # target - second phrase
    # context - CPC classification which indicates the context which the similarity is to be scored
    # score - similarity between the two phrases
    
# patent_titles
    # code - hierarchical code used to categorize the patent; corresponds to the context field in patent_train and patent_test dataframe
    # title - description of the code field
    # section - first symbol in the title field; ranges from A - H and Y
    # class - 2 digit class
    # subclass - 1 letter code subclass
    # group - 1-3 digit group code value
    # main_group - 2+ sigit main or subgroup after the / symbol
    # EXAMPLE: patent_titles.loc[3,'code'] = 'A01B1/00'
        # title = 'Hand tools (edge trimmers for lawns A01G3/06  {; machines for working soil A01B35/00; making hand tools B21D})'
        # section = A
        # class = 1.0
        # subclass = B
        # group = 1.0
        # main_group = 00
        
# --- Description of the data fields ---

# Preprocessing of the patents

## Organizing and defining the model and datasets

### Read in the BERT transformer model

In [2]:
# import the BERT model
model = SentenceTransformer('bert-base-nli-mean-tokens')

### Read in the data that has been uploaded to the GCP bucket

In [3]:
# read in the data from the folder
patent_train = pd.read_csv('/home/jupyter/uspto_analysis/train.csv')
patent_test = pd.read_csv('/home/jupyter/uspto_analysis/test.csv')
patent_titles = pd.read_csv('/home/jupyter/uspto_analysis/titles.csv')

### Join the training and testing datasets with the titles csv
titles.csv contains more information on the context of the patent

In [4]:
# joining the training dataset
patents_combined = patent_train.merge(patent_titles, how = 'left', left_on = 'context', right_on = 'code')
patents_combined = patents_combined[['id', 'anchor', 'target', 'context', 'title', 'score']]

In [5]:
# joining the testing dataset
testing_combined = patent_test.merge(patent_titles, how = 'left', left_on = 'context', right_on = 'code')
testing_combined = testing_combined[['id', 'anchor', 'target', 'context', 'title']]

In [6]:
# create a copy of the original dataframe and set the id as the index
text_processing_frame = patents_combined.copy()
text_processing_frame = text_processing_frame.set_index('id')
text_processing_frame.columns

Index(['anchor', 'target', 'context', 'title', 'score'], dtype='object')

## Processing the text of the patents

In [13]:
# convert all of the text fields to lowercase 
    # anchor, target, and code
text_processing_frame['anchor'] = text_processing_frame['anchor'].str.lower()
text_processing_frame['target'] = text_processing_frame['target'].str.lower()
text_processing_frame['title'] = text_processing_frame['title'].str.lower()

In [14]:
# remove all non-alphabetic characters from the anchor, target, and title fields
text_processing_frame['anchor'] = text_processing_frame.anchor.str.replace('\W+', ' ')
text_processing_frame['target'] = text_processing_frame.target.str.replace('\W+', ' ')
text_processing_frame['title_alpha'] = text_processing_frame.title.str.replace('\W+', ' ')

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [15]:
# Lemmatization of the data fields
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in word_tokenize(text)]

In [16]:
# Applying the lemmatization function to the text datafields
text_processing_frame['lemmatized_anchor'] = text_processing_frame.anchor.apply(lemmatize_text)
text_processing_frame['lemmatized_target'] = text_processing_frame.target.apply(lemmatize_text)
text_processing_frame['lemmatized_title'] = text_processing_frame.title_alpha.apply(lemmatize_text)

In [17]:
# Define new dataframe with the the needed datafields
patent_select = pd.DataFrame(text_processing_frame[['lemmatized_anchor','lemmatized_target','lemmatized_title', 'context', 'score']])

In [18]:
patent_select

Unnamed: 0_level_0,lemmatized_anchor,lemmatized_target,lemmatized_title,context,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
37d61fd2272659b1,[abatement],"[abatement, of, pollution]","[furniture, domestic, article, or, appliance, ...",A47,0.50
7b9652b17b68b7a4,[abatement],"[act, of, abating]","[furniture, domestic, article, or, appliance, ...",A47,0.75
36d72442aefd8232,[abatement],"[active, catalyst]","[furniture, domestic, article, or, appliance, ...",A47,0.25
5296b0c19e1ce60e,[abatement],"[eliminating, process]","[furniture, domestic, article, or, appliance, ...",A47,0.50
54c1e3b9184cb5b6,[abatement],"[forest, region]","[furniture, domestic, article, or, appliance, ...",A47,0.00
...,...,...,...,...,...
8e1386cbefd7f245,"[wood, article]","[wooden, article]","[decorative, art]",B44,1.00
42d9e032d1cd3242,"[wood, article]","[wooden, box]","[decorative, art]",B44,0.50
208654ccb9e14fa3,"[wood, article]","[wooden, handle]","[decorative, art]",B44,0.50
756ec035e694722b,"[wood, article]","[wooden, material]","[decorative, art]",B44,0.75


### Process to create a word dictionary for further filtering out tokens

In [19]:
# create list of all tokens - lemmatized versions
anchor_list = list(text_processing_frame['lemmatized_anchor'])
target_list = list(text_processing_frame['lemmatized_target'])
title_list = list(text_processing_frame['lemmatized_title'])

In [20]:
combined_list = (anchor_list + target_list + title_list)

In [21]:
# total number of tokens = 386,751
# number of unique tokens
    # lemmatization = 8,031
all_words = []
for item in combined_list:
    for word in item:
        all_words.append(word)

In [22]:
len(all_words)

386751

In [23]:
# frequency of each individual token
# convert to a dictionary
token_count = FreqDist(all_words)
len(token_count)
token_list = list(token_count)
token_count_dict = dict(token_count)

sorted_dict = sorted(token_count_dict.items(), key = lambda x: x[1], reverse = True)

In [24]:
len(token_list)

8031

In [25]:
# create list of tokens that are to be removed
stop_words = list(stopwords.words('english'))
token_len_one = [w for w in token_list if len(w) == 1]
token_len_two = [w for w in token_list if len(w) == 2]
numeric_tokens = [num for num in token_list if any(c.isdigit() for c in num)]

In [26]:
# token_len_two

### EDA - Token Descriptions
- lengths for each unique token
- avg, max, min, sd of tokens
- number of consonants in each token
- percentiles of token lengths

In [27]:
# Create list of lengths for each of the tokens
token_length_list = []
for word in token_list:
    token_len = len(word)
    token_length_list.append(token_len)

In [28]:
# Average length of tokens
# max length of tokens
# standard deviation of tokens
avg_token = sum(map(len, token_list))/len(token_list)
avg_token
max_length = max(token_length_list)
max_length
token_sd = np.std(token_length_list)
token_sd

2.9313652766188376

In [29]:
# Number of consonants in tokens
vowels = ['a', 'e', 'i', 'o', 'u']
token_data = []
for token in token_list:
    token_len = len(token)
    consonants = 0
    for letter in token:
        if letter not in vowels:
            consonants = consonants + 1
    token_data.append((token, token_len, consonants))

In [30]:
# create a dataframe with metadata about each of the unique tokens
token_count_df = pd.DataFrame(token_count_dict.items(), columns = ['token', 'token_count'])
token_metadata = pd.DataFrame(token_data, columns = ['token', 'token_length', 'consonant_count'])
token_metadata = token_metadata.merge(token_count_df, left_on = 'token', right_on = 'token')
token_metadata['consonant_percentage'] = token_metadata['consonant_count']/token_metadata['token_length']

high_consonants = token_metadata[token_metadata['consonant_percentage'] > 0.9]
consonant_list = list(high_consonants['token'])

In [31]:
# check for any tokens containing numeric values
numeric_tokens = [num for num in token_list if any(c.isdigit() for c in num)]       

In [32]:
# Percentiles of token lengths
twentyfive_percentile = np.percentile(token_length_list, 25)
fifty_percentile = np.percentile(token_length_list, 50)
sevenfive_percentile = np.percentile(token_length_list, 75)
twentyfive_percentile
fifty_percentile
sevenfive_percentile

9.0

In [33]:
# Playing around with mean and standard deviation of the token lengths
# Testing if long tokens should be removed - NO
    # Many important long tokens
max_length = max(token_length_list)
dictionary_lengths = dict(zip(token_list, token_length_list))

sd_value = avg_token + (token_sd *2)
sd_value_three = avg_token + (token_sd *3)
sd_min = avg_token - (token_sd*2)
two_sd = [k for k,v in dictionary_lengths.items() if v >= sd_value]
three_sd = [k for k,v in dictionary_lengths.items() if v >= sd_value_three]

In [34]:
# Define a list of tokens to be removed
remove_list = stop_words + token_len_one + consonant_list + numeric_tokens

In [35]:
lemmatized_anchor = patent_select['lemmatized_anchor']
lemmatized_target = patent_select['lemmatized_target']
lemmatized_title = patent_select['lemmatized_title']

In [36]:
# funcion to apply the word dictionary to the anchor, target, and title data fields
def word_dictionary_apply(patent_datafield):
    filter_list = []
    for elements in patent_datafield:
        inner_filter = []
        for token in elements:
            if token not in remove_list:
                inner_filter.append(token)
        filter_list.append(inner_filter)
    return filter_list

In [37]:
# apply the word dictionary to the specified text fields
anchor_filter = word_dictionary_apply(lemmatized_anchor)
target_filter = word_dictionary_apply(lemmatized_target)
title_filter = word_dictionary_apply(lemmatized_title)

In [38]:
# append the text lists to the patent dataframe after the word dictionary has been applied
patent_select['anchor_dict'] = anchor_filter
patent_select['target_dict'] = target_filter
patent_select['title_dict'] = title_filter

In [39]:
# future modelling needs input as combined string instead of tokenized text
# function to combine the tokenized text fields into a non-tokenized version
# add each as a datafield in the dataframe
def list_to_string(datafield_list):
    string_list = []
    for phrase in datafield_list:
        new_string = ' '.join(phrase)
        string_list.append(new_string)
    return string_list

In [40]:
# apply the function to the tokenized text lists
anchor_list = list_to_string(anchor_filter)
target_list = list_to_string(target_filter)
title_list = list_to_string(title_filter)

In [41]:
# append back to the dataframe
patent_select['anchor_list'] = anchor_list
patent_select['target_list'] = target_list
patent_select['title_list'] = title_list

In [42]:
patent_select = patent_select.drop(columns = ['lemmatized_anchor','lemmatized_target','lemmatized_title'])

In [43]:
# create a new datafield - 'target_title_combined'
# concatenate the target and title datafields in order to calculate cosine similarity later on
patent_select['target_title_combined'] = patent_select['target_list'] + ' ' + patent_select['title_list']

In [44]:
patent_select['target_title_combined']

id
37d61fd2272659b1    abatement pollution furniture domestic article...
7b9652b17b68b7a4    act abating furniture domestic article applian...
36d72442aefd8232    active catalyst furniture domestic article app...
5296b0c19e1ce60e    eliminating process furniture domestic article...
54c1e3b9184cb5b6    forest region furniture domestic article appli...
                                          ...                        
8e1386cbefd7f245                        wooden article decorative art
42d9e032d1cd3242                            wooden box decorative art
208654ccb9e14fa3                         wooden handle decorative art
756ec035e694722b                       wooden material decorative art
8d135da0b55b8c88                      wooden substrate decorative art
Name: target_title_combined, Length: 36473, dtype: object

# Implementation of the DeBERTA Model

## Testing the cosine similarity between a sample anchor, target, and titleTesting the cosine similarity between a sample anchor, target, and title

In [45]:
patent_select.iloc[0,:]

context                                                                A47
score                                                                  0.5
anchor_dict                                                    [abatement]
target_dict                                         [abatement, pollution]
title_dict               [furniture, domestic, article, appliance, coff...
anchor_list                                                      abatement
target_list                                            abatement pollution
title_list               furniture domestic article appliance coffee mi...
target_title_combined    abatement pollution furniture domestic article...
Name: 37d61fd2272659b1, dtype: object

In [46]:
# Define a function to calculate the cosine similarity between two vectors
def cosine(u, v):
    return np.dot(u,v)/(np.linalg.norm(u)*np.linalg.norm(v))

In [None]:
# Evaluation of similarity between the first patent entry:
    # anchor - 'abatement'
    # target and title - 'abatement pollution furniture domestic article appliance coffee mill spice mill suction cleaner general'
    # actual score - 0.5
    # cosine similarity value - 0.26190528

In [47]:
### encoding tests
sample_anchor = patent_select.iloc[0,5]
sample_combined = patent_select.iloc[0,8]
sample_target = patent_select.iloc[0,6]
sample_embed1 = model.encode(sample_anchor)
sample_embed2 = model.encode(sample_combined)
sample_embed3 = model.encode(sample_target)

In [48]:
from sklearn.metrics.pairwise import cosine_similarity

In [49]:
sklearn_reshape1 = sample_embed1.reshape(1, -1)
sklearn_reshape2 = sample_embed2.reshape(1, -1)
sklearn_reshape3 = sample_embed3.reshape(1, -1)

In [50]:
# cosine similarity between anchor and target/title combined
cosine_function = cosine(sample_embed1, sample_embed2)
sklearn_cosine = cosine_similarity(sklearn_reshape1, sklearn_reshape2)

In [51]:
cosine_function

0.26190525

In [52]:
sklearn_cosine

array([[0.26190528]], dtype=float32)

In [53]:
# cosine similarity between only anchor and target
cosine_function1 = cosine(sample_embed1, sample_embed3)
sklearn_cosine2 = cosine_similarity(sklearn_reshape1, sklearn_reshape3)

In [54]:
cosine_function1

0.8251631

In [55]:
sklearn_cosine2

array([[0.8251631]], dtype=float32)

## Embed anchor, target, and target/title datafields

In [65]:
# function to embed and format the input variables
def variable_embedding(datafield):
    data_list = list(patent_select[datafield])
    data_embedding = model.encode(data_list)
    patent_reshape = []
    for patent in data_embedding:
        reshaped = patent.reshape(1, -1)
        patent_reshape.append(reshaped)
    return patent_reshape

In [66]:
patent_select.columns

Index(['context', 'score', 'anchor_dict', 'target_dict', 'title_dict',
       'anchor_list', 'target_list', 'title_list', 'target_title_combined'],
      dtype='object')

In [67]:
start = time.time()
anchor_embedding = variable_embedding('anchor_list')
target_embedding = variable_embedding('target_list')
target_title_embedding = variable_embedding('target_title_combined')
print('Time to embed: ', (time.time() - start))

Time to embed:  601.8831436634064


### Process to calculate the cosine similarity between the anchor and target/title combined fields

# Processing and Handling on the Testing Dataset

In [68]:
# create a copy of the original dataframe and set the id as the index
text_processing_frame = testing_combined.copy()
text_processing_frame = text_processing_frame.set_index('id')
text_processing_frame.columns

Index(['anchor', 'target', 'context', 'title'], dtype='object')

In [70]:
text_processing_frame.shape

(36, 4)

## Processing the text of the patents

In [71]:
# convert all of the text fields to lowercase 
    # anchor, target, and code
text_processing_frame['anchor'] = text_processing_frame['anchor'].str.lower()
text_processing_frame['target'] = text_processing_frame['target'].str.lower()
text_processing_frame['title'] = text_processing_frame['title'].str.lower()

In [72]:
# remove all non-alphabetic characters from the anchor, target, and title fields
text_processing_frame['anchor'] = text_processing_frame.anchor.str.replace('\W+', ' ')
text_processing_frame['target'] = text_processing_frame.target.str.replace('\W+', ' ')
text_processing_frame['title_alpha'] = text_processing_frame.title.str.replace('\W+', ' ')

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [73]:
# Lemmatization of the data fields
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in word_tokenize(text)]

In [74]:
# Applying the lemmatization function to the text datafields
text_processing_frame['lemmatized_anchor'] = text_processing_frame.anchor.apply(lemmatize_text)
text_processing_frame['lemmatized_target'] = text_processing_frame.target.apply(lemmatize_text)
text_processing_frame['lemmatized_title'] = text_processing_frame.title_alpha.apply(lemmatize_text)

In [76]:
text_processing_frame.columns

Index(['anchor', 'target', 'context', 'title', 'title_alpha',
       'lemmatized_anchor', 'lemmatized_target', 'lemmatized_title'],
      dtype='object')

In [77]:
# Define new dataframe with the the needed datafields
patent_select = pd.DataFrame(text_processing_frame[['lemmatized_anchor','lemmatized_target','lemmatized_title', 'context']])

In [None]:
patent_select

In [79]:
lemmatized_anchor = patent_select['lemmatized_anchor']
lemmatized_target = patent_select['lemmatized_target']
lemmatized_title = patent_select['lemmatized_title']

In [80]:
# apply the word dictionary to the specified text fields
anchor_filter = word_dictionary_apply(lemmatized_anchor)
target_filter = word_dictionary_apply(lemmatized_target)
title_filter = word_dictionary_apply(lemmatized_title)

In [81]:
# append the text lists to the patent dataframe after the word dictionary has been applied
patent_select['anchor_dict'] = anchor_filter
patent_select['target_dict'] = target_filter
patent_select['title_dict'] = title_filter

In [82]:
# apply the function to the tokenized text lists
anchor_list = list_to_string(anchor_filter)
target_list = list_to_string(target_filter)
title_list = list_to_string(title_filter)

In [83]:
# append back to the dataframe
patent_select['anchor_list'] = anchor_list
patent_select['target_list'] = target_list
patent_select['title_list'] = title_list

In [84]:
patent_select = patent_select.drop(columns = ['lemmatized_anchor','lemmatized_target','lemmatized_title'])

In [85]:
# create a new datafield - 'target_title_combined'
# concatenate the target and title datafields in order to calculate cosine similarity later on
patent_select['target_title_combined'] = patent_select['target_list'] + ' ' + patent_select['title_list']

In [86]:
patent_select.columns

Index(['context', 'anchor_dict', 'target_dict', 'title_dict', 'anchor_list',
       'target_list', 'title_list', 'target_title_combined'],
      dtype='object')

In [87]:
start = time.time()
anchor_embedding = variable_embedding('anchor_list')
target_embedding = variable_embedding('target_list')
target_title_embedding = variable_embedding('target_title_combined')
print('Time to embed: ', (time.time() - start)/60)

Time to embed:  0.1024384339650472


In [91]:
cosine_similarity_list = []
for i in range(len(anchor_embedding)):
    anchor_val = anchor_embedding[i]
    target_title_val = target_title_embedding[i]
    cosine_sim_val = cosine_similarity(anchor_val, target_title_val)
    cosine_similarity_list.append(cosine_sim_val)

In [108]:
# function to round output to nearest quarter decimal for submission
def quarter_round(cosine_list, round_val):
    cosine_round = []
    for num in cosine_list:
        num_rounded = round(num/round_val) * round_val
        cosine_round.append(num_rounded)
    return cosine_round

In [118]:
unwrap_list = [l.tolist() for l in cosine_similarity_list]

cosine_list = []
for item in unwrap_list:
    element = item[0][0]
    cosine_list.append(element)

In [120]:
score_rounded = quarter_round(cosine_list, 0.25)

In [121]:
score_rounded

[0.5,
 0.75,
 0.75,
 0.5,
 0.75,
 0.5,
 0.75,
 0.5,
 0.75,
 0.75,
 0.75,
 0.5,
 0.75,
 1.0,
 0.5,
 0.75,
 0.5,
 0.5,
 0.25,
 0.5,
 0.5,
 0.5,
 0.75,
 0.25,
 0.5,
 0.5,
 0.75,
 0.5,
 0.5,
 1.0,
 0.25,
 0.25,
 0.75,
 0.5,
 0.5,
 0.25]

# Put together file for submission

In [128]:
# sample submission file
sample_submission = pd.read_csv('/home/jupyter/uspto_analysis/sample_submission.csv')

In [125]:
testing_ids = list(testing_combined['id'])
submission_df = pd.DataFrame(list(zip(testing_ids, score_rounded)), columns = ['id', 'score'])

In [130]:
submission_df.to_csv('/home/jupyter/uspto_analysis/submission.csv')

In [None]:
# Process for stemming the content of the patents
# Too aggressive for my liking
# Worse performance when calculating the cosine similarity between the patents later on

In [None]:
'''
stemmer = PorterStemmer()
def stemmer_text(text_field):
    return [stemmer.stem(w) for w in word_tokenize(text_field)]
'''

In [None]:
'''
text_processing_frame['stemmed_anchor'] = text_processing_frame.anchor.apply(stemmer_text)
text_processing_frame['stemmed_target'] = text_processing_frame.target.apply(stemmer_text)
text_processing_frame['stemmed_title'] = text_processing_frame.title_alpha.apply(stemmer_text)


patent_select = pd.DataFrame(text_processing_frame[['lemmatized_anchor', 'lemmatized_target', 'lemmatized_title', 'code', 'score']])
patent_select = pd.DataFrame(text_processing_frame[['stemmed_anchor', 'stemmed_target', 'stemmed_title', 'code', 'score']])
'''

In [None]:
'''# create list of all tokens - stemmed versions
anchor_list = list(text_processing_frame['stemmed_anchor'])
target_list = list(text_processing_frame['stemmed_target'])
title_list = list(text_processing_frame['stemmed_title'])'''

# Patent EDA

In [None]:
# --- Description of the data fields ---
# patent_train and patent_test
    # id - unique identifier for a pair of phrases
    # anchor - first phrase
    # target - second phrase
    # context - CPC classification which indicates the context which the similarity is to be scored
    # score - similarity between the two phrases
    
# patent_titles
    # code - hierarchical code used to categorize the patent; corresponds to the context field in patent_train and patent_test dataframe
    # title - description of the code field
    # section - first symbol in the title field; ranges from A - H and Y
    # class - 2 digit class
    # subclass - 1 letter code subclass
    # group - 1-3 digit group code value
    # main_group - 2+ sigit main or subgroup after the / symbol
    # EXAMPLE: patent_titles.loc[3,'code'] = 'A01B1/00'
        # title = 'Hand tools (edge trimmers for lawns A01G3/06  {; machines for working soil A01B35/00; making hand tools B21D})'
        # section = A
        # class = 1.0
        # subclass = B
        # group = 1.0
        # main_group = 00
        
# --- Description of the data fields ---

In [2]:
patent_train.head()
patent_test.head()
# patent_titles.head()
# patent_cpc.head()

patent_train.shape

# data fields
patent_train.columns
patent_titles.columns
# patent_cpc.columns

NameError: name 'patent_train' is not defined

In [3]:
# function to view the first 10 columns of the titles dataframe
def view_data(dataframe, *args):
    col_data_list = []
    for arg in args:
        col_data = dataframe.iloc[0:10, arg]
        col_header = dataframe.columns[arg]
        col_data_list.append((col_header, col_data))
    return col_data_list

patent_titles.shape
view_data(patent_titles,range(0,7))
patent_titles.iloc[0:10,0:6]

NameError: name 'patent_titles' is not defined