In [1]:
import numpy as np
import pandas as pd
import torch
from collections import Counter
import re
import time

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim import corpora, models, similarities
from gensim.utils import simple_preprocess

import sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

# text processing
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords

import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- Description of the data fields ---
# patent_train and patent_test
    # id - unique identifier for a pair of phrases
    # anchor - first phrase
    # target - second phrase
    # context - CPC classification which indicates the context which the similarity is to be scored
    # score - similarity between the two phrases
    
# patent_titles
    # code - hierarchical code used to categorize the patent; corresponds to the context field in patent_train and patent_test dataframe
    # title - description of the code field
    # section - first symbol in the title field; ranges from A - H and Y
    # class - 2 digit class
    # subclass - 1 letter code subclass
    # group - 1-3 digit group code value
    # main_group - 2+ sigit main or subgroup after the / symbol
    # EXAMPLE: patent_titles.loc[3,'code'] = 'A01B1/00'
        # title = 'Hand tools (edge trimmers for lawns A01G3/06  {; machines for working soil A01B35/00; making hand tools B21D})'
        # section = A
        # class = 1.0
        # subclass = B
        # group = 1.0
        # main_group = 00
        
# --- Description of the data fields ---

# Preprocessing of the patents

### Read in the data that has been uploaded to the GCP bucket

In [3]:
# read in the data from the folder
patent_train = pd.read_csv('/home/jupyter/uspto_analysis/train.csv')
patent_test = pd.read_csv('/home/jupyter/uspto_analysis/test.csv')
patent_titles = pd.read_csv('/home/jupyter/uspto_analysis/titles.csv')

### Join the training and testing datasets with the titles csv
titles.csv contains more information on the context of the patent

In [4]:
# joining the training dataset
patents_combined = patent_train.merge(patent_titles, how = 'left', left_on = 'context', right_on = 'code')
patents_combined = patents_combined[['id', 'anchor', 'target', 'context', 'title', 'score']]

In [5]:
# joining the testing dataset
testing_combined = patent_test.merge(patent_titles, how = 'left', left_on = 'context', right_on = 'code')
testing_combined = testing_combined[['id', 'anchor', 'target', 'context', 'title']]

In [6]:
# create a copy of the original dataframe and set the id as the index
text_processing_frame = patents_combined.copy()
text_processing_frame = text_processing_frame.set_index('id')
text_processing_frame.columns

Index(['anchor', 'target', 'context', 'title', 'score'], dtype='object')

## Processing the text of the patents

In [7]:
# convert all of the text fields to lowercase 
    # anchor, target, and code
text_processing_frame['anchor'] = text_processing_frame['anchor'].str.lower()
text_processing_frame['target'] = text_processing_frame['target'].str.lower()
text_processing_frame['title'] = text_processing_frame['title'].str.lower()

In [8]:
# remove all non-alphabetic characters from the anchor, target, and title fields
text_processing_frame['anchor'] = text_processing_frame.anchor.str.replace('\W+', ' ')
text_processing_frame['target'] = text_processing_frame.target.str.replace('\W+', ' ')
text_processing_frame['title_alpha'] = text_processing_frame.title.str.replace('\W+', ' ')

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [9]:
# Lemmatization of the data fields
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in word_tokenize(text)]

In [10]:
# Applying the lemmatization function to the text datafields
text_processing_frame['lemmatized_anchor'] = text_processing_frame.anchor.apply(lemmatize_text)
text_processing_frame['lemmatized_target'] = text_processing_frame.target.apply(lemmatize_text)
text_processing_frame['lemmatized_title'] = text_processing_frame.title_alpha.apply(lemmatize_text)

In [11]:
# Define new dataframe with the the needed datafields
patent_select = pd.DataFrame(text_processing_frame[['lemmatized_anchor','lemmatized_target','lemmatized_title', 'context', 'score']])

In [12]:
patent_select

Unnamed: 0_level_0,lemmatized_anchor,lemmatized_target,lemmatized_title,context,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
37d61fd2272659b1,[abatement],"[abatement, of, pollution]","[furniture, domestic, article, or, appliance, ...",A47,0.50
7b9652b17b68b7a4,[abatement],"[act, of, abating]","[furniture, domestic, article, or, appliance, ...",A47,0.75
36d72442aefd8232,[abatement],"[active, catalyst]","[furniture, domestic, article, or, appliance, ...",A47,0.25
5296b0c19e1ce60e,[abatement],"[eliminating, process]","[furniture, domestic, article, or, appliance, ...",A47,0.50
54c1e3b9184cb5b6,[abatement],"[forest, region]","[furniture, domestic, article, or, appliance, ...",A47,0.00
...,...,...,...,...,...
8e1386cbefd7f245,"[wood, article]","[wooden, article]","[decorative, art]",B44,1.00
42d9e032d1cd3242,"[wood, article]","[wooden, box]","[decorative, art]",B44,0.50
208654ccb9e14fa3,"[wood, article]","[wooden, handle]","[decorative, art]",B44,0.50
756ec035e694722b,"[wood, article]","[wooden, material]","[decorative, art]",B44,0.75


### Process to create a word dictionary for further filtering out tokens

In [13]:
# create list of all tokens - lemmatized versions
anchor_list = list(text_processing_frame['lemmatized_anchor'])
target_list = list(text_processing_frame['lemmatized_target'])
title_list = list(text_processing_frame['lemmatized_title'])

In [14]:
combined_list = (anchor_list + target_list + title_list)

In [15]:
# total number of tokens = 386,751
# number of unique tokens
    # lemmatization = 8,031
all_words = []
for item in combined_list:
    for word in item:
        all_words.append(word)

In [16]:
len(all_words)

386751

In [17]:
# frequency of each individual token
# convert to a dictionary
token_count = FreqDist(all_words)
len(token_count)
token_list = list(token_count)
token_count_dict = dict(token_count)

sorted_dict = sorted(token_count_dict.items(), key = lambda x: x[1], reverse = True)

In [18]:
len(token_list)

8031

In [19]:
# create list of tokens that are to be removed
stop_words = list(stopwords.words('english'))
token_len_one = [w for w in token_list if len(w) == 1]
token_len_two = [w for w in token_list if len(w) == 2]
numeric_tokens = [num for num in token_list if any(c.isdigit() for c in num)]

In [20]:
# token_len_two

### EDA - Token Descriptions
- lengths for each unique token
- avg, max, min, sd of tokens
- number of consonants in each token
- percentiles of token lengths

In [21]:
# Create list of lengths for each of the tokens
token_length_list = []
for word in token_list:
    token_len = len(word)
    token_length_list.append(token_len)

In [22]:
# Average length of tokens
# max length of tokens
# standard deviation of tokens
avg_token = sum(map(len, token_list))/len(token_list)
avg_token
max_length = max(token_length_list)
max_length
token_sd = np.std(token_length_list)
token_sd

2.9313652766188376

In [23]:
# Number of consonants in tokens
vowels = ['a', 'e', 'i', 'o', 'u']
token_data = []
for token in token_list:
    token_len = len(token)
    consonants = 0
    for letter in token:
        if letter not in vowels:
            consonants = consonants + 1
    token_data.append((token, token_len, consonants))

In [24]:
# create a dataframe with metadata about each of the unique tokens
token_count_df = pd.DataFrame(token_count_dict.items(), columns = ['token', 'token_count'])
token_metadata = pd.DataFrame(token_data, columns = ['token', 'token_length', 'consonant_count'])
token_metadata = token_metadata.merge(token_count_df, left_on = 'token', right_on = 'token')
token_metadata['consonant_percentage'] = token_metadata['consonant_count']/token_metadata['token_length']

high_consonants = token_metadata[token_metadata['consonant_percentage'] > 0.9]
consonant_list = list(high_consonants['token'])

In [25]:
# check for any tokens containing numeric values
numeric_tokens = [num for num in token_list if any(c.isdigit() for c in num)]       

In [26]:
# Percentiles of token lengths
twentyfive_percentile = np.percentile(token_length_list, 25)
fifty_percentile = np.percentile(token_length_list, 50)
sevenfive_percentile = np.percentile(token_length_list, 75)
twentyfive_percentile
fifty_percentile
sevenfive_percentile

9.0

In [27]:
# Playing around with mean and standard deviation of the token lengths
# Testing if long tokens should be removed - NO
    # Many important long tokens
max_length = max(token_length_list)
dictionary_lengths = dict(zip(token_list, token_length_list))

sd_value = avg_token + (token_sd *2)
sd_value_three = avg_token + (token_sd *3)
sd_min = avg_token - (token_sd*2)
two_sd = [k for k,v in dictionary_lengths.items() if v >= sd_value]
three_sd = [k for k,v in dictionary_lengths.items() if v >= sd_value_three]

In [28]:
# Define a list of tokens to be removed
remove_list = stop_words + token_len_one + consonant_list + numeric_tokens

In [29]:
lemmatized_anchor = patent_select['lemmatized_anchor']
lemmatized_target = patent_select['lemmatized_target']
lemmatized_title = patent_select['lemmatized_title']

In [30]:
# funcion to apply the word dictionary to the anchor, target, and title data fields
def word_dictionary_apply(patent_datafield):
    filter_list = []
    for elements in patent_datafield:
        inner_filter = []
        for token in elements:
            if token not in remove_list:
                inner_filter.append(token)
        filter_list.append(inner_filter)
    return filter_list

In [31]:
# apply the word dictionary to the specified text fields
anchor_filter = word_dictionary_apply(lemmatized_anchor)
target_filter = word_dictionary_apply(lemmatized_target)
title_filter = word_dictionary_apply(lemmatized_title)

In [32]:
# append the text lists to the patent dataframe after the word dictionary has been applied
patent_select['anchor_dict'] = anchor_filter
patent_select['target_dict'] = target_filter
patent_select['title_dict'] = title_filter

In [33]:
# future modelling needs input as combined string instead of tokenized text
# function to combine the tokenized text fields into a non-tokenized version
# add each as a datafield in the dataframe
def list_to_string(datafield_list):
    string_list = []
    for phrase in datafield_list:
        new_string = ' '.join(phrase)
        string_list.append(new_string)
    return string_list

In [34]:
# apply the function to the tokenized text lists
anchor_list = list_to_string(anchor_filter)
target_list = list_to_string(target_filter)
title_list = list_to_string(title_filter)

In [35]:
# append back to the dataframe
patent_select['anchor_list'] = anchor_list
patent_select['target_list'] = target_list
patent_select['title_list'] = title_list

In [36]:
patent_select = patent_select.drop(columns = ['lemmatized_anchor','lemmatized_target','lemmatized_title'])

In [37]:
# create a new datafield - 'target_title_combined'
# concatenate the target and title datafields in order to calculate cosine similarity later on
patent_select['target_title_combined'] = patent_select['target_list'] + ' ' + patent_select['title_list']

In [38]:
patent_select['target_title_combined']

id
37d61fd2272659b1    abatement pollution furniture domestic article...
7b9652b17b68b7a4    act abating furniture domestic article applian...
36d72442aefd8232    active catalyst furniture domestic article app...
5296b0c19e1ce60e    eliminating process furniture domestic article...
54c1e3b9184cb5b6    forest region furniture domestic article appli...
                                          ...                        
8e1386cbefd7f245                        wooden article decorative art
42d9e032d1cd3242                            wooden box decorative art
208654ccb9e14fa3                         wooden handle decorative art
756ec035e694722b                       wooden material decorative art
8d135da0b55b8c88                      wooden substrate decorative art
Name: target_title_combined, Length: 36473, dtype: object

## Process for tokenization

In [39]:
anchor_list = list(patent_select['anchor_list'])
target_list = list(patent_select['target_list'])
title_list = list(patent_select['title_list'])
target_title_combined = list(patent_select['target_title_combined'])

In [40]:
anchor_tokenize = []
target_tokenize = []
title_tokenize = []
target_title_tokenize = []
for item in anchor_list:
    anchor_token = word_tokenize(item)
    anchor_tokenize.append(anchor_token)
for item in target_list:
    target_token = word_tokenize(item)
    target_tokenize.append(target_token)
for item in title_list:
    title_token = word_tokenize(item)
    title_tokenize.append(title_tokenize)
for item in target_title_combined:
    target_title_token = word_tokenize(item)
    target_title_tokenize.append(target_title_token)

In [41]:
'''patent_select['anchor_tokenize'] = anchor_tokenize
patent_select['target_tokenize'] = target_tokenize
patent_select['title_tokenize'] = title_tokenize
patent_select['target_title_tokenize'] = target_title_tokenize'''

"patent_select['anchor_tokenize'] = anchor_tokenize\npatent_select['target_tokenize'] = target_tokenize\npatent_select['title_tokenize'] = title_tokenize\npatent_select['target_title_tokenize'] = target_title_tokenize"

In [42]:
patent_select

Unnamed: 0_level_0,context,score,anchor_dict,target_dict,title_dict,anchor_list,target_list,title_list,target_title_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
37d61fd2272659b1,A47,0.50,[abatement],"[abatement, pollution]","[furniture, domestic, article, appliance, coff...",abatement,abatement pollution,furniture domestic article appliance coffee mi...,abatement pollution furniture domestic article...
7b9652b17b68b7a4,A47,0.75,[abatement],"[act, abating]","[furniture, domestic, article, appliance, coff...",abatement,act abating,furniture domestic article appliance coffee mi...,act abating furniture domestic article applian...
36d72442aefd8232,A47,0.25,[abatement],"[active, catalyst]","[furniture, domestic, article, appliance, coff...",abatement,active catalyst,furniture domestic article appliance coffee mi...,active catalyst furniture domestic article app...
5296b0c19e1ce60e,A47,0.50,[abatement],"[eliminating, process]","[furniture, domestic, article, appliance, coff...",abatement,eliminating process,furniture domestic article appliance coffee mi...,eliminating process furniture domestic article...
54c1e3b9184cb5b6,A47,0.00,[abatement],"[forest, region]","[furniture, domestic, article, appliance, coff...",abatement,forest region,furniture domestic article appliance coffee mi...,forest region furniture domestic article appli...
...,...,...,...,...,...,...,...,...,...
8e1386cbefd7f245,B44,1.00,"[wood, article]","[wooden, article]","[decorative, art]",wood article,wooden article,decorative art,wooden article decorative art
42d9e032d1cd3242,B44,0.50,"[wood, article]","[wooden, box]","[decorative, art]",wood article,wooden box,decorative art,wooden box decorative art
208654ccb9e14fa3,B44,0.50,"[wood, article]","[wooden, handle]","[decorative, art]",wood article,wooden handle,decorative art,wooden handle decorative art
756ec035e694722b,B44,0.75,"[wood, article]","[wooden, material]","[decorative, art]",wood article,wooden material,decorative art,wooden material decorative art


In [43]:
anchor_tokenize[0]

['abatement']

# Implementation of the Gensim tfidf module

In [44]:
all_patents = anchor_list + target_title_combined

In [45]:
# apply the gensim simple_preprocess function to get the text in the correct format for future modelling
all_patent_process = [simple_preprocess(item) for item in all_patents]

In [46]:
# create a dictionary of all the unique tokens
dictionary = corpora.Dictionary(all_patent_process)

In [47]:
# embed the anchor and target/title fields based on the defined dictionary of tokens
corpus = [dictionary.doc2bow(text) for text in all_patent_process]

In [48]:
all_patent_process[50]

['abnormal', 'position']

In [49]:
corpus[50]

[(1, 1), (2, 1)]

In [50]:
# define the gensim tfidf model on the corpus
tfidf = models.TfidfModel(corpus)

In [51]:
# apply a tfidf transformation to the entire corpus of documents
corpus_tfidf = tfidf[corpus]

## Compute a tfidf matrix

In [52]:
vocab = [dictionary[i] for i in range(len(dictionary))]

In [53]:
index = list(range(len(corpus)))

In [54]:
tfidf_matrix = pd.DataFrame(np.zeros((len(corpus), len(vocab)), dtype = np.float16), index = index, columns = vocab)

In [61]:
for idx in index:
    for id_val, freq in tfidf[corpus[idx]]:
        tfidf_matrix[dictionary[id_val]][idx] = freq

In [56]:
# tfidf_matrix.to_csv('/home/jupyter/uspto_analysis/tfidf_matrix.csv')

In [57]:
tfidf_matrix

Unnamed: 0,abatement,abnormal,position,absorbent,property,acan,accept,information,achieve,authentication,...,beamsplitter,union,influent,elevating,universal,chase,fixedly,creosote,bat,lumber
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Test the tfidf matrix on a few sample patents

In [58]:
anchor_tfidf = tfidf_matrix.iloc[0:36473,:]

In [59]:
combined_tfidf = tfidf_matrix.iloc[36473:,:]
combined_tfidf = combined_tfidf.reset_index(drop = True)

In [60]:
combined_tfidf

Unnamed: 0,abatement,abnormal,position,absorbent,property,acan,accept,information,achieve,authentication,...,beamsplitter,union,influent,elevating,universal,chase,fixedly,creosote,bat,lumber
0,0.337158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36468,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36469,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36470,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36471,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
anchor_list[0]

'abatement'

In [62]:
target_title_combined[0]

'abatement pollution furniture domestic article appliance coffee mill spice mill suction cleaner general'

In [63]:
anchor_test = anchor_tfidf.iloc[0,:]
combined_test = combined_tfidf.iloc[0,:]
anchor_testarray = anchor_test.to_numpy()
combined_testarray = combined_test.to_numpy()

In [64]:
anchor_testarray = anchor_testarray.reshape(1, -1)
combined_testarray = combined_testarray.reshape(1, -1)

In [65]:
test_cosine_sum = cosine_similarity(anchor_testarray, combined_testarray)
test_cosine_sum

array([[0.33717669]])

In [None]:
# Actual similarity score = 0.5
# Cosine similarity score = 0.3372

# Calculate cosine similarity between all patent anchors and combined target/titles

In [58]:
# split the large tfidf matrix into two separate matrices
    # anchor_tfidf = anchor matrix
    # combined_tfidf = target/title matrix
anchor_tfidf = tfidf_matrix.iloc[0:36473,:]
combined_tfidf = tfidf_matrix.iloc[36473:,:]
combined_tfidf = combined_tfidf.reset_index(drop = True)

In [59]:
# function to round output to nearest quarter decimal for submission
def quarter_round(cosine_list, round_val):
    cosine_round = []
    for num in cosine_list:
        num_rounded = round(num/round_val) * round_val
        cosine_round.append(num_rounded)
    return cosine_round

In [77]:
anchor_tfidf.iloc[1,:]

abatement    1.0
abnormal     0.0
position     0.0
absorbent    0.0
property     0.0
            ... 
chase        0.0
fixedly      0.0
creosote     0.0
bat          0.0
lumber       0.0
Name: 1, Length: 7694, dtype: float16

In [78]:
cosine_simlist = []
for i in range(len(anchor_list)):
    anchor_test = anchor_tfidf.iloc[i,:]
    combined_test = combined_tfidf.iloc[i,:]
    anchor_testarray = anchor_test.to_numpy()
    combined_testarray = combined_test.to_numpy()
    anchor_testarray = anchor_testarray.reshape(1, -1)
    combined_testarray = combined_testarray.reshape(1, -1)
    semantic_cosine = cosine_similarity(anchor_testarray, combined_testarray)
    cosine_simlist.append(semantic_cosine)

In [83]:
cosine_simlist[0:10]

[array([[0.33717669]]),
 array([[0.]]),
 array([[0.]]),
 array([[0.]]),
 array([[0.]]),
 array([[0.]]),
 array([[0.]]),
 array([[0.]]),
 array([[0.]]),
 array([[0.]])]

In [88]:
cosine_list = []
for i in range(len(cosine_simlist)):
    # test_list = np.array2string(cosine_simlist[i][0][0])
    value = cosine_simlist[i][0][0]
    cosine_list.append(value)

In [89]:
cosine_list[0:10]

[0.3371766931452356, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [91]:
max(cosine_list)

0.9256036652485101

In [64]:
# should not need this
# min(cosine_list) = 0
'''cosine_positive = []
for val in cosine_list:
    pos_val = val + 1
    cosine_positive.append(pos_val)'''

In [67]:
max(cosine_positive)

1.3371766931452356

In [75]:
# round each of the scores to the nearest quarter value
score_rounded = quarter_round(cosine_positive, 0.25)

array([[0.33717669]])

In [None]:
score_rounded[0:10]

In [79]:
patent_train

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00
...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.00
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.50
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.50
36471,756ec035e694722b,wood article,wooden material,B44,0.75


In [None]:
from google.cloud import storage
import os

client = storage.Client()
bucket = client.get_bucket('cliffm_uspto_kaggle_data')
bucket.blob('tfidf_matrix.parquet.gzip').upload_from_string(tfidf_matrix.to_parquet(), 'parquet')

In [None]:
tfidf_matrix = pd.read_csv('/home/jupyter/uspto_analysis/tfidf_matrix.csv')
tf_idf_matrix

In [76]:
corpus_tfidf[50]

[(1, 0.7990465392774274), (2, 0.6012691810402117)]

In [88]:
len(corpus_tfidf)

72946

In [89]:
all_patent_process[71]

['abnormal', 'position']

In [92]:
all_patent_process[36546]

['abnormal', 'placement', 'weaving']

## Testing the model on a few sample patents

In [102]:
all_patent_process[36544]

['abnormal', 'breathing', 'weaving']

In [103]:
# test cosine similarity
    # anchor 1 = abnormal position
    # target/title 1 = abnormal breathing/weaving
    # score 1 = 0
    # anchor 2 = abnormal position
    # target/title 2 = abnormal placement/weaving
    # score 2 = 0.75
train_anchor = corpus_tfidf[71]
train_combined = corpus_tfidf[36544]
train_anchor2 = corpus_tfidf[73]
train_combined2 = corpus_tfidf[36546]

In [104]:
train_anchor

[(1, 0.7990465392774274), (2, 0.6012691810402117)]

In [105]:
train_combined

[(1, 0.5027760784033785),
 (1096, 0.7413125030310371),
 (1097, 0.44460318019013145)]

In [106]:
cos_distance1 = cosine_similarity(train_anchor, train_combined)
cos_distance1

array([[0.97838527, 0.78165408, 0.78148496],
       [0.98493007, 0.95785337, 0.95777547]])

In [98]:
cos_distance2 = cosine_similarity(train_anchor2, train_combined2)
cos_distance2

array([[0.97703633, 0.78148093, 0.78165835],
       [0.98602106, 0.95777361, 0.95785534]])

# 

In [107]:
# define the gensim tfidf model on the corpus
tfidf2 = models.TfidfModel(corpus, smartirs = 'ntc')

In [None]:
# obtain the word ids and their frequencies in the tfidf model
for doc in tfidf2[corpus]:
    