In [1]:
# !pip install torch
!pip install gensim



In [1]:
import numpy as np
import pandas as pd
import torch
from collections import Counter
import re
import time

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim import corpora, models, similarities
from gensim.utils import simple_preprocess

import sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

# text processing
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords

import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# --- Description of the data fields ---
# patent_train and patent_test
    # id - unique identifier for a pair of phrases
    # anchor - first phrase
    # target - second phrase
    # context - CPC classification which indicates the context which the similarity is to be scored
    # score - similarity between the two phrases
    
# patent_titles
    # code - hierarchical code used to categorize the patent; corresponds to the context field in patent_train and patent_test dataframe
    # title - description of the code field
    # section - first symbol in the title field; ranges from A - H and Y
    # class - 2 digit class
    # subclass - 1 letter code subclass
    # group - 1-3 digit group code value
    # main_group - 2+ sigit main or subgroup after the / symbol
    # EXAMPLE: patent_titles.loc[3,'code'] = 'A01B1/00'
        # title = 'Hand tools (edge trimmers for lawns A01G3/06  {; machines for working soil A01B35/00; making hand tools B21D})'
        # section = A
        # class = 1.0
        # subclass = B
        # group = 1.0
        # main_group = 00
        
# --- Description of the data fields ---

# Preprocessing of the patents

### Read in the data that has been uploaded to the GCP bucket

In [2]:
# read in the data from the folder
patent_train = pd.read_csv('/home/jupyter/uspto_analysis/train.csv')
patent_test = pd.read_csv('/home/jupyter/uspto_analysis/test.csv')
patent_titles = pd.read_csv('/home/jupyter/uspto_analysis/titles.csv')

### Join the training and testing datasets with the titles csv
titles.csv contains more information on the context of the patent

In [3]:
# joining the training dataset
patents_combined = patent_train.merge(patent_titles, how = 'left', left_on = 'context', right_on = 'code')
patents_combined = patents_combined[['id', 'anchor', 'target', 'context', 'title', 'score']]

In [4]:
# joining the testing dataset
testing_combined = patent_test.merge(patent_titles, how = 'left', left_on = 'context', right_on = 'code')
testing_combined = testing_combined[['id', 'anchor', 'target', 'context', 'title']]

In [5]:
# create a copy of the original dataframe and set the id as the index
text_processing_frame = patents_combined.copy()
text_processing_frame = text_processing_frame.set_index('id')
text_processing_frame.columns

Index(['anchor', 'target', 'context', 'title', 'score'], dtype='object')

## Processing the text of the patents

In [6]:
# convert all of the text fields to lowercase 
    # anchor, target, and code
text_processing_frame['anchor'] = text_processing_frame['anchor'].str.lower()
text_processing_frame['target'] = text_processing_frame['target'].str.lower()
text_processing_frame['title'] = text_processing_frame['title'].str.lower()

In [7]:
# remove all non-alphabetic characters from the anchor, target, and title fields
text_processing_frame['anchor'] = text_processing_frame.anchor.str.replace('\W+', ' ')
text_processing_frame['target'] = text_processing_frame.target.str.replace('\W+', ' ')
text_processing_frame['title_alpha'] = text_processing_frame.title.str.replace('\W+', ' ')

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [8]:
# Lemmatization of the data fields
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in word_tokenize(text)]

In [9]:
# Applying the lemmatization function to the text datafields
text_processing_frame['lemmatized_anchor'] = text_processing_frame.anchor.apply(lemmatize_text)
text_processing_frame['lemmatized_target'] = text_processing_frame.target.apply(lemmatize_text)
text_processing_frame['lemmatized_title'] = text_processing_frame.title_alpha.apply(lemmatize_text)

In [10]:
# Define new dataframe with the the needed datafields
patent_select = pd.DataFrame(text_processing_frame[['lemmatized_anchor','lemmatized_target','lemmatized_title', 'context', 'score']])

In [11]:
patent_select

Unnamed: 0_level_0,lemmatized_anchor,lemmatized_target,lemmatized_title,context,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
37d61fd2272659b1,[abatement],"[abatement, of, pollution]","[furniture, domestic, article, or, appliance, ...",A47,0.50
7b9652b17b68b7a4,[abatement],"[act, of, abating]","[furniture, domestic, article, or, appliance, ...",A47,0.75
36d72442aefd8232,[abatement],"[active, catalyst]","[furniture, domestic, article, or, appliance, ...",A47,0.25
5296b0c19e1ce60e,[abatement],"[eliminating, process]","[furniture, domestic, article, or, appliance, ...",A47,0.50
54c1e3b9184cb5b6,[abatement],"[forest, region]","[furniture, domestic, article, or, appliance, ...",A47,0.00
...,...,...,...,...,...
8e1386cbefd7f245,"[wood, article]","[wooden, article]","[decorative, art]",B44,1.00
42d9e032d1cd3242,"[wood, article]","[wooden, box]","[decorative, art]",B44,0.50
208654ccb9e14fa3,"[wood, article]","[wooden, handle]","[decorative, art]",B44,0.50
756ec035e694722b,"[wood, article]","[wooden, material]","[decorative, art]",B44,0.75


### Process to create a word dictionary for further filtering out tokens

In [12]:
# create list of all tokens - lemmatized versions
anchor_list = list(text_processing_frame['lemmatized_anchor'])
target_list = list(text_processing_frame['lemmatized_target'])
title_list = list(text_processing_frame['lemmatized_title'])

In [13]:
combined_list = (anchor_list + target_list + title_list)

In [14]:
# total number of tokens = 386,751
# number of unique tokens
    # lemmatization = 8,031
all_words = []
for item in combined_list:
    for word in item:
        all_words.append(word)

In [15]:
len(all_words)

386751

In [16]:
# frequency of each individual token
# convert to a dictionary
token_count = FreqDist(all_words)
len(token_count)
token_list = list(token_count)
token_count_dict = dict(token_count)

sorted_dict = sorted(token_count_dict.items(), key = lambda x: x[1], reverse = True)

In [17]:
len(token_list)

8031

In [18]:
# create list of tokens that are to be removed
stop_words = list(stopwords.words('english'))
token_len_one = [w for w in token_list if len(w) == 1]
token_len_two = [w for w in token_list if len(w) == 2]
numeric_tokens = [num for num in token_list if any(c.isdigit() for c in num)]

In [19]:
# token_len_two

### EDA - Token Descriptions
- lengths for each unique token
- avg, max, min, sd of tokens
- number of consonants in each token
- percentiles of token lengths

In [20]:
# Create list of lengths for each of the tokens
token_length_list = []
for word in token_list:
    token_len = len(word)
    token_length_list.append(token_len)

In [21]:
# Average length of tokens
# max length of tokens
# standard deviation of tokens
avg_token = sum(map(len, token_list))/len(token_list)
avg_token
max_length = max(token_length_list)
max_length
token_sd = np.std(token_length_list)
token_sd

2.9313652766188376

In [22]:
# Number of consonants in tokens
vowels = ['a', 'e', 'i', 'o', 'u']
token_data = []
for token in token_list:
    token_len = len(token)
    consonants = 0
    for letter in token:
        if letter not in vowels:
            consonants = consonants + 1
    token_data.append((token, token_len, consonants))

In [23]:
# create a dataframe with metadata about each of the unique tokens
token_count_df = pd.DataFrame(token_count_dict.items(), columns = ['token', 'token_count'])
token_metadata = pd.DataFrame(token_data, columns = ['token', 'token_length', 'consonant_count'])
token_metadata = token_metadata.merge(token_count_df, left_on = 'token', right_on = 'token')
token_metadata['consonant_percentage'] = token_metadata['consonant_count']/token_metadata['token_length']

high_consonants = token_metadata[token_metadata['consonant_percentage'] > 0.9]
consonant_list = list(high_consonants['token'])

In [24]:
# check for any tokens containing numeric values
numeric_tokens = [num for num in token_list if any(c.isdigit() for c in num)]       

In [25]:
# Percentiles of token lengths
twentyfive_percentile = np.percentile(token_length_list, 25)
fifty_percentile = np.percentile(token_length_list, 50)
sevenfive_percentile = np.percentile(token_length_list, 75)
twentyfive_percentile
fifty_percentile
sevenfive_percentile

9.0

In [26]:
# Playing around with mean and standard deviation of the token lengths
# Testing if long tokens should be removed - NO
    # Many important long tokens
max_length = max(token_length_list)
dictionary_lengths = dict(zip(token_list, token_length_list))

sd_value = avg_token + (token_sd *2)
sd_value_three = avg_token + (token_sd *3)
sd_min = avg_token - (token_sd*2)
two_sd = [k for k,v in dictionary_lengths.items() if v >= sd_value]
three_sd = [k for k,v in dictionary_lengths.items() if v >= sd_value_three]

In [27]:
# Define a list of tokens to be removed
remove_list = stop_words + token_len_one + consonant_list + numeric_tokens

In [28]:
lemmatized_anchor = patent_select['lemmatized_anchor']
lemmatized_target = patent_select['lemmatized_target']
lemmatized_title = patent_select['lemmatized_title']

In [29]:
# funcion to apply the word dictionary to the anchor, target, and title data fields
def word_dictionary_apply(patent_datafield):
    filter_list = []
    for elements in patent_datafield:
        inner_filter = []
        for token in elements:
            if token not in remove_list:
                inner_filter.append(token)
        filter_list.append(inner_filter)
    return filter_list

In [30]:
# apply the word dictionary to the specified text fields
anchor_filter = word_dictionary_apply(lemmatized_anchor)
target_filter = word_dictionary_apply(lemmatized_target)
title_filter = word_dictionary_apply(lemmatized_title)

In [31]:
# append the text lists to the patent dataframe after the word dictionary has been applied
patent_select['anchor_dict'] = anchor_filter
patent_select['target_dict'] = target_filter
patent_select['title_dict'] = title_filter

In [32]:
# future modelling needs input as combined string instead of tokenized text
# function to combine the tokenized text fields into a non-tokenized version
# add each as a datafield in the dataframe
def list_to_string(datafield_list):
    string_list = []
    for phrase in datafield_list:
        new_string = ' '.join(phrase)
        string_list.append(new_string)
    return string_list

In [33]:
# apply the function to the tokenized text lists
anchor_list = list_to_string(anchor_filter)
target_list = list_to_string(target_filter)
title_list = list_to_string(title_filter)

In [34]:
# append back to the dataframe
patent_select['anchor_list'] = anchor_list
patent_select['target_list'] = target_list
patent_select['title_list'] = title_list

In [35]:
patent_select = patent_select.drop(columns = ['lemmatized_anchor','lemmatized_target','lemmatized_title'])

In [36]:
# create a new datafield - 'target_title_combined'
# concatenate the target and title datafields in order to calculate cosine similarity later on
patent_select['target_title_combined'] = patent_select['target_list'] + ' ' + patent_select['title_list']

In [37]:
patent_select['target_title_combined']

id
37d61fd2272659b1    abatement pollution furniture domestic article...
7b9652b17b68b7a4    act abating furniture domestic article applian...
36d72442aefd8232    active catalyst furniture domestic article app...
5296b0c19e1ce60e    eliminating process furniture domestic article...
54c1e3b9184cb5b6    forest region furniture domestic article appli...
                                          ...                        
8e1386cbefd7f245                        wooden article decorative art
42d9e032d1cd3242                            wooden box decorative art
208654ccb9e14fa3                         wooden handle decorative art
756ec035e694722b                       wooden material decorative art
8d135da0b55b8c88                      wooden substrate decorative art
Name: target_title_combined, Length: 36473, dtype: object

## Process for tokenization

In [38]:
anchor_list = list(patent_select['anchor_list'])
target_list = list(patent_select['target_list'])
title_list = list(patent_select['title_list'])
target_title_combined = list(patent_select['target_title_combined'])

In [39]:
anchor_tokenize = []
target_tokenize = []
title_tokenize = []
target_title_tokenize = []
for item in anchor_list:
    anchor_token = word_tokenize(item)
    anchor_tokenize.append(anchor_token)
for item in target_list:
    target_token = word_tokenize(item)
    target_tokenize.append(target_token)
for item in title_list:
    title_token = word_tokenize(item)
    title_tokenize.append(title_tokenize)
for item in target_title_combined:
    target_title_token = word_tokenize(item)
    target_title_tokenize.append(target_title_token)

In [40]:
'''patent_select['anchor_tokenize'] = anchor_tokenize
patent_select['target_tokenize'] = target_tokenize
patent_select['title_tokenize'] = title_tokenize
patent_select['target_title_tokenize'] = target_title_tokenize'''

"patent_select['anchor_tokenize'] = anchor_tokenize\npatent_select['target_tokenize'] = target_tokenize\npatent_select['title_tokenize'] = title_tokenize\npatent_select['target_title_tokenize'] = target_title_tokenize"

In [41]:
patent_select

Unnamed: 0_level_0,context,score,anchor_dict,target_dict,title_dict,anchor_list,target_list,title_list,target_title_combined
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
37d61fd2272659b1,A47,0.50,[abatement],"[abatement, pollution]","[furniture, domestic, article, appliance, coff...",abatement,abatement pollution,furniture domestic article appliance coffee mi...,abatement pollution furniture domestic article...
7b9652b17b68b7a4,A47,0.75,[abatement],"[act, abating]","[furniture, domestic, article, appliance, coff...",abatement,act abating,furniture domestic article appliance coffee mi...,act abating furniture domestic article applian...
36d72442aefd8232,A47,0.25,[abatement],"[active, catalyst]","[furniture, domestic, article, appliance, coff...",abatement,active catalyst,furniture domestic article appliance coffee mi...,active catalyst furniture domestic article app...
5296b0c19e1ce60e,A47,0.50,[abatement],"[eliminating, process]","[furniture, domestic, article, appliance, coff...",abatement,eliminating process,furniture domestic article appliance coffee mi...,eliminating process furniture domestic article...
54c1e3b9184cb5b6,A47,0.00,[abatement],"[forest, region]","[furniture, domestic, article, appliance, coff...",abatement,forest region,furniture domestic article appliance coffee mi...,forest region furniture domestic article appli...
...,...,...,...,...,...,...,...,...,...
8e1386cbefd7f245,B44,1.00,"[wood, article]","[wooden, article]","[decorative, art]",wood article,wooden article,decorative art,wooden article decorative art
42d9e032d1cd3242,B44,0.50,"[wood, article]","[wooden, box]","[decorative, art]",wood article,wooden box,decorative art,wooden box decorative art
208654ccb9e14fa3,B44,0.50,"[wood, article]","[wooden, handle]","[decorative, art]",wood article,wooden handle,decorative art,wooden handle decorative art
756ec035e694722b,B44,0.75,"[wood, article]","[wooden, material]","[decorative, art]",wood article,wooden material,decorative art,wooden material decorative art


In [42]:
anchor_tokenize[0]

['abatement']

# Implementation of the Gensim module

### Determining the hyperparameters

In [43]:
# number of topics - how many context labels exist in the dataset?
num_titles = len(pd.unique(patent_titles['section']))
num_titles

9

In [44]:
patent_titles

Unnamed: 0,code,title,section,class,subclass,group,main_group
0,A,HUMAN NECESSITIES,A,,,,
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...,A,1.0,,,
2,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...,A,1.0,B,,
3,A01B1/00,Hand tools (edge trimmers for lawns A01G3/06 ...,A,1.0,B,1.0,0.0
4,A01B1/02,Spades; Shovels {(hand-operated dredgers E02F3...,A,1.0,B,1.0,2.0
...,...,...,...,...,...,...,...
260471,Y10T483/1864,including tool pot or adapter,Y,10.0,T,483.0,1864.0
260472,Y10T483/1873,Indexing matrix,Y,10.0,T,483.0,1873.0
260473,Y10T483/1882,Rotary disc,Y,10.0,T,483.0,1882.0
260474,Y10T483/1891,Chain or belt,Y,10.0,T,483.0,1891.0


In [45]:
all_patents = anchor_list + target_title_combined

In [46]:
all_patent_process = [simple_preprocess(item) for item in all_patents]

In [47]:
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(all_patent_process)]

In [67]:
len(tagged_data)

72946

In [87]:
tagged_data[0]

TaggedDocument(words=['abatement'], tags=[0])

In [68]:
# define the hyperparameters to be used for the doc2vec model
model = gensim.models.doc2vec.Doc2Vec(vector_size = 500, epochs = 100)

In [69]:
model.build_vocab(tagged_data)

In [50]:
print(f"Word 'abatement' appeared {model.wv.get_vecattr('abatement', 'count')} times in the training corpus.")

Word 'abatement' appeared 63 times in the training corpus.


In [70]:
# train the doc2vec model

start = time.time()
model.train(tagged_data, total_examples = model.corpus_count, epochs = model.epochs)
print('time (minutes) to train doc2vec: ', (time.time()-start)/60)

time (minutes) to train doc2vec:  5.220850129922231


## Testing the model on a few sample patents

In [135]:
# test cosine similarity
    # anchor 1 = abnormal position
    # target/title 1 = abnormal breathing/weaving
    # score 1 = 0
    # anchor 2 = abnormal position
    # target/title 2 = abnormal placemenet/weaving
    # score 2 = 0.75
train_anchor = anchor_list[71]
train_combined = target_title_combined[71]
train_anchor2 = anchor_list[73]
train_combined2 = target_title_combined[73]

In [134]:
train_combined

'abnormal breathing weaving'

In [136]:
anchor_vec1 = model.infer_vector(train_anchor.split())
combined_vec1 = model.infer_vector(train_anchor.split())
anchor_vec2 = model.infer_vector(train_anchor2.split())
combined_vec2 = model.infer_vector(train_combined2.split())

anchor_vec1 = anchor_vec1.reshape(1, -1)
combined_vec1 = combined_vec1.reshape(1, -1)
anchor_vec2 = anchor_vec2.reshape(1, -1)
combined_vec2 = combined_vec2.reshape(1, -1)

In [137]:
cos_distance1 = cosine_similarity(anchor_vec1, combined_vec1)
cos_distance1

array([[0.97753304]], dtype=float32)

In [138]:
cos_distance2 = cosine_similarity(anchor_vec2, combined_vec2)
cos_distance2

array([[0.42563236]], dtype=float32)

<gensim.models.doc2vec.Doc2Vec at 0x7fdcc5ed8f50>

# Score the Doc2Vec model on the training data

In [71]:
cosine_score_list = []
for i in range(len(patent_select)):
    patent_anchor = anchor_list[i]
    patent_combined = target_title_combined[i]
    patent_anchor_vec = model.infer_vector(patent_anchor.split())
    patent_anchor_vec = patent_anchor_vec.reshape(1, -1)
    patent_combined_vec = model.infer_vector(patent_combined.split())
    patent_combined_vec = patent_combined_vec.reshape(1, -1)
    cos_distance = cosine_similarity(patent_anchor_vec, patent_combined_vec)
    cosine_score_list.append(cos_distance)

In [72]:
cosine_list = []
for i in range(len(cosine_score_list)):
    test_list = np.array2string(cosine_score_list[i][0][0])
    cosine_list.append(test_list)

In [73]:
cosine_list = [float(i) for i in cosine_list]

In [74]:
max(cosine_list)

0.98755324

In [75]:
cosine_positive = []
for val in cosine_list:
    pos_val = val + 1
    cosine_positive.append(pos_val)

In [76]:
max(cosine_positive)

1.98755324

In [77]:
normalized_cosine = [cos_val/max(cosine_positive) for cos_val in cosine_positive]

In [78]:
max(normalized_cosine)

1.0

In [79]:
# function to round output to nearest quarter decimal for submission
def quarter_round(cosine_list, round_val):
    cosine_round = []
    for num in cosine_list:
        num_rounded = round(num/round_val) * round_val
        cosine_round.append(num_rounded)
    return cosine_round

In [80]:
cosine_rounded = quarter_round(normalized_cosine, 0.25)

In [81]:
training_eval = patent_select[['score']]
training_eval['model_score'] = cosine_rounded

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [82]:
training_eval['correct_results'] = np.where(training_eval['score'] == training_eval['model_score'], 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [83]:
training_eval['difference'] = abs(training_eval['model_score'] - training_eval['score'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [84]:
training_eval

Unnamed: 0_level_0,score,model_score,correct_results,difference
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
37d61fd2272659b1,0.50,0.75,0,0.25
7b9652b17b68b7a4,0.75,0.75,1,0.00
36d72442aefd8232,0.25,0.50,0,0.25
5296b0c19e1ce60e,0.50,0.75,0,0.25
54c1e3b9184cb5b6,0.00,0.75,0,0.75
...,...,...,...,...
8e1386cbefd7f245,1.00,0.75,0,0.25
42d9e032d1cd3242,0.50,0.75,0,0.25
208654ccb9e14fa3,0.50,0.75,0,0.25
756ec035e694722b,0.75,0.50,0,0.25


In [86]:
# percent predicted correctly
training_eval.sum(axis = 0)[2]/len(training_eval)

0.19650152167356674

In [71]:
training_eval.groupby('difference').count()

Unnamed: 0_level_0,score,model_score,correct_results
difference,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,7308,7308,7308
0.25,13113,13113,13113
0.5,10087,10087,10087
0.75,4947,4947,4947
1.0,1018,1018,1018


In [80]:
len(training_eval)

36473

# Improving Model Performance

### Testing out different hyperparameter values

In [None]:
# iterating through different combinations of vector_size and epoch values

In [None]:
tagged_data = tagged_data

In [None]:
vec_values = [200, 400, 500]
num_epochs = 40

In [None]:
start = time.time()

for values in vec_values:
    model = gensim.models.doc2vec.Doc2Vec(vector_size = values, epochs = num_epochs)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples = model.corpus_count, epochs = model.epochs)
    cosine_score_list = []
    for i in range(len(patent_select)):
        patent_anchor = anchor_list[i]
        patent_combined = target_title_combined[i]
        patent_anchor_vec = model.infer_vector(patent_anchor.split())
        patent_anchor_vec = patent_anchor_vec.reshape(1, -1)
        patent_combined_vec = model.infer_vector(patent_combined.split())
        patent_combined_vec = patent_combined_vec.reshape(1, -1)
        cos_distance = cosine_similarity(patent_anchor_vec, patent_combined_vec)
        cosine_score_list.append(cos_distance)
print('time(minutes) to run models: ', , (time.time()-start)/60)

In [None]:
vec1 = cosine_score_list[0]
vec2 = cosine_score_list[1]
vec3 = cosine_score_list[2]

In [None]:
all_vecs = []
for vec in cosine_score_list:
    cosine_list = []
    for i in range(len(vec)):
        test_list = np.array2string(cosine_score_list[i][0][0])
        cosine_list.append(test_list)
    all_vecs.append(cosine_list)

In [None]:
float_cosine = []
for cosine_vals in all_vecs:
    cosine_list = [float(i) for i in cosine_vals]
    float_cosine.append(cosine_list)

In [None]:
float_cosine2 = []
for float_cosine1 in float_cosine:
    cosine_positive = []
    for val in cosine_list:
        pos_val = val + 1
        cosine_positive.append(pos_val)
    float_cosine2.append(cosine_positive)

In [None]:
cosine_normalized = []
for positive in float_cosine2:
    normalized_cosine = [cos_val/max(positive) for cos_val in positive]
    cosine_normalized.append(normalized_cosine)

In [None]:
rounded_cosine = []
for normalized_list in cosine_normalized:
    cosine_rounded = quarter_round(normalized_list, 0.25)
    rounded_cosine.append(cosine_rounded)

# Processing and Handling on the Testing Dataset

In [151]:
# create a copy of the original dataframe and set the id as the index
text_processing_frame = testing_combined.copy()
text_processing_frame = text_processing_frame.set_index('id')
text_processing_frame.columns

Index(['anchor', 'target', 'context', 'title'], dtype='object')

In [152]:
text_processing_frame.shape

(36, 4)

## Processing the text of the patents

In [153]:
# convert all of the text fields to lowercase 
    # anchor, target, and code
text_processing_frame['anchor'] = text_processing_frame['anchor'].str.lower()
text_processing_frame['target'] = text_processing_frame['target'].str.lower()
text_processing_frame['title'] = text_processing_frame['title'].str.lower()

In [154]:
# remove all non-alphabetic characters from the anchor, target, and title fields
text_processing_frame['anchor'] = text_processing_frame.anchor.str.replace('\W+', ' ')
text_processing_frame['target'] = text_processing_frame.target.str.replace('\W+', ' ')
text_processing_frame['title_alpha'] = text_processing_frame.title.str.replace('\W+', ' ')

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [155]:
# Lemmatization of the data fields
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in word_tokenize(text)]

In [156]:
# Applying the lemmatization function to the text datafields
text_processing_frame['lemmatized_anchor'] = text_processing_frame.anchor.apply(lemmatize_text)
text_processing_frame['lemmatized_target'] = text_processing_frame.target.apply(lemmatize_text)
text_processing_frame['lemmatized_title'] = text_processing_frame.title_alpha.apply(lemmatize_text)

In [157]:
text_processing_frame.columns

Index(['anchor', 'target', 'context', 'title', 'title_alpha',
       'lemmatized_anchor', 'lemmatized_target', 'lemmatized_title'],
      dtype='object')

In [158]:
# Define new dataframe with the the needed datafields
patent_select = pd.DataFrame(text_processing_frame[['lemmatized_anchor','lemmatized_target','lemmatized_title', 'context']])

In [None]:
patent_select

In [160]:
lemmatized_anchor = patent_select['lemmatized_anchor']
lemmatized_target = patent_select['lemmatized_target']
lemmatized_title = patent_select['lemmatized_title']

In [161]:
# apply the word dictionary to the specified text fields
anchor_filter = word_dictionary_apply(lemmatized_anchor)
target_filter = word_dictionary_apply(lemmatized_target)
title_filter = word_dictionary_apply(lemmatized_title)

In [162]:
# append the text lists to the patent dataframe after the word dictionary has been applied
patent_select['anchor_dict'] = anchor_filter
patent_select['target_dict'] = target_filter
patent_select['title_dict'] = title_filter

In [163]:
# apply the function to the tokenized text lists
anchor_list = list_to_string(anchor_filter)
target_list = list_to_string(target_filter)
title_list = list_to_string(title_filter)

In [164]:
# append back to the dataframe
patent_select['anchor_list'] = anchor_list
patent_select['target_list'] = target_list
patent_select['title_list'] = title_list

In [165]:
patent_select = patent_select.drop(columns = ['lemmatized_anchor','lemmatized_target','lemmatized_title'])

In [166]:
# create a new datafield - 'target_title_combined'
# concatenate the target and title datafields in order to calculate cosine similarity later on
patent_select['target_title_combined'] = patent_select['target_list'] + ' ' + patent_select['title_list']

In [167]:
patent_select.columns

Index(['context', 'anchor_dict', 'target_dict', 'title_dict', 'anchor_list',
       'target_list', 'title_list', 'target_title_combined'],
      dtype='object')

In [168]:
anchor_list = list(patent_select['anchor_list'])
target_list = list(patent_select['target_list'])
title_list = list(patent_select['title_list'])
target_title_combined = list(patent_select['target_title_combined'])

In [172]:
cosine_score_list = []
for i in range(len(patent_select)):
    patent_anchor = anchor_list[i]
    patent_combined = target_title_combined[i]
    patent_anchor_vec = model.infer_vector(patent_anchor.split())
    patent_anchor_vec = patent_anchor_vec.reshape(1, -1)
    patent_combined_vec = model.infer_vector(patent_combined.split())
    patent_combined_vec = patent_combined_vec.reshape(1, -1)
    cos_distance = cosine_similarity(patent_anchor_vec, patent_combined_vec)
    cosine_score_list.append(cos_distance)

In [None]:
cosine_score_list

In [175]:
cosine_list = []
for i in range(len(cosine_score_list)):
    test_list = np.array2string(cosine_score_list[i][0][0])
    cosine_list.append(test_list)

In [176]:
cosine_list = [float(i) for i in cosine_list]

In [184]:
min(cosine_list)

-0.5176584

In [185]:
cosine_positive = []
for val in cosine_list:
    pos_val = val + 1
    cosine_positive.append(pos_val)

In [186]:
min(cosine_positive)

0.48234160000000004

In [187]:
normalized_cosine = [cos_val/max(cosine_positive) for cos_val in cosine_positive]

In [188]:
max(normalized_cosine)

1.0

In [189]:
cosine_rounded = quarter_round(normalized_cosine, 0.25)

In [None]:
cosine_rounded

# Put together file for submission

In [191]:
testing_ids = list(testing_combined['id'])
submission_df = pd.DataFrame(list(zip(testing_ids, cosine_rounded)), columns = ['id', 'score'])

In [130]:
submission_df.to_csv('/home/jupyter/uspto_analysis/submission.csv')

In [131]:
from google.cloud import storage
import os
import pandas as pd

In [132]:
# write pandas df to GCP 
client = storage.Client()
bucket = client.get_bucket('cliffm_uspto_kaggle_data')
    
bucket.blob('submission.csv').upload_from_string(submission_df.to_csv(), 'text/csv')

In [None]:
# Process for stemming the content of the patents
# Too aggressive for my liking
# Worse performance when calculating the cosine similarity between the patents later on

In [None]:
'''
stemmer = PorterStemmer()
def stemmer_text(text_field):
    return [stemmer.stem(w) for w in word_tokenize(text_field)]
'''

In [None]:
'''
text_processing_frame['stemmed_anchor'] = text_processing_frame.anchor.apply(stemmer_text)
text_processing_frame['stemmed_target'] = text_processing_frame.target.apply(stemmer_text)
text_processing_frame['stemmed_title'] = text_processing_frame.title_alpha.apply(stemmer_text)


patent_select = pd.DataFrame(text_processing_frame[['lemmatized_anchor', 'lemmatized_target', 'lemmatized_title', 'code', 'score']])
patent_select = pd.DataFrame(text_processing_frame[['stemmed_anchor', 'stemmed_target', 'stemmed_title', 'code', 'score']])
'''

In [None]:
'''# create list of all tokens - stemmed versions
anchor_list = list(text_processing_frame['stemmed_anchor'])
target_list = list(text_processing_frame['stemmed_target'])
title_list = list(text_processing_frame['stemmed_title'])'''

# Patent EDA

In [None]:
# --- Description of the data fields ---
# patent_train and patent_test
    # id - unique identifier for a pair of phrases
    # anchor - first phrase
    # target - second phrase
    # context - CPC classification which indicates the context which the similarity is to be scored
    # score - similarity between the two phrases
    
# patent_titles
    # code - hierarchical code used to categorize the patent; corresponds to the context field in patent_train and patent_test dataframe
    # title - description of the code field
    # section - first symbol in the title field; ranges from A - H and Y
    # class - 2 digit class
    # subclass - 1 letter code subclass
    # group - 1-3 digit group code value
    # main_group - 2+ sigit main or subgroup after the / symbol
    # EXAMPLE: patent_titles.loc[3,'code'] = 'A01B1/00'
        # title = 'Hand tools (edge trimmers for lawns A01G3/06  {; machines for working soil A01B35/00; making hand tools B21D})'
        # section = A
        # class = 1.0
        # subclass = B
        # group = 1.0
        # main_group = 00
        
# --- Description of the data fields ---

In [2]:
patent_train.head()
patent_test.head()
# patent_titles.head()
# patent_cpc.head()

patent_train.shape

# data fields
patent_train.columns
patent_titles.columns
# patent_cpc.columns

NameError: name 'patent_train' is not defined

In [3]:
# function to view the first 10 columns of the titles dataframe
def view_data(dataframe, *args):
    col_data_list = []
    for arg in args:
        col_data = dataframe.iloc[0:10, arg]
        col_header = dataframe.columns[arg]
        col_data_list.append((col_header, col_data))
    return col_data_list

patent_titles.shape
view_data(patent_titles,range(0,7))
patent_titles.iloc[0:10,0:6]

NameError: name 'patent_titles' is not defined