# Model Application
1. Pre-Process our labels
2. compare any two labels (with or without the parenthesis considered separately)
3. find all pairs of nodes which are similar above a certain threshold.

In [1]:
import pandas as pd
from NodeSim import NodeSim
from gensim.models import Word2Vec
from timeit import default_timer as timer
import seaborn as sns

## Read in Nodes and Process Labels
Here we will be considering the information inside and outside the parethesis seperately. 

In [2]:
# read in data
df = pd.read_csv("siren_network_nodes 121520.csv")
slim = df['label'].tolist()

## processStrings
**FUNCTION :** processStrings  

**INPUT:**  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **labels:** (list), list of labels or strings to be processed for comparisons. [ Required Param ]   
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; - If element of list is not of type = string it will be ignored.  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **removeParenthesis:** (boolean), if True parenthesis will be removed and processed seperately. [ default = False ]  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **termsToBeRemoved:** (list), list of predetermined words to be removed. [default = [] ]  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **removeStopWords:** (boolean), if True given set of stopwords will be removed. [ default = True ]  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **stopWords:** (set), set of stopwords to be removed.[default = [nltk's english stop words](https://gist.github.com/sebleier/554280)  )  ]  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **locateBigrams:** (boolean), if True will locate and concat  bigrams with given minimum count. [ default = False ]  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **bigramMinCount:** (int), minimum count for a bigram to appear to be processed as a bigram (combined) [default = 5]  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **Ex:** if adjacent terms ['artificial', 'sweetener'] are found > 5 times, all instances will be concatenated to ['artificial sweetener']  

**OUTPUT:** pandas dataframe containing raw input labels, cleaned labels, and parenthesis labels (if removeParenthesis = True)  

**DESCRIPTION:** this is a pre-processing step to get data ready for the model training and comparisons.  


In [3]:
# these terms were hand-picked after looking at some of the most frequent words..
termsToBeRemoved = ["(efsa foodex2)","(efsa foodex)","and similar","probably", "other","food","plant", "raw", "(us cfr)","(gs gpc)","products", "product","obsolete"]

NS = NodeSim()

labels = NS.processStrings(labels = slim, removeParentheses = True, termsToBeRemoved = termsToBeRemoved, removeStopWords = True, locateBigrams = True, bigramMinCount = 5)



In [4]:
# sample
labels.head(50)

Unnamed: 0,raw,clean,parentheses
0,whole wheat crispbread,"[whole wheat, crispbread]",[]
1,USDA SR sweets (1900),"[usda sr, sweets]",[]
2,gruenland cheese,"[gruenland, cheese]",[]
3,citron melon food product,"[citron, melon]",[]
4,blueflag plant,[blueflag],[]
5,CCFAC beverages; excluding dairy products,"[ccfac, beverages, excluding, dairy]",[]
6,independent continuant,"[independent, continuant]",[]
7,habanero pepper plant,"[habanero, pepper]",[]
8,bullhead,[bullhead],[]
9,lisita (raw),[lisita],[]


# Now lets import our Model and try using it.

In [5]:
model = Word2Vec.load("final.model")

## Sim
**FUNCTION :** Sim  

**INPUT:**  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **model:** Gensim Word2Vec Model to be used for comparison      
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **considerParenthesis:** (boolean) if True we will consider terms within parenthesis seperately. [default= False]     
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **termsA:** (list), list of strings for label_A, if considering parenthesis, do not include words within parenthesis.  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **termsB:** (list), list of strings for label_B, if considering parenthesis, do not include words within parenthesis.  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **termsA_insideP:**  (list), list of strings inside parenthesis from label A. [default = [  ]]     
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **termsB_insideP:** (list), list of strings inside parenthesis from label B. [default = [  ]]  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **pWeight:** (float)  0.0 - 1.0  weight of which to apply to information within the parenthesis. [default = 0.1]  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **considerRemoved:** (boolean), if True, the ratio of removed terms will be considered. [Default = True]  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **removedWeight:** (float) 0.0 - 1.0 weight for how the removed terms should affect the similarity value. [default = 0.1]  

**OUTPUT:** (float) 0.0 - 1.0 : similarity value 

**DESCRIPTION:** this function will take the given model and use it to compare the two labels.





In [6]:
# example usage:

simVal2 = NS.sim(model = model, termsA = labels['clean'][13], termsA_insideP = labels['parentheses'][13], 
                                termsB = labels['clean'][22], termsB_insideP = labels['parentheses'][22], 
                                pWeight = 0.1, considerParentheses = True, considerRemoved = True, removedWeight = 0.1)


In [7]:
print("sim : '{}' vs '{}' = {}".format(labels['raw'][13], labels['raw'][22], simVal2))

sim : 'pudding sugar-free instant' vs 'fruit sherbet (artificially sweetened)' = 0.7262362271547317


# SimAllNodes

**FUNCTION :** SimAllNodes  

**INPUT:**  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **model:** Gensim Word2Vec Model to be used for comparison      
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **labels:** (list), list of labels to be compared.      
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; - these should be pre-processed, each label should be a list of words.    
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **parenthesis:** (list), list of parenthetical data from labels to be compared. [default = []]       
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; - Each i'th element should correnspond to the i'th label from labels.  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; - These should be pre-processed, each element of list should be a list of words.  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **considerParenthesis:** (boolean) if True we will consider terms within parenthesis seperately. [default= False]     
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **pWeight:** (float)  0.0 - 1.0  weight of which to apply to information within the parenthesis. [default = 0.1]  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **considerRemoved:** (boolean), if True, the ratio of removed terms will be considered. [Default = True]  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **removedWeight:** (float) 0.0 - 1.0 weight for how the removed terms should affect the similarity value. [default = 0.1]   
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **includeLabels:** (boolean) If True, resulting pandas dataframe will include the raw labels. [default= False]  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **includeThreshold:** (boolean) If True, only similarities greater or eqaual to the given threshold will be added to the output.     
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **threshold:**  (float)  Only similarities greater or equal to the given threshold will be added to the output . [default = 0.9]    

**OUTPUT:** pandas dataFrame with all the compared values, of given labels.     

**DESCRIPTION:** this function will compare all the given labels and will output a pandas DF with calculated similarity values.  




In [19]:
# lets start with a sample. 
labels_sample = labels.sample(n=2000, random_state=13).reset_index()


In [20]:
THRESHOLD = 0.99

start_time = timer()
results = NS.simAll(model, labels_sample['clean'].tolist(), labels_sample['parentheses'].tolist(), pWeight = 0.1, removedWeight = 0.01, includeLabels = True, includeThreshold = True, threshold = THRESHOLD)
# results = NS.SimAllNodes(model, labels['clean'].tolist(), labels['parenthesis'].tolist(), pWeight = 0.1, removedWeight = 0.1, includeLabels = True, includeThreshold = True, threshold = THRESHOLD)

elapsed_time = timer() - start_time # in seconds

print('elapsed time : {}'.format(elapsed_time))

elapsed time : 161.04571916700002


In [None]:
# results.to_csv("sims_threshold_0999.csv")

In [None]:
# plot = sns.displot(results, x="simVal", bins=200).set(xlim=(-0.75,1.0), ylim=(0,160000))
# plot = sns.displot(results, x="simVal", bins=200)


# plot.fig.set_figwidth(10)
# plot.fig.set_figheight(10)
# plot



In [None]:
# results.sort_values("simVal", ascending = False).head(50)
# results.sort_values("simVal").head(50)

In [18]:
results.head(25)

Unnamed: 0,label_A,index_A,label_B,index_B,simVal
0,milkfish family (),6,bluefish family (),478,0.990112
1,chilipepper fish (),56,fish (),69,0.995179
2,chilipepper fish (),56,fish cypriniform (),337,0.992691
3,chilipepper fish (),56,fish perciform (),347,0.992691
4,chilipepper fish (),56,fish (hot process smoked),369,0.990203
5,chilipepper fish (),56,fish lamniform (),372,0.992691
6,chilipepper fish (),56,fish (portion cut),388,0.991198
7,fish (),69,fish cypriniform (),337,0.996667
8,fish (),69,fish perciform (),347,0.996667
9,fish (),69,fish (hot process smoked),369,0.994


In [None]:
# why is scallop 1.0 ...
# - oh because i only compare if theyr both non-empty... this should effect the score...
# but also they are very similar..
# this brings up the point of how would we handle specificity
# talk about plant...