# Model Application
1. Pre-Process our labels
2. compare any two labels (with or without the parenthesis considered separately)
3. find all pairs of nodes which are similar above a certain threshold.

In [1]:
import pandas as pd
from NodeSim import NodeSim
from gensim.models import Word2Vec
from timeit import default_timer as timer
import seaborn as sns


## Read in Nodes and Process Labels
Here we will be considering the information inside and outside the parethesis seperately. 

In [2]:
# read in data
df = pd.read_csv("siren_network_nodes 121520.csv")

slim = df['label'].tolist()

## processStrings
**FUNCTION :** processStrings  
**INPUT:**  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **labels:** list of labels or strings to be processed for model comparisons.(if element of list is not of type = string it will be ignored.)  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **removeParenthesis:** Boolean; default = True. if True parenthesis will be removed and processed seperately.    
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **termsToBeRemoved:** list of strings which will be removed from the labels (default = [])  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **removeStopWords:** Boolean; default = True. if True given stop words will be remvoed from labels.  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **stopWords:** set of stop words to be removed. (default is [nltk's english stop words](https://gist.github.com/sebleier/554280)  )  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **locateBigrams:** Boolean; default = True. if True will locate and concat  bigrams with given minimum count.   
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **bigramMinCount:** int; minimum count for bigrams to be processed. (default = 5).  

**OUTPUT:** pandas dataframe containing raw input labels, cleaned labels, and parenthesis labels (if removeParenthesis = True)  
**DESCRIPTION:** this is a pre-processing step to get data ready for the model training and comparisons.  


In [3]:
# these terms were hand-picked after looking at some of the most frequent words..
termsToBeRemoved = ["(efsa foodex2)","(efsa foodex)","and similar","probably", "other","food","plant", "raw", "(us cfr)","(gs gpc)","products", "product","obsolete"]

NS = NodeSim()

labels = NS.processStrings(labels = slim, termsToBeRemoved = termsToBeRemoved)



In [4]:
# sample
labels.head(25)

Unnamed: 0,raw,clean,parenthesis
0,whole wheat crispbread,"[whole wheat, crispbread]",[]
1,USDA SR sweets (1900),"[usda sr, sweets]",[]
2,gruenland cheese,"[gruenland, cheese]",[]
3,citron melon food product,"[citron, melon]",[]
4,blueflag plant,[blueflag],[]
5,CCFAC beverages; excluding dairy products,"[ccfac, beverages, excluding, dairy]",[]
6,independent continuant,"[independent, continuant]",[]
7,habanero pepper plant,"[habanero, pepper]",[]
8,bullhead,[bullhead],[]
9,lisita (raw),[lisita],[]


# Now lets import our Model and try using it.

In [5]:
model = Word2Vec.load("final.model")

## modelSim
**FUNCTION :** modelSim  
**INPUT:**  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **model:** Gensim Word2Vec Model to be used for comparison      
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **termsA:** list of strings to be compared  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **termsB:** list of strings to be compared  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **considerRemoved:** Boolean, if True, the ratio of removed terms will be considered.(Default = True)  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **removedWeight:** float : 0.0 - 1.0 weight for how the removed terms should affect the total sim value   

**OUTPUT:** Similarity Value : 0.0 - 1.0  
**DESCRIPTION:** This function will take the given model and use it to compare the two given lists of terms. This is for comparing labels (without considering parenthesis separately). 




In [6]:
# example usage:
simVal = NS.Sim(model = model, termsA = labels['clean'][11], termsB = labels['clean'][15], considerRemoved = True)
print("sim : '{}' vs '{}' = {}".format(labels['raw'][11], labels['raw'][15], simVal))

sim : '33770 - sangria (efsa foodex2)' vs 'soup base flavored with beef extract' = 0.7669535279273987


## SimWithParenthesis
**FUNCTION :** SimWithParenthesis  
**INPUT:**  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **model:** Gensim Word2Vec Model to be used for comparison      
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **termsA_outsideP:** list of strings outside parenthesis from label A  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **termsA_insideP:** list of strings inside parenthesis from label A    
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **termsB_outsideP:** list of strings outside parenthesis from label B   
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **termsB_insideP:** list of strings inside parenthesis from label B  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **pWeight:** float 0.0 - 1.0 weight at which to apply to information within the parenthesis  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **considerRemoved:** Boolean, if True, the ratio of removed terms will be considered.(Default = True)  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **removedWeight:** float : 0.0 - 1.0 weight for how the removed terms should affect the total sim value   

**OUTPUT:** Similarity Value : float 0.0 - 1.0  
**DESCRIPTION:** this function will take the given model and use it to compare the two labels, treating the information inside the parenthesis separately and weighted. 





In [7]:
# example usage:

simVal2 = NS.SimWithParenthesis(model = model, termsA_outsideP = labels['clean'][11], termsA_insideP = labels['parenthesis'][11], 
                                termsB_outsideP = labels['clean'][15], termsB_insideP = labels['parenthesis'][15], 
                                pWeight = 0.2, considerRemoved = True, removedWeight = 1.0)


In [8]:
print("sim : '{}' vs '{}' = {}".format(labels['raw'][11], labels['raw'][15], simVal2))

sim : '33770 - sangria (efsa foodex2)' vs 'soup base flavored with beef extract' = 0.7669535279273987


# SimAllNodes

In [9]:
# lets start with a sample. 
# labels_sample = labels.sample(n=2000, random_state=13).reset_index()


In [10]:
THRESHOLD = 0.999

start_time = timer()
results = NS.SimAllNodes(model, labels['clean'].tolist(), labels['parenthesis'].tolist(), pWeight = 0.2, removedWeight = 0.01, includeLabels = False, includeThreshold = True, threshold = THRESHOLD)
# results = NS.SimAllNodes(model, labels['clean'].tolist(), labels['parenthesis'].tolist(), pWeight = 0.3, removedWeight = 0.3, includeLabels = True, includeThreshold = True, threshold = THRESHOLD)

elapsed_time = timer() - start_time # in seconds

print('elapsed time : {}'.format(elapsed_time))

elapsed time : 11998.635222417


In [11]:
# results.to_csv("sims_threshold_0999.csv")

In [12]:
# plot = sns.displot(results, x="simVal", bins=200).set(xlim=(-0.75,1.0), ylim=(0,160000))
# plot = sns.displot(results, x="simVal", bins=200)


# plot.fig.set_figwidth(10)
# plot.fig.set_figheight(10)
# plot



In [13]:
# results.sort_values("simVal", ascending = False).head(50)
# results.sort_values("simVal").head(50)

In [14]:
len(results)

11416

In [15]:
# why is scallop 1.0 ...
# - oh because i only compare if theyr both non-empty... this should effect the score...
# but also they are very similar..
# this brings up the point of how would we handle specificity
# talk about plant...