# Word2Vec Model training

We are going to train our model on four data sources
1. The nodes themselves
2. recipe data (external)
3. groceryDB
5. text8 Corpus (Wikipedia dump)

In [1]:
# NodeSim Tool:
from NodeSim import NodeSim


# Word2Vec:
import gensim.downloader as api
import gensim
from gensim.models import Word2Vec
from timeit import default_timer as timer

# other
import json
import pandas as pd
from nltk.corpus import stopwords


## Read in Nodes

**Note:** for model training we are using the labels, including the information in the parenthesis. The way we will differentiate the information within the parenthesis will be when we actually run labels through the model. 

In [2]:
# read in data
df = pd.read_csv("nodes.csv")
slim = df['label'].tolist()

In [13]:
# process labels

NS = NodeSim()

# these terms were hand-picked after looking at some of the most frequent words..
termsToBeRemoved = ["(efsa foodex2)","(efsa foodex)","and similar","probably", "other","food","(us cfr)","(gs gpc)","products", "product","obsolete"]


labels = NS.processStrings(labels = slim, removeParentheses = True, 
                           termsToBeRemoved = termsToBeRemoved, removeStopWords = True, 
                           locateBigrams = True, bigramMinCount = 5)



#### Preview:

In [14]:
print('size:', len(labels))
labels[10:25]

size: 21675


Unnamed: 0,raw,clean,parentheses
10,33770 - sangria (efsa foodex2),[sangria],[]
11,coffee bean (whole or ground),"[coffee, bean]","[whole, ground]"
12,pudding sugar-free instant,"[pudding, sugar, free, instant]",[]
13,vegetable shortening,"[vegetable, shortening]",[]
14,soup base flavored with beef extract,"[soup, base, flavored, with, beef, extract]",[]
15,beef eye round (roasted),"[beef, eye, round]",[roasted]
16,laminate tube; unlined aluminum ends,"[laminate tube, unlined, aluminum, ends]",[]
17,43050 - macadamia flavour (efsa foodex2),"[macadamia, flavour]",[]
18,ice krill,"[ice, krill]",[]
19,neogastropod,[neogastropod],[]


## Read Recipe Data

In [5]:
recipes = list()

with open('../recipes.json') as f:
    data = json.load(f)
    
    # just need to read the json data into a list of words.
    for r in data:
        myRecipe = [r['cuisine']]
        myRecipe.extend(r['ingredients'])
        recipes.append(myRecipe)
        
# this data is already clean, so no further processing is neccessary.

#### Preview

In [6]:
print('size:', len(recipes))
recipes[:1]

size: 39774


[['greek',
  'romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles']]

## Read Grocery DB

In [7]:
grocery = pd.read_excel("../GroceryDB.xlsx", engine='openpyxl')

# we are going to use the name and group of each listing.. so we are just going to concat 
# those strings and then run them through our processor.
groceryClean = []

for i in range(len(grocery)):
    if(type(grocery['name'][i]) == str and type(grocery['group'][i]) == str):
        concat = grocery['name'][i] + " " + grocery['group'][i]
        groceryClean.append(concat)
        
termsToBeRemoved = ['each', 'lb']

grocery = NS.processStrings(labels = groceryClean, removeParentheses = False, 
                            removeStopWords = True, termsToBeRemoved = termsToBeRemoved,
                            locateBigrams = True, bigramMinCount = 5)


#### Preview:

In [8]:
print('size:', len(grocery))
grocery[:10]

size: 14823


Unnamed: 0,raw,clean
0,"Bananas, each Bananas & Plantains","[bananas, bananas, plantains]"
1,Fresh Red Seedless Grapes bag Grapes,"[fresh, red, seedless, grapes, bag, grapes]"
2,"Hass Avocados, each Avocados","[hass, avocados, avocados]"
3,"Lemons, each Citrus","[lemons, citrus]"
4,"Cantaloupe, each Melons","[cantaloupe, melons]"
5,"Limes, each Citrus","[limes, citrus]"
6,Pineapple Tropical & Exotic Fruit,"[pineapple, tropical exotic, fruit]"
7,"Fresh Strawberries, 1 lb Berries","[fresh, strawberries, berries]"
8,"Yellow Mangoes, each Tropical & Exotic Fruit","[yellow, mangoes, tropical exotic, fruit]"
9,"Fresh Mangoes, each Tropical & Exotic Fruit","[fresh, mangoes, tropical exotic, fruit]"


In [9]:
grocery['clean'][5]

['limes', 'citrus']

# Model Training


**FUNCTION :** trainModel  

**INPUT:**  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **includeText8:** (boolean), if True, text8 corpus (wikipedia word dump) will be used in model training. [default = True]  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; - it is reccomended to use the text 8 corpus unless you have a large enough corpus to successfuly train the model.  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; - More on text8 data : http://mattmahoney.net/dc/textdata  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **corpora:** (list) list of documents of which the model will be trained.     
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; - Corpora should be a list of documents, which should be a list of sentences, which should be a list of words.  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; - An example input of a single document might look like : [[['the', 'cat', 'in', 'the', 'hat'],['the', 'grinch']]]  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; - These documents should be pre-processed.  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **epochsForTraining:** (list), (int), number of epochs desired for model training   
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **word2VecArgs:** (dictionary), dictionary of Word2Vec model arguments please see Word2Vec for argument details. https://radimrehurek.com/gensim/models/word2vec.html  

**OUTPUT:** gensim Word2Vec Model   

**DESCRIPTION:** this function will allow for basic training of a gensim Word2Vec model. please see gensim documentation for more details on
    model training. [https://radimrehurek.com/gensim/models/word2vec.html]





In [10]:
word2VecArgs = {
    'min_count': 2, 
    'size' : 100, 
    'workers': 3, 
    'window' : 5, 
    'sg' : 1
}

parameters = {
    'includeText8' : True,
    'corpora' : [labels['clean'].tolist(), recipes, grocery['clean'].tolist()],
    'epochsForTraining' : 10,
    'word2VecArgs' :  word2VecArgs
}

model = NS.trainModel(**parameters)