# Word2Vec Model training

We are going to train our model on four data sources
1. The nodes themselves
2. recipe data (external)
3. groceryDB
5. text8 Corpus (Wikipedia dump)

In [1]:
# NodeSim Tool:
from NodeSim import NodeSim


# Word2Vec:
import gensim.downloader as api
import gensim
from gensim.models import Word2Vec
from timeit import default_timer as timer

# other
import json
import pandas as pd
from nltk.corpus import stopwords


## Read in Nodes

**Note:** for model training we are using the labels, including the information in the parenthesis. The way we will differentiate the information within the parenthesis will be when we actually run labels through the model. 

In [2]:
# read in data
df = pd.read_csv("nodes.csv")
slim = df['label'].tolist()

In [3]:
# process

NS = NodeSim()

# these terms were hand-picked after looking at some of the most frequent words..
termsToBeRemoved = ["(efsa foodex2)","(efsa foodex)","and similar","probably", "other","food","(us cfr)","(gs gpc)","products", "product","obsolete"]

NS = NodeSim()
start_time = timer()

labels = NS.processStrings(labels = slim, removeParenthesis = False, 
                           termsToBeRemoved = termsToBeRemoved, removeStopWords = True, 
                           locateBigrams = True, bigramMinCount = 5)

elapsed_time = timer() - start_time # in seconds

print('elapsed time : {}'.format(elapsed_time))

elapsed time : 0.460900458


#### Preview:

In [4]:
print('size:', len(labels))
labels[:25]

size: 21675


Unnamed: 0,raw,clean
0,whole wheat crispbread,"[whole, wheat, crispbread]"
1,USDA SR sweets (1900),"[usda sr, sweets]"
2,gruenland cheese,"[gruenland, cheese]"
3,citron melon food product,"[citron, melon]"
4,blueflag plant,[blueflag]
5,CCFAC beverages; excluding dairy products,"[ccfac, beverages, excluding, dairy, s]"
6,habanero pepper plant,"[habanero, pepper]"
7,bullhead,[bullhead]
8,lisita (raw),"[lisita, raw]"
9,light cream,"[light, cream]"


## Read Recipe Data

In [5]:
recipes = list()

with open('../recipes.json') as f:
    data = json.load(f)
    
    # just need to read the json data into a list of words.
    for r in data:
        myRecipe = [r['cuisine']]
        myRecipe.extend(r['ingredients'])
        recipes.append(myRecipe)
        
# this data is already clean, so no further processing is neccessary.

#### Preview

In [6]:
print('size:', len(recipes))
recipes[:1]

size: 39774


[['greek',
  'romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles']]

## Read Grocery DB

In [7]:
grocery = pd.read_excel("../GroceryDB.xlsx", engine='openpyxl')

# we are going to use the name and group of each listing.. so we are just going to concat 
# those strings and then run them through our processor.
groceryClean = []

for i in range(len(grocery)):
    if(type(grocery['name'][i]) == str and type(grocery['group'][i]) == str):
        concat = grocery['name'][i] + " " + grocery['group'][i]
        groceryClean.append(concat)
        
termsToBeRemoved = ['each', 'lb']

grocery = NS.processStrings(labels = groceryClean, removeParenthesis = False, 
                            removeStopWords = True, termsToBeRemoved = termsToBeRemoved,
                            locateBigrams = True, bigramMinCount = 5)


#### Preview:

In [8]:
print('size:', len(grocery))
grocery[:10]

size: 14823


Unnamed: 0,raw,clean
0,"Bananas, each Bananas & Plantains","[bananas, bananas, plantains]"
1,Fresh Red Seedless Grapes bag Grapes,"[fresh, red, seedless, grapes, bag, grapes]"
2,"Hass Avocados, each Avocados","[hass, avocados, avocados]"
3,"Lemons, each Citrus","[lemons, citrus]"
4,"Cantaloupe, each Melons","[cantaloupe, melons]"
5,"Limes, each Citrus","[limes, citrus]"
6,Pineapple Tropical & Exotic Fruit,"[pineapple, tropical exotic, fruit]"
7,"Fresh Strawberries, 1 lb Berries","[fresh, strawberries, berries]"
8,"Yellow Mangoes, each Tropical & Exotic Fruit","[yellow, mangoes, tropical exotic, fruit]"
9,"Fresh Mangoes, each Tropical & Exotic Fruit","[fresh, mangoes, tropical exotic, fruit]"


In [9]:
grocery['clean'][5]

['limes', 'citrus']

## Read Text8 Corpus

Note: I had trouble running this on my new machine. Had to follow this StackOverflow Answer:
https://stackoverflow.com/questions/62861346/why-cant-i-download-a-dataset-with-the-gensim-download-api

In [10]:
text8 = api.load('text8')


# Model Training
Now I will train on Text8 and then retrain on the three other sources. 


In [16]:
model = Word2Vec(text8, min_count=2, size= 100, workers=3, window =5, sg = 1)

### Build On Recipes

In [17]:
model.build_vocab(recipes, update=True)
model.train(recipes,total_examples=len(recipes), epochs = 10)

(3680169, 4680490)

### Build on GroceryDB

In [18]:
model.build_vocab(grocery['clean'], update=True)
model.train(grocery['clean'],total_examples=len(grocery['clean']), epochs = 10)

(967412, 1175170)

### Build On Labels

In [19]:
model.build_vocab(labels['clean'], update=True)
model.train(labels['clean'], total_examples=len(labels['clean']), epochs = 10)

(497041, 570020)

In [20]:
# model.save("Final.Model")