# Imports

In [7]:
# Basics
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
import string
import csv
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS
import pickle as pkl

# MongoDB
from pymongo import MongoClient

# natural language processing
import nltk
from textblob import TextBlob
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.corpus import stopwords
from ingreedypy import Ingreedy

# sklearn
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import scale
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score

# gensim
from gensim import corpora, models, similarities, matutils

# Other
import warnings
warnings.filterwarnings('ignore')


# Data from 02_cbow_process

In [77]:
df = pd.read_pickle('df_07_2.pkl')

In [78]:
df.head()

Unnamed: 0,title,ingredients,directions,desc
0,"[lentil, apple, and, turkey, wrap]","[[low, sodium, vegetable, chicken, stock], [dr...","[[place, stock, lentils, celery_carrot, thyme_...",[]
1,"[boudin_blanc, terrine, with, red, onion_confit]","[[whipping_cream], [onions], [salt], [bay], [c...","[[combine, first, ingredients, heavy, medium_s...","[uses, ingredients, found, boudin_blanc, class..."
2,"[potato, and, fennel_soup, hodge]","[[fennel_bulb, anise, stalks, bulb, feathery, ...","[[large, heavy, saucepan, cook, diced, fennel,...",[]
3,"[mahi_mahi, in, tomato, olive, sauce]","[[extra, virgin, olive_oil], [onion], [dry, wh...","[[heat, oil, heavy, large, skillet, medium, hi...","[sicilian, style, tomato_sauce, tons, mediterr..."
4,"[spinach, noodle_casserole]","[[spinach, soufflé], [extra, wide, egg_noodles...","[[preheat_oven, lightly, grease, inch, glass, ...",[]


In [79]:
df.ingredients[0]

[['low', 'sodium', 'vegetable', 'chicken', 'stock'],
 ['dried', 'brown', 'lentils'],
 ['dried', 'french', 'green', 'lentils'],
 ['stalks_celery'],
 ['carrot'],
 ['thyme'],
 ['kosher_salt'],
 ['tomato'],
 ['fuji', 'apple'],
 ['squeezed', 'lemon_juice'],
 ['extra', 'virgin', 'olive_oil'],
 ['black', 'pepper', 'taste'],
 ['wheat', 'lavash', 'crosswise', 'flour_tortillas'],
 ['turkey_breast'],
 ['bibb_lettuce']]

# Word2Vec Custom Model

In [8]:
import gensim

with open('epicurious_texts.pkl', 'rb') as f:
    epicurious_texts = pkl.load(f)

In [9]:
model = gensim.models.Word2Vec(epicurious_texts, size=200, window=5, min_count=1, workers=2, sg=1)

In [30]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in model.wv]
#     print(doc)
    return np.mean(word2vec_model[doc], axis=0)

In [48]:
measurements = ['cup','cups','C','c','gram','grams','g','kilogram','kilograms','kg','liter','liters','L','l',
               'pound','pounds','lb','milliliter','milliliters','ml','mL','ounce','ounces','oz','pint','pints','pt',
               'teaspoon','teaspoons','t','tsp','tablespoon','tablespoons','T','TB','Tbl','Tbsp','tbsp','quart','quarts','qt',
               'dash','pinch','piece','pieces','slice','slices','sheet','sheets','log','stick','sticks']

## Create Document Vectors

In [71]:
# get vector for each ingredient in ingredients
# then take the mean to get the ingredient list vector
ingredient_vectors = []
for i in range(len(df)): # each recipe
    ing_list = df.ingredients[i] # the list of ingredients
#     print(ing_list)
    ing_vec_temp = []
    for j in range(len(ing_list)): # each item in the list of ingredients
        try:
            ing_vec_temp.append(document_vector(model, ing_list[j])) # create individual ingredient vector
        except:
            pass
#     print(ing_vec_temp)
    ingredient_vectors.append(ing_vec_temp) # list of lists of individual ingredient vectors
    

In [72]:
ingredient_list_vectors = []
for i in range(len(ingredient_vectors)):
    try:
        ing_vstack = np.vstack(ingredient_vectors[i])
        ing_mean = np.mean(ing_vstack, axis=0)
    except:
        ing_mean = float('NaN')
    ingredient_list_vectors.append(ing_mean)

In [38]:
# get vector for each direction item in directions
# then take the mean to get the direction list vector
direction_vectors = []
for i in range(len(df)): # each recipe
    dir_list = df.directions[i] # the list of directions
    dir_vec_temp = []
    for j in range(len(dir_list)): # each item in the list of directions
        try:
            dir_vec_temp.append(document_vector(model, dir_list[j])) # create individual direction item vector
        except:
            pass
    direction_vectors.append(dir_vec_temp) # list of lists of individual direction item vectors

In [39]:
direction_list_vectors = []
for i in range(len(direction_vectors)):
    try:
        dir_vstack = np.vstack(direction_vectors[i])
        dir_mean = np.mean(dir_vstack, axis=0)
    except:
        dir_mean = float('NaN')
    direction_list_vectors.append(dir_mean)

In [40]:
# get vector for each title
title_vectors = []
for i in range(len(df)): # each recipe
    title_temp = df.title[i] # the title of the recipe
    try:
        title_vectors.append(document_vector(model, title_temp)) # list of title vectors
    except:
        title_vectors.append(float('NaN'))


In [41]:
# take the mean of the title, ingredient list, and direction list vectors 
# to get the recipe vector
recipe_vectors = []
for i in range(len(df)):
    doc_temp = [title_vectors[i], direction_list_vectors[i], ingredient_list_vectors[i]]
    doc_temp = [x for x in doc_temp if not np.isnan(x).all()]
    doc_temp = np.vstack(doc_temp)
    recipe_vectors.append(np.mean(doc_temp, axis=0))


In [42]:
(recipe_vectors[0].shape)

(200,)

# df_03

In [73]:
df_03 = pd.DataFrame(columns=['title','ingredient_ind','ingredient_avg','directions_ind','directions_avg','recipe_vector'])

In [74]:
df_03['title'] = title_vectors
df_03['ingredient_ind'] = ingredient_vectors
df_03['ingredient_avg'] = ingredient_list_vectors
df_03['directions_ind'] = direction_vectors
df_03['directions_avg'] = direction_list_vectors
df_03['recipe_vector'] = recipe_vectors

In [75]:
df_03.head()

Unnamed: 0,title,ingredient_ind,ingredient_avg,directions_ind,directions_avg,recipe_vector
0,"[0.215455, -0.0678674, -0.0536728, -0.0384151,...","[[0.091462, 0.141202, 0.103957, 0.00484193, 0....","[0.323847, -0.113021, 0.0213143, -0.135363, 0....","[[0.224048, -0.101772, 0.178376, -0.0839364, -...","[0.257222, -0.0911494, 0.0382795, -0.0387829, ...","[0.296192, -0.0795992, -0.00273143, -0.0739488..."
1,"[0.318823, -0.100801, 0.0378541, -0.177314, -0...","[[-0.217296, 0.676479, -0.406482, 0.09348, 0.3...","[0.282012, -0.0117083, 0.0157072, -0.0991843, ...","[[0.287888, -0.112491, 0.0919362, -0.129807, -...","[0.253184, -0.0988187, 0.0498208, -0.0839749, ...","[0.319925, -0.0805943, 0.0112368, -0.122873, -..."
2,"[0.179084, 0.0596837, 0.118409, -0.00933529, -...","[[0.344291, -0.155895, -0.122205, -0.326594, 0...","[0.259661, 0.0358039, 0.179575, -0.0221815, 0....","[[0.254357, -0.0437917, 0.143034, -0.0979843, ...","[0.290386, -0.0688211, 0.100292, -0.182143, -0...","[0.265184, 0.00772217, 0.119431, -0.0645404, -..."
3,"[0.241015, -0.0255318, 0.0375358, -0.132623, -...","[[0.514625, 0.127052, 0.312318, 0.259462, 0.15...","[0.348406, -0.0545045, -0.00216654, -0.132842,...","[[0.218503, 0.0109341, 0.090625, -0.00890265, ...","[0.21065, -0.0409501, 0.0920256, -0.0774544, 0...","[0.284226, -0.0446817, 0.0150394, -0.0866968, ..."
4,"[0.141338, 0.127031, 0.0356138, -0.062714, -0....","[[0.202019, 0.187605, 0.0542731, -0.105495, -0...","[0.249164, 0.0869428, 0.0621178, -0.127689, -0...","[[0.295774, -0.106614, 0.135902, 0.013157, -0....","[0.295774, -0.106614, 0.135902, 0.013157, -0.0...","[0.263629, 0.00999229, 0.0272844, -0.0439009, ..."


In [76]:
df_03.to_pickle('df_07_2_vecs.pkl')