# Imports

In [1]:
# Basics
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
import string
import csv
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS

# MongoDB
from pymongo import MongoClient

# natural language processing
import nltk
from textblob import TextBlob
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.corpus import stopwords
from ingreedypy import Ingreedy

# sklearn
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import scale
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score

# gensim
from gensim import corpora, models, similarities, matutils

# Other
import warnings
warnings.filterwarnings('ignore')


# Data from 02_cbow_process

In [2]:
df = pd.read_pickle('df_02.pkl')

In [3]:
df.head()

Unnamed: 0,title,ingredients,directions,categories,desc
0,"[lentil, apple, turkey, wrap]","[[cups, low, sodium, vegetable, or, chicken, s...","[[place, stock, lentils, celery, carrot, thyme...","[sandwich, bean, fruit, tomato, turkey, vegeta...",[]
1,"[boudin, blanc, terrine, red, onion, confit, b...","[[cups, whipping, cream, whipping_cream], [med...","[[combine, first, ingredients, heavy, medium, ...","[onion, pork, bake, port, winter, chill]","[uses, ingredients, found, boudin, blanc, clas..."
2,"[potato, fennel, soup, hodge, fennel_soup]","[[fennel, bulb, sometimes, called, anise, stal...","[[large, heavy, saucepan, cook, diced, fennel,...","[dairy, potato, vegetable, fennel, gourmet]",[]
3,"[mahi, mahi, tomato, olive, sauce, mahi_mahi]","[[tablespoons, extra, virgin, olive, oil, oliv...","[[heat, oil, heavy, large, skillet, medium, hi...","[fish, olive, tomato, sauté, dinner, healthy, ...","[style, tomato, sauce, tons, mediterranean, fl..."
4,"[spinach, noodle, casserole, noodle_casserole]","[[ounce, package, frozen, spinach, soufflé, th...","[[preheat, oven, lightly, grease, inch, glass,...","[cheese, dairy, pasta, vegetable, side, bake, ...",[]


# Word2Vec Google Model

In [4]:
import gensim
google_vec_file = '/Users/carliebadder/Downloads/GoogleNews-vectors-negative300.bin'

In [5]:
model = gensim.models.KeyedVectors.load_word2vec_format(google_vec_file, binary=True)

In [6]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in model.vocab]
#     print(doc)
    return np.mean(word2vec_model[doc], axis=0)

## Create Document Vectors

In [7]:
# get vector for each ingredient in ingredients
# then take the mean to get the ingredient list vector
ingredient_vectors = []
for i in range(len(df)): # each recipe
    ing_list = df.ingredients[i] # the list of ingredients
    ing_vec_temp = []
    for j in range(len(ing_list)): # each item in the list of ingredients
        try:
            ing_vec_temp.append(document_vector(model, ing_list[j])) # create individual ingredient vector
        except:
            pass
    ingredient_vectors.append(ing_vec_temp) # list of lists of individual ingredient vectors
    

In [8]:
ingredient_list_vectors = []
for i in range(len(ingredient_vectors)):
    try:
        ing_vstack = np.vstack(ingredient_vectors[i])
        ing_mean = np.mean(ing_vstack, axis=0)
    except:
        ing_mean = float('NaN')
    ingredient_list_vectors.append(ing_mean)

In [9]:
# get vector for each direction item in directions
# then take the mean to get the direction list vector
direction_vectors = []
for i in range(len(df)): # each recipe
    dir_list = df.directions[i] # the list of directions
    dir_vec_temp = []
    for j in range(len(dir_list)): # each item in the list of directions
        try:
            dir_vec_temp.append(document_vector(model, dir_list[j])) # create individual direction item vector
        except:
            pass
    direction_vectors.append(dir_vec_temp) # list of lists of individual direction item vectors

In [10]:
direction_list_vectors = []
for i in range(len(direction_vectors)):
    try:
        dir_vstack = np.vstack(direction_vectors[i])
        dir_mean = np.mean(dir_vstack, axis=0)
    except:
        dir_mean = float('NaN')
    direction_list_vectors.append(dir_mean)

In [11]:
# get vector for each title
title_vectors = []
for i in range(len(df)): # each recipe
    title_temp = df.title[i] # the title of the recipe
    try:
        title_vectors.append(document_vector(model, title_temp)) # list of title vectors
    except:
        title_vectors.append(float('NaN'))


In [12]:
# take the mean of the title, ingredient list, and direction list vectors 
# to get the recipe vector
recipe_vectors = []
for i in range(len(df)):
    doc_temp = [title_vectors[i], direction_list_vectors[i], ingredient_list_vectors[i]]
    doc_temp = [x for x in doc_temp if not np.isnan(x).all()]
    doc_temp = np.vstack(doc_temp)
    recipe_vectors.append(np.mean(doc_temp, axis=0))


In [13]:
(recipe_vectors[0].shape)

(300,)

# df_03

In [14]:
df_03 = pd.DataFrame(columns=['title','ingredient_ind','ingredient_avg','directions_ind','directions_avg','recipe_vector'])

In [15]:
df_03['title'] = title_vectors
df_03['ingredient_ind'] = ingredient_vectors
df_03['ingredient_avg'] = ingredient_list_vectors
df_03['directions_ind'] = direction_vectors
df_03['directions_avg'] = direction_list_vectors
df_03['recipe_vector'] = recipe_vectors

In [16]:
df_03.head()

Unnamed: 0,title,ingredient_ind,ingredient_avg,directions_ind,directions_avg,recipe_vector
0,"[-0.0663452, 0.0281372, -0.101227, 0.201782, -...","[[-0.102923, -0.0281982, 0.0750384, 0.196359, ...","[-0.106555, 0.0587182, 0.0666594, 0.13566, -0....","[[-0.0359828, 0.140685, 0.0033493, 0.0847643, ...","[-0.0537251, 0.098973, 0.0104921, 0.0944859, -...","[-0.0755416, 0.0619428, -0.00802511, 0.143976,..."
1,"[-0.1325, -0.0630264, 0.0566177, 0.227844, 0.0...","[[-0.161743, 0.211548, 0.183624, -0.0588074, 0...","[-0.0981199, 0.0860157, 0.0853733, 0.08912, 0....","[[0.00524581, 0.15945, 0.000435277, 0.0132478,...","[-0.046243, 0.123296, 0.0316817, 0.0581252, -0...","[-0.0922875, 0.0487617, 0.0578909, 0.12503, 0...."
2,"[-0.162451, 0.156934, 0.0261841, 0.230518, 0.0...","[[0.0104607, 0.0896768, 0.0837606, 0.12384, -0...","[-0.155297, 0.11129, 0.0650542, 0.141659, 0.02...","[[-0.0479048, 0.177701, 0.0575639, 0.0630222, ...","[0.00466088, 0.150056, 0.064866, 0.11978, -0.0...","[-0.104362, 0.139427, 0.0520347, 0.163986, 0.0..."
3,"[-0.298177, 0.0159505, 0.0141602, 0.207845, 0....","[[0.00515111, 0.0625407, -0.0951742, 0.0750326...","[-0.13089, 0.0155496, 0.0651881, 0.107567, -0....","[[-0.0551779, 0.102122, 0.0181274, 0.046172, -...","[-0.061394, 0.108771, 0.0319948, 0.0565655, -0...","[-0.163487, 0.0467572, 0.0371144, 0.123992, 0...."
4,"[-0.16748, 0.1474, 0.167816, 0.279053, -0.0443...","[[-0.133799, 0.0292562, 0.137736, 0.265137, -0...","[-0.131535, 0.0166664, 0.110015, 0.164179, 0.0...","[[-0.128431, 0.0915348, 0.0722405, 0.146545, 0...","[-0.128431, 0.0915348, 0.0722405, 0.146545, 0....","[-0.142482, 0.0852004, 0.116691, 0.196592, 0.0..."


In [17]:
df_03.to_pickle('df_03.pkl')