In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import zipfile
from collections import defaultdict
import ast
from gensim.models import Word2Vec
import math

In [2]:
data_dir = "C:/Users/samsung/UCSD/Fall22/CSE258/archive"
df_interactions = pd.read_csv(f"{data_dir}/RAW_interactions.csv")
df_recipes = pd.read_csv(f"{data_dir}/RAW_recipes.csv")
print(df_recipes.shape)
print(df_recipes.columns)

(231637, 12)
Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')


In [3]:
data = pd.merge(df_recipes, df_interactions, right_on='recipe_id',left_on='id')
data[['calories','total fat','sugar','sodium','protein','saturated fat','carbohydrates']] = data.nutrition.str.split(",",expand=True) 
data['calories'] = data['calories'].apply(lambda x: x.replace("[" ,""))
data['carbohydrates'] = data['carbohydrates'].apply(lambda x: x.replace("]" ,""))
data[['calories','total fat','sugar','sodium','protein','saturated fat','carbohydrates']] =  data[['calories','total fat','sugar','sodium','protein','saturated fat','carbohydrates']].astype(float)

In [4]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=6752)
train = train + test[:-2000]
testData = test[int(len(test)*0.1):][-2000:]

### Ingredient Similaritiy (Word2Vec)

In [None]:
# use train data
userHistory = defaultdict(list) 
allInteractions = defaultdict(float)
for _, row in train.iterrows():
    i = row['id']
    u = row['user_id']
    r = row['rating']
    userHistory[u].append(i)
    allInteractions[(u, i)] = r

In [6]:
recipeIngredients = defaultdict(list)
recipeSteps = defaultdict(list)
for _, row in data.iterrows():
    i = row['id']
    ingList = row['ingredients']
    recipeIngredients[i] = ingList
    stepsList = row['steps']
    recipeSteps[i] = stepsList

In [7]:
mean_rating = 0

for _, row in train.iterrows():
    mean_rating += row['rating']

mean_rating /= len(train)
print(mean_rating)

4.412119312104189


In [8]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [9]:
def Cosine(x1,x2):
    numer = 0
    norm1 = 0
    norm2 = 0
    for a1,a2 in zip(x1,x2):
        numer += a1*a2
        norm1 += a1**2
        norm2 += a2**2
    if norm1*norm2:
        return numer / math.sqrt(norm1*norm2)
    return 0

In [10]:
data["ingredients"] = data.ingredients.apply(lambda x: ast.literal_eval(x))
data["ingredients"][0]

['winter squash',
 'mexican seasoning',
 'mixed spice',
 'honey',
 'butter',
 'olive oil',
 'salt']

In [11]:
model = Word2Vec(data["ingredients"].tolist(), min_count=500, vector_size=10, sg=1)

In [12]:
model.wv.similar_by_word("butter")

[('eggs', 0.9305537343025208),
 ('flour', 0.9250773787498474),
 ('milk', 0.9156212210655212),
 ('walnuts', 0.9009474515914917),
 ('nutella', 0.8821560740470886),
 ('self rising flour', 0.8730486035346985),
 ('pie crust', 0.8584103584289551),
 ('pecans', 0.8509156107902527),
 ('jam', 0.8476229906082153),
 ('self-rising flour', 0.8476076126098633)]

In [13]:
def get_ingList_vector(s):
    return np.sum(np.array([model.wv[i] if i in list(model.wv.index_to_key) else [0.0]*10 for i in s]), axis=0)

v1 = get_ingList_vector(["butter", "sugar", "flour"])
v2 = get_ingList_vector(["salt", "chicken", "paprika"])
v3 = get_ingList_vector(["eggs", "chocolate", "sprinkles"])

print(Cosine(v1, v2))
print(Cosine(v1, v3))

0.39788652463925933
0.9116139954718935


In [16]:
pred = []
for _, row in testData.iloc[-2000:].iterrows():
    u = row['user_id']
    i = row['id']
    if u in userHistory:
        r_pred = 0
        norm_fact = 0
        # print(f'number of user history: {len(userHistory[u])}')
        for i2 in userHistory[u]:
            if (u, i2) in allInteractions and i2 != i:
                v1 = get_ingList_vector(recipeIngredients[i])
                v2 = get_ingList_vector(recipeIngredients[i2])
                sim = Cosine(v1, v2)
                r_pred += sim*allInteractions[(u, i2)]
                norm_fact += sim
                # if sim == 0:
                    # print(recipeIngredients[i])
                    # print(recipeIngredients[i2])
        # print(r_pred)
        # print(norm_fact)
        if norm_fact > 0:
            r_pred /= norm_fact
        else:
            r_pred = 0.0
    else:
        r_pred = 3.0

    pred.append([r_pred,r])

# open the file in the write mode
with open('./ingredient_result.csv', 'w') as f:
    # create the csv writer
    for p,t in pred:
        f.writelines(f"{p},{t}\n")

In [25]:
a_pred = [a[0] for a in pred]
MSE(a_pred, list(testData.iloc[-2000:]['rating']))

1.7090711784415222

### Recipe Steps (TF-IDF)

In [29]:
import string
from gensim.parsing.preprocessing import remove_stopwords

punctuation = set(string.punctuation)


In [50]:
# get words in all recipe steps
wordCount = defaultdict(int)
punctuation = set(string.punctuation)

for index, row in data.iterrows():
    # if type(row['steps']) != list:
    #     continue
    review_text = ''.join([c for c in row['steps'].lower() if not c in punctuation])
    review_text = remove_stopwords(review_text)
    for word in review_text.split():
        wordCount[word] += 1
print(len(wordCount))

56468


In [82]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[:1000]]
print(words[:10])

['add', 'minutes', 'heat', '1', 'stir', 'mix', 'bowl', 'pan', 'mixture', '2']


In [83]:
df = defaultdict(int)

for _, row in data.iterrows():
    steps_text = ''.join([c for c in row['steps'].lower() if not c in punctuation])
    steps_text = remove_stopwords(steps_text)
    for w in set(steps_text.split()):
        df[w] += 1

In [85]:
def getTFIDF(i, steps):
    tf = defaultdict(int)
    r = ''.join([c for c in steps.lower() if not c in punctuation])
    r = remove_stopwords(r)
    for w in r.split():
        # Note = rather than +=, different versions of tf could be used instead
        tf[w] = 1
    tfidf2 = [tf[w] * math.log2(len(data) / df[w]) for w in words]
    return tfidf2

In [91]:
pred = []

for _, row in testData.iloc[-200:].iterrows():
    u = row['user_id']
    i = row['id']
    t1 = getTFIDF(i, recipeSteps[i])
    if u in userHistory:
        r_pred = 0
        norm_fact = 0
        for i2 in userHistory[u]:
            if (u, i2) in allInteractions:
                t2 = getTFIDF(i2, recipeSteps[i2])
                sim = Cosine(t1, t2)
                r_pred += sim*allInteractions[(u, i2)]
                norm_fact += sim
        r_pred /= norm_fact
    else:
        r_pred = mean_rating

    pred.append(r_pred)

In [4]:
MSE(pred, list(testData.iloc[-200:]['rating']))

NameError: name 'MSE' is not defined

In [3]:
pred[:10]
testData.iloc[-200:]['rating'][:10]

NameError: name 'pred' is not defined

In [88]:
rev = testData.iloc[0]
# steps =rev['steps']
# r = ''.join([c for c in steps.lower() if not c in punctuation])
# r = remove_stopwords(r)
# for w in r.split():
#     print(w)
rev_tfidf = getTFIDF(rev['id'], rev['steps'])
print(rev_tfidf)

[0.7070826095559978, 0.0, 1.287804213789717, 0.0, 1.4171510010391948, 0.0, 0.0, 0.0, 0.0, 0.0, 1.545525423564627, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7418880027456294, 0.0, 0.0, 0.0, 1.6308536916276215, 0.0, 1.8440733235322981, 0.0, 2.064532183217921, 0.0, 0.0, 2.36981192064371, 2.192827278958934, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.605974841235293, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.16032011194608, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0539338443813904, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.12285796521402, 3.175227146861053, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.3339503192830753, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.317103689103838, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.023737432388487