In [212]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import json
import matplotlib.pyplot as plt
import scipy
import dateutil.parser
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import requests
import json
from collections import defaultdict
import gzip
import csv
import random
from sklearn.metrics import jaccard_score 
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV

In [213]:
fp = os.path.join('data', 'trainInteractions.csv')
df = pd.read_csv(fp)

In [214]:
#TASKS 1 & 2
train = df[0:400000]
validation = df[400000:]


In [215]:
#ALL UNIQUE RECIPES IN BOTH TRAINING AND VALIDATION SET
all_recipes = list(df['recipe_id'].unique())


In [216]:
#CREATING NEGATIVE SAMPLE
validation_will_cook = validation.drop(['rating', 'date'], axis = 1).reset_index(drop = True)


ids = []
recipes = []
binary = []
for u in validation_will_cook['user_id']:
    while True:
        recipe = random.choice(all_recipes)
        if recipe not in list(validation_will_cook[validation_will_cook['user_id'] == u]['recipe_id']):
            ids.append(u)
            recipes.append(recipe)
            binary.append(0)
            break

In [217]:
negative_samples = pd.DataFrame()
negative_samples['user_id'], negative_samples['recipe_id'], negative_samples['will_cook'] = ids, recipes, binary


In [218]:
validation_will_cook['will_cook'] = 1
validation_will_cook_combined = pd.concat([negative_samples, validation_will_cook], ignore_index=True)

In [219]:
def readGz(path):
    for l in gzip.open(path, 'rt', encoding = "UTF-8"):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [220]:
#QUESTION 1

recipeCount = defaultdict(int)
totalCooked = 0

for user,recipe,_ in readCSV(os.path.join('data', 'trainInteractions.csv.gz')):
    recipeCount[recipe] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(int(i))
    if count > totalCooked/2: break

predictions_will_cook = pd.DataFrame(columns = ['user_id', 'recipe_id', 'will_cook'])
predictions_will_cook    

ids = []
recipes = []
binary = []
for n in range(validation_will_cook_combined.shape[0]):
    result = validation_will_cook_combined.iloc[n]
    if result['recipe_id'] in return1:
        ids.append(result['user_id'])
        recipes.append(result['recipe_id'])
        binary.append(1)
    else:
        ids.append(result['user_id'])
        recipes.append(result['recipe_id'])
        binary.append(0)
predictions_will_cook = pd.DataFrame()
predictions_will_cook['user_id'], predictions_will_cook['recipe_id'], predictions_will_cook['will_cook'] = ids, recipes, binary

In [221]:
#CALCULATING ACCURACY
predictions_will_cook['actual'] = validation_will_cook_combined['will_cook']
pred = predictions_will_cook[['actual',  'will_cook']]

In [222]:
def calculate_accuracy(df):
    TP = df[(df['actual'] == 1) & (df['will_cook'] == 1)].shape[0]
    TN = df[(df['actual'] == 0) & (df['will_cook'] == 0)].shape[0]
    FP = df[(df['actual'] == 0) & (df['will_cook'] == 1)].shape[0]
    FN = df[(df['actual'] == 1) & (df['will_cook'] == 0)].shape[0]
    P = TP + FN
    N = TN + FP
    Accuracy = (TP + TN)/(P + N)
    return Accuracy *100

In [223]:
calculate_accuracy(pred)

69.47

In [224]:
#QUESTION 2

def made_cook_model(n):
    recipeCount = defaultdict(int)
    totalCooked = 0

    for user,recipe,_ in readCSV(os.path.join('data', 'trainInteractions.csv.gz')):
        recipeCount[recipe] += 1
        totalCooked += 1

    mostPopular = [(recipeCount[x], x) for x in recipeCount]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(int(i))
        if count > totalCooked/n: break

    predictions_will_cook = pd.DataFrame(columns = ['user_id', 'recipe_id', 'will_cook'])
    predictions_will_cook    

    ids = []
    recipes = []
    binary = []
    for n in range(validation_will_cook_combined.shape[0]):
        result = validation_will_cook_combined.iloc[n]
        if result['recipe_id'] in return1:
            ids.append(result['user_id'])
            recipes.append(result['recipe_id'])
            binary.append(1)
        else:
            ids.append(result['user_id'])
            recipes.append(result['recipe_id'])
            binary.append(0)
    predictions_will_cook = pd.DataFrame()
    predictions_will_cook['user_id'], predictions_will_cook['recipe_id'], predictions_will_cook['will_cook'] = ids, recipes, binary
    return predictions_will_cook

In [225]:
accuracy_list = []
for i in range(2, 10):
    res = made_cook_model(i)
    res['actual'] = validation_will_cook_combined['will_cook']
    pred = res[['actual',  'will_cook']]
    accuracy_list.append(calculate_accuracy(pred))

In [226]:
accuracy_list

[69.47,
 64.7155,
 61.55500000000001,
 59.399,
 57.8825,
 56.8365,
 56.007,
 55.364000000000004]

In [227]:
accuracy_list = []
for i in [3/2,4/3,5/4,6/5,7/6,8/7,9/8,10/9]:
    res = made_cook_model(i)
    res['actual'] = validation_will_cook_combined['will_cook']
    pred = res[['actual',  'will_cook']]
    accuracy_list.append(calculate_accuracy(pred))

In [228]:
accuracy_list

[71.273, 70.34, 68.7355, 67.63550000000001, 66.447, 64.3985, 62.761, 61.5]

In [229]:
#dividing by 3/2, eg. including 66.66 percent of data gave the best accuracy

In [230]:
train_id = train[['recipe_id', "user_id"]].reset_index(drop = True)


In [231]:
#QUESTION 3
usersPerRecipe = defaultdict(set) # Maps an item to the users who rated it
recipePerUser = defaultdict(set) # Maps a user to the items that they rated
recipeNames = {}


for d in range(0,train_id.shape[0]):
    user,recipe = train_id['user_id'][d], train_id['recipe_id'][d]
    usersPerRecipe[recipe].add(user)
    recipePerUser[user].add(recipe)

def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

def jaccardModel(u,r,N):
    similarities = set()
    users = usersPerRecipe[r]
    recipes_u_has_cooked = recipePerUser[u]
    for r2 in recipes_u_has_cooked:
        if r2 == r: continue
        sim = Jaccard(users, usersPerRecipe[r2])
        similarities.add(abs(sim))
    if len(similarities) == 0: return 0
    if max(similarities) > N:
        return 1
    else:
        return 0
        
    
    
    

In [232]:
def claculate_pred(df,N):
    pred = []
    for i in range(df.shape[0]):
        pred.append(jaccardModel(df['user_id'][i], df['recipe_id'][i],N))
    return pred
   

In [233]:
pred= claculate_pred(validation_will_cook_combined,0.01)


In [234]:
df = pd.DataFrame(list(zip(list(validation_will_cook_combined['will_cook']), pred)),columns =['actual', 'will_cook'])
calculate_accuracy(df)

59.239

In [235]:
#QUESTION 4

recipeCount = defaultdict(int)
totalCooked = 0

for user,recipe,_ in readCSV(os.path.join('data', 'trainInteractions.csv.gz')):
    recipeCount[recipe] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(int(i))
    if count > totalCooked/(3/2): break

#RUN THROUGH JACCARD FIRST, IF THE JACARD VALUE IS 0, CHECK POPULARITY        
def jaccardPopularModel1(u,r,N):      
    similarities = set()
    users = usersPerRecipe[r]
    recipes_u_has_cooked = recipePerUser[u]
    for r2 in recipes_u_has_cooked:
        if r2 == r: continue
        sim = Jaccard(users, usersPerRecipe[r2])
        similarities.add(abs(sim))
    if len(similarities) == 0:
        if r in return1:
            return 1
        else: return 0
    if max(similarities) > N:
        return 1
    else:
        return 0

In [236]:
def claculate_pred(df,N):
    pred = []
    for i in range(df.shape[0]):
        pred.append(jaccardPopularModel1(df['user_id'][i], df['recipe_id'][i],N))
    return pred

pred= claculate_pred(validation_will_cook_combined,0.01)

In [237]:
df = pd.DataFrame(list(zip(list(validation_will_cook_combined['will_cook']), pred)),columns =['actual', 'will_cook'])
calculate_accuracy(df)

59.2905

In [238]:
#RUN THROUGH POPULARITY, IF THE RECIPE IS NOT POPULAR THEN DO A JACCARD SIMILARITY
def jaccardPopularModel2(u,r,N): 
    if r in return1:
        return 1
    similarities = set()
    users = usersPerRecipe[r]
    recipes_u_has_cooked = recipePerUser[u]
    for r2 in recipes_u_has_cooked:
        if r2 == r: continue
        sim = Jaccard(users, usersPerRecipe[r2])
        similarities.add(abs(sim))
    if len(similarities) == 0:
        return 0
    if max(similarities) > N:
        return 1
    else:
        return 0
    
def claculate_pred(df,N):
    pred = []
    for i in range(df.shape[0]):
        pred.append(jaccardPopularModel2(df['user_id'][i], df['recipe_id'][i],N))
    return pred

pred= claculate_pred(validation_will_cook_combined,0.01)

df = pd.DataFrame(list(zip(list(validation_will_cook_combined['will_cook']), pred)),columns =['actual', 'will_cook'])
calculate_accuracy(df)

62.041500000000006

In [239]:
#Running through the popularity model and then checking the Jaccard similarity 
#after it determines that its not popular seems to give the best result.
# This can however be improved by simutaneously looking at both models and improving the thresholds

In [240]:
#QUESTION 5

user_recipe_pair = []
for l in open(os.path.join('data', "stub_Made.txt")):
    if l.startswith("user_id"):continue
    u,i = l.strip().split('-')
    user_recipe_pair.append((u,i))
    
test = pd.DataFrame(user_recipe_pair, columns = ['user_id', 'recipe_id'])
test['user_id'] = test['user_id'].apply(int)
test['recipe_id'] = test['recipe_id'].apply(int)


In [241]:
def claculate_pred(df,N):
    pred = []
    for i in range(df.shape[0]):
        pred.append(jaccardPopularModel2(df['user_id'][i], df['recipe_id'][i],N))
    return pred

pred= claculate_pred(test,0.01)


In [242]:
sum(pred)

10875

In [243]:
test['cook'] = pred
test

Unnamed: 0,user_id,recipe_id,cook
0,23872231,98373850,0
1,19934813,86257276,1
2,96078190,35608339,0
3,18107115,2502174,0
4,57470081,20443676,0
...,...,...,...
19995,11943204,27685220,1
19996,27612552,70864986,1
19997,39776297,45646968,1
19998,25489508,91585618,0


In [244]:
predictions = open("predictions_Made.txt", 'w')
for l in open(os.path.join('data', "stub_Made.txt")):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    predictions.write(u + '-' + i + "," + str(jaccardPopularModel2(int(u), int(i),0.01)) + "\n")
predictions.close()
                      
#Kaggle Username: xXepicgamergirl_69Xx (nice.)                      

In [245]:
#QUESTION 6

X = []

for d in readGz(os.path.join('data', "trainRecipes.json.gz")): 
    X.append([d['steps'], d['minutes']])

In [246]:
steps = pd.DataFrame(X, columns = ['steps', 'minutes'])

In [247]:
train = steps[0:190000]
validation = steps[190000:]

In [248]:
def formatWords(df):
    df['steps'] = df['steps'].apply(lambda x: re.sub("[^0-9a-zA-Z]+"," ", x).lower())
    return df


In [249]:
trainFormatted = formatWords(train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['steps'] = df['steps'].apply(lambda x: re.sub("[^0-9a-zA-Z]+"," ", x).lower())


In [250]:
countVec = CountVectorizer(ngram_range=(1,1), max_features = 1000, tokenizer=lambda txt: txt.split())
#transform
countData = countVec.fit_transform(trainFormatted['steps'])
 
#create dataframe
cvDataframe=pd.DataFrame(countData.toarray(),columns=countVec.get_feature_names())


In [251]:
cvDataframe.sum().sort_values(ascending = False)[:10]

and        931712
the        866806
in         488266
a          473406
to         469849
with       313578
until      284239
add        259497
minutes    240162
of         235831
dtype: int64

In [252]:
#QUESTION 7

In [253]:
regr = linear_model.LinearRegression()
regr.fit(cvDataframe, train['minutes'])


LinearRegression()

In [254]:
validationFormatted = formatWords(validation)

countVecValid = CountVectorizer(ngram_range=(1,1), max_features = 1000, tokenizer=lambda txt: txt.split())
#transform
countDataValid = countVecValid.fit_transform(validationFormatted['steps'])
 
#create dataframe
cvDataframe=pd.DataFrame(countDataValid.toarray(),columns=countVecValid.get_feature_names())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['steps'] = df['steps'].apply(lambda x: re.sub("[^0-9a-zA-Z]+"," ", x).lower())


In [255]:
MSE = np.mean((validation['minutes'] - regr.predict(cvDataframe))**2)
MSE

7503.729766534698

In [256]:
#QUESTION 8

countVec = CountVectorizer(ngram_range=(1,1), max_features = 100, tokenizer=lambda txt: txt.split(), stop_words = 'english')
#transform
countData = countVec.fit_transform(trainFormatted['steps'])
 
#create dataframe
cvDataframeTrain=pd.DataFrame(countData.toarray(),columns=countVec.get_feature_names())
#cvDataframeTrain = cvDataframeTrain.reindex(columns=cvDataframeTrain.sum().sort_values(ascending = False).index).iloc[10:110]


#regr = linear_model.LinearRegression()
regr = linear_model.Ridge(alpha = 50)
regr.fit(cvDataframeTrain, train['minutes'])




validationFormatted = formatWords(validation)


countDataValid = countVec.fit_transform(validationFormatted['steps'])
 
#create dataframe
cvDataframe=pd.DataFrame(countDataValid.toarray(),columns=countVec.get_feature_names())
#cvDataframe = cvDataframe.reindex(columns=cvDataframe.sum().sort_values(ascending = False).index).iloc[10:110]


MSE = np.mean((validation['minutes'] - regr.predict(cvDataframe))**2)
MSE


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['steps'] = df['steps'].apply(lambda x: re.sub("[^0-9a-zA-Z]+"," ", x).lower())


4032.469132231415

In [257]:
parameters = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 20, 30, 40, 50, 100, 150, 120],

}
clf = GridSearchCV(regr, parameters, cv = 5)
clf.fit(cvDataframe, validation['minutes'])

GridSearchCV(cv=5, estimator=Ridge(alpha=50),
             param_grid={'alpha': [0.001, 0.01, 0.1, 1, 10, 20, 30, 40, 50, 100,
                                   150, 120]})

In [258]:
clf.best_params_

{'alpha': 50}

In [259]:
XTest = []

for d in readGz(os.path.join('data', "testRecipes.json.gz")): 
    XTest.append([d['steps'], d['recipe_id']])
    
stepsTest = pd.DataFrame(XTest, columns = ['steps', 'recipe_id'])

In [260]:
stepsTestFormatted = formatWords(stepsTest)

In [263]:
countDataTest = countVec.fit_transform(stepsTestFormatted['steps'])
cvDataframeTest = pd.DataFrame(countData.toarray(),columns=countVec.get_feature_names())

stepsTestFormatted['pred'] = regr.predict(countDataTest)

In [264]:
predictions = open("predictions_Minutes.txt", 'w')
predictions.write("recipe_id,prediction\n")
for n in range(0, stepsTestFormatted.shape[0]):
    predictions.write(stepsTestFormatted['recipe_id'].iloc[n] + ',' + str(stepsTestFormatted['pred'].iloc[n]) + '\n')
predictions.close()
