In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import json
import matplotlib.pyplot as plt
import scipy
import dateutil.parser
from sklearn.preprocessing import OneHotEncoder
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import requests
import json
from collections import defaultdict
import gzip
import csv
import random
from sklearn.metrics import jaccard_score 
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
def readGz(path):
    for l in gzip.open(path, 'rt', encoding = "UTF-8"):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [3]:
fp = os.path.join('data', 'trainInteractions.csv')
df = pd.read_csv(fp)


#TASKS 1 
train = df[0:400000]
validation = df[400000:]


In [4]:
#ALL UNIQUE RECIPES IN BOTH TRAINING AND VALIDATION SET
all_recipes = list(df['recipe_id'].unique())

In [5]:
#CREATING NEGATIVE SAMPLE
validation_will_cook = validation.drop(['rating', 'date'], axis = 1).reset_index(drop = True)


ids = []
recipes = []
binary = []
for u in validation_will_cook['user_id']:
    while True:
        recipe = random.choice(all_recipes)
        if recipe not in list(validation_will_cook[validation_will_cook['user_id'] == u]['recipe_id']):
            ids.append(u)
            recipes.append(recipe)
            binary.append(0)
            break

In [6]:
negative_samples = pd.DataFrame()
negative_samples['user_id'], negative_samples['recipe_id'], negative_samples['will_cook'] = ids, recipes, binary


In [7]:
validation_will_cook['will_cook'] = 1
validation_will_cook_combined = pd.concat([negative_samples, validation_will_cook], ignore_index=True)

In [8]:
def calculate_accuracy(df):
    TP = df[(df['actual'] == 1) & (df['will_cook'] == 1)].shape[0]
    TN = df[(df['actual'] == 0) & (df['will_cook'] == 0)].shape[0]
    FP = df[(df['actual'] == 0) & (df['will_cook'] == 1)].shape[0]
    FN = df[(df['actual'] == 1) & (df['will_cook'] == 0)].shape[0]
    P = TP + FN
    N = TN + FP
    Accuracy = (TP + TN)/(P + N)
    return Accuracy *100

In [9]:
train_id = train[['recipe_id', "user_id"]].reset_index(drop = True)

usersPerRecipe = defaultdict(set) # Maps an item to the users who rated it
recipePerUser = defaultdict(set) # Maps a user to the items that they rated
recipeNames = {}


for d in range(0,train_id.shape[0]):
    user,recipe = train_id['user_id'][d], train_id['recipe_id'][d]
    usersPerRecipe[recipe].add(user)
    recipePerUser[user].add(recipe)

def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [10]:
#EDA LOOKING AT RATING DISTRIBUTION
train['rating'].value_counts()

5    297664
4     72698
3     14551
0      9383
2      3851
1      1853
Name: rating, dtype: int64

In [11]:
# Making a dictionary which takes all user's average rating of the recipe
ratingDict = {}
for recipe in all_recipes:
    temp = df[df['recipe_id'] == recipe]
    ratingDict[recipe] = temp['rating'].mean()
    
# Rating mean    
ratingMean = np.mean(list(ratingDict.values()))

In [12]:
ratingMean

4.506993301907747

In [13]:
recipeCount = defaultdict(int)
totalCooked = 0

for user,recipe,_ in readCSV(os.path.join('data', 'trainInteractions.csv.gz')):
    recipeCount[recipe] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(int(i))
    if count > totalCooked/(3/2): break


In [20]:
def model4(u,r):      
    similarities = []
    users = usersPerRecipe[r]
    recipes_u_has_cooked = recipePerUser[u]
    recipes = []
    for r2 in recipes_u_has_cooked:
        if r2 == r: continue
        sim = Jaccard(users, usersPerRecipe[r2])
        similarities.append(sim)
        recipes.append(r2)
        
    if r not in ratingDict.keys():
        if r in return1:
            return 1
        else: return 0
        
    if len(similarities) == 0: #Handle empty cases
        if ratingDict[r] >= ratingMean and r in return1:
            return 1
        else: return 0


    if max(similarities) < 0.4:
        if ratingDict[r] >= ratingMean - 2 and r in return1:
            return 1
        else: return 0
    
    if max(similarities) < 0.5:
        if ratingDict[r] >= ratingMean - 2.5 and r in return1:
            return 1
        else: return 0
        
    
    if ratingDict[r] >= ratingMean- 1.7 and r in return1:
        return 1
    else: return 0

In [17]:
def claculate_pred(df):
    pred = []
    for i in range(df.shape[0]):
        pred.append(model4(df['user_id'][i], df['recipe_id'][i]))
    return pred

pred= claculate_pred(validation_will_cook_combined)

pred = pd.DataFrame(list(zip(list(validation_will_cook_combined['will_cook']), pred)),columns =['actual', 'will_cook'])
calculate_accuracy(pred)

71.1895

In [21]:
predictions = open("predictions_Made.txt", 'w')
for l in open(os.path.join('data', "stub_Made.txt")):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    predictions.write(u + '-' + i + "," + str(model4(int(u), int(i))) + "\n")
predictions.close()

In [1143]:
# TASK 2

X = []

for d in readGz(os.path.join('data', "trainRecipes.json.gz")): 
    X.append([d['name'], d['steps'],d['ingredients'], d['recipe_id'], d['minutes']])
    
steps = pd.DataFrame(X, columns = ['name', 'steps', 'ingredients', 'recipe_id', 'minutes'])

In [1144]:
#Average Cook Time
avgTime = np.mean(steps['minutes'])
avgTime

# Max Cook Time
maxTime = max(steps['minutes'])
maxTime

600

In [1145]:
def timeFinder(string):
    mins = re.findall("([0-9]+) mins*", string)
    hour = re.findall("([0-9]+) hour*", string)
    oneHalf = re.findall("1 2 hours*", string)
    
    if len(hour) > 0:
        hourTotal = [int(i) for i in hour]
        for n in range(len(hourTotal)):
            if hourTotal[n] >= 7:
                return 600
        
        hourTotal = sum(hourTotal) * 60
    else: 
        hourTotal = 0
    if len(oneHalf) > 0:
        hourTotal -= 30 * len(oneHalf)
        
    if len(mins) > 0:
        minTotal = sum([int(i) for i in mins])
    else: 
        minTotal = 0
        
    if minTotal + hourTotal == 0:
        return avgTime
    else: return minTotal + hourTotal

In [1146]:
def colTransform(df):
    df['steps'] = df['steps'].apply(lambda x: re.sub("[^0-9a-zA-Z]+"," ", x).lower())
    df['num_ingredients'] = df['ingredients'].apply(len)

    df['minAgg'] = df['steps'].apply(lambda x: timeFinder(x))
    df['minAgg'] = df['minAgg'].apply(lambda x : avgTime if x > maxTime else x)
    df['stepLength'] = df['steps'].apply(lambda x: len(x.split()))
    df['oven'] = df['steps'].apply(lambda x: 1 if len(re.findall('oven', x)) > 0 else 0)
    df['cook'] = df['steps'].apply(lambda x: 1 if len(re.findall('cook', x)) > 0 else 0)
    df['overnight'] = df['steps'].apply(lambda x: 1 if len(re.findall('overnight', x)) > 0 else 0)
    df['refrigerate'] = df['steps'].apply(lambda x: 1 if len(re.findall('refrigerate', x)) > 0 else 0)
    return df

In [1219]:
X = []

for d in readGz(os.path.join('data', "trainRecipes.json.gz")): 
    X.append([d['name'], d['steps'],d['ingredients'], d['recipe_id'], d['minutes']])
    
steps = pd.DataFrame(X, columns = ['name', 'steps', 'ingredients', 'recipe_id', 'minutes'])
y = steps['minutes']
steps = steps.drop(columns = ['minutes'])

steps = colTransform(steps)

In [1220]:
countVec = CountVectorizer(ngram_range=(1,1), max_features = 1000, tokenizer=lambda txt: txt.split(), stop_words = 'english')
    #transform
countData = countVec.fit_transform(steps['steps'].replace('\d+', '', regex=True))

    #create dataframe
cvDataframeTrain=pd.DataFrame(countData.toarray(),columns=countVec.get_feature_names())
cvDataframeTrain = pd.concat([cvDataframeTrain, steps[['num_ingredients', 'minAgg', 'stepLength', 'oven', 'cook', 'overnight']]], axis = 1)

In [1221]:
X = []

for d in readGz(os.path.join('data', "testRecipes.json.gz")): 
    X.append([d['name'], d['steps'],d['ingredients'], d['recipe_id']])
    
testStub = pd.DataFrame(X, columns = ['name', 'steps', 'ingredients', 'recipe_id'])

testStub = colTransform(testStub)

In [1222]:
countStub = countVec.transform(testStub['steps'].replace('\d+', '', regex=True))
cvDataframeStub=pd.DataFrame(countStub.toarray(),columns=countVec.get_feature_names())
cvDataframeStub = pd.concat([cvDataframeStub, testStub[['num_ingredients', 'minAgg', 'stepLength', 'oven', 'cook', 'overnight']]], axis = 1)

In [1223]:
t = linear_model.Ridge()
train = cvDataframeTrain[:190000]
test = cvDataframeTrain[190000:]

t.fit(train, y[:190000])

pred = t.predict(test)


In [1224]:
np.average((pred - y[190000:])**2)

2970.427634260682

In [1225]:
# STUB

t = linear_model.Ridge()
train = cvDataframeTrain

t.fit(train, y)

pred = t.predict(cvDataframeStub)

In [1226]:
testStub['pred'] = pred

In [1227]:
predictions = open("predictions_Minutes.txt", 'w')
predictions.write("recipe_id,prediction\n")
for n in range(testStub.shape[0]):
    predictions.write(testStub['recipe_id'].iloc[n] + ',' + str(testStub['pred'].iloc[n]) + '\n')
predictions.close()

In [1228]:
train

Unnamed: 0,absorbed,according,add,added,adding,addition,additional,adjust,ahead,air,...,yolks,zest,zip,zucchini,num_ingredients,minAgg,stepLength,oven,cook,overnight
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,9,5.0000,145,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5,30.0000,47,1,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,11,30.0000,78,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,4,60.2356,38,0,1,0
4,0,0,3,0,0,0,0,0,0,0,...,1,0,0,0,18,30.0000,169,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,11,40.0000,125,0,1,0
199996,0,1,5,0,0,0,0,0,0,0,...,0,0,0,0,15,13.0000,126,1,1,0
199997,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,13,48.0000,83,1,1,0
199998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,11,187.0000,172,1,0,0
