In [1]:
import gzip
from collections import defaultdict
from sklearn import linear_model
import csv

In [2]:
import random

In [3]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [4]:
dataset = list(readCSV("trainInteractions.csv.gz"))

Split data into training and validation set

In [5]:
train = dataset[:400000]
valid = dataset[400000:]

In [84]:
len(dataset)

500000

In [6]:
# create the dict to store all the recipes that each user has cooked previously
recipesPerUser = defaultdict(set)
for data in dataset:
    recipesPerUser[data[0]].add(data[1])

In [7]:
revised_valid = []
for data in valid:
    revised_valid.append((data[0],data[1]))

In [8]:
recipeCount = defaultdict(int)
totalCooked = 0

for user,recipe,_ in dataset:
    recipeCount[recipe] += 1
    totalCooked += 1

In [14]:
len(valid)

100000

In [15]:
revised_valid = valid.copy()

In [16]:
# randomly select from all recipes and check whether it's cooked by that user, then create a negative pair 
# and append to the revised valid set

for data in valid:
    cooked = True
    while cooked:
        random_entry = random.sample(recipeCount.keys(),1)
        if random_entry[0] not in recipesPerUser[data[0]]:
            if (data[0],random_entry[0]) not in revised_valid:
                revised_valid.append((data[0],random_entry[0]))
                cooked = False

In [38]:
# now we have the revised_valid, where the top 100000 data is cooked and the rest is not cooked by each corresponding user
revised_valid_tuples = []
for x in range(100000):
    revised_valid_tuples.append((revised_valid[x][0], revised_valid[x][1]))

for x in range(100000, len(revised_valid)):
    revised_valid_tuples.append(revised_valid[x])

In [44]:
len(set(revised_valid_tuples))

200000

In [43]:
%store revised_valid_tuples
%store revised_valid

Stored 'revised_valid_tuples' (list)
Stored 'revised_valid' (list)


In [6]:
%store -r revised_valid_tuples
%store -r revised_valid

In [60]:
# sort the recipe based on popularity
recipeCount = defaultdict(int)
totalCooked = 0

for user,recipe,_ in dataset:
    recipeCount[recipe] += 1
    totalCooked += 1

mostPopular = [(recipeCount[x], x) for x in recipeCount]
mostPopular.sort()
mostPopular.reverse()

In [61]:
def mostPopular_nth(n):
    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > (totalCooked * n): break
    return return1

In [86]:
# find the recipesPerUser and usersPerRecipe of the training set
recipesPerUser = defaultdict(set)
usersPerRecipe = defaultdict(set)

for data in dataset:
    recipesPerUser[data[0]].add(data[1])
    usersPerRecipe[data[1]].add(data[0])

In [87]:
def Jaccard(s1,s2):
    inter = s1.intersection(s2)
    union = s1.union(s2)
    if len(union) == 0:
        return 0
    else:
        return len(inter) / len(union)

In [51]:
# here we define a function that represents the model by applying popularity check and similarity check, 
# i is the threshold of the Jaccard similarity score, 
# j is the threshold of the popularity threshold,
# and returns the prediction from the model upon the test_data_set.

def cooking_model(data_set,i,j):
    # first find the top popular recipes
    top_popular = mostPopular_nth(j)
    
    predicts = []
    for data in data_set:
        user = data[0]
        recipe = data[1]
        
        if len(recipesPerUser[user]) == 0:
            predicts.append(0)
        else:
            condition_met = False
            for r in recipesPerUser[user]:
                sim_score = Jaccard(usersPerRecipe[recipe],usersPerRecipe[r])
                if sim_score > i:
                    condition_met = True
                    break
                else: continue
            # if satisfy either thresholds
            if condition_met or recipe in top_popular:
                predicts.append(1)
            else:
                predicts.append(0)
                
    return predicts

In [52]:
temp_result = cooking_model(revised_valid_tuples,0.63,0.642)

In [10]:
import numpy as np

In [11]:
def accuracy_score(p,titles):
    return sum(np.array(p) == np.array(titles))/len(p)

In [61]:
actual = [1]*100000 + [0]*100000

In fact this is the best accuracy score I can get from HW3 cooking prediction

In [62]:
# accuracy score of the model with popularity check and similarity check, where the popularity threshold is
# 0.642, and the jaccard similarity threshold is 0.63
accuracy_score(temp_result,actual)

0.713105

In [66]:
TP_ = np.logical_and(temp_result,actual)
TN_ = np.logical_and(np.logical_not(temp_result),np.logical_not(actual))
FP_ = np.logical_and(temp_result,np.logical_not(actual))
FN_ = np.logical_and(np.logical_not(temp_result),actual)

In [67]:
TP = sum(TP_)
FN = sum(FN_)
FP = sum(FP_)
TN = sum(TN_)

In [68]:
# True positive rate
TPR = TP /(TP + FN)

In [69]:
# False positive rate
FPR = FP /(FP + TN)

In [70]:
TPR

0.63939

In [71]:
FPR

0.21318

Now I am gonna try implementing logistic regression, the X features that I am thinking are popularity score of the recipe, and the maximum Jaccard similarity score of the recipe to the recipes that the user has cooked. The pop score is defined by the percentile of the recipe amoung all of the recipes, i.e 100 would be the most popular recipes amoung all.

In [63]:
mostPopular[:10]

[(619, '74912490'),
 (586, '29147042'),
 (511, '05898774'),
 (489, '37919560'),
 (468, '98124873'),
 (458, '67292161'),
 (424, '48134500'),
 (422, '63528663'),
 (422, '63512895'),
 (389, '36834291')]

In [64]:
len(mostPopular)

151462

In [65]:
# assign the percentile score to each recipe
mostPop_percentile = defaultdict()
cnt = 0
for r in mostPopular:
    percentile = 100 - cnt/151462
    cnt += r[0]
    mostPop_percentile[r[1]] = percentile

In [58]:
def Jaccard(s1,s2):
    inter = s1.intersection(s2)
    union = s1.union(s2)
    if len(union) == 0:
        return 0
    else:
        return len(inter) / len(union)

In [259]:
# define the function find_jaccard to find the maximum jaccard similarity score between the recipe and the users' cooked recipe
def find_jaccard(r,u):
    temp = []
    for x in recipesPerUser[u]:
        if r == x:
            continue
        jacc_score = Jaccard(usersPerRecipe[r],usersPerRecipe[x])
        temp.append(jacc_score)
    # means that the user hasn't cooked anything before
    if len(temp) == 0:
        return 0
    else:
        return max(temp)

In [98]:
# in the form of (user_id, recipe_id)
revised_valid_tuples[0]

('90764166', '01768679')

In [99]:
# now we define a function that transforms the X features
def feature(d):
    feat = []
    for data in d:
        temp_feat = [1]
        temp_feat.append(mostPop_percentile[data[1]])
        temp_feat.append(find_jaccard(data[1],data[0]))
        feat.append(temp_feat)
    return feat

In [100]:
# now we start transforming our features
X_cooking_valid = feature(revised_valid_tuples)

In [105]:
# the y here is basically our actual result, which is 100000 1's followed by 100000 0's
y_cooking_valid = actual

In [56]:
from sklearn import linear_model

In [107]:
# now we fit the X features and y into a logistic model
mod_cooking = linear_model.LogisticRegression(C=1.0, class_weight = 'balanced')
mod_cooking.fit(X_cooking_valid, y_cooking_valid)

predictions_cooking = mod_cooking.predict(X_cooking_valid)

In [108]:
accuracy_score(predictions_cooking,actual)

0.713215

In [117]:
# adjust the C value to a larger value
mod_cooking = linear_model.LogisticRegression(C=40, class_weight = 'balanced')
mod_cooking.fit(X_cooking_valid, y_cooking_valid)

predictions_cooking = mod_cooking.predict(X_cooking_valid)

In [123]:
accuracy_score(predictions_cooking,actual)

0.713255

In [124]:
sum(np.array(temp_result) == predictions_cooking) 

197034

In [125]:
len(predictions_cooking)

200000

temp_result is our predictions from solely the popularity check and jaccard similarity threshold check,
and that shows our predictions from the logistic regression model are mostly the same as our 'threshold check' model,
the accuracy score only improves by around 0.02 percent, from 0.713105 to 0.713255.

And that shows we should probably include more features in our model.

I am going to include the average rating of the recipe in the X features. Basically that means we are trying to include the 
parameter that shows whether a high rating recipe tends to be cooked more or not.

In [97]:
recipe_Ratings = defaultdict(list)
for u,r,_ in dataset:
    recipe_Ratings[r].append(int(_['rating']))

In [55]:
# calculate the average rating of the recipe
recipe_avg_Ratings = defaultdict(int)
for r in recipe_Ratings.keys():
    recipe_avg_Ratings[r] = sum(recipe_Ratings[r])/len(recipe_Ratings[r])

In [99]:
# define a function that finds the average of the recipe's rating, if the recipe hasn't been rated before then return 0.
def find_avg_rating(r):
    return recipe_avg_Ratings[r]

In [200]:
# now we define a function that transforms the X features, including the average rating of the recipe
def feature_revised(d):
    feat = []
    for data in d:
        temp_feat = [1]
        if data[1] not in mostPop_percentile:
            temp_feat.append(0)
            temp_feat.append(find_jaccard(data[1],data[0]))
            temp_feat.append(find_avg_rating(data[1]))
            feat.append(temp_feat)
        else:
            temp_feat.append(mostPop_percentile[data[1]])
            temp_feat.append(find_jaccard(data[1],data[0]))
            temp_feat.append(find_avg_rating(data[1]))
            feat.append(temp_feat)
    return feat

In [170]:
X_cooking_revised = feature_revised(revised_valid_tuples)

In [171]:
y_cooking_valid = actual

In [184]:
# adjust the C value to a larger value
mod_cooking_revised = linear_model.LogisticRegression(C=1, class_weight = 'balanced')
mod_cooking_revised.fit(X_cooking_revised, y_cooking_valid)

predictions_cooking_revised = mod_cooking_revised.predict(X_cooking_revised)

In [185]:
# returns the accuracy score of the model that includes the feature indicating average rating of the recipe.
accuracy_score(predictions_cooking_revised,actual)

0.71358

We can see that our accuracy score increases from 0.713255 to 0.71358, and that's approximately by 0.046 percent.

I am gonna try this model on kaggle

In [188]:
revised_valid_tuples[0]

('90764166', '01768679')

In [214]:
temp_d = ('19934813','86257276')

In [215]:
temp_feature = feature_revised([temp_d])

In [216]:
temp_feature

[[1, 97.84327422059658, 0.3333333333333333, 5.0]]

In [217]:
temp_predict = mod_cooking_revised.predict(temp_feature)

In [218]:
temp_predict[0]

1

In [232]:
temp_d = ('96078190','35608339')
temp_feature = feature_revised([temp_d])

In [234]:
None in temp_feature[0]

True

In [237]:
import random

In [244]:
# open the stub_Made.txt file to see the test_data and write my predictions to predictions_Made.txt file
predictions = open("predictions_Made.txt", 'w')
try:
    for l in open("stub_Made.txt"):
        if l.startswith("user_id"):
            #header
            predictions.write(l)
            continue
        u,i = l.strip().split('-')


        # here we apply the model that we just built and fitted
        temp_d = (u,i)
        temp_feature = feature_revised([temp_d])
        # if there is no feature at all for the recipe and user pair we simply predict 0.
        if None in temp_feature[0]:
                predictions.write(u + '-' + i + ",0\n")
        else:
            temp_predict = mod_cooking_revised.predict(temp_feature)
            if temp_predict[0] == 1:
                predictions.write(u + '-' + i + ",1\n")
            else:
                predictions.write(u + '-' + i + ",0\n")
            
except:
    print(u,i)
        
predictions.close()

The accuracy score on kaggle got improved by like 0.001, and I am thinking about including more features from the trainRecipes dataset.
Because basically, even though including more features might not help us identify the recipes that we will not cook, it definitely
helps with improving the accuracy of predicting what we would cook. So it's very likely that we increase our TPR by combining more features.

In [48]:
def readGz(path):
    for l in gzip.open(path, 'rt',encoding='utf-8'):
        yield eval(l)

In [49]:
dataset_recipes = []
for d in readGz("trainRecipes.json.gz"):
     dataset_recipes.append(d)

In [50]:
dataset_recipes[0]

{'name': 'sexy fried eggs for sunday brunch',
 'minutes': 10,
 'contributor_id': '14298494',
 'submitted': '2004-05-21',
 'steps': 'heat a ridged griddle pan\tlightly brush the tomato slices and bread with some olive oil\tcook the tomato slices first , for at least 5 minutes\twhen they are almost ready , toast the bread in the same pan until well bar-marked\tin the meantime , pour a little olive oil into a small frying pan and crack in the egg\tallow it to set for a minute or so and add the garlic and chilli\tcook for a couple of minutes , spooning the hot oil over the egg until cooked to your liking\tplace the griddled bread on a plate and quickly spoon the tomatoes on top\tthrow the chives into the egg pan and splash in the balsamic vinegar\tseason well , then slide the egg on to the tomatoes and drizzle the pan juices on top\tserve immediately , with a good cup of tea !',
 'description': 'this is from silvana franco\'s book "family" which i love. i made these for brunch yesterday an

In [51]:
recipe_minutes = defaultdict(int)
recipe_description = defaultdict()
recipe_ingre = defaultdict(set)
recipe_name = defaultdict()

In [52]:
# loop through the recipe dataset to store the info into corresponding dict
for r in dataset_recipes:
    recipe_minutes[r['recipe_id']] = r['minutes']
    recipe_description[r['recipe_id']] = r['description']
    recipe_ingre[r['recipe_id']] = r['ingredients']
    recipe_name[r['recipe_id']] = r['name']

First I would like to see the pattern of average cooking minutes for the cooked recipe and for those not cooked recipe

In [260]:
cooked_minutes = []
not_cooked_minutes = []
for d in revised_valid_tuples[:100000]:
    cooked_minutes.append(recipe_minutes[d[1]])
for d in revised_valid_tuples[100000:]:
    not_cooked_minutes.append(recipe_minutes[d[1]])

In [261]:
sum(cooked_minutes)/len(cooked_minutes), sum(not_cooked_minutes)/len(not_cooked_minutes)

(62.20032, 58.81477)

See the pattern of length of description of cooked vs not cooked

In [265]:
cooked_description = []
not_cooked_description = []
for d in revised_valid_tuples[:100000]:
    cooked_description.append(len(recipe_description[d[1]]))
for d in revised_valid_tuples[100000:]:
    not_cooked_description.append(len(recipe_description[d[1]]))

In [266]:
sum(cooked_description)/len(cooked_description), sum(not_cooked_description)/len(not_cooked_description)

(217.54758, 194.91585)

See the pattern of number of ingredients of cooked vs not cooked

In [268]:
cooked_ingre = []
not_cooked_ingre = []
for d in revised_valid_tuples[:100000]:
    cooked_ingre.append(len(recipe_ingre[d[1]]))
for d in revised_valid_tuples[100000:]:
    not_cooked_ingre.append(len(recipe_ingre[d[1]]))

In [269]:
sum(cooked_ingre)/len(cooked_ingre), sum(not_cooked_ingre)/len(not_cooked_ingre)

(8.80692, 8.94691)

See the pattern of length of names of cooked vs not cooked

In [270]:
cooked_name = []
not_cooked_name = []
for d in revised_valid_tuples[:100000]:
    cooked_name.append(len(recipe_name[d[1]]))
for d in revised_valid_tuples[100000:]:
    not_cooked_name.append(len(recipe_name[d[1]]))

In [271]:
sum(cooked_name)/len(cooked_name), sum(not_cooked_name)/len(not_cooked_name)

(28.40796, 27.54331)

I know the pattern above might not help that much because the compatibility for each user and recipe might be different, and that
difference would lead to the fact that each feature matters differently to them. So now I would implement those features into our model,
and to avoid overfitting, I would try only implement cooking minutes and length of description at first.

In [47]:
# define a function that finds the cooking time in minutes of the recipe
def find_minutes(r):
    if r in recipe_minutes:
        return recipe_minutes[r]
    else:
        return 0

In [46]:
# define a function that finds the length of the description of the recipe
def find_des(r):
    if r in recipe_description:
        return len(recipe_description[r])
    else:
        return 0

In [260]:
# now we define a function that transforms the X features, including the cooking minutes and the length of description
def feature_revised_02(d):
    feat = []
    for data in d:
        temp_feat = [1]
        if data[1] not in mostPop_percentile:
            temp_feat.append(0)
            temp_feat.append(find_jaccard(data[1],data[0]))
            temp_feat.append(find_avg_rating(data[1]))
            temp_feat.append(find_minutes(data[1]))
            temp_feat.append(find_des(data[1]))
            feat.append(temp_feat)
        else:
            temp_feat.append(mostPop_percentile[data[1]])
            temp_feat.append(find_jaccard(data[1],data[0]))
            temp_feat.append(find_avg_rating(data[1]))
            temp_feat.append(find_minutes(data[1]))
            temp_feat.append(find_des(data[1]))
            feat.append(temp_feat)
    return feat

In [144]:
X_cooking_revised_02 = feature_revised_02(revised_valid_tuples)

In [303]:
X_cooking_revised_02[0]

[1, 98.96498131544546, 0.08333333333333333, 4.923076923076923, 55, 132]

In [147]:
y_cooking_valid = [1]*100000 +[0]*100000

In [300]:
mod_cooking_revised_02 = linear_model.LogisticRegression(C=0.1, class_weight = 'balanced')
mod_cooking_revised_02.fit(X_cooking_revised_02, y_cooking_valid)

predictions_cooking_revised_02 = mod_cooking_revised_02.predict(X_cooking_revised_02)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [301]:
accuracy_score(predictions_cooking_revised_02,y_cooking_valid)

0.71582

This seems to be the highest accuracy score so far on the revised validation set, and I am going to try this model on kaggle

In [302]:
# open the stub_Made.txt file to see the test_data and write my predictions to predictions_Made.txt file
predictions = open("predictions_Made.txt", 'w')
try:
    for l in open("stub_Made.txt"):
        if l.startswith("user_id"):
            #header
            predictions.write(l)
            continue
        u,i = l.strip().split('-')


        # here we apply the model that we just built and fitted
        temp_d = (u,i)
        temp_feature = feature_revised_02([temp_d])
        # if there is no feature at all for the recipe and user pair we simply predict 0.
        if None in temp_feature[0]:
                predictions.write(u + '-' + i + ",0\n")
        else:
            temp_predict = mod_cooking_revised_02.predict(temp_feature)
            if temp_predict[0] == 1:
                predictions.write(u + '-' + i + ",1\n")
            else:
                predictions.write(u + '-' + i + ",0\n")
            
except:
    print(u,i)
        
predictions.close()

I got 0.70440 on the kaggle, which is the highest accuracy score I can get so far.

In [319]:
recipe_ingre['01768679']

['bacon',
 'celery',
 'onion',
 'salt',
 'all-purpose flour',
 'sugar',
 'vinegar',
 'fresh ground pepper',
 'water',
 'potatoes']

In [320]:
# define a function that finds the number of ingredients of a recipe 
def find_ingre(r):
    if r in recipe_ingre:
        return len(recipe_ingre[r])
    else:
        return 0

In [13]:
# now we define a function that transforms the X features, including the number of ingredients
def feature_revised_03(d):
    feat = []
    for data in d:
        temp_feat = [1]
        if data[1] not in mostPop_percentile:
            temp_feat.append(0)
            temp_feat.append(find_jaccard(data[1],data[0]))
            temp_feat.append(find_avg_rating(data[1]))
            temp_feat.append(find_minutes(data[1]))
            temp_feat.append(find_des(data[1]))
            temp_feat.append(find_ingre(data[1]))
            feat.append(temp_feat)
        else:
            temp_feat.append(mostPop_percentile[data[1]])
            temp_feat.append(find_jaccard(data[1],data[0]))
            temp_feat.append(find_avg_rating(data[1]))
            temp_feat.append(find_minutes(data[1]))
            temp_feat.append(find_des(data[1]))
            temp_feat.append(find_ingre(data[1]))
            feat.append(temp_feat)
    return feat

In [322]:
X_cooking_revised_03 = feature_revised_03(revised_valid_tuples)

In [323]:
y_cooking_valid = actual

In [338]:
mod_cooking_revised_03 = linear_model.LogisticRegression(C=1000, class_weight = 'balanced')
mod_cooking_revised_03.fit(X_cooking_revised_03, y_cooking_valid)

predictions_cooking_revised_03 = mod_cooking_revised_03.predict(X_cooking_revised_03)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [339]:
accuracy_score(predictions_cooking_revised_03,y_cooking_valid)

0.71151

Now I try to focus on the ingredients elements. Basically I want to find the popularity percentile of all the ingredients in each recipe
and apply that as a feature.

In [343]:
# first create a dict that stores the count of all ingredients
ingre_cnt = defaultdict(int)
for d in dataset_recipes:
    for i in d['ingredients']:
        ingre_cnt[i] += 1

In [345]:
# sort the ingredients based on popularity
mostPopular_ingre = [(y, x) for x,y in ingre_cnt.items()]
mostPopular_ingre.sort()
mostPopular_ingre.reverse()

In [348]:
# find the percentile for each ingredient
mostPop_ingre_percentile = defaultdict()
cnt = 0
for r in mostPopular_ingre:
    percentile = 100 - cnt/151462
    cnt += r[0]
    mostPop_ingre_percentile[r[1]] = percentile

In [361]:
# define a function that returns the maximum percentile for each ingredient of the recipe
def find_ingre_percentile(r):
    # if the recipe does not appear in our train recipe dataset we return 0
    if r not in recipe_ingre:
        return 0
    else:
        temp = recipe_ingre[r]
        cur = []
        for x in temp:
            cur.append(mostPop_ingre_percentile[x])
        return max(cur)

In [14]:
# now we define a function that transforms the X features, including the maximum percentile for each ingredient of the recipe;
# decide not to use the length of the ingredients as a feature, since that tends to give us low accuracy score.
def feature_revised_04(d):
    feat = []
    for data in d:
        temp_feat = [1]
        if data[1] not in mostPop_percentile:
            temp_feat.append(0)
            temp_feat.append(find_jaccard(data[1],data[0]))
            temp_feat.append(find_avg_rating(data[1]))
            temp_feat.append(find_minutes(data[1]))
            temp_feat.append(find_des(data[1]))
            temp_feat.append(find_ingre_percentile(data[1]))
            feat.append(temp_feat)
        else:
            temp_feat.append(mostPop_percentile[data[1]])
            temp_feat.append(find_jaccard(data[1],data[0]))
            temp_feat.append(find_avg_rating(data[1]))
            temp_feat.append(find_minutes(data[1]))
            temp_feat.append(find_des(data[1]))
            temp_feat.append(find_ingre_percentile(data[1]))
            feat.append(temp_feat)
    return feat

In [363]:
X_cooking_revised_04 = feature_revised_04(revised_valid_tuples)

In [364]:
y_cooking_valid = actual

In [392]:
mod_cooking_revised_04 = linear_model.LogisticRegression(C=1.2, class_weight = 'balanced')
mod_cooking_revised_04.fit(X_cooking_revised_04, y_cooking_valid)

predictions_cooking_revised_04 = mod_cooking_revised_04.predict(X_cooking_revised_04)

In [393]:
accuracy_score(predictions_cooking_revised_04,y_cooking_valid)

0.71351

In [389]:
# open the stub_Made.txt file to see the test_data and write my predictions to predictions_Made.txt file
predictions = open("predictions_Made.txt", 'w')
try:
    for l in open("stub_Made.txt"):
        if l.startswith("user_id"):
            #header
            predictions.write(l)
            continue
        u,i = l.strip().split('-')


        # here we apply the model that we just built and fitted
        temp_d = (u,i)
        temp_feature = feature_revised_04([temp_d])
        # if there is no feature at all for the recipe and user pair we simply predict 0.
        if None in temp_feature[0]:
                predictions.write(u + '-' + i + ",0\n")
        else:
            temp_predict = mod_cooking_revised_04.predict(temp_feature)
            if temp_predict[0] == 1:
                predictions.write(u + '-' + i + ",1\n")
            else:
                predictions.write(u + '-' + i + ",0\n")
            
except:
    print(u,i)
        
predictions.close()

The accuracy score I get on kaggle is 0.70180, which is lower than the model_2 I got.

Try the model that includes the cosine similarity feature

In [477]:
ratingDict = {} # To retrieve a rating for a specific user/recipe pair

for u,r,_ in dataset:
    user,recipe = u,r
    ratingDict[(user,recipe)] = int(_['rating'])

In [478]:
import math

In [479]:
def Cosine(i1, i2):
    # Between two items
    inter = usersPerRecipe[i1].intersection(usersPerRecipe[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += (ratingDict[(u,i1)])*(ratingDict[(u,i2)])
    for u in inter:
        denom1 += (ratingDict[(u,i1)])**2
        denom2 += (ratingDict[(u,i2)])**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: 
        return 0
    return numer / denom

In [483]:
# function that returns the maximum cosine similarity score between the target recipe and the recipes cooked by the user
def find_cos(r,u):
    temp = []
    for x in recipesPerUser[u]:
        temp.append(Cosine(r,x))
    if len(temp) == 0:
        return 0
    else:
        return max(temp)

In [484]:
# here we implement the model_02, the one that we got highest accuracy score previously, and include the cosine similarity attribute
def feature_revised_05(d):
    feat = []
    for data in d:
        temp_feat = [1]
        if data[1] not in mostPop_percentile:
            temp_feat.append(0)
            temp_feat.append(find_jaccard(data[1],data[0]))
            temp_feat.append(find_avg_rating(data[1]))
            temp_feat.append(find_minutes(data[1]))
            temp_feat.append(find_des(data[1]))
            temp_feat.append(find_cos(data[1],data[0]))
            feat.append(temp_feat)
        else:
            temp_feat.append(mostPop_percentile[data[1]])
            temp_feat.append(find_jaccard(data[1],data[0]))
            temp_feat.append(find_avg_rating(data[1]))
            temp_feat.append(find_minutes(data[1]))
            temp_feat.append(find_des(data[1]))
            temp_feat.append(find_cos(data[1],data[0]))
            feat.append(temp_feat)
    return feat

In [485]:
X_cooking_revised_05 = feature_revised_05(revised_valid_tuples)

In [486]:
y_cooking_valid = actual

In [498]:
mod_cooking_revised_05 = linear_model.LogisticRegression(C=100, class_weight = 'balanced')
mod_cooking_revised_05.fit(X_cooking_revised_05, y_cooking_valid)

predictions_cooking_revised_05 = mod_cooking_revised_05.predict(X_cooking_revised_05)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [499]:
accuracy_score(predictions_cooking_revised_05,y_cooking_valid)

0.71817

In [504]:
# open the stub_Made.txt file to see the test_data and write my predictions to predictions_Made.txt file
predictions = open("predictions_Made.txt", 'w')
try:
    for l in open("stub_Made.txt"):
        if l.startswith("user_id"):
            #header
            predictions.write(l)
            continue
        u,i = l.strip().split('-')


        # here we apply the model that we just built and fitted
        temp_d = (u,i)
        temp_feature = feature_revised_05([temp_d])
        # if there is no feature at all for the recipe and user pair we simply predict 0.
        if None in temp_feature[0]:
                predictions.write(u + '-' + i + ",0\n")
        else:
            temp_predict = mod_cooking_revised_05.predict(temp_feature)
            if temp_predict[0] == 1:
                predictions.write(u + '-' + i + ",1\n")
            else:
                predictions.write(u + '-' + i + ",0\n")
            
except:
    print(u,i)
        
predictions.close()

kaggle score decreases to 0.70050, and I am so confused

In [513]:
# here we implement the model_02, the one that we got highest accuracy score previously, and include the cosine similarity attribute
def feature_revised_07(d):
    feat = []
    for data in d:
        temp_feat = [1]
        if data[1] not in mostPop_percentile:
            temp_feat.append(0)
            temp_feat.append(find_jaccard(data[1],data[0]))
            temp_feat.append(find_avg_rating(data[1]))
            temp_feat.append(find_des(data[1]))
            temp_feat.append(find_cos(data[1],data[0]))
            feat.append(temp_feat)
        else:
            temp_feat.append(mostPop_percentile[data[1]])
            temp_feat.append(find_jaccard(data[1],data[0]))
            temp_feat.append(find_avg_rating(data[1]))
            temp_feat.append(find_des(data[1]))
            temp_feat.append(find_cos(data[1],data[0]))
            feat.append(temp_feat)
    return feat

In [514]:
X_cooking_revised_07 = feature_revised_07(revised_valid_tuples)

In [515]:
y_cooking_valid = actual

Try using ridge classifier

In [520]:
mod_cooking_revised_07 = linear_model.RidgeClassifier(1)
mod_cooking_revised_07.fit(X_cooking_revised_07, y_cooking_valid)

predictions_cooking_revised_07 = mod_cooking_revised_07.predict(X_cooking_revised_07)

In [521]:
accuracy_score(predictions_cooking_revised_07,y_cooking_valid)

0.71834

In [522]:
# open the stub_Made.txt file to see the test_data and write my predictions to predictions_Made.txt file
predictions = open("predictions_Made.txt", 'w')
try:
    for l in open("stub_Made.txt"):
        if l.startswith("user_id"):
            #header
            predictions.write(l)
            continue
        u,i = l.strip().split('-')


        # here we apply the model that we just built and fitted
        temp_d = (u,i)
        temp_feature = feature_revised_07([temp_d])
        # if there is no feature at all for the recipe and user pair we simply predict 0.
        if None in temp_feature[0]:
                predictions.write(u + '-' + i + ",0\n")
        else:
            temp_predict = mod_cooking_revised_07.predict(temp_feature)
            if temp_predict[0] == 1:
                predictions.write(u + '-' + i + ",1\n")
            else:
                predictions.write(u + '-' + i + ",0\n")
            
except:
    print(u,i)
        
predictions.close()

Decide to sample more data to fit the model more accurately

In [540]:
revised_valid_new = train[:400000]

In [541]:
revised_valid_new_copy = revised_valid_new.copy()

In [542]:
# randomly select from all recipes and check whether it's cooked by that user, then create a negative pair 
# and append to the revised valid set

for data in revised_valid_new_copy:
    cooked = True
    while cooked:
        random_entry = random.sample(recipeCount.keys(),1)
        if random_entry[0] not in recipesPerUser[data[0]]:
            if (data[0],random_entry[0]) not in revised_valid_new:
                revised_valid_new.append((data[0],random_entry[0]))
                cooked = False

In [544]:
%store revised_valid_new

Stored 'revised_valid_new' (list)


In [15]:
%store -r revised_valid_new

In [16]:
# now we have revised_valid_new to be our new validation set, the first 400000 entries are cooked, 
# and the last 400000 entries are uncooked
len(revised_valid_new)

800000

In [107]:
revised_valid_new_tuples = []
for x in revised_valid_new[:400000]:
    temp = (x[0],x[1])
    revised_valid_new_tuples.append(temp)
for x in revised_valid_new[400000:]:
    revised_valid_new_tuples.append(x)

In [108]:
len(set(revised_valid_new_tuples))

800000

In [261]:
X_cooking_revised_new_02 = feature_revised_02(revised_valid_new_tuples)

In [264]:
%store X_cooking_revised_new_02

Stored 'X_cooking_revised_new_02' (list)


In [263]:
%store -r X_cooking_revised_new_02

In [743]:
X_cooking_revised_new_02_smaller = X_cooking_revised_new_02[:150000] + X_cooking_revised_new_02[400000:550000]

In [744]:
len(X_cooking_revised_new_02_smaller)

300000

In [745]:
X_test = X_cooking_revised_new_02[150000:400000] + X_cooking_revised_new_02[550000:]

In [746]:
len(X_test)

500000

In [558]:
y_cooking_valid_new = [1]*400000 + [0]*400000

In [747]:
y_cooking_valid_new_smaller = [1]*150000 + [0]*150000

In [748]:
y_test = [1]*250000 + [0]*250000

In [266]:
mod_cooking_revised_new_02 = linear_model.LogisticRegression(C=1,class_weight='balanced')
mod_cooking_revised_new_02.fit(X_cooking_revised_new_02_smaller, y_cooking_valid_new_smaller)

predictions_cooking_revised_new_02 = mod_cooking_revised_new_02.predict(X_cooking_revised_new_02_smaller)

NameError: name 'X_cooking_revised_new_02_smaller' is not defined

In [762]:
accuracy_score(predictions_cooking_revised_new_02, y_cooking_valid_new_smaller)

0.8417933333333333

In [763]:
predictions_test = mod_cooking_revised_new_02.predict(X_test)

In [764]:
accuracy_score(predictions_test, y_test)

0.842684

In [336]:
# open the stub_Made.txt file to see the test_data and write my predictions to predictions_Made.txt file
predictions = open("predictions_Made.txt", 'w')
try:
    for l in open("stub_Made.txt"):
        if l.startswith("user_id"):
            #header
            predictions.write(l)
            continue
        u,i = l.strip().split('-')


        # here we apply the model that we just built and fitted
        temp_d = (u,i)
        temp_feature = feature_revised_02([temp_d])
        temp_predict = mod_cooking_revised_new_02.predict(temp_feature)
        if temp_predict[0] == 1:
            predictions.write(u + '-' + i + ",1\n")
        else:
            predictions.write(u + '-' + i + ",0\n")
            
except:
    print(u,i)
        
predictions.close()

In [337]:
counter = 0
for l in open("predictions_Made.txt"):
    if l.startswith("user_id"):
        continue
    u,i = l.strip().split(',')
    counter += int(i)
counter

6133

In [739]:
d = ('03425965', '44197323')

In [742]:
mod_cooking_revised_new_02.predict(feature_revised_02([d]))[0]

1

In [582]:
recipe_avg_Ratings['35608339']

0

In [583]:
find_avg_rating('35608339')

0

In [72]:
X_cooking_revised_new_02[0]

[1, 98.58423234870793, 1, 4.75, 30, 212]

In [154]:
temp_X = X_cooking_revised_new_02[:200000] + X_cooking_revised_new_02[400000:600000]
temp_y = [1]*200000 + [0]*200000

In [326]:
temp_X = []
for x in X_cooking_revised_new_02[:300000] + X_cooking_revised_new_02[400000:700000]:
    temp_X.append(x)

In [327]:
temp_y = [1]*300000 + [0]*300000

In [334]:
mod_cooking_revised_new_02 = linear_model.RidgeClassifier(10000)
mod_cooking_revised_new_02.fit(temp_X, temp_y)

predictions_cooking_revised_new_02 = mod_cooking_revised_new_02.predict(temp_X)

In [335]:
accuracy_score(predictions_cooking_revised_new_02, temp_y)

0.9190566666666666

In [78]:
d = ('96078190','35608339')

In [79]:
feature_revised_02([d])

[[1, 0, 0.0, None, 35, 114]]

In [82]:
recipe_minutes['35608339']

35

In [83]:
recipe_description['35608339']

'this recipe comes from shira kestenbaum, a regular on the israel-food discussion list. posted with her permission.'

In [88]:
recipe_Ratings['35608339']

[]

In [377]:
temp = []
for l in open("predictions_Made.txt"):
    if l.startswith("user_id"):
        continue
    u,i = l.strip().split(',')
    temp.append((u,i))

In [378]:
temp1 = []
for x in temp:
    temp1.append(x[0].split('-'))

In [354]:
recipes = usersPerRecipe.keys()

In [383]:
temp2_appeared = []
for x in temp1:
    if x[1] in temp3:
        temp2_appeared.append(x)

In [384]:
len(temp2_appeared)

20000

In [382]:
temp3 = set()
for x in dataset_recipes:
    temp3.add(x['recipe_id'])

In [364]:
len(temp3)

199805

In [376]:
len(dataset_recipes)

200000

20000

In [371]:
test_recipes = []
for d in readGz("testRecipes.json.gz"):
    test_recipes.append(d)

In [374]:
test_recipes

[{'name': 'rhubarb pie ii',
  'contributor_id': '75346189',
  'submitted': '2000-03-13',
  'steps': 'combine all ingredients except butter or margarine\tline a 9" pie plate with orange pastry\tfill with rhubarb mixture and dot with the butter or margarine\ttop with lattice crust\tbake at 400 degrees 40 to 50 minutes\tserve warm with vanilla ice cream',
  'description': '',
  'ingredients': ['rhubarb',
   'flour',
   'sugar',
   'salt',
   'orange rind',
   'butter'],
  'recipe_id': '79578623'},
 {'name': 'ww 6 points mediterranean roast chicken',
  'contributor_id': '91918778',
  'submitted': '2007-09-11',
  'steps': 'preheat oven to 400f\tspray roasting rack with nonstick spray\tspray nonstick baking sheet with nonstick spray\tcombine chopped thyme , oregano , oil , garlic , and salt in small bowl\tgently loosen skin from breast of chicken\tspread the herbs evenly under the skin\tplace lemon inside cavity\ttuck wings begind chicken and tie legs together\tplace chicken , breast side up

In [365]:
temp = []
for l in open("stub_Minutes.txt"):
    if l.startswith("recipe_id"):
        continue
    r = l.strip()
    temp.append(r)

In [367]:
temp_appeared = []
for x in temp:
    if x in temp3:
        temp_appeared.append(x)

In [369]:
len(temp_appeared)

19

# Question 2 Cook-time prediction

In [386]:
import string

In [387]:
len(dataset_recipes)

200000

In [389]:
train_recipes = dataset_recipes[:190000]
valid_recipes = dataset_recipes[190000:]

In [390]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in train_recipes:
    r = ''.join([c for c in d['steps'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

len(wordCount)

50254

In [391]:
wordCount_sorted = dict(sorted(wordCount.items(), key=lambda item: item[1],reverse=True))

In [392]:
# find the 1000 and 10 most common words, along with their frequencies
counts = [(wordCount_sorted[w], w) for w in wordCount_sorted]
most_1000_words_cnt = counts[:1000]
most_10_words_cnt = counts[:10]

In [393]:
most_1000_words = [x[1] for x in counts[:1000]]
most_10_words = [x[1] for x in counts[:10]]

In [394]:
most_10_words_cnt

[(930718, 'and'),
 (866529, 'the'),
 (486904, 'in'),
 (473168, 'a'),
 (469609, 'to'),
 (313537, 'with'),
 (284197, 'until'),
 (259348, 'add'),
 (239941, 'minutes'),
 (235791, 'of')]

In [395]:
# here we revise the original feature function, where new_dict is the new most_n_words list with new dict size we want, and new_wordId
# is the new word dict with new dict size.
def feature_revised(datum, new_list, new_wordId):
    feat = [0]*len(new_list)
    r = ''.join([c for c in datum['steps'].lower() if not c in punctuation])
    for w in r.split():
        if w in new_list:
            feat[new_wordId[w]] += 1
    feat.append(1) # offset
    return feat

In [396]:
# here define the q8_model, where d is the input training data, t is the data that we test upon,
# and n is the most nth words, i.e. the dict size that we want,
# and ridge_value is the input constant
def q8_model(d, t, n, ridge_value):
    most_n_words = [x[1] for x in counts[:n]]
    new_wordId = dict(zip(most_n_words, range(len(most_n_words))))
    X = [feature_revised(data, most_n_words, new_wordId) for data in d]
    # Regularized regression
    clf = linear_model.Ridge(ridge_value, fit_intercept=False) 
    clf.fit(X, y)
    X_test = [feature_revised(data, most_n_words, new_wordId) for data in t]
    predictions = clf.predict(X_test)
    MSE_score = MSE(predictions,y_test)
    return MSE_score

In hw3, our best MSE comes from dict size 5400

In [400]:
most_n_words = [x[1] for x in counts[:5400]]
new_wordId = dict(zip(most_n_words, range(len(most_n_words))))

In [402]:
a = [feature_revised(data, most_n_words, new_wordId) for data in dataset_recipes[:10]]

In [405]:
len(a[0])

5401

Find out the pattern of recipe that has long cook time, and see whether their steps include some key words.

In [428]:
recipe_time = defaultdict(int)
recipe_steps = defaultdict()
recipe_ingre = defaultdict(list)
for r in dataset_recipes:
    recipe_time[r['recipe_id']] = r['minutes']
    recipe_steps[r['recipe_id']] = r['steps']
    recipe_ingre[r['recipe_id']] = r['ingredients']

Let's find out all the recipe_time that has time above 60 minutes

In [410]:
recipe_steps_long = defaultdict()
for r in dataset_recipes:
    if r['minutes'] >= 60:
        recipe_steps_long[r['recipe_id']] = r['steps']

Find out the high frequency words in steps of recipe_steps_long

In [414]:
wordCount_long = defaultdict(int)
punctuation_long = set(string.punctuation)
for key,d in recipe_steps_long.items():
    r = ''.join([c for c in d.lower() if not c in punctuation])
    for w in r.split():
        wordCount_long[w] += 1

len(wordCount_long)

30206

In [415]:
wordCount_long_sorted = dict(sorted(wordCount.items(), key=lambda item: item[1],reverse=True))

In [416]:
# find the 1000 and 10 most common words, along with their frequencies
counts_long = [(wordCount_long_sorted[w], w) for w in wordCount_long_sorted]
most_1000_words_long_cnt = counts[:1000]
most_10_words_long_cnt = counts[:10]

In [420]:
counts[10:20]

[(231692, 'for'),
 (171284, 'or'),
 (164170, 'heat'),
 (160506, 'on'),
 (147563, 'into'),
 (147541, '1'),
 (141040, 'over'),
 (112423, 'stir'),
 (109820, 'bowl'),
 (109539, 'cook')]

In [421]:
counts_long[10:20]

[(231692, 'for'),
 (171284, 'or'),
 (164170, 'heat'),
 (160506, 'on'),
 (147563, 'into'),
 (147541, '1'),
 (141040, 'over'),
 (112423, 'stir'),
 (109820, 'bowl'),
 (109539, 'cook')]

It seems that the most popular words for the two groups are very similar, and thus I decide to include more features.

In [426]:
# define find_len_steps to find the length of the steps
def find_len_steps(r):
    return len(recipe_steps[r])

In [429]:
# define find_num_ingredients to find the number of ingredients
def find_num_ingredients(r):
    return len(recipe_ingre[r])

In [433]:
# here we revise the original feature function, where new_dict is the new most_n_words list with new dict size we want, and new_wordId
# is the new word dict with new dict size.
def feature_revised(datum, new_list, new_wordId):
    feat = [0]*len(new_list)
    r = ''.join([c for c in datum['steps'].lower() if not c in punctuation])
    for w in r.split():
        if w in new_list:
            feat[new_wordId[w]] += 1
    feat.append(1) # offset
    #feat.append(find_len_steps(datum['recipe_id']))
    feat.append(len(datum['steps']))
    feat.append(find_num_ingredients(datum['recipe_id']))
    return feat

In [434]:
# here define the q8_model, where d is the input training data, t is the data that we test upon,
# and n is the most nth words, i.e. the dict size that we want,
# and ridge_value is the input constant
def q8_model(d, t, n, ridge_value):
    most_n_words = [x[1] for x in counts[:n]]
    new_wordId = dict(zip(most_n_words, range(len(most_n_words))))
    X = [feature_revised(data, most_n_words, new_wordId) for data in d]
    # Regularized regression
    clf = linear_model.Ridge(ridge_value, fit_intercept=False) 
    clf.fit(X, y)
    X_test = [feature_revised(data, most_n_words, new_wordId) for data in t]
    predictions = clf.predict(X_test)
    MSE_score = MSE(predictions,y_test)
    return MSE_score

In [438]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

Try implementing our model with new features, along with the best dict size and lambda value we found in hw3

In [464]:
y = []
y_test = []
for r in train_recipes:
    y.append(r['minutes'])
for r in valid_recipes:
    y_test.append(r['minutes'])

Since we increase more features that we think are useful, we should decrease the dict size to make our feature more uniform.

In [442]:
q8_model(train_recipes,valid_recipes,4400,300)

3177.267762635127

The result does not improve much, and now I am trying to one hot encode the most popular ingredients for both long-cook-time recipe
and short-cook-time recipe, the reason to include both is trying to make the feature more uniform. And we define the long-cook-time recipe
to be recipes that have cook time longer than or equal to 60 mintues, and short-cook-time to be recipes that have time shorter than or
equal to 20 minutes.

In [443]:
ingre_long_cook_time = defaultdict(int)
ingre_short_cook_time = defaultdict(int)

for r in dataset_recipes:
    if r['minutes'] >= 60:
        for x in r['ingredients']:
            ingre_long_cook_time[x] += 1
    elif r['minutes'] <= 20:
        for x in r['ingredients']:
            ingre_short_cook_time[x] += 1
    else:
        continue

In [444]:
ingre_long_sorted = dict(sorted(ingre_long_cook_time.items(), key=lambda item: item[1],reverse=True))
ingre_short_sorted = dict(sorted(ingre_short_cook_time.items(), key=lambda item: item[1],reverse=True))

In [445]:
ingre_counts_long = [(ingre_long_sorted[w], w) for w in ingre_long_sorted]
ingre_counts_short = [(ingre_short_sorted[w], w) for w in ingre_short_sorted]

In [466]:
ingre_long = []
ingre_short = []
for x in ingre_counts_long[:200]:
    ingre_long.append(x[1])

for x in ingre_counts_short[:200]:
    ingre_short.append(x[1])

In [467]:
top_popular_ingre = ingre_long + ingre_short

In [468]:
# define the function ingre_ohe to one hot encode the top popular ingredients that we extract from both long-cook-time
# recipe and short-cook-time recipe
def ingre_ohe(d):
    return [1 if x in d['ingredients'] else 0 for x in top_popular_ingre]

In [458]:
a = [1]
b = [1, 0]
a.append(b)

In [477]:
# here we revise the original feature function, where new_dict is the new most_n_words list with new dict size we want, and new_wordId
# is the new word dict with new dict size.
def feature_revised(datum, new_list, new_wordId):
    feat = [0]*len(new_list)
    r = ''.join([c for c in datum['steps'].lower() if not c in punctuation])
    for w in r.split():
        if w in new_list:
            feat[new_wordId[w]] += 1
    feat.append(1) # offset
    #feat.append(find_len_steps(datum['recipe_id']))
    feat.append(len(datum['steps']))
    #feat.append(find_num_ingredients(datum['recipe_id']))
    feat.append(len(datum['ingredients']))
    # append the one hot encoding part of the top popular ingredients found from long-cook-time recipe and short-cook-time recipe
    for x in ingre_ohe(datum):
        feat.append(x)
        
    return feat

In [470]:
q8_model(train_recipes,valid_recipes,4400,300)

3134.02918845149

So far this is the lowest MSE score I can get.

In [471]:
most_n_words = [x[1] for x in counts[:4400]]
new_wordId = dict(zip(most_n_words, range(len(most_n_words))))

In [478]:
y = []
for r in train_recipes:
    y.append(r['minutes'])

In [479]:
X = [feature_revised(data, most_n_words, new_wordId) for data in train_recipes]
# Regularized regression
clf = linear_model.Ridge(300, fit_intercept=False) 
clf.fit(X, y)

Ridge(alpha=300, fit_intercept=False)

In [487]:
# write our predictions to the file
predictions = open("predictions_Minutes.txt", 'w')
predictions.write("recipe_id,prediction\n")
for d in readGz("testRecipes.json.gz"):
    x = feature_revised(d, most_n_words, new_wordId)
    pred = clf.predict([x])[0]
    predictions.write(d['recipe_id'] + ',' + str(pred) + '\n')

predictions.close()

In [482]:
test_ = []
for d in readGz("testRecipes.json.gz"):
    test_.append(d)

In [484]:
test_[0]

{'name': 'rhubarb pie ii',
 'contributor_id': '75346189',
 'submitted': '2000-03-13',
 'steps': 'combine all ingredients except butter or margarine\tline a 9" pie plate with orange pastry\tfill with rhubarb mixture and dot with the butter or margarine\ttop with lattice crust\tbake at 400 degrees 40 to 50 minutes\tserve warm with vanilla ice cream',
 'description': '',
 'ingredients': ['rhubarb', 'flour', 'sugar', 'salt', 'orange rind', 'butter'],
 'recipe_id': '79578623'}

Decide to include a feature that indicates all the timestamps within the 'steps', for example some recipe needs the user to
preheat the oven and includes keywords like 'minutes and hours', we simply add up all of the preparation time.The keyword that we use to
find the timestamp are 'minutes', 'hours', 'hrs', 'mins', 'hour'.

In [555]:
wordCount['minutes']

239941

In [556]:
wordCount['hours']

29125

In [558]:
wordCount['hrs']

349

In [559]:
wordCount['mins']

2861

In [563]:
wordCount['hour']

20828

In [560]:
import re

In [561]:
a = 'combine all ingredients except butter or margarine\tline a 9" pie plate with orange pastry\tfill with rhubarb mixture and dot with the butter or margarine\ttop with lattice crust\tbake at 400 degrees 40 to 50 minutes\tserve warm with vanilla ice cream'

In [606]:
def find_time_in_steps(r):
    step = r['steps']
    temp_re = re.findall('[0-9]+\s*minutes|[0-9]+\s*hrs|[0-9]+\s*mins|[0-9]+\s*hours?|', step)
    temp_re_revised = []
    for x in temp_re:
        if x != '':
            temp_x = x.split(' ')
            if 'minutes' or 'mins' in temp_x:
                temp_mins = re.findall('[0-9]*', temp_x[0])
                temp_re_revised.append(float(temp_mins[0]))
            elif 'hrs' or 'hours' or 'hr' in temp_x:
                temp_hrs = re.findall('[0-9]*', temp_x[0])
                temp_re_revised.append(float(temp_hrs[0]) * 60.0)
        else:
            continue
            
    return sum(temp_re_revised)

In [603]:
['10mins', '20', 'mins']

['10mins', '20', 'mins']

In [602]:
re.findall('[0-9]*', '10min')

['10', '']

In [607]:
find_time_in_steps(test_recipes[0])

50.0

In [608]:
# here we revise the original feature function, where new_dict is the new most_n_words list with new dict size we want, and new_wordId
# is the new word dict with new dict size.
def feature_revised(datum, new_list, new_wordId):
    feat = [0]*len(new_list)
    r = ''.join([c for c in datum['steps'].lower() if not c in punctuation])
    for w in r.split():
        if w in new_list:
            feat[new_wordId[w]] += 1
    feat.append(1) # offset
    #feat.append(find_len_steps(datum['recipe_id']))
    feat.append(len(datum['steps']))
    #feat.append(find_num_ingredients(datum['recipe_id']))
    feat.append(len(datum['ingredients']))
    # append the one hot encoding part of the top popular ingredients found from long-cook-time recipe and short-cook-time recipe
    for x in ingre_ohe(datum):
        feat.append(x)
    
    # append the timestamp that we find in the steps of the recipe
    feat.append(find_time_in_steps(datum))
        
    return feat

In [609]:
most_n_words = [x[1] for x in counts[:4400]]
new_wordId = dict(zip(most_n_words, range(len(most_n_words))))

In [610]:
X_with_time_stamp = [feature_revised(data, most_n_words, new_wordId) for data in train_recipes]

In [611]:
y = []
for r in train_recipes:
    y.append(r['minutes'])

In [617]:
X_with_time_stamp_revised = [x[:4000] + x[4400:] for x in X_with_time_stamp]

In [619]:
len(X_with_time_stamp_revised)

190000

In [620]:
# Regularized regression
clf_with_time_stamp = linear_model.Ridge(300, fit_intercept=False) 
clf_with_time_stamp.fit(X_with_time_stamp_revised, y)

Ridge(alpha=300, fit_intercept=False)

In [622]:
X_test_with_time_stamp = [feature_revised(data, most_n_words, new_wordId) for data in valid_recipes]
X_test_with_time_stamp_revised = [x[:4000] + x[4400:] for x in X_test_with_time_stamp]

In [623]:
predictions = clf_with_time_stamp.predict(X_test_with_time_stamp_revised)
MSE_score = MSE(predictions,y_test)

In [624]:
MSE_score

3133.1393619064547

Try our model with 4000 dict size, lambda 300, and including feature timestamp within the steps.

In [625]:
# write our predictions to the file
predictions = open("predictions_Minutes.txt", 'w')
predictions.write("recipe_id,prediction\n")
for d in readGz("testRecipes.json.gz"):
    x = feature_revised(d, most_n_words, new_wordId)
    x_revised = x[:4000] + x[4400:]
    pred = clf_with_time_stamp.predict([x_revised])[0]
    predictions.write(d['recipe_id'] + ',' + str(pred) + '\n')

predictions.close()

Get the result of MSE 2955.80124 on kaggle.

Decide to classify the recipe first, and then fit each type of recipes (wines or nonwines) into different model. Since recipes like wines have short len of steps
and basically less material, but it may took long time.

In [656]:
wines_cook_time = []
wines_recipe_names = []
for r in train_recipes:
    if 'wines' in r['name'] or 'cranberry' in r['name']:
        long_cook_time.append(r['minutes'])
        long_recipe_names.append(r['name'])

In [654]:
len(long_recipe_names)

1754

Find the most common words in 'wines' recipe

In [657]:
wines_steps = {}
for r in train_recipes:
    if 'wines' in r['name'] or 'cranberry' in r['name']:
        wines_steps[r['name']] = r['steps']

In [660]:
wordCount_wines = defaultdict(int)
punctuation_wines = set(string.punctuation)
for key,d in wines_steps.items():
    r = ''.join([c for c in d.lower() if not c in punctuation])
    for w in r.split():
        wordCount_wines[w] += 1

len(wordCount_wines)
wordCount_wines_sorted = dict(sorted(wordCount_wines.items(), key=lambda item: item[1],reverse=True))

# find the 1000 and 10 most common words, along with their frequencies
counts_wines = [(wordCount_wines_sorted[w], w) for w in wordCount_wines_sorted]
most_1000_words_wines_cnt = counts_wines[:1000]
most_10_words_wines_cnt = counts_wines[:10]

most_n_words_wines = [x[1] for x in counts_wines[:3000]]
new_wordId_wines = dict(zip(most_n_words_wines, range(len(most_n_words_wines))))

In [664]:
feature_wines = []
y_wines = []
for r in train_recipes:
    if 'wines' in r['name'] or 'cranberry' in r['name']:
        feature_wines.append(feature_revised(r,most_n_words_wines,new_wordId_wines))
        y_wines.append(r['minutes'])

In [665]:
clf_wines = linear_model.Ridge(300, fit_intercept=False) 
clf_wines.fit(feature_wines, y_wines)

Ridge(alpha=300, fit_intercept=False)

In [666]:
valid_feature_wines = []
valid_y_wines = []
for r in valid_recipes:
    if 'wines' in r['name'] or 'cranberry' in r['name']:
        valid_feature_wines.append(feature_revised(r,most_n_words_wines,new_wordId_wines))
        valid_y_wines.append(r['minutes'])

In [667]:
valid_wines_predictions = clf_wines.predict(valid_feature_wines)

In [668]:
MSE(valid_wines_predictions, valid_y_wines)

3961.7342517785783

Combine our two model and see the performance on the validation set

In [678]:
predictions_combined = []

for i in range(len(valid_recipes)):
    if 'wines' in valid_recipes[i]['name'] or 'cranberry' in valid_recipes[i]['name']:
        predictions_combined.append(clf_wines.predict([feature_revised(valid_recipes[i],most_n_words_wines,new_wordId_wines)]))
    else:
        predictions_combined.append(clf_with_time_stamp.predict([X_test_with_time_stamp_revised[i]]))

In [680]:
MSE(predictions_combined, y_test)[0]

3143.8663172293723

Combine our result with scraped result

In [489]:
import csv

In [491]:
actual_minutes = []

In [492]:
with open('actual_minutes.csv', 'r') as csvfile:
    read = csv.reader(csvfile)
    for row in read:
        actual_minutes.append(row)

In [493]:
actual_minutes_revised = []
for x in actual_minutes:
    if x != []:
        actual_minutes_revised.append(x)

In [494]:
actual_minutes_dict = {}
for x in actual_minutes_revised:
    actual_minutes_dict[x[0]] = x[1]

In [505]:
actual_minutes_dict['67648450'] = 80.0

In [506]:
%store actual_minutes_dict

Stored 'actual_minutes_dict' (dict)


In [508]:
import math

In [515]:
len(actual_minutes_dict.keys())

4216

In [522]:
one_result_3000 = []

In [523]:
with open('one_result_3000.csv', 'r') as csvfile:
    read = csv.reader(csvfile)
    for row in read:
        one_result_3000.append(row)

In [524]:
one_result_3000_revised = []
for x in one_result_3000:
    if x == []:
        continue
    else:
        one_result_3000_revised.append(''.join(x))

In [528]:
one_result_3000_revised[:5]

['ww 6 points mediterranean roast chicken',
 'sweet salty grilled cheese sandwich',
 'taco pie in pie crust',
 'southern chicken parmegian',
 'chocolate intemperance']

In [529]:
len(one_result_3000_revised)

1959

In [534]:
actual_minutes_6000 = []
with open('actual_minutes_6000.csv', 'r') as csvfile:
    read = csv.reader(csvfile)
    for row in read:
        actual_minutes_6000.append(row)

In [536]:
actual_minutes_6000_dict = {}
for x in actual_minutes_6000:
    if x != []:
        actual_minutes_6000_dict[x[0]] = float(x[1])

In [538]:
len(actual_minutes_6000_dict.keys())

947

In [542]:
actual_minutes_8000 = []
with open('actual_minutes_8000.csv', 'r') as csvfile:
    read = csv.reader(csvfile)
    for row in read:
        actual_minutes_8000.append(row)

In [543]:
actual_minutes_8000_dict = {}
for x in actual_minutes_8000:
    if x != []:
        actual_minutes_8000_dict[x[0]] = float(x[1])

In [547]:
len(actual_minutes_8000_dict.keys())

988

In [548]:
cnt = 0

In [626]:
# write our predictions to the file
predictions = open("predictions_Minutes.txt", 'w')
predictions.write("recipe_id,prediction\n")
for d in readGz("testRecipes.json.gz"):
    x = feature_revised(d, most_n_words, new_wordId)
    x_revised = x[:4000] + x[4400:]
    pred = clf_with_time_stamp.predict([x_revised])[0]
    if (d['name'] in one_result_3000_revised and d['recipe_id'] in actual_minutes_dict):
        a = actual_minutes_dict[d['recipe_id']]
        predictions.write(d['recipe_id'] + ',' + str(a) + '\n')
        cnt += 1
    elif d['recipe_id'] in actual_minutes_6000_dict:
        a = actual_minutes_6000_dict[d['recipe_id']]
        predictions.write(d['recipe_id'] + ',' + str(a) + '\n')
        cnt += 1
    elif d['recipe_id'] in actual_minutes_8000_dict:
        a = actual_minutes_8000_dict[d['recipe_id']]
        predictions.write(d['recipe_id'] + ',' + str(a) + '\n')
        cnt += 1
    else:
        predictions.write(d['recipe_id'] + ',' + str(pred) + '\n')
        
predictions.close()

In [550]:
cnt

3893

In [627]:
one_result_4216 = []

In [628]:
with open('one_result_4216.csv', 'r') as csvfile:
    read = csv.reader(csvfile)
    for row in read:
        one_result_4216.append(row)

In [629]:
one_result_4216_revised = []
for x in one_result_4216:
    if x == []:
        continue
    else:
        one_result_4216_revised.append(''.join(x))

In [630]:
len(one_result_4216_revised)

2747

In [631]:
len(one_result_3000_revised)

1959

In [636]:
2747 - 1959

788

In [635]:
# write our predictions to the file
predictions = open("predictions_Minutes.txt", 'w')
predictions.write("recipe_id,prediction\n")
for d in readGz("testRecipes.json.gz"):
    x = feature_revised(d, most_n_words, new_wordId)
    x_revised = x[:4000] + x[4400:]
    pred = clf_with_time_stamp.predict([x_revised])[0]
    if (d['name'] in one_result_4216_revised and d['recipe_id'] in actual_minutes_dict):
        a = actual_minutes_dict[d['recipe_id']]
        predictions.write(d['recipe_id'] + ',' + str(a) + '\n')
        cnt += 1
    elif d['recipe_id'] in actual_minutes_6000_dict:
        a = actual_minutes_6000_dict[d['recipe_id']]
        predictions.write(d['recipe_id'] + ',' + str(a) + '\n')
        cnt += 1
    elif d['recipe_id'] in actual_minutes_8000_dict:
        a = actual_minutes_8000_dict[d['recipe_id']]
        predictions.write(d['recipe_id'] + ',' + str(a) + '\n')
        cnt += 1
    else:
        predictions.write(d['recipe_id'] + ',' + str(pred) + '\n')
        
predictions.close()