In [75]:
import json
from matplotlib import pyplot as plt
from collections import defaultdict
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import numpy
import random
import gzip
import math
import sklearn
import pandas
import ast
import csv

In [76]:
import warnings
warnings.filterwarnings("ignore")

In [77]:
filePath = "recipes.csv"
recipes = pandas.read_csv(filePath)

recipesDataset = {}
for _, row in recipes.iterrows():
    recipesDataset[row["id"]] = {
        "name": row["name"],
        "minutes": row["minutes"],
        "contributor_id": row["contributor_id"],
        "submitted": row["submitted"],
        "tags": row["tags"],
        "nutrition": row["nutrition"],
        "n_steps": row["n_steps"],
        "steps": row["steps"],
        "description": row["description"],
        "ingredients": row["ingredients"],
        "n_ingredients": row["n_ingredients"]
    }

In [78]:
for recipe_id, attributes in list(recipesDataset.items())[:1]:
    print(f"Recipe ID: {recipe_id}")
    print(json.dumps(attributes, indent=4))
    print()

Recipe ID: 137739
{
    "name": "arriba   baked winter squash mexican style",
    "minutes": 55,
    "contributor_id": 47892,
    "submitted": "2005-09-16",
    "tags": "['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'side-dishes', 'vegetables', 'mexican', 'easy', 'fall', 'holiday-event', 'vegetarian', 'winter', 'dietary', 'christmas', 'seasonal', 'squash']",
    "nutrition": "[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",
    "n_steps": 11,
    "steps": "['make a choice and proceed with recipe', 'depending on size of squash , cut into half or fourths', 'remove seeds', 'for spicy squash , drizzle olive oil or melted butter over each cut squash piece', 'season with mexican seasoning mix ii', 'for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece', 'season with sweet mexican spice mix', 'bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork

In [79]:
filePath = "interactions.csv"
interactions = pandas.read_csv(filePath)

interactionsDataset = []
for _, row in interactions.iterrows():
    interactionsDataset.append({
        "user_id": row["user_id"],
        "recipe_id": row["recipe_id"],
        "date": row["date"],
        "rating": row["rating"],
        "review": row["review"],
    })

In [80]:
print(json.dumps(interactionsDataset[0], indent=4))

{
    "user_id": 38094,
    "recipe_id": 40893,
    "date": "2003-02-17",
    "rating": 4,
    "review": "Great with a salad. Cooked on top of stove for 15 minutes.Added a shake of cayenne and a pinch of salt.  Used low fat sour cream.  Thanks."
}


In [81]:
allRatings = [
    (entry["user_id"], entry["recipe_id"], entry["rating"])
    for entry in interactionsDataset
]

print(allRatings[1])
print(len(allRatings))

(1293707, 40893, 5)
1132367


In [82]:
ratingsTrain = allRatings[:950000]
ratingsValid = allRatings[950000:1000000]
ratingsTest = allRatings[1000000:]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))

In [83]:
filename = "pairs_Rating.csv"

with open(filename, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["userID", "recipeID", "prediction"])
    for u, r, _ in ratingsTest:
        writer.writerow([u, r, ""])

In [84]:
betaU = {u: numpy.random.normal(0, 0.01) for u in ratingsPerUser}
betaI = {b: numpy.random.normal(0, 0.01) for b in ratingsPerItem}

ratings = numpy.array([r for _, _, r in ratingsTrain])
alpha = numpy.mean(ratings)

In [85]:
def iterate(lamb):
    newAlpha = 0
    for u,b,r in ratingsTrain:
        newAlpha += r - (betaU[u] + betaI[b])
    alpha = newAlpha / len(ratingsTrain)
    for u in ratingsPerUser:
        newBetaU = 0
        for b,r in ratingsPerUser[u]:
            newBetaU += r - (alpha + betaI[b])
        betaU[u] = newBetaU / (lamb + len(ratingsPerUser[u]))
    for b in ratingsPerItem:
        newBetaI = 0
        for u,r in ratingsPerItem[b]:
            newBetaI += r - (alpha + betaU[u])
        betaI[b] = newBetaI / (lamb + len(ratingsPerItem[b]))
    mse = 0
    for u,b,r in ratingsTrain:
        prediction = alpha + betaU[u] + betaI[b]
        mse += (r - prediction) ** 2
    regularizer = 0
    for u in betaU:
        regularizer += betaU[u] ** 2
    for b in betaI:
        regularizer += betaI[b] ** 2
    mse /= len(ratingsTrain)
    return mse, mse + lamb * regularizer

In [86]:
mse, objective = iterate(1)
newMSE, newObjective = iterate(1)

while objective - newObjective > 0.00001:
    mse, objective = newMSE, newObjective
    newMSE, newObjective = iterate(4.7)

In [101]:
predicted = []

predictions = open("predictions_Rating.csv", 'w')
for l in open("pairs_Rating.csv"):
    if l.startswith("userID"): # header
        predictions.write(l)
        continue
    parts = l.strip().split(',')
    u, b = int(parts[0]), int(parts[1])

    wu, wi = 1.1, 0.8
    
    bu = 0
    bi = 0
    if u in betaU:
        bu = betaU[u]
    if b in betaI:
        bi = betaI[b]
    prediction = alpha + wu * bu + wi * bi

    if prediction > 5: prediction = 5
    predicted.append(prediction)
    predictions.write(f"{u},{b},{prediction}\n")

predictions.close()

In [102]:
actual = [rating for _, _, rating in ratingsTest]
mse = mean_squared_error(actual, predicted)

print(mse)

1.4933149453451318


In [89]:
!jupyter nbconvert --to python assignment2.ipynb

[NbConvertApp] Converting notebook assignment2.ipynb to python
[NbConvertApp] Writing 4434 bytes to assignment2.py
