In [None]:
import pandas as pd
import numpy as np
import sklearn
import ast

In [None]:
interactions_test = pd.read_csv('interactions_test.csv')
interactions_train = pd.read_csv('interactions_train.csv')
interactions_validation = pd.read_csv('interactions_validation.csv')
PP_recipes = pd.read_csv('PP_recipes.csv').set_index('i').sort_index()
PP_users = pd.read_csv('PP_users.csv').set_index('u').sort_index()
RAW_interactions = pd.read_csv('RAW_interactions.csv')
RAW_recipes = pd.read_csv('RAW_recipes.csv')

In [None]:
recipe_review_counts = RAW_interactions.groupby('recipe_id')['rating'].count()
recipe_review_counts

In [None]:
PP_recipes.dtypes

In [None]:
PP_recipes['n_reviewers'] = PP_recipes['id'].apply(lambda x: recipe_review_counts[x])
PP_recipes['name_tokens'] = PP_recipes['name_tokens'].apply(lambda x: ast.literal_eval(x))
PP_recipes['ingredient_tokens'] = PP_recipes['ingredient_tokens'].apply(lambda x: ast.literal_eval(x))
PP_recipes['steps_tokens'] = PP_recipes['steps_tokens'].apply(lambda x: ast.literal_eval(x))
PP_recipes['techniques'] = PP_recipes['techniques'].apply(lambda x: ast.literal_eval(x))
PP_recipes['ingredient_ids'] = PP_recipes['ingredient_ids'].apply(lambda x: ast.literal_eval(x))
PP_recipes['n_steps'] = PP_recipes['steps_tokens'].apply(lambda x: len(x['steps_tokens']))

PP_recipes

In [None]:
PP_users['techniques'] = PP_users['techniques'].apply(lambda x: ast.literal_eval(x))
PP_users['items'] = PP_users['items'].apply(lambda x: ast.literal_eval(x))
PP_users['ratings'] = PP_users['ratings'].apply(lambda x: ast.literal_eval(x))

PP_users

In [None]:
PP_recipes.dtypes

In [None]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom > 0:
        return numer/denom
    return 0

def MSE(predictions, y):
    diffs = [(a-b)**2 for (a,b) in zip(predictions, y)]
    return sum(diffs)/len(diffs)

def euclidean_distance(arr1, arr2):
    return np.sqrt(sum((arr1 - arr2)**2))

def normalize(arr):
    min = np.min(arr)
    max = np.max(arr)

    return (arr - min) / (max - min)

In [None]:
train_X = interactions_train.drop(columns=['rating'])
train_y = interactions_train['rating']
validation_X = interactions_validation.drop(columns=['rating'])
validation_y = interactions_validation['rating']
test_X = interactions_test.drop(columns=['rating'])
test_y = interactions_test['rating']

In [None]:
PP_recipes

In [None]:
PP_users

In [None]:
# Baseline 1: Always Predict Global Average

global_average = interactions_train['rating'].mean()
print('Mean Squared Error for Baseline 1 (Always Predict Global Average):', MSE([global_average] * test_y.shape[0], test_y))

# Baseline 2: Regresssion With Two Predictors

baseline_train_X_features = [[PP_recipes.iloc[i]['calorie_level'], PP_users.iloc[u]['n_items']] for u, i in train_X[['u', 'i']].to_numpy()]
baseline_test_X_features = [[PP_recipes.iloc[i]['calorie_level'], PP_users.iloc[u]['n_items']] for u, i in test_X[['u', 'i']].to_numpy()]

baseline_model = sklearn.linear_model.LinearRegression()
baseline_model.fit(baseline_train_X_features, train_y)
baseline_model_preds = baseline_model.predict(baseline_test_X_features)
print('Mean Squared Error for Baseline 2 (Regrission With Two Predictors):', MSE(baseline_model_preds, test_y))

In [None]:
interactions_test

In [None]:
def get_familiarity(datum, col):
    user = datum['u']
    recipe = datum['i']
    recipe_ingredients = set(PP_recipes.iloc[recipe][col])

    user_known_recipes = PP_users.iloc[user]['items']
    jaccard_sims = []

    for i in user_known_recipes:
        known_recipe_ingredients = set(PP_recipes.iloc[i][col])
        jaccard_sims.append(Jaccard(recipe_ingredients, known_recipe_ingredients))

    return np.mean(jaccard_sims)

In [None]:
def feature(datum):
    user = datum['u']
    recipe = datum['i']

    user_num_reviews = PP_users.iloc[user]['n_ratings']

    recipe_num_reviewers = PP_recipes.iloc[recipe]['n_reviewers']

    recipe_calorie_level = PP_recipes.iloc[recipe]['calorie_level']

    recipe_num_steps = PP_recipes.iloc[recipe]['n_steps']

    # mean of jaccards might not be the best way to evaluate familiarity
    # maybe modify to address people with low review counts?
    # modify to reduce the impact of very common ingredients like salt?
    ingredient_familiarity = get_familiarity(datum, 'ingredient_ids')

    steps_familiarity = get_familiarity(datum, 'steps_tokens')

    technique_familiarity = (np.array(PP_users.iloc[user]['techniques']) * np.array(PP_recipes.iloc[recipe]['techniques'])).sum() / np.array(PP_recipes.iloc[recipe]['techniques']).sum()

    return [user_num_reviews, recipe_num_reviewers, recipe_calorie_level, recipe_num_steps, ingredient_familiarity, steps_familiarity, technique_familiarity]
