# Section 1 (Regression)

In [81]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
import numpy
import string
import random
from sklearn import linear_model
import dateutil
import functools
import warnings
import numpy as np
warnings.filterwarnings('ignore')

In [48]:
def parse(f):
    for l in gzip.open(f):
        yield eval(l)

In [49]:
# Download data from below:
# https://cseweb.ucsd.edu/classes/fa21/cse258-b/files/
dataset = list(parse("trainRecipes.json.gz"))

In [50]:
len(dataset)

200000

In [51]:
train = dataset[:150000]
valid = dataset[150000:175000]
test = dataset[175000:]

In [52]:
dataset[1]

{'name': 'double delicious cookie bars',
 'minutes': 40,
 'contributor_id': '26865936',
 'submitted': '2007-08-27',
 'steps': 'preheat oven to 350f\tin 13x9-inch baking pan , melt butter in oven\tsprinkle crumbs evenly over butter\tpour milk evenly over crumbs\ttop with remaining ingredients\tpress down firmly\tbake 25-30 minutes or until lightly browned\tcool completely , chill if desired , and cut into bars',
 'description': 'from "all time favorite recipes". for fun, try substituting butterscotch or white chocolate chips for the semi-sweet and/or peanut butter chips. make sure you cool it completely or the bottom will crumble!',
 'ingredients': ['butter',
  'graham cracker crumbs',
  'sweetened condensed milk',
  'semi-sweet chocolate chips',
  'peanut butter chips'],
 'recipe_id': '98015212'}

### Question 1

In [54]:
def feat1a(d):
    return [len(d['steps']), len(d['ingredients'])]

In [55]:
def get_month(d):
    t = dateutil.parser.parse(d['submitted'])
    return t.month

def get_year(d):
    t = dateutil.parser.parse(d['submitted'])
    return t.year

min_year = -1
max_year = -1

for i in range(len(dataset)):
    year = get_year(dataset[i])
    if min_year == -1:
        min_year = year
    else:
        min_year = min(min_year, year)
    if max_year == -1:
        max_year = year
    else:
        max_year = max(max_year, year)

num_years = max_year - min_year + 1
num_months = 12

num_year_encoding = num_years - 1
num_months_encoding = num_months - 1

def feat1b(d):
    year = get_year(d)
    month = get_month(d)
    
    year_encoding = [0 for i in range(num_year_encoding)]
    month_encoding = [0 for i in range(num_months_encoding)]
    
    if year > min_year:
        year_encoding[year-min_year-1] = 1
    
    if month > 1:
        month_encoding[month-2] = 1
    
    feature = year_encoding + month_encoding
    return feature

In [62]:
ingredients_dict = defaultdict(int)
for data in dataset:
    ingredients = data['ingredients']
    for ingredient in ingredients:
        ingredients_dict[ingredient] += 1

ingredients_list = []
for ingredient in ingredients_dict:
    ingredients_list.append((ingredients_dict[ingredient], ingredient))

def ingredient_comparator(tuple1, tuple2):
    if tuple1[0] > tuple2[0]:
        return -1
    elif tuple1[0] < tuple2[0]:
        return 1
    elif tuple1[1] < tuple2[1]:
        return -1
    else:
        return 1

ingredients_list.sort(key=functools.cmp_to_key(ingredient_comparator))
ingredients_idx_dict = {}
for i in range(len(ingredients_list)):
    ingredients_idx_dict[ingredients_list[i][1]] = i

def feat1c(d):
    binary_vector = [0 for _ in range(50)]
    ingredients = d['ingredients']
    for ingredient in ingredients:
        if ingredients_idx_dict[ingredient] <= 49:
            binary_vector[ingredients_idx_dict[ingredient]] = 1
    return binary_vector

In [63]:
def feat(d, a = True, b = True, c = True):
    # Hint: for Questions 1 and 2, might be useful to set up a function like this
    #       which allows you to "select" which features are included
    feature = [1]
    if a:
        feature += feat1a(d)
    if b:
        feature += feat1b(d)
    if c:
        feature += feat1c(d)
    
    return feature

In [113]:
def MSE(y, ypred):
    # Can use library if you prefer
    diff = y - ypred
    mse = (diff.T*diff / diff.shape[0]).tolist()[0][0]
    return mse

In [120]:
def experiment(a = True, b = True, c = True):
    # Hint: might be useful to write this function which extracts features and 
    #       computes the performance of a particular model on those features
    X_train = [feat(d, a, b, c) for d in train]
    y_train = [d['minutes'] for d in train]
    X_test = [feat(d, a, b, c) for d in test]
    y_test = [d['minutes'] for d in test]
    theta,residuals,rank,s = np.linalg.lstsq(X_train, y_train)
    
    theta = np.matrix(theta).T
    X_test = np.matrix(X_test)
    y_test = np.matrix(y_test).T
    y_pred = X_test*theta
    
    return MSE(y_test, y_pred)

In [104]:
print('First test sample 1 a) feature:\n{}'.format(feat1a(train[0])))
print('\nFirst test sample 1 b) feature:\n{}'.format(feat1b(train[0])))
print('\nFirst test sample 1 c) feature:\n{}'.format(feat1c(train[0])))

First test sample 1 a) feature:
[743, 9]

First test sample 1 b) feature:
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]

First test sample 1 c) feature:
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [118]:
mse1a = experiment(a=True, b=False, c=False)
mse1b = experiment(a=False, b=True, c=False)
mse1c = experiment(a=False, b=False, c=True)

In [119]:
print('MSE on test data for 1 a) model: {}'.format(mse1a))
print('MSE on test data for 1 b) model: {}'.format(mse1b))
print('MSE on test data for 1 c) model: {}'.format(mse1c))

MSE on test data for 1 a) model: 6169.549296366498
MSE on test data for 1 b) model: 6396.833687711815
MSE on test data for 1 c) model: 6000.948439855973


### Question 2

- TODO: write about abelation studies here

In [123]:
mseall = experiment(a=True,b=True,c=True)
mse1a_excluded = experiment(a=False,b=True,c=True)
mse1b_excluded = experiment(a=True,b=False,c=True)
mse1c_excluded = experiment(a=True,b=True,c=False)

In [124]:
print('MSE on test data for model including all features: {}'.format(mseall))
print('MSE on test data for model excluding 1 a) features: {}'.format(mse1a_excluded))
print('MSE on test data for model excluding 1 b) features: {}'.format(mse1b_excluded))
print('MSE on test data for model excluding 1 c) features: {}'.format(mse1c_excluded))

MSE on test data for model including all features: 5861.253905671382
MSE on test data for model excluding 1 a) features: 5992.6635101007005
MSE on test data for model excluding 1 b) features: 5870.11506165606
MSE on test data for model excluding 1 c) features: 6157.754094366207


### Question 4
- TODO: write

# Section 2 (Classification)

### Question 5

In [146]:
def BER(predictions, y):
    # Implement following this logic or otherwise
    TP = sum([(p and l) for (p,l) in zip(predictions, y)])
    FP = sum([(p and (not l)) for (p,l) in zip(predictions, y)])
    TN = sum([((not p) and (not l)) for (p,l) in zip(predictions, y)])
    FN = sum([((not p) and l) for (p,l) in zip(predictions, y)])
    return 1 - 0.5 * (TP / (TP + FN) + TN / (TN + FP))

In [152]:
def get_mostPopularInd(dict_size):
    i = 0
    mostPopularInd = {}
    idx = 0
    while i < dict_size and idx < len(ingredients_list):
        ingredient = ingredients_list[idx][1]
        idx += 1
        if ingredient == 'butter':
            continue
        mostPopularInd[ingredient] = i
        i += 1
    
    return mostPopularInd

def feat2(d, dict_size, mostPopularInd):
    fIng = [0] * dict_size
    for i in d['ingredients']:
        if i == 'butter':
            continue
        if i in mostPopularInd:
            fIng[mostPopularInd[i]] = 1
    return fIng

def hasButter(d):
    if 'butter' in d['ingredients']:
        return 1
    return 0

In [166]:
def experiment(reg = 1, dict_size = 50, mode='valid'):
    # Hint: run an experiment with a particular regularization strength, and a particular one-hot encoding size
    # extract features...
    # (etc.)
    mostPopularInd = get_mostPopularInd(dict_size)
    X_train = [feat2(d, dict_size, mostPopularInd) for d in train]
    Y_train = [hasButter(d) for d in train]
    mod = linear_model.LogisticRegression(C=reg, class_weight='balanced', solver = 'lbfgs')
    mod.fit(X_train, Y_train)
    Y_train_pred = mod.predict(X_train)
    
    if mode == 'valid':
        X_valid = [feat2(d, dict_size, mostPopularInd) for d in valid]
        Y_valid = [hasButter(d) for d in valid]
        Y_pred = mod.predict(X_valid)
        return BER(Y_train_pred, Y_train), BER(Y_pred, Y_valid)
    
    X_test = [feat2(d, dict_size, mostPopularInd) for d in test]
    Y_test = [hasButter(d) for d in test]
    Y_pred = mod.predict(X_test)
    return BER(Y_train_pred, Y_train), BER(Y_pred, Y_test)

In [167]:
ber5_train, ber5_test = experiment(reg=1, dict_size=50, mode='test')
print('Balanced Error Rate of the classifier on the test set: {}'.format(ber5_test))

Balanced Error Rate of the classifier on the test set: 0.28930328282968243


### Question 6

In [176]:
def pipeline():
    C_best = None
    dsize_best = None
    ber_best = None
    for C in [1e-3, 1e-2, 1e-1, 1e0, 10, 100, 1000]:
        for dsize in [50, 100, 500]:
            ber_train, ber_cur = experiment(reg=C, dict_size=dsize, mode='valid')
            print('C: {0:5}, dict size: {1:5}, BER on train set: {2:10}, BER on valid set: {3:10}'
                  .format(C, dsize, round(ber_train, 5), round(ber_cur, 5)))
            if (ber_best == None) or (ber_cur < ber_best):
                C_best = C
                dsize_best = dsize
                ber_best = ber_cur
    
    return C_best, dsize_best

In [177]:
C_opt, dsize_opt = pipeline()

C: 0.001, dict size:    50, BER on train set:    0.30158, BER on valid set:    0.30632
C: 0.001, dict size:   100, BER on train set:    0.28364, BER on valid set:    0.28685
C: 0.001, dict size:   500, BER on train set:     0.2635, BER on valid set:    0.26701
C:  0.01, dict size:    50, BER on train set:    0.29019, BER on valid set:    0.29033
C:  0.01, dict size:   100, BER on train set:    0.26405, BER on valid set:    0.26473
C:  0.01, dict size:   500, BER on train set:    0.22835, BER on valid set:    0.23016
C:   0.1, dict size:    50, BER on train set:    0.28989, BER on valid set:    0.28972
C:   0.1, dict size:   100, BER on train set:    0.26198, BER on valid set:      0.264
C:   0.1, dict size:   500, BER on train set:    0.22355, BER on valid set:    0.22469
C:   1.0, dict size:    50, BER on train set:    0.28988, BER on valid set:    0.28952
C:   1.0, dict size:   100, BER on train set:     0.2622, BER on valid set:    0.26447
C:   1.0, dict size:   500, BER on train se

In [182]:
ber6_train, ber6_test = experiment(reg=C_opt, dict_size=dsize_opt, mode='test')
print('best value of C: {0:5}, best value of dict size: {1:5}'.format(C_opt, dsize_opt))
print('Balanced Error Rate of the classifier on the test set for selected model: {0:10}'.format(ber6_test))

best value of C:   0.1, best value of dict size:   500
Balanced Error Rate of the classifier on the test set for selected model: 0.22504702608007565


# Section 3 (Recommender Systems)

### Question 8

In [132]:
# Utility data structures
ingsPerItem = defaultdict(set)
itemsPerIng = defaultdict(set)

In [133]:
for d in dataset:
    r = d['recipe_id']
    for i in d['ingredients']:
        ingsPerItem[r].add(i)
        itemsPerIng[i].add(r)

In [134]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [139]:
def similarity_comparator(tuple1, tuple2):
    if tuple1[0] > tuple2[0]:
        return -1
    elif tuple1[0] < tuple2[0]:
        return 1
    elif tuple1[1] < tuple2[1]:
        return -1
    else:
        return 1

def mostSimilar8(i, N):
    similarities = []
    ings = ingsPerItem[i]
    for i2 in ingsPerItem:
        if i2 == i: continue
        sim = Jaccard(ings, ingsPerItem[i2])
        similarities.append((sim,i2))
    similarities.sort(key=functools.cmp_to_key(similarity_comparator))
    return similarities[:N]

In [141]:
recipe_id = '06432987'
print('Printing (similarity, recipe id) values')
mostSimilar8(recipe_id, 5)

Printing (similarity, recipe id) values


[(0.4166666666666667, '68523854'),
 (0.38461538461538464, '12679596'),
 (0.36363636363636365, '56301588'),
 (0.36363636363636365, '79675099'),
 (0.35714285714285715, '87359281')]

### Question 9

In [142]:
def mostSimilar9(i, N):
    similarities = []
    items = itemsPerIng[i]
    for i2 in itemsPerIng:
        if i2 == i: continue
        sim = Jaccard(items, itemsPerIng[i2])
        similarities.append((sim,i2))
    similarities.sort(key=functools.cmp_to_key(similarity_comparator))
    return similarities[:N]

In [144]:
ingredient_id = 'butter'
print('Printing (similarity, ingredient id) values')
mostSimilar9(ingredient_id, 5)

Printing (similarity, ingredient id) values


[(0.22315311514274808, 'salt'),
 (0.2056685424969639, 'flour'),
 (0.19100394157199166, 'eggs'),
 (0.17882420717656095, 'sugar'),
 (0.17040052045973944, 'milk')]

In [None]:
### Question 10
#(open ended)