In [35]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
from sklearn import linear_model
from tqdm import tqdm

In [36]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [37]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [38]:
def readJSON(path):
    f = gzip.open(path, 'rt', encoding='utf8')
    f.readline()
    for l in f:
        d = eval(l)
        u = d['userID']
        g = d['gameID']
        yield u,g,d

In [39]:
answers = {}

In [40]:
# Some data structures that will be useful

In [41]:
allHours = []
for l in readJSON("train.json.gz"):
    allHours.append(l)

In [42]:
hoursTrain = allHours[:165000]
hoursValid = allHours[165000:]

In [43]:
##################################################
# Play prediction                                #
##################################################

In [44]:
# Any other preprocessing...
itemset = set()
userset = set()
user_stoi = dict()
user_itos = []
item_stoi = dict()
item_itos = []
for user, item, review in allHours:
    itemset.add(item)
    userset.add(item)
    if user not in user_stoi:
        user_stoi[user] = len(user_itos)
        user_itos.append(user)
    if item not in item_stoi:
        item_stoi[item] = len(item_itos)
        item_itos.append(item)


U = defaultdict(set)
I = defaultdict(set)
validPairs_part_1 = []
for review in hoursTrain:
    user = review[0]
    item = review[1]
    U[item].add(user)
    I[user].add(item)

I_arr = np.array([len(I[user_itos[u]]) for u in range(len(I))])
U_arr = np.array([len(U[item_itos[i]]) for i in range(len(U))])

validPairs_part_1 = [[user_stoi[user], item_stoi[item]] for user, item, review_body in hoursValid]
validLabels_part_1 = np.array([1] * len(hoursValid) + [0] * len(hoursValid))

validPairs_part_2 = validPairs_part_1.copy()
validPairs_part_2 = np.array(validPairs_part_2)
validLabels_part_2 = np.array([review['hours_transformed'] for user, item, review in hoursValid])

# Construct a new validation set w/ negative pairs
for user, item, review in hoursValid:
    sample = random.sample(itemset.difference(I[user]), 1)[0]
    validPairs_part_1.append([user_stoi[user], item_stoi[sample]])

validPairs_part_1 = np.array(validPairs_part_1)

In [45]:
# Baseline played

def make_baseline_set(thresh: float):
    gameCount = defaultdict(int)
    totalPlayed = 0

    for user,game,_ in readJSON("train.json.gz"):
        gameCount[game] += 1
        totalPlayed += 1

    mostPopular = [(gameCount[x], x) for x in gameCount]
    mostPopular.sort()
    mostPopular.reverse()

    top_thresh_percentile = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        top_thresh_percentile.add(i)
        if count > thresh * totalPlayed: break

    return top_thresh_percentile

In [46]:
### Question 1

In [47]:
# Evaluate baseline strategy
return1 = make_baseline_set(0.5)
def baseline_predict(user: int, item: int, top_percentile: set):
    if item_itos[item] in top_percentile:
        return 1
    else:
        return 0

preds = np.zeros(len(validLabels_part_1))
for i, (user, item) in enumerate(validPairs_part_1):
    preds[i] = baseline_predict(user, item, return1)


In [48]:
answers['Q1'] = np.sum(preds == validLabels_part_1) / len(preds)
answers['Q1']

0.6832183218321832

In [49]:
assertFloat(answers['Q1'])

In [50]:
### Question 2

In [51]:
# Improved strategy
return2 = make_baseline_set(0.7)
preds = np.zeros(len(validLabels_part_1))
for i, (user, item) in enumerate(validPairs_part_1):
    preds[i] = baseline_predict(user, item, return2)

In [52]:
answers['Q2'] = [np.sum(preds == validLabels_part_1) / len(preds), 0.7]
answers['Q2']

[0.6999699969997, 0.7]

In [53]:
assertFloatList(answers['Q2'], 2)

In [54]:
### Question 3/4

In [55]:
def jaccard_baseline_predict(user: int, item: int, thresh: float):
    jaccard = lambda a, b: len(a.intersection(b)) / len(a.union(b))
    jaccard_max = 0
    for i in I[user_itos[user]]:
        jaccard_max = max(jaccard_max, jaccard(U[item_itos[item]], U[i]))
    if jaccard_max > thresh:
        return 1
    else:
        return 0

def jaccard_popularity_baseline(user: int, item: int, thresh: float, top_percentile: set):
    jaccard = lambda a, b: len(a.intersection(b)) / len(a.union(b))
    jaccard_max = 0
    for i in I[user_itos[user]]:
        jaccard_max = max(jaccard_max, jaccard(U[item_itos[item]], U[i]))
    if jaccard_max > thresh and item_itos[item] in top_percentile:
        return 1
    else:
        return 0

In [56]:
preds = np.zeros(len(validLabels_part_1))
for i, (user, item) in enumerate(validPairs_part_1):
    preds[i] = jaccard_baseline_predict(user, item, 0.03)
q3_acc = np.sum(preds == validLabels_part_1) / len(preds)

In [57]:
preds = np.zeros(len(validLabels_part_1))
return3 = make_baseline_set(0.7)
for i, (user, item) in enumerate(validPairs_part_1):
    preds[i] = jaccard_popularity_baseline(user, item, 0.03, return3)
q4_acc = np.sum(preds == validLabels_part_1) / len(preds)

In [58]:
print(q3_acc)
print(q4_acc)

0.6776677667766776
0.6967696769676968


In [59]:
answers['Q3'] = q3_acc
answers['Q4'] = q4_acc

In [60]:
assertFloat(answers['Q3'])
assertFloat(answers['Q4'])

In [61]:
predictions = open("HWpredictions_Played.csv", 'w')
for l in open("./pairs_Played.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    user, item = l.strip().split(',')
    # Logic...
    if user in user_stoi:
        pred = str(jaccard_popularity_baseline(user_stoi[user], item_stoi[item], 0.03, return3))
    else:
        pred = '0'
    _ = predictions.write(user + ',' + item + ',' + pred  + '\n')

predictions.close()

In [62]:
answers['Q5'] = "I confirm that I have uploaded an assignment submission to gradescope"

In [63]:
##################################################
# Hours played prediction                        #
##################################################

In [64]:
trainHours = [r[2]['hours_transformed'] for r in hoursTrain]
globalAverage = sum(trainHours) * 1.0 / len(trainHours)

In [65]:
### Question 6

In [66]:
trainPairs = [[user_stoi[user], item_stoi[item]] for user, item, review in hoursTrain]
trainPairs = np.array(trainPairs)

In [74]:
beta_u = np.zeros(len(I))
beta_i = np.zeros(len(U))
alpha = globalAverage # Could initialize anywhere, this is a guess
lr = 0.001
def iterate(lamb):
    global alpha
    global beta_u
    global beta_i
    alpha_grad = 0
    beta_u_grad = np.zeros_like(beta_u)
    beta_i_grad = np.zeros_like(beta_i)
    for i, (user, item) in enumerate(trainPairs):
        alpha_grad += (2 * (alpha + beta_u[user] + beta_i[item] - trainHours[i])) / len(trainPairs)
        beta_u_grad[user] += (2 * (alpha + beta_u[user] + beta_i[item] - trainHours[i]) + (2 * lamb * beta_u[user])) / I_arr[user]
        beta_i_grad[item] += (2 * (alpha + beta_u[user] + beta_i[item] - trainHours[i]) + (2 * lamb * beta_i[item])) / U_arr[item]
    beta_u -= lr * beta_u_grad
    beta_i -= lr * beta_i_grad
    alpha -= lr * alpha_grad

def closed_form(lamb):
    global alpha
    global beta_u
    global beta_i

    new_beta_u = np.zeros_like(beta_u)
    new_beta_i = np.zeros_like(beta_i)
    new_alpha = 0

    for i, (user, item) in enumerate(trainPairs):
        new_alpha += (trainHours[i] - beta_u[user] - beta_i[item]) / len(trainPairs)
    alpha = new_alpha
    for i, (user, item) in enumerate(trainPairs):
        new_beta_u[user] += (trainHours[i] - alpha - beta_i[item]) / (lamb + I_arr[user])
    beta_u = new_beta_u
    for i, (user, item) in enumerate(trainPairs):
        new_beta_i[item] += (trainHours[i] - alpha - beta_u[user]) / (lamb + U_arr[item])
    beta_i = new_beta_i

for i in tqdm(range(300)):
    closed_form(5)
    if i % 100 == 0:
        validMSE = 0
        for i, (user, item) in enumerate(validPairs_part_2):
            validMSE += (validLabels_part_2[i] - alpha - beta_u[user] - beta_i[item]) ** 2
        validMSE /= len(validPairs_part_2)
        print(validMSE)

3.716088074007024


  0%|          | 1/300 [00:03<19:35,  3.93s/it]

3.1347934454377326


 34%|███▎      | 101/300 [04:14<05:47,  1.75s/it]

2.9906297574462624


 67%|██████▋   | 201/300 [08:17<04:23,  2.66s/it]

2.990628067045836


 68%|██████▊   | 205/300 [08:28<03:55,  2.48s/it]


KeyboardInterrupt: 

In [70]:
closed_form(5)
print(alpha)

3.684224766504498


In [71]:
beta_u

array([ 0.51915976, -0.29995939, -0.71692046, ...,  0.48210574,
        0.46233988,  0.46227292])

In [None]:
validMSE = 0
for i, (user, item) in enumerate(validPairs_part_2):
    validMSE += (validLabels_part_2[i] - alpha - beta_u[user] - beta_i[item]) ** 2
validMSE /= len(validLabels_part_2)
print(validMSE)
print(alpha)
print(globalAverage)

2.9906280646889387
3.1154418803025523
3.716088074007024


In [None]:
answers['Q6'] = validMSE

In [None]:
assertFloat(answers['Q6'])

In [None]:
### Question 7

In [None]:
betaUs = [(beta_u[u], user_itos[u]) for u in range(len(beta_u))]
betaIs = [(beta_i[i], item_itos[i]) for i in range(len(beta_i))]
betaUs.sort()
betaIs.sort()

print("Maximum betaU = " + str(betaUs[-1][1]) + ' (' + str(betaUs[-1][0]) + ')')
print("Maximum betaI = " + str(betaIs[-1][1]) + ' (' + str(betaIs[-1][0]) + ')')
print("Minimum betaU = " + str(betaUs[0][1]) + ' (' + str(betaUs[0][0]) + ')')
print("Minimum betaI = " + str(betaIs[0][1]) + ' (' + str(betaIs[0][0]) + ')')

Maximum betaU = u14947742 (5.028992741427703)
Maximum betaI = g17604638 (4.958229977012988)
Minimum betaU = u13037838 (-2.8747411504511153)
Minimum betaI = g84397720 (-2.964508870386239)


In [None]:
answers['Q7'] = [betaUs[-1][0], betaUs[0][0], betaIs[-1][0], betaIs[0][0]]

In [None]:
assertFloatList(answers['Q7'], 4)

In [None]:
### Question 8

In [None]:
# Better lambda...


In [None]:
beta_u = np.zeros(len(I))
beta_i = np.zeros(len(U))
alpha = globalAverage # Could initialize anywhere, this is a guess
for i in tqdm(range(10)):
    closed_form(2)
    loss = 0
    for i, (user, item) in enumerate(trainPairs):
        loss += (trainHours[i] - alpha - beta_u[user] - beta_i[item]) ** 2
    loss /= len(trainPairs)
    print(loss)
validMSE = 0
for i, (user, item) in enumerate(validPairs_part_2):
    validMSE += (validLabels_part_2[i] - alpha - beta_u[user] - beta_i[item]) ** 2
validMSE /= len(validLabels_part_2)
print(validMSE)

  0%|          | 0/10 [00:00<?, ?it/s]

 10%|█         | 1/10 [00:01<00:16,  1.86s/it]

2.9355175579739337


 20%|██        | 2/10 [00:03<00:14,  1.77s/it]

2.7692616791758646


 30%|███       | 3/10 [00:05<00:12,  1.73s/it]

2.761292137124928


 40%|████      | 4/10 [00:06<00:10,  1.67s/it]

2.7604697423423135


 50%|█████     | 5/10 [00:08<00:08,  1.64s/it]

2.760301578016814


 60%|██████    | 6/10 [00:09<00:06,  1.60s/it]

2.7602324512268606


 70%|███████   | 7/10 [00:11<00:04,  1.56s/it]

2.7601850719671064


 80%|████████  | 8/10 [00:12<00:03,  1.57s/it]

2.76014409052146


 90%|█████████ | 9/10 [00:14<00:01,  1.58s/it]

2.7601059094538947


100%|██████████| 10/10 [00:16<00:00,  1.62s/it]

2.7600696019887234
2.9991601635049583





In [None]:
answers['Q8'] = (2, validMSE)

In [None]:
assertFloatList(answers['Q8'], 2)

In [None]:
predictions = open("HWpredictions_Hours.csv", 'w')
for l in open("./pairs_Hours.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    user,item = l.strip().split(',')

    # Logic...

    _ = predictions.write(user + ',' + item + ',' + str(alpha + beta_u[user_stoi[user]] + beta_i[item_stoi[item]]) + '\n')

predictions.close()

In [None]:
f = open("answers_hw3.txt", 'w')
f.write(str(answers) + '\n')
f.close()