In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
from sklearn import linear_model
from tqdm import tqdm

In [2]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [3]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [4]:
def readJSON(path):
    f = gzip.open(path, 'rt', encoding='utf8')
    f.readline()
    for l in f:
        d = eval(l)
        u = d['userID']
        g = d['gameID']
        yield u,g,d

In [5]:
answers = {}

In [6]:
# Some data structures that will be useful

In [7]:
allHours = []
for l in readJSON("train.json.gz"):
    allHours.append(l)

In [8]:
hoursTrain = allHours[:165000]
hoursValid = allHours[165000:]

In [9]:
##################################################
# Play prediction                                #
##################################################

In [10]:
# Any other preprocessing...
itemset = set()
userset = set()
user_stoi = dict()
user_itos = []
item_stoi = dict()
item_itos = []
for user, item, review in allHours:
    itemset.add(item)
    userset.add(item)
    if user not in user_stoi:
        user_stoi[user] = len(user_itos)
        user_itos.append(user)
    if item not in item_stoi:
        item_stoi[item] = len(item_itos)
        item_itos.append(item)


U = defaultdict(set)
I = defaultdict(set)
validPairs_part_1 = []
for review in hoursTrain:
    user = review[0]
    item = review[1]
    U[item].add(user)
    I[user].add(item)

I_arr = np.array([len(I[user_itos[u]]) for u in range(len(I))])
U_arr = np.array([len(U[item_itos[i]]) for i in range(len(U))])

validPairs_part_1 = [[user_stoi[user], item_stoi[item]] for user, item, review_body in hoursValid]
validLabels_part_1 = np.array([1] * len(hoursValid) + [0] * len(hoursValid))

validPairs_part_2 = validPairs_part_1.copy()
validPairs_part_2 = np.array(validPairs_part_2)
validLabels_part_2 = np.array([review['hours_transformed'] for user, item, review in hoursValid])

# Construct a new validation set w/ negative pairs
for user, item, review in hoursValid:
    sample = random.sample(itemset.difference(I[user]), 1)[0]
    validPairs_part_1.append([user_stoi[user], item_stoi[sample]])

validPairs_part_1 = np.array(validPairs_part_1)

In [11]:
# Baseline played

def make_baseline_set(thresh: float):
    gameCount = defaultdict(int)
    totalPlayed = 0

    for user,game,_ in readJSON("train.json.gz"):
        gameCount[game] += 1
        totalPlayed += 1

    mostPopular = [(gameCount[x], x) for x in gameCount]
    mostPopular.sort()
    mostPopular.reverse()

    top_thresh_percentile = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        top_thresh_percentile.add(i)
        if count > thresh * totalPlayed: break

    return top_thresh_percentile

In [12]:
### Question 1

In [13]:
# Evaluate baseline strategy
return1 = make_baseline_set(0.5)
def baseline_predict(user: int, item: int, top_percentile: set):
    if item_itos[item] in top_percentile:
        return 1
    else:
        return 0

preds = np.zeros(len(validLabels_part_1))
for i, (user, item) in enumerate(validPairs_part_1):
    preds[i] = baseline_predict(user, item, return1)


In [14]:
answers['Q1'] = np.sum(preds == validLabels_part_1) / len(preds)
answers['Q1']

0.6818181818181818

In [15]:
assertFloat(answers['Q1'])

In [16]:
### Question 2

In [17]:
# Improved strategy
return2 = make_baseline_set(0.7)
preds = np.zeros(len(validLabels_part_1))
for i, (user, item) in enumerate(validPairs_part_1):
    preds[i] = baseline_predict(user, item, return2)

In [18]:
answers['Q2'] = [np.sum(preds == validLabels_part_1) / len(preds), 0.7]
answers['Q2']

[0.703020302030203, 0.7]

In [19]:
assertFloatList(answers['Q2'], 2)

In [20]:
### Question 3/4

In [21]:
def jaccard_baseline_predict(user: int, item: int, thresh: float):
    jaccard = lambda a, b: len(a.intersection(b)) / len(a.union(b))
    jaccard_max = 0
    for i in I[user_itos[user]]:
        jaccard_max = max(jaccard_max, jaccard(U[item_itos[item]], U[i]))
    if jaccard_max > thresh:
        return 1
    else:
        return 0

def jaccard_popularity_baseline(user: int, item: int, thresh: float, top_percentile: set):
    jaccard = lambda a, b: len(a.intersection(b)) / len(a.union(b))
    jaccard_max = 0
    for i in I[user_itos[user]]:
        jaccard_max = max(jaccard_max, jaccard(U[item_itos[item]], U[i]))
    if jaccard_max > thresh and item_itos[item] in top_percentile:
        return 1
    else:
        return 0

In [22]:
preds = np.zeros(len(validLabels_part_1))
for i, (user, item) in enumerate(validPairs_part_1):
    preds[i] = jaccard_baseline_predict(user, item, 0.03)
q3_acc = np.sum(preds == validLabels_part_1) / len(preds)

In [23]:
preds = np.zeros(len(validLabels_part_1))
return3 = make_baseline_set(0.7)
for i, (user, item) in enumerate(validPairs_part_1):
    preds[i] = jaccard_popularity_baseline(user, item, 0.03, return3)
q4_acc = np.sum(preds == validLabels_part_1) / len(preds)

In [24]:
print(q3_acc)
print(q4_acc)

0.6723172317231724
0.6936693669366937


In [25]:
answers['Q3'] = q3_acc
answers['Q4'] = q4_acc

In [26]:
assertFloat(answers['Q3'])
assertFloat(answers['Q4'])

In [27]:
predictions = open("HWpredictions_Played.csv", 'w')
for l in open("./pairs_Played.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    user, item = l.strip().split(',')
    # Logic...
    if user in user_stoi:
        pred = str(jaccard_popularity_baseline(user_stoi[user], item_stoi[item], 0.03, return3))
    else:
        pred = '0'
    _ = predictions.write(user + ',' + item + ',' + pred  + '\n')

predictions.close()

In [28]:
answers['Q5'] = "I confirm that I have uploaded an assignment submission to gradescope"

In [29]:
##################################################
# Hours played prediction                        #
##################################################

In [30]:
trainHours = [r[2]['hours_transformed'] for r in hoursTrain]
globalAverage = sum(trainHours) * 1.0 / len(trainHours)

In [31]:
### Question 6

In [32]:
trainPairs = [[user_stoi[user], item_stoi[item]] for user, item, review in hoursTrain]
trainPairs = np.array(trainPairs)

In [33]:
beta_u = np.zeros(len(I))
beta_i = np.zeros(len(U))
alpha = globalAverage # Could initialize anywhere, this is a guess
lr = 0.001
def iterate(lamb):
    global alpha
    global beta_u
    global beta_i
    alpha_grad = 0
    beta_u_grad = np.zeros_like(beta_u)
    beta_i_grad = np.zeros_like(beta_i)
    for i, (user, item) in enumerate(trainPairs):
        alpha_grad += (2 * (alpha + beta_u[user] + beta_i[item] - trainHours[i])) / len(trainPairs)
        beta_u_grad[user] += (2 * (alpha + beta_u[user] + beta_i[item] - trainHours[i]) + (2 * lamb * beta_u[user])) / I_arr[user]
        beta_i_grad[item] += (2 * (alpha + beta_u[user] + beta_i[item] - trainHours[i]) + (2 * lamb * beta_i[item])) / U_arr[item]
    beta_u -= lr * beta_u_grad
    beta_i -= lr * beta_i_grad
    alpha -= lr * alpha_grad

def closed_form(lamb):
    global alpha
    global beta_u
    global beta_i

    new_beta_u = np.zeros_like(beta_u)
    new_beta_i = np.zeros_like(beta_i)
    new_alpha = 0

    for i, (user, item) in enumerate(trainPairs):
        new_alpha += (trainHours[i] - beta_u[user] - beta_i[item]) / len(trainPairs)
    alpha = new_alpha
    for i, (user, item) in enumerate(trainPairs):
        new_beta_u[user] += (trainHours[i] - alpha - beta_i[item]) / (lamb + I_arr[user])
    beta_u = new_beta_u
    for i, (user, item) in enumerate(trainPairs):
        new_beta_i[item] += (trainHours[i] - alpha - beta_u[user]) / (lamb + U_arr[item])
    beta_i = new_beta_i

for i in tqdm(range(10)):
    closed_form(1)
    if i % 100 == 0:
        validMSE = 0
        for i, (user, item) in enumerate(validPairs_part_2):
            validMSE += (validLabels_part_2[i] - alpha - beta_u[user] - beta_i[item]) ** 2
        validMSE /= len(validPairs_part_2)
        print(validMSE)

  0%|          | 0/10 [00:00<?, ?it/s]

 10%|█         | 1/10 [00:02<00:18,  2.08s/it]

3.208496521757075


100%|██████████| 10/10 [00:21<00:00,  2.14s/it]


In [34]:
validMSE = 0
for i, (user, item) in enumerate(validPairs_part_2):
    validMSE += (validLabels_part_2[i] - alpha - beta_u[user] - beta_i[item]) ** 2
validMSE /= len(validLabels_part_2)
print(validMSE)
print(alpha)
print(globalAverage)

3.0071926265486777
3.641698452865309
3.716088074007024


In [35]:
answers['Q6'] = validMSE

In [36]:
assertFloat(answers['Q6'])

In [37]:
### Question 7

In [38]:
betaUs = [(beta_u[u], user_itos[u]) for u in range(len(beta_u))]
betaIs = [(beta_i[i], item_itos[i]) for i in range(len(beta_i))]
betaUs.sort()
betaIs.sort()

print("Maximum betaU = " + str(betaUs[-1][1]) + ' (' + str(betaUs[-1][0]) + ')')
print("Maximum betaI = " + str(betaIs[-1][1]) + ' (' + str(betaIs[-1][0]) + ')')
print("Minimum betaU = " + str(betaUs[0][1]) + ' (' + str(betaUs[0][0]) + ')')
print("Minimum betaI = " + str(betaIs[0][1]) + ' (' + str(betaIs[0][0]) + ')')

Maximum betaU = u60898505 (5.817089020336196)
Maximum betaI = g17604638 (4.962308784862177)
Minimum betaU = u13037838 (-3.013519718914663)
Minimum betaI = g84397720 (-3.3393017612671665)


In [39]:
answers['Q7'] = [betaUs[-1][0], betaUs[0][0], betaIs[-1][0], betaIs[0][0]]

In [40]:
assertFloatList(answers['Q7'], 4)

In [41]:
### Question 8

In [42]:
# Better lambda...


In [43]:
beta_u = np.zeros(len(I))
beta_i = np.zeros(len(U))
alpha = globalAverage # Could initialize anywhere, this is a guess
for i in tqdm(range(10)):
    closed_form(2)
    loss = 0
    for i, (user, item) in enumerate(trainPairs):
        loss += (trainHours[i] - alpha - beta_u[user] - beta_i[item]) ** 2
    loss /= len(trainPairs)
    print(loss)
validMSE = 0
for i, (user, item) in enumerate(validPairs_part_2):
    validMSE += (validLabels_part_2[i] - alpha - beta_u[user] - beta_i[item]) ** 2
validMSE /= len(validLabels_part_2)
print(validMSE)

 10%|█         | 1/10 [00:02<00:23,  2.59s/it]

2.9355175579739337


 20%|██        | 2/10 [00:04<00:18,  2.33s/it]

2.7692616791758646


 30%|███       | 3/10 [00:06<00:15,  2.15s/it]

2.761292137124928


 40%|████      | 4/10 [00:08<00:12,  2.11s/it]

2.7604697423423135


 50%|█████     | 5/10 [00:11<00:11,  2.22s/it]

2.760301578016814


 60%|██████    | 6/10 [00:13<00:08,  2.24s/it]

2.7602324512268606


 70%|███████   | 7/10 [00:16<00:07,  2.44s/it]

2.7601850719671064


 80%|████████  | 8/10 [00:18<00:04,  2.48s/it]

2.76014409052146


 90%|█████████ | 9/10 [00:21<00:02,  2.47s/it]

2.7601059094538947


100%|██████████| 10/10 [00:23<00:00,  2.38s/it]

2.7600696019887234
2.9991601635049583





In [44]:
answers['Q8'] = (2, validMSE)

In [45]:
assertFloatList(answers['Q8'], 2)

In [46]:
predictions = open("HWpredictions_Hours.csv", 'w')
for l in open("./pairs_Hours.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    user,item = l.strip().split(',')

    # Logic...

    _ = predictions.write(user + ',' + item + ',' + str(alpha + beta_u[user_stoi[user]] + beta_i[item_stoi[item]]) + '\n')

predictions.close()

In [47]:
f = open("answers_hw3.txt", 'w')
f.write(str(answers) + '\n')
f.close()