In [91]:
import json
import gzip
import math
import numpy as np
from collections import defaultdict
from sklearn import linear_model
import random
import statistics

In [101]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [102]:
answers = {}

In [103]:
# From https://cseweb.ucsd.edu/classes/fa24/cse258-b/files/steam.json.gz
z = gzip.open("/Users/danielzhao/Documents/GitHub/cse258-r/midterm/steam.json.gz")

In [7]:
dataset = []
for l in z:
    d = eval(l)
    dataset.append(d)

In [8]:
z.close()

In [14]:
### Question 1
print(dataset[0])

{'hours': 0.3, 'gameID': 'g35322304', 'hours_transformed': 0.37851162325372983, 'early_access': False, 'date': '2015-04-08', 'text': '+1', 'userID': 'u55351001'}


In [104]:
def MSE(y, ypred):
    return ((y - ypred) ** 2).mean()

In [105]:
X = [[len(d['text'])] for d in dataset]
y = [d['hours'] for d in dataset]

In [106]:
y[0]

0.3

In [107]:
model = linear_model.LinearRegression(fit_intercept=True)
model.fit(X, y)

theta_1 = model.coef_[0]
y_pred = model.predict(X)

mse1 = MSE(y, y_pred)

theta_1, mse1

(np.float64(0.0010422806169490813), np.float64(75735.70018272949))

In [108]:
answers['Q1'] = [float(theta_1), float(mse1)] # Remember to cast things to float rather than (e.g.) np.float64

In [109]:
assertFloatList(answers['Q1'], 2)

In [110]:
### Question 2

In [252]:
dataTrain = dataset[:int(len(dataset)*0.8)]
dataTest = dataset[int(len(dataset)*0.8):]

In [253]:
X_train = [[len(d['text'])] for d in dataTrain]
y_train = [d['hours'] for d in dataTrain]

X_test = [[len(d['text'])] for d in dataTest]
y_test = [d['hours'] for d in dataTest]

In [254]:
model = linear_model.LinearRegression()
model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)
mse_test = MSE(y_test, y_pred_test)

In [255]:
under = 0
over = 0

for y_i, y_pred_i in zip(y_test, y_pred_test):
    if y_pred_i < y_i:
        under += 1
    elif y_pred_i > y_i:
        over += 1

In [256]:
mse2 = MSE(y_test, y_pred_test)

In [257]:
mse2, under, over

(np.float64(76047.19578054463), 5249, 29751)

In [117]:
answers['Q2'] = [float(mse2), under, over]

In [118]:
assertFloatList(answers['Q2'], 3)

In [119]:
### Question 3

In [120]:
y2 = y[:]
y2.sort()
perc90 = y2[int(len(y2)*0.9)] # 90th percentile

In [261]:
X3a = [x for x, y in zip(X_train, y_train) if y <= perc90]
y3a = [y for y in y_train if y <= perc90]

mod3a = linear_model.LinearRegression(fit_intercept=True)
mod3a.fit(X3a,y3a)
pred3a = mod3a.predict(X_test)

In [262]:
under3a = 0
over3a = 0

for y_i, y_pred_i in zip(y_test, pred3a):
    if y_pred_i < y_i:
        under3a += 1
    elif y_pred_i > y_i:
        over3a += 1

under3a, over3a

(13084, 21916)

In [123]:
# etc. for 3b and 3c

In [124]:
y_train3b =  [d['hours_transformed'] for d in dataTrain]
y_test3b =  [d['hours_transformed'] for d in dataTest]

In [125]:
model_b = linear_model.LinearRegression()
model_b.fit(X_train, y_train3b)

pred3b = model_b.predict(X_test)

In [126]:
under3b, over3b = 0, 0
for y_i, y_pred_i in zip(y_test3b, pred3b):
    if y_pred_i < y_i:
        under3b += 1
    elif y_pred_i > y_i:
        over3b += 1

In [127]:
under3b, over3b

(15941, 19059)

In [258]:
median_review_length = np.median([len(d['text']) for d in dataTrain])
median_hours = np.median(y_train)

theta_0c = model.intercept_
theta_1c = (median_hours - theta_0c) / median_review_length

pred3c = [theta_0c + theta_1c * x[0] for x in X_test]

under3c, over3c = 0, 0

for yi, y_pred_i in zip(y_test, pred3c):
    if y_pred_i < yi:
        under3c += 1
    elif y_pred_i > yi:
        over3c += 1

under3c, over3c

(20808, 14192)

In [129]:
answers['Q3'] = [under3a, over3a, under3b, over3b, under3c, over3c]

In [130]:
assertFloatList(answers['Q3'], 6)

In [131]:
### Question 4

In [135]:
from sklearn.metrics import confusion_matrix

In [132]:
y_train4 = [1 if y > median_hours else 0 for y in y_train]
y_test4 = [1 if y > median_hours else 0 for y in y_test]

In [136]:
mod = linear_model.LogisticRegression(C=1)
mod.fit(X_train,y_train4)
predictions4 = mod.predict(X_test) # Binary vector of predictions

In [141]:
TN, FP, FN, TP = confusion_matrix(y_test4, predictions4).ravel()

In [142]:
fpr = FP / (FP + TN)
fnr = FN / (FN + TP)
BER = (fpr + fnr) / 2

In [146]:
answers['Q4'] = [int(TP), int(TN), int(FP), int(FN), float(BER)]

In [144]:
assertFloatList(answers['Q4'], 5)

In [None]:
### Question 5

In [150]:
answers['Q5'] = [int(FP), int(FN)]

In [None]:
assertFloatList(answers['Q5'], 2)

In [None]:
### Question 6

In [155]:
def compute_ber(X_train, y_train, X_test, y_test):
    model = linear_model.LogisticRegression(C=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    ber = (fpr + fnr) / 2
    return ber

In [156]:
X2014 = [[len(d['text'])] for d in dataTrain if int(d['date'][:4]) <= 2014]
y2014 = [1 if d['hours'] > median_hours else 0 for d in dataTrain if int(d['date'][:4]) <= 2014]

X2014test = [[len(d['text'])] for d in dataTest if int(d['date'][:4]) <= 2014]
y2014test = [1 if d['hours'] > median_hours else 0 for d in dataTest if int(d['date'][:4]) <= 2014]

X2015 = [[len(d['text'])] for d in dataTrain if int(d['date'][:4]) >= 2015]
y2015 = [1 if d['hours'] > median_hours else 0 for d in dataTrain if int(d['date'][:4]) >= 2015]

X2015test = [[len(d['text'])] for d in dataTest if int(d['date'][:4]) >= 2015]
y2015test = [1 if d['hours'] > median_hours else 0 for d in dataTest if int(d['date'][:4]) >= 2015]

In [158]:
BER_A = compute_ber(X2014, y2014, X2014test, y2014test)
BER_B = compute_ber(X2015, y2015, X2015test, y2015test)
BER_C = compute_ber(X2014, y2014, X2015test, y2015test)
BER_D = compute_ber(X2015, y2015, X2014test, y2014test)

BER_A, BER_B, BER_C, BER_D

(np.float64(0.4799670470952742),
 np.float64(0.47394608476712863),
 np.float64(0.4820528229832485),
 np.float64(0.4722496441821391))

In [159]:
answers['Q6'] = [float(BER_A), float(BER_B), float(BER_C), float(BER_D)]

In [160]:
assertFloatList(answers['Q6'], 4)

In [None]:
### Question 7

In [178]:
dataTrain[5]

{'hours': 1.7,
 'gameID': 'g25723374',
 'hours_transformed': 1.4329594072761063,
 'early_access': False,
 'date': '2015-01-17',
 'text': 'Never knew a guns had THAT many parts!',
 'userID': 'u01499286'}

In [202]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
reviewsPerUser = defaultdict(dict)
reviewsPerItem = defaultdict(dict)

for d in dataTrain:
    user = d['userID']
    item = d['gameID']
    review = d['hours_transformed']
    year = int(d['date'][:4])

    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    reviewsPerUser[user][item] = review
    reviewsPerItem[item][user] = review

In [203]:
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

In [204]:
first_user = dataTrain[0]['userID']
first_user_items = itemsPerUser[first_user]

similarities = []
for user, items in itemsPerUser.items():
    if user != first_user:
        s = jaccard_similarity(first_user_items, items)
        similarities.append((user, s))

similarities.sort(key=lambda x: x[1], reverse=True)
top_10_similar = similarities[:10]

first = top_10_similar[0][1]
tenth = top_10_similar[9][1]

In [205]:
first, tenth

(0.10909090909090909, 0.08235294117647059)

In [206]:
answers['Q7'] = [first, tenth]

In [207]:
assertFloatList(answers['Q7'], 2)

In [208]:
### Question 8

In [209]:
avg_hrs = np.mean([d['hours_transformed'] for d in dataTrain])

In [210]:
def predict_hrs_user(user, item):
    similar_users = usersPerItem[item]
    num = 0
    den = 0
    for i in similar_users:
        if i != user:
            similarity = jaccard_similarity(itemsPerUser[user], itemsPerUser[i])
            num += reviewsPerUser[i][item] * similarity
            den += similarity
    return num / den if den != 0 else avg_hrs

def predict_hrs_item(user, item):
    similar_items = itemsPerUser[user]
    num = 0
    den = 0
    for i in similar_items:
        if i != item:
            similarity = jaccard_similarity(usersPerItem[item], usersPerItem[i])
            num += reviewsPerItem[i][user] * similarity
            den += similarity
    return num / den if den != 0 else avg_hrs

def calculate_mse(predictor, test_data):
    squared_errors = []
    for d in test_data:
        user, item, true_value = d['userID'], d['gameID'], d['hours_transformed']
        predicted_value = predictor(user, item)
        squared_errors.append((true_value - predicted_value) ** 2)
    return np.mean(squared_errors)

MSEU = calculate_mse(predict_hrs_user, dataTest)
MSEI = calculate_mse(predict_hrs_item, dataTest)

In [211]:
MSEU, MSEI

(np.float64(3.281076845941175), np.float64(4.915274596519426))

In [212]:
answers['Q8'] = [float(MSEU), float(MSEI)]

In [213]:
assertFloatList(answers['Q8'], 2)

In [214]:
### Question 9

In [244]:
reviewYearUserItem = defaultdict(dict)
reviewYearItemUser = defaultdict(dict)
for d in dataset:
    user = d['userID']
    item = d['gameID']
    year = int(d['date'][:4])
    reviewYearUserItem[user][item] = year
    reviewYearItemUser[item][user] = year

In [246]:
def predict_hrs_user_time(user, item):
    similar_users = usersPerItem[item]
    target_year = reviewYearUserItem[user][item]
    
    numerator = 0
    denominator = 0
    for i in similar_users:
        if i != user:
            similarity = jaccard_similarity(itemsPerUser[user], itemsPerUser[i])
            year_diff = abs(target_year - reviewYearUserItem[i][item])
            time_weight = np.exp(-year_diff)
            numerator += reviewsPerUser[i][item] * similarity * time_weight
            denominator += similarity * time_weight
    
    return numerator / denominator if denominator != 0 else avg_hrs

MSE9 = calculate_mse(predict_hrs_user_time, dataTest)

In [247]:
MSE9

np.float64(3.310021092937096)

In [248]:
answers['Q9'] = float(MSE9)

In [249]:
assertFloat(answers['Q9'])

In [250]:
if "float" in str(answers) or "int" in str(answers):
    print("it seems that some of your answers are not native python ints/floats;")
    print("the autograder will not be able to read your solution unless you convert them to ints/floats")

In [251]:
f = open("answers_midterm.txt", 'w')
f.write(str(answers) + '\n')
f.close()