In [155]:
import json
import gzip
import math
from collections import defaultdict
import numpy as np
from sklearn import linear_model

In [156]:
# This will suppress any warnings, comment out if you'd like to preserve them
import warnings
warnings.filterwarnings("ignore")

In [157]:
# Check formatting of submissions
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [158]:
answers = {}

In [159]:
f = open("spoilers.json.gz", 'r')

In [160]:
dataset = []
for l in f:
    d = eval(l)
    dataset.append(d)

In [161]:
f.close()

In [162]:
# A few utility data structures
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

for d in dataset:
    u,i = d['user_id'],d['book_id']
    reviewsPerUser[u].append(d)
    reviewsPerItem[i].append(d)

# Sort reviews per user by timestamp
for u in reviewsPerUser:
    reviewsPerUser[u].sort(key=lambda x: x['timestamp'])
    
# Same for reviews per item
for i in reviewsPerItem:
    reviewsPerItem[i].sort(key=lambda x: x['timestamp'])

In [163]:
# E.g. reviews for this user are sorted from earliest to most recent
[d['timestamp'] for d in reviewsPerUser['b0d7e561ca59e313b728dc30a5b1862e']]

['2012-03-13',
 '2013-05-06',
 '2013-09-03',
 '2015-04-05',
 '2016-02-10',
 '2016-05-29']

In [164]:
# check if user-item combo is unique
user_items = set()
for d in dataset:
  user_items.add((d['user_id'], d['book_id']))

len(user_items) == len(dataset)

True

In [165]:
### 1a

In [166]:
dataset[0]
for user in reviewsPerUser.keys():
  print(reviewsPerUser[user][0])
  break

{'user_id': 'b0d7e561ca59e313b728dc30a5b1862e', 'timestamp': '2012-03-13', 'review_sentences': [[0, 'The Grapes of Wrath is set during the Great Depression when times were terribly hard for the farmers in the Dust Bowl.'], [0, 'Drought, inability to pay back loans, and the movement of large agricultural companies to take over the small farms all led to a bad economic situation.'], [0, 'The Joads can no longer farm in Oklahoma, and they have piled their possessions on top of an old truck and headed down Route 66 to California.'], [0, 'They are hoping for high pay picking crops, but there are so many workers heading west that the owners of the large farms are only giving them a pittance.'], [0, 'People are starving and dying while the corporate farmers are in collusion with the police to arrest anyone who objects or tries to unionize.'], [0, 'A bright spot is their stay at a federal camp operated by a New Deal agency that helps the migrant workers.'], [0, 'They are in a heartbreaking sit

In [167]:
def q_1a(user):
  ratings = [d['rating'] for d in reviewsPerUser[user]]
  if len(ratings) ==1:
    return [np.nan,np.nan]
  return [ratings[-1],np.mean(ratings[:-1])]

def q_1b(item):
  ratings = [d['rating'] for d in reviewsPerItem[item]]
  if len(ratings) ==1:
    return [np.nan,np.nan]
  return [ratings[-1],np.mean(ratings[:-1])]
  
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [168]:
y = [q_1a(d) for d in reviewsPerUser]
y_user = [i[0] for i in y if i!=[np.nan,np.nan]]
y_pred_user = [i[1] for i in y if i!=[np.nan,np.nan]]

In [169]:
answers['Q1a'] = MSE(y_user,y_pred_user)

In [170]:
assertFloat(answers['Q1a'])

In [171]:
answers

{'Q1a': 1.970416294395752}

In [172]:
### 1b

In [173]:
y = [q_1b(d) for d in reviewsPerItem]
y_user = [i[0] for i in y if i!=[np.nan,np.nan]]
y_pred_user = [i[1] for i in y if i!=[np.nan,np.nan]]

In [174]:
answers['Q1b'] = MSE(y_user,y_pred_user)

In [175]:
assertFloat(answers['Q1b'])

In [176]:
answers

{'Q1a': 1.970416294395752, 'Q1b': 2.051966103395068}

In [177]:
### 2

In [178]:
def avg_N(user, N):
  ratings = [d['rating'] for d in reviewsPerUser[user]]
  if len(ratings) == 1:
    return [np.nan, np.nan]
  if len(ratings) <= N:
    return [ratings[-1],np.mean(ratings[:-1])]
  return [ratings[-1],np.mean(ratings[-(N+1):-1])]
 

In [179]:
answers['Q2'] = []

for N in [1,2,3]:
    y = [avg_N(d,N) for d in reviewsPerUser]
    y_user = [i[0] for i in y if i!=[np.nan,np.nan]]
    y_pred_user = [i[1] for i in y if i!=[np.nan,np.nan]]
    answers['Q2'].append(MSE(y_user,y_pred_user))

In [180]:
assertFloatList(answers['Q2'], 3)

In [181]:
answers

{'Q1a': 1.970416294395752,
 'Q1b': 2.051966103395068,
 'Q2': [2.666035950804163, 2.1542691579943236, 2.0280931357090237]}

In [182]:
### 3a

In [199]:
def feature3(N, u): # For a user u and a window size of N
    ratings = [d['rating'] for d in reviewsPerUser[u]]
    feat = [1]
    last_N = ratings[-(N+1):-1][::-1]
    return feat + last_N


In [190]:
answers

{'Q1a': 1.970416294395752,
 'Q1b': 2.051966103395068,
 'Q2': [2.666035950804163, 2.1542691579943236, 2.0280931357090237],
 'Q3a': [[1, 4, 4], [1, 4, 4, 4]],
 'Q3b': []}

In [191]:
answers['Q3a'] = [feature3(2,dataset[0]['user_id']), feature3(3,dataset[0]['user_id'])]

In [192]:
assert len(answers['Q3a']) == 2
assert len(answers['Q3a'][0]) == 3
assert len(answers['Q3a'][1]) == 4

In [193]:
### 3b

In [200]:
answers['Q3b'] = []

for N in [1,2,3]:
    mod = linear_model.LinearRegression()
    X, y = [],[]
    for u in reviewsPerUser:
      ratings = [d['rating'] for d in reviewsPerUser[u]]
      x = feature3(N, u)
      r = ratings[-1]
      if len(x) <=N:
        continue
      X.append(x)
      y.append(r)
    print(len(X), len(y))
    mod.fit(X,y)
    mse = MSE(y, mod.predict(X))
    answers['Q3b'].append(mse)

4228 4228
3143 3143
2272 2272


In [201]:
assertFloatList(answers['Q3b'], 3)

In [202]:
answers

{'Q1a': 1.970416294395752,
 'Q1b': 2.051966103395068,
 'Q2': [2.666035950804163, 2.1542691579943236, 2.0280931357090237],
 'Q3a': [[1, 4, 4], [1, 4, 4, 4]],
 'Q3b': [1.5608319121482233, 1.540951237331577, 1.5396484853948436]}

In [None]:
### 4a

In [203]:
globalAverage = [d['rating'] for d in dataset]
globalAverage = sum(globalAverage) / len(globalAverage)

In [222]:
def featureMeanValue(N, u): # For a user u and a window size of N
    ratings = [d['rating'] for d in reviewsPerUser[u]]
    feat = []
    if len(ratings)==1:
      return feat + [globalAverage]*(N+1 - len(ratings))
    if len(ratings)<=N:
      avg = np.mean(ratings[:-1])
      diff = N+1 - len(ratings)
      return feat + feature3(N,u) + [avg]*diff 
    return feat + feature3(N,u)


In [223]:
def featureMissingValue(N, u):
    ratings = [d['rating'] for d in reviewsPerUser[u]]
    for i in range(1, len(feature3(N,u))):
      feat.append([0,feature3(N,u)[i]])
    if len(ratings) <=N:
      feat += [1,0]*(N+1 - len(ratings))
    return feat
    

In [224]:
answers['Q4a'] = [featureMeanValue(10, dataset[0]['user_id']), featureMissingValue(10, dataset[0]['user_id'])]

UnboundLocalError: ignored

In [225]:
assert len(answers['Q4a']) == 2
assert len(answers['Q4a'][0]) == 11
assert len(answers['Q4a'][1]) == 21

AssertionError: ignored

In [None]:
### 4b

In [None]:
answers['Q4b'] = []

for featFunc in [featureMeanValue, featureMissingValue]:
    # etc.
    answers['Q4b'].append(mse)

In [None]:
assertFloatList(answers["Q4b"], 2)

In [None]:
### 5

In [None]:
def feature5(sentence):
    feat = [1]
    f1 = len(sentence)
    f2 = sentence.count()
    f3 = 

In [14]:
y = []
X = []

for d in dataset:
    for spoiler,sentence in d['review_sentences']:
        X.append(feature5(sentence))
        y.append(spoiler)

NameError: ignored

In [None]:
answers['Q5a'] = X[0]

In [None]:
answers['Q5b'] = [TP, TN, FP, FN, BER]

In [None]:
assert len(answers['Q5a']) == 4
assertFloatList(answers['Q5b'], 5)

In [None]:
### 6

In [None]:
def feature6(review):
    

In [None]:
y = []
X = []

for d in dataset:
    sentences = d['review_sentences']
    if len(sentences) < 6: continue
    X.append(feature6(d))
    y.append(sentences[5][0])

#etc.

In [None]:
answers['Q6a'] = X[0]

In [None]:
answers['Q6b'] = BER

In [None]:
assert len(answers['Q6a']) == 9
assertFloat(answers['Q6b'])

In [None]:
### 7

In [18]:
# 50/25/25% train/valid/test split
Xtrain, Xvalid, Xtest = X[:len(X)//2], X[len(X)//2:(3*len(X))//4], X[(3*len(X))//4:]
ytrain, yvalid, ytest = y[:len(X)//2], y[len(X)//2:(3*len(X))//4], y[(3*len(X))//4:]

In [None]:
for c in [0.01, 0.1, 1, 10, 100]:
    # etc.

In [None]:
answers['Q7'] = bers + [bestC] + [ber]

In [None]:
assertFloatList(answers['Q7'], 7)

In [None]:
### 8

In [25]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [26]:
# 75/25% train/test split
print(len(dataset))
dataTrain = dataset[:15000]
dataTest = dataset[15000:]

20000


In [27]:
# A few utilities

itemAverages = defaultdict(list)
ratingMean = []
userAverages = defaultdict(list)

for d in dataTrain:
    itemAverages[d['book_id']].append(d['rating'])
    userAverages[d['user_id']].append(d['rating'])
    ratingMean.append(d['rating'])

for i in itemAverages:
    itemAverages[i] = sum(itemAverages[i]) / len(itemAverages[i])

for u in userAverages:
    userAverages[u] = sum(userAverages[u]) / len(userAverages[u])

ratingMean = sum(ratingMean) / len(ratingMean)

In [28]:
reviewsPerUser = defaultdict(list)
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
ratingDict = {} # To retrieve a rating for a specific user/item pair
for d in dataTrain:
    u,i= d['user_id'], d['book_id']
    reviewsPerUser[u].append(d)
    usersPerItem[i].add(u)
    itemsPerUser[u].add(i)
    ratingDict[(u,i)] =d['rating']

In [12]:
# From my HW2 solution, welcome to reuse
# def predictRating(user,item):
#     ratings = []
#     similarities = []
#     for d in reviewsPerUser[user]:
#         i2 = d['book_id']
#         if i2 == item: continue
#         ratings.append(d['rating'] - itemAverages[i2])
#         similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
#     if (sum(similarities) > 0):
#         weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
#         return itemAverages[item] + sum(weightedRatings) / sum(similarities)
#     else:
#         # User hasn't rated any similar items
#         if item in itemAverages:
#             return itemAverages[item]
#         else:
#             return ratingMean
def predictRating(user,item):
  ratings = []
  similarities = []
  for d in usersPerItem[item]:
    #d is user
    if d == user: continue
    ratings.append(ratingDict[(d,item)]- userAverages[d])
    similarities.append(Jaccard(itemsPerUser[user],itemsPerUser[d]))
  if (sum(similarities) > 0):
    weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
    return itemAverages[item] + sum(weightedRatings) / sum(similarities)
  else:
    # User hasn't rated any similar items
    if item in itemAverages:
        return itemAverages[item]
    return ratingMean
 
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [13]:
predictions = [predictRating(d['user_id'], d['book_id']) for d in dataTest]
labels = [d['rating'] for d in dataTest]

In [14]:
answers["Q8"] = MSE(predictions, labels)

In [15]:
assertFloat(answers["Q8"])

In [16]:
answers

{'Q8': 1.8218848124540317}

In [17]:
### 9

In [29]:
# instances i was never in training set
items_in_train = set(usersPerItem.keys()) #items
items_in_test_only = []
r_items_in_test_only = []


for x in dataTest:
  if x['book_id'] not in items_in_train:
    items_in_test_only.append(x)
    r_items_in_test_only.append(x['rating'])
# instances i was less than 5 in training set
item_in_train_leq_5 = set()
item_in_train_more_5 = set()
for d in usersPerItem: #d is items
  if 1<=len(usersPerItem[d])<=5:
    item_in_train_leq_5.add(d)
  elif len(usersPerItem[d])>5:
    item_in_train_more_5.add(d)

items_in_test_lt_5_train = []
r_items_in_test_lt_5_train = []
items_in_test_more_5_train = []
r_items_in_test_more_5_train = []
for x in dataTest:
  if x['book_id'] in item_in_train_leq_5:
    items_in_test_lt_5_train.append(x)
    r_items_in_test_lt_5_train.append(x['rating'])
  elif x['book_id'] in item_in_train_more_5:
    items_in_test_more_5_train.append(x)
    r_items_in_test_more_5_train.append(x['rating'])

print(len(items_in_test_only),len(items_in_test_lt_5_train), len(items_in_test_more_5_train))

360 2860 1780


In [30]:
predictions = [predictRating(d['user_id'], d['book_id']) for d in items_in_test_only]
labels = [r for r in r_items_in_test_only]
mse0 = MSE(predictions, labels)

predictions = [predictRating(d['user_id'], d['book_id']) for d in items_in_test_lt_5_train]
labels = [r for r in r_items_in_test_lt_5_train]
mse1to5 = MSE(predictions, labels)

predictions = [predictRating(d['user_id'], d['book_id']) for d in items_in_test_more_5_train]
labels = [r for r in r_items_in_test_more_5_train]
mse5 = MSE(predictions, labels)


In [31]:
[mse0, mse1to5, mse5]

[1.742012484444442, 2.073713661093682, 1.4334148860349525]

In [32]:
answers["Q9"] = [mse0, mse1to5, mse5]

In [33]:
assertFloatList(answers["Q9"], 3)

In [89]:
### 10

In [38]:
def predictRating_coldstart(user,item):
    ratings = []
    similarities = []
    for d in usersPerItem[item]:
      #d is user
      if d == user: continue
      ratings.append(ratingDict[(d,item)]- userAverages[d])
      similarities.append(Jaccard(itemsPerUser[user],itemsPerUser[d]))
    if (sum(similarities) > 0):
      weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
      return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
      # User hasn't rated any similar items
      if item in itemAverages:
          return itemAverages[item]
      elif user in userAverages:
          return userAverages[user]
      return ratingMean
  


In [None]:
for d in items_in_test_only:
  if itemsPerUser[d['user_id']]:
    print(itemsPerUser[d['user_id']])


In [39]:
predictions = [predictRating_coldstart(d['user_id'], d['book_id']) for d in items_in_test_only]
labels = [r for r in r_items_in_test_only]
mse0 = MSE(predictions, labels)

In [42]:
itsMSE = mse0

In [43]:
answers["Q10"] = ("Since item does not exist in training set, we can just return average for that user", itsMSE)

In [None]:
assert type(answers["Q10"][0]) == str
assertFloat(answers["Q10"][1])

In [35]:
f = open("answers_midterm.txt", 'w')
f.write(str(answers) + '\n')
f.close()

In [133]:
answers

{'Q8': 1.774765822204667,
 'Q9': [1.8357575313080152, 2.0072803882444763, 1.406769789656592]}

In [34]:
answers

{'Q8': 1.8218848124540317,
 'Q9': [1.742012484444442, 2.073713661093682, 1.4334148860349525]}