In [55]:
import json
from collections import defaultdict
from sklearn import linear_model
import numpy
import random
import gzip
import dateutil.parser
import math

In [56]:
answers = {}

In [57]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [58]:
### Question 1

In [59]:
f = gzip.open("fantasy_10000.json.gz")
dataset = []
for l in f:
    dataset.append(json.loads(l))

In [60]:
max_len = 0
for d in dataset:
    max_len = max(max_len, len(d['review_text']))

In [61]:
def feature(datum):
    a = len(datum['review_text']) / max_len
    return [1, a]

In [62]:
X = [feature(d) for d in dataset]
Y = [d['rating'] for d in dataset]

In [63]:
theta, residuals, rank, s = numpy.linalg.lstsq(X,Y)
Y_pred = [theta[0] + theta[1] * x[1] for x in X]
MSE = numpy.mean(residuals) / len(dataset) 

  theta, residuals, rank, s = numpy.linalg.lstsq(X,Y)


In [64]:
answers['Q1'] = [theta[0], theta[1], MSE]

In [65]:
assertFloatList(answers['Q1'], 3)

In [66]:
### Question 2

In [67]:
for d in dataset:
    t = dateutil.parser.parse(d['date_added'])
    d['parsed_date'] = [t.weekday(), t.month]

In [68]:
def feature(datum):
    a = len(datum['review_text']) / max_len
    b = [0] * 6
    c = [0] * 11
    weekday, month = datum['parsed_date']
    if weekday != 0:
        b[weekday-1] = 1
    if month != 1:
        c[month-2] = 1
    return [1, a] + b + c

In [69]:
X = [feature(d) for d in dataset]
Y = [d['rating'] for d in dataset]

In [70]:
answers['Q2'] = [X[0], X[1]]

In [71]:
assertFloatList(answers['Q2'][0], 19)
assertFloatList(answers['Q2'][1], 19)

In [72]:
### Question 3

In [73]:
def feature3(datum):
    a = len(datum['review_text']) / max_len
    b, c = datum['parsed_date']
    return [1, a, b, c]

In [74]:
X3 = [feature3(d) for d in dataset]
Y3 = [d['rating'] for d in dataset]

In [75]:
theta, residuals, rank, s = numpy.linalg.lstsq(X,Y)
theta3, residuals3, rank3, s3 = numpy.linalg.lstsq(X3,Y3)
mse2 = numpy.mean(residuals) / len(dataset)
mse3 = numpy.mean(residuals3) / len(dataset) 

  theta, residuals, rank, s = numpy.linalg.lstsq(X,Y)
  theta3, residuals3, rank3, s3 = numpy.linalg.lstsq(X3,Y3)


In [76]:
answers['Q3'] = [mse2, mse3]

In [77]:
assertFloatList(answers['Q3'], 2)

In [78]:
### Question 4

In [79]:
random.seed(0)
random.shuffle(dataset)

In [80]:
X2 = [feature(d) for d in dataset]
X3 = [feature3(d) for d in dataset]
Y = [d['rating'] for d in dataset]

In [81]:
train2, test2 = X2[:len(X2)//2], X2[len(X2)//2:]
train3, test3 = X3[:len(X3)//2], X3[len(X3)//2:]
trainY, testY = Y[:len(Y)//2], Y[len(Y)//2:]

In [82]:
theta2, residuals2, rank2, s2 = numpy.linalg.lstsq(test2,testY)
theta3, residuals3, rank3, s3 = numpy.linalg.lstsq(test3,testY)
test_mse2 = numpy.mean(residuals2) / len(test2)
test_mse3 = numpy.mean(residuals3) / len(test3)

  theta2, residuals2, rank2, s2 = numpy.linalg.lstsq(test2,testY)
  theta3, residuals3, rank3, s3 = numpy.linalg.lstsq(test3,testY)


In [83]:
answers['Q4'] = [test_mse2, test_mse3]

In [84]:
assertFloatList(answers['Q4'], 2)

In [85]:
### Question 5

In [86]:
f = open("beer_50000.json")
dataset = []
for l in f:
    dataset.append(eval(l))

In [87]:
X = [[1,len(d['review/text'])] for d in dataset]
y = [d['review/overall'] >= 4 for d in dataset]

In [88]:
mod = linear_model.LogisticRegression(C=1.0,class_weight='balanced')
mod.fit(X,y)
pred = mod.predict(X)
pred = pred.tolist()

In [89]:
TP = sum([(a and b) for (a,b) in zip(pred,y)])
TN = sum([(not a and not b) for (a,b) in zip(pred,y)])
FP = sum([(a and not b) for (a,b) in zip(pred,y)])
FN = sum([(not a and b) for (a,b) in zip(pred,y)])
TPR = TP / (TP + FN)
TNR = TN / (TN + FP)

In [90]:
BER = 1 - 0.5 * (TPR + TNR)

In [91]:
answers['Q5'] = [TP, TN, FP, FN, BER]

In [92]:
assertFloatList(answers['Q5'], 5)

In [93]:
### Question 6

In [94]:
scores = mod.decision_function(X)
scores_labels = list(zip(scores,y))
scores_labels.sort(reverse=True)
sorted_labels = [x[1] for x in scores_labels]

In [95]:
precs = []

In [96]:
for k in [1,100,1000,10000]:
    precs.append(sum(sorted_labels[:k]) / k)

In [97]:
answers['Q6'] = precs

In [98]:
assertFloatList(answers['Q6'], 4)

In [99]:
### Question 7

In [100]:
X = [[1,len(d['review/text']),d['review/taste'],d['review/aroma']] for d in dataset]
mod.fit(X,y)
pred = mod.predict(X)
pred = pred.tolist()
TP = sum([(a and b) for (a,b) in zip(pred,y)])
TN = sum([(not a and not b) for (a,b) in zip(pred,y)])
FP = sum([(a and not b) for (a,b) in zip(pred,y)])
FN = sum([(not a and b) for (a,b) in zip(pred,y)])
TPR = TP / (TP + FN)
TNR = TN / (TN + FP)
BER = 1 - 0.5 * (TPR + TNR)

In [101]:
answers['Q7'] = ["I added more features to X (review of aroma and taste) to increase the accuracy of the predictor", BER]

In [102]:
f = open("answers_hw1.txt", 'w')
f.write(str(answers) + '\n')
f.close()