In [968]:
import random
from sklearn import linear_model
from matplotlib import pyplot as plt
from collections import defaultdict
import gzip
from datetime import date
import math

In [909]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [910]:
answers = {}

In [911]:
def parseData(fname):
    for l in open(fname):
        yield eval(l)

In [912]:
data = list(parseData("beer_50000.json"))
max_review_len = 0
for d in data:
    max_review_len = max(max_review_len,len(d['review/text']))

In [913]:
random.seed(0)
random.shuffle(data)

In [914]:
dataTrain = data[:25000]
dataValid = data[25000:37500]
dataTest = data[37500:]

In [915]:
yTrain = [d['beer/ABV'] > 7 for d in dataTrain]
yValid = [d['beer/ABV'] > 7 for d in dataValid]
yTest = [d['beer/ABV'] > 7 for d in dataTest]

In [916]:
def feature(datum,dict):
    a = [0] * len(dict)
    if datum['beer/style'] in dict:
        index = dict[datum['beer/style']]
        a[index] = 1 
    return [1] + a

def feature2(datum,dict):
    a = [0] * len(dict)    
    if datum['beer/style'] in dict:
        index = dict[datum['beer/style']]
        a[index] = 1 
    r = [datum['review/appearance'],datum['review/palate'],datum['review/taste'],datum['review/overall'],datum['review/aroma'],len(datum['review/text'])/max_review_len]
    
    return [1] + a + r

def feature_no_cat(datum):
    r = [datum['review/appearance'],datum['review/palate'],datum['review/taste'],datum['review/overall'],datum['review/aroma'],len(datum['review/text'])/max_review_len]
    return [1] + r

def feature_no_review(datum,dict):
    a = [0] * len(dict)
    if datum['beer/style'] in dict:
        index = dict[datum['beer/style']]
        a[index] = 1 
    return [1] + a + [len(datum['review/text'])/max_review_len]

def feature_no_length(datum,dict):
    a = [0] * len(dict)    
    if datum['beer/style'] in dict:
        index = dict[datum['beer/style']]
        a[index] = 1 
    r = [datum['review/appearance'],datum['review/palate'],datum['review/taste'],datum['review/overall'],datum['review/aroma']]
    return [1] + a + r

    
def getBER(pred,y):
    TP = sum([(a and b) for (a,b) in zip(pred,y)])
    TN = sum([(not a and not b) for (a,b) in zip(pred,y)])
    FP = sum([(a and not b) for (a,b) in zip(pred,y)])
    FN = sum([(not a and b) for (a,b) in zip(pred,y)])
    TPR = TP / (TP + FN)
    TNR = TN / (TN + FP)
    BER = 1 - 0.5 * (TPR + TNR)
    return BER
    

In [917]:
categoryCounts = defaultdict(int)
for d in data:
    categoryCounts[d['beer/style']] += 1

In [918]:
categories = [c for c in categoryCounts if categoryCounts[c] > 1000]

In [919]:
catID = dict(zip(list(categories),range(len(categories))))

In [920]:
xTrain = [feature(d,catID) for d in dataTrain]
xValid = [feature(d,catID) for d in dataValid]
xTest = [feature(d,catID) for d in dataTest]
mod = linear_model.LogisticRegression(C=10,class_weight='balanced')
mod.fit(xTrain,yTrain)
pred = mod.predict(xValid)
pred = pred.tolist()
validBER = getBER(pred,yValid)
pred = mod.predict(xTest)
pred = pred.tolist()
testBER = getBER(pred,yTest)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [921]:
def feat(d, includeCat = True, includeReview = True, includeLength = True):
    pass

In [922]:
def pipeline(reg, includeCat = True, includeReview = True, includeLength = True):
    pass

In [923]:
### Question 1

In [924]:
answers['Q1'] = [validBER, testBER]
answers

{'Q1': [0.16130237168160533, 0.1607838024608832]}

In [925]:
assertFloatList(answers['Q1'], 2)

In [926]:
### Question 2

In [927]:
xTrain = [feature2(d,catID) for d in dataTrain]
xValid = [feature2(d,catID) for d in dataValid]
xTest = [feature2(d,catID) for d in dataTest]
mod = linear_model.LogisticRegression(C=10,class_weight='balanced')
mod.fit(xTrain,yTrain)
pred = mod.predict(xValid)
pred = pred.tolist()
validBER = getBER(pred,yValid)
pred = mod.predict(xTest)
pred = pred.tolist()
testBER = getBER(pred,yTest)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [928]:
answers['Q2'] = [validBER, testBER]

In [929]:
assertFloatList(answers['Q2'], 2)

In [930]:
### Question 3

In [931]:
bestValidBER = float('inf')
bestTestBER = float('inf')
bestC = None
for c in [0.001, 0.01, 0.1, 1, 10]:
    xTrain = [feature2(d,catID) for d in dataTrain]
    xValid = [feature2(d,catID) for d in dataValid]
    xTest = [feature2(d,catID) for d in dataTest]
    mod = linear_model.LogisticRegression(C=c,class_weight='balanced')
    mod.fit(xTrain,yTrain)
    pred = mod.predict(xValid)
    pred = pred.tolist()
    validBER = getBER(pred,yValid)
    pred = mod.predict(xTest)
    pred = pred.tolist()
    testBER = getBER(pred,yTest)
    if validBER < bestValidBER:
        bestValidBER = validBER
        bestTestBER = testBER
        bestC = c

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [932]:
answers['Q3'] = [bestC, validBER, testBER]
answers['Q3']

[0.1, 0.14178602689235864, 0.1432448216318316]

In [933]:
assertFloatList(answers['Q3'], 3)

In [934]:
### Question 4

In [935]:
xTrain = [feature_no_cat(d) for d in dataTrain]
xValid = [feature_no_cat(d) for d in dataValid]
xTest = [feature_no_cat(d) for d in dataTest]
mod = linear_model.LogisticRegression(C=1,class_weight='balanced')
mod.fit(xTrain,yTrain)
pred = mod.predict(xValid)
pred = pred.tolist()
validBER = getBER(pred,yValid)
pred = mod.predict(xTest)
pred = pred.tolist()
testBER = getBER(pred,yTest)
testBER_noCat = testBER

In [936]:
xTrain = [feature_no_review(d,catID) for d in dataTrain]
xValid = [feature_no_review(d,catID) for d in dataValid]
xTest = [feature_no_review(d,catID) for d in dataTest]
mod = linear_model.LogisticRegression(C=1,class_weight='balanced')
mod.fit(xTrain,yTrain)
pred = mod.predict(xValid)
pred = pred.tolist()
validBER = getBER(pred,yValid)
pred = mod.predict(xTest)
pred = pred.tolist()
testBER = getBER(pred,yTest)
testBER_noReview = testBER

In [937]:
xTrain = [feature_no_length(d,catID) for d in dataTrain]
xValid = [feature_no_length(d,catID) for d in dataValid]
xTest = [feature_no_length(d,catID) for d in dataTest]
mod = linear_model.LogisticRegression(C=1,class_weight='balanced')
mod.fit(xTrain,yTrain)
pred = mod.predict(xValid)
pred = pred.tolist()
validBER = getBER(pred,yValid)
pred = mod.predict(xTest)
pred = pred.tolist()
testBER = getBER(pred,yTest)
testBER_noLength = testBER

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [938]:
answers['Q4'] = [testBER_noCat, testBER_noReview, testBER_noLength]
answers['Q4']

[0.3138624152215086, 0.16109632033831978, 0.14635254958101407]

In [939]:
assertFloatList(answers['Q4'], 3)

In [940]:
### Question 5

In [941]:
path = "amazon_reviews_us_Musical_Instruments_v1_00.tsv"
f = open(path, 'rt', encoding="utf8")

header = f.readline()
header = header.strip().split('\t')

In [942]:
header

['marketplace',
 'customer_id',
 'review_id',
 'product_id',
 'product_parent',
 'product_title',
 'product_category',
 'star_rating',
 'helpful_votes',
 'total_votes',
 'vine',
 'verified_purchase',
 'review_headline',
 'review_body',
 'review_date']

In [943]:
dataset = []

pairsSeen = set()

for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    ui = (d['customer_id'], d['product_id'])
    if ui in pairsSeen:
        print("Skipping duplicate user/item:", ui)
        continue
    pairsSeen.add(ui)
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)


Skipping duplicate user/item: ('46953315', 'B00QM3CNN6')
Skipping duplicate user/item: ('31616428', 'B0026RB0G8')
Skipping duplicate user/item: ('47240912', 'B008I653SC')
Skipping duplicate user/item: ('14503091', 'B003FRMRC4')
Skipping duplicate user/item: ('38538360', 'B00HVLUR86')
Skipping duplicate user/item: ('43448024', 'B00HVLUR86')
Skipping duplicate user/item: ('51525270', 'B00HVLUR86')
Skipping duplicate user/item: ('20652160', 'B004OU2IQG')
Skipping duplicate user/item: ('10964440', 'B00HVLUR86')
Skipping duplicate user/item: ('20043677', 'B00HVLUR86')
Skipping duplicate user/item: ('44796499', 'B00HVLUSGM')
Skipping duplicate user/item: ('29066899', 'B0002CZSYO')
Skipping duplicate user/item: ('10385056', 'B004OU2IQG')
Skipping duplicate user/item: ('1658551', 'B00HVLURL8')
Skipping duplicate user/item: ('907433', 'B00N9Q2E5G')
Skipping duplicate user/item: ('39412969', 'B00HVLUR86')
Skipping duplicate user/item: ('4901688', 'B00HVLUR86')
Skipping duplicate user/item: ('234

In [944]:
dataTrain = dataset[:int(len(dataset)*0.9)]
dataTest = dataset[int(len(dataset)*0.9):]
dataTrain[0]

{'marketplace': 'US',
 'customer_id': '45610553',
 'review_id': 'RMDCHWD0Y5OZ9',
 'product_id': 'B00HH62VB6',
 'product_parent': '618218723',
 'product_title': 'AGPtek® 10 Isolated Output 9V 12V 18V Guitar Pedal Board Power Supply Effect Pedals with Isolated Short Cricuit / Overcurrent Protection',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'N',
 'review_headline': 'Three Stars',
 'review_body': 'Works very good, but induces ALOT of noise.',
 'review_date': '2015-08-31'}

In [945]:
# Feel free to keep or discard

usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
itemNames = {}
ratingDict = {} # To retrieve a rating for a specific user/item pair
reviewsPerUser = defaultdict(list)

for d in dataset:
    user,item = d['customer_id'], d['product_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    ratingDict[(user,item)] = d['star_rating']
    itemNames[item] = d['product_title']

In [946]:
userAverages = {}
itemAverages = {}

for u in itemsPerUser:
    rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)
ratingMean = sum([d['star_rating'] for d in dataset]) / len(dataset)

In [947]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [948]:
def mostSimilar(i, N):
    similarities = []
    users = usersPerItem[i]
    for i2 in usersPerItem:
        if i2 == i: continue
        sim = Jaccard(users, usersPerItem[i2])
        #sim = Pearson(i, i2) # Could use alternate similarity metrics straightforwardly
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:N]

In [949]:
query = 'B00KCHRKD6'

In [950]:
ms = mostSimilar(query, 10)

In [951]:
answers['Q5'] = ms
answers['Q5']

[(0.015228426395939087, 'B00H7NFDKA'),
 (0.014492753623188406, 'B00QKVV3HC'),
 (0.014492753623188406, 'B00GXRMD7W'),
 (0.014084507042253521, 'B00H7ILRRI'),
 (0.014084507042253521, 'B0057RUMPO'),
 (0.014084507042253521, 'B000B6DTYW'),
 (0.013888888888888888, 'B00L2708TI'),
 (0.013513513513513514, 'B009Z1KKWI'),
 (0.013333333333333334, 'B003F2BDZQ'),
 (0.013333333333333334, 'B000VYINCW')]

In [952]:
assertFloatList([m[0] for m in ms], 10)

In [953]:
### Question 6
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
itemNames = {}
ratingDict = {} # To retrieve a rating for a specific user/item pair
reviewsPerUser = defaultdict(list)

for d in dataTrain:
    user,item = d['customer_id'], d['product_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    ratingDict[(user,item)] = d['star_rating']
    itemNames[item] = d['product_title']
    reviewsPerUser[user].append(d)
    
userAverages = {}
itemAverages = {}

for u in itemsPerUser:
    rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)
    
ratingMean = sum([d['star_rating'] for d in dataTrain]) / len(dataTrain)

In [954]:
def MSE(pred, labels):
    differences = [(x-y)**2 for x,y in zip(pred,labels)]
    return sum(differences) / len(differences)

In [955]:
def predictRating(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['product_id']
        if i2 == item: continue
        ratings.append(d['star_rating'] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    elif item not in itemAverages:
        return ratingMean 
    else:
        # User hasn't rated any similar items
        return itemAverages[item]

In [956]:
alwaysPredictMean = [ratingMean for d in dataTest]

In [957]:
simPredictions = [predictRating(d['customer_id'], d['product_id']) for d in dataTest]

In [958]:
labels = [d['star_rating'] for d in dataTest]

In [959]:
answers['Q6'] = MSE(simPredictions, labels)
answers['Q6']

1.7165666373341593

In [960]:
assertFloat(answers['Q6'])

In [961]:
### Question 7

In [995]:
for d in dataTest:
    arr = d['review_date'].split('-')
    year, month, day = arr[0], arr[1], arr[2]
    t = date(int(year), int(month), int(day))
    timeDict[(d['customer_id'], d['product_id'])] = t
    
def predictRating(user,item):
    ratings = []
    similarities = []
    times = []
    for d in reviewsPerUser[user]:
        i2 = d['product_id']
        if i2 == item: continue
        ratings.append(d['star_rating'] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
        arr = d['review_date'].split('-')
        year, month, day = arr[0], arr[1], arr[2]
        t = date(int(year), int(month), int(day))
        times.append(t)
    if (sum(similarities) > 0):
        t = timeDict[(user,item)]
        weightedRatings = [x*y*abs((t-z).seconds) for x,y,z in zip(ratings,similarities,times)]
        return itemAverages[item] + sum(weightedRatings) / sum([y*abs((t-z).seconds+1) for y,z in zip(similarities,times)])
    elif item not in itemAverages:
        return ratingMean 
    else:
        # User hasn't rated any similar items
        return itemAverages[item]
simPredictions = [predictRating(d['customer_id'], d['product_id']) for d in dataTest]
labels = [d['star_rating'] for d in dataTest]
itsMSE =  MSE(simPredictions, labels)

In [997]:
answers['Q7'] = ["Items that are reviewed more closely with each other in terms of time holds more weight. My function is f(|t(u,i) − t(u,j)|)", itsMSE]
answers['Q7']

['Items that are reviewed more closely with each other in terms of time holds more weight. My function is f(|t(u,i) − t(u,j)|)',
 1.6993689339769356]

In [998]:
assertFloat(answers['Q7'][1])

In [999]:
f = open("answers_hw2.txt", 'w')
f.write(str(answers) + '\n')
f.close()