In [1]:
import gzip
import math
import numpy
import random
import sklearn
import string
from collections import defaultdict
from gensim.models import Word2Vec
from nltk.stem.porter import *
from sklearn import linear_model
from sklearn.manifold import TSNE
import dateutil

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [4]:
dataset = []

f = gzip.open("young_adult_20000.json.gz")
for l in f:
    d = eval(l)
    dataset.append(d)
    if len(dataset) >= 20000:
        break
        
f.close()

In [5]:
answers = {}

In [6]:
dataset[0]

{'user_id': 'dc3763cdb9b2cae805882878eebb6a32',
 'book_id': '18471619',
 'review_id': '66b2ba840f9bd36d6d27f46136fe4772',
 'rating': 3,
 'review_text': 'Sherlock Holmes and the Vampires of London \n Release Date: April 2014 \n Publisher: Darkhorse Comics \n Story by: Sylvain Cordurie \n Art by: Laci \n Colors by: Axel Gonzabo \n Cover by: Jean Sebastien Rossbach \n ISDN: 9781616552664 \n MSRP: $17.99 Hardcover \n "Sherlock Holmes died fighting Professor Moriarty in the Reichenbach Falls. \n At least, that\'s what the press claims. \n However, Holmes is alive and well and taking advantage of his presumed death to travel the globe. \n Unfortunately, Holmes\'s plans are thwarted when a plague of vampirism haunts Britain. \n This book collects Sherlock Holmes and the Vampires of London Volumes 1 and 2, originally created by French publisher Soleil." - Darkhorse Comics \n When I received this copy of "Sherlock Holmes and the Vampires of London" I was Ecstatic! The cover art was awesome and 

In [7]:
### Question 1

In [8]:
def mostCommonUnigrams():
    wordCount = defaultdict(int)
    punctuation = set(string.punctuation)
    for d in dataset:
        r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
        ws = r.split()
        for w in ws:
            wordCount[w] += 1

    counts = [(wordCount[w], w) for w in wordCount]
    counts.sort()
    counts.reverse()

    words = [x[1] for x in counts[:1000]]

    wordId = dict(zip(words, range(len(words))))
    wordSet = set(words)

    def feature(datum):
        feat = [0]*len(words)
        r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
        ws = r.split()
        for w in ws:
            if w in words:
                feat[wordId[w]] += 1
        feat.append(1) #offset
        return feat

    X = [feature(d) for d in dataset]
    y = [d['rating'] for d in dataset]

    Xtrain = X[:10000]
    Xtext = X[10000:]
    ytrain = y[:10000]
    ytest = y[10000:]

    clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
    clf.fit(Xtrain, ytrain)
    theta = clf.coef_
    predictions = clf.predict(Xtext)

    wordSort = list(zip(theta[:-1], words))
    wordSort.sort()

    mse = sum((ytest - predictions)**2)/len(ytest)

    return mse, wordSort

In [9]:
def mostCommonBigrams():
    wordCount = defaultdict(int)
    punctuation = set(string.punctuation)
    for d in dataset:
        r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
        ws = r.split()
        ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
        for w in ws2:
            wordCount[w] += 1

    counts = [(wordCount[w], w) for w in wordCount]
    counts.sort()
    counts.reverse()

    words = [x[1] for x in counts[:1000]]

    wordId = dict(zip(words, range(len(words))))
    wordSet = set(words)

    def feature(datum):
        feat = [0]*len(words)
        r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
        ws = r.split()
        ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
        for w in ws2:
            if w in words:
                feat[wordId[w]] += 1
        feat.append(1) #offset
        return feat

    X = [feature(d) for d in dataset]
    y = [d['rating'] for d in dataset]

    Xtrain = X[:10000]
    Xtext = X[10000:]
    ytrain = y[:10000]
    ytest = y[10000:]

    clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
    clf.fit(Xtrain, ytrain)
    theta = clf.coef_
    predictions = clf.predict(Xtext)

    wordSort = list(zip(theta[:-1], words))
    wordSort.sort()

    mse = sum((ytest - predictions)**2)/len(ytest)

    return mse, wordSort

In [10]:
def mostCommonBoth():
    wordCount = defaultdict(int)
    punctuation = set(string.punctuation)
    for d in dataset:
        r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
        ws = r.split()
        ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
        for w in ws + ws2:
            wordCount[w] += 1

    counts = [(wordCount[w], w) for w in wordCount]
    counts.sort()
    counts.reverse()

    words = [x[1] for x in counts[:1000]]

    wordId = dict(zip(words, range(len(words))))
    wordSet = set(words)

    def feature(datum):
        feat = [0]*len(words)
        r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
        ws = r.split()
        ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
        for w in ws + ws2:
            if w in words:
                feat[wordId[w]] += 1
        feat.append(1) #offset
        return feat

    X = [feature(d) for d in dataset]
    y = [d['rating'] for d in dataset]

    Xtrain = X[:10000]
    Xtext = X[10000:]
    ytrain = y[:10000]
    ytest = y[10000:]

    clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
    clf.fit(Xtrain, ytrain)
    theta = clf.coef_
    predictions = clf.predict(Xtext)

    wordSort = list(zip(theta[:-1], words))
    wordSort.sort()

    mse = sum((ytest - predictions)**2)/len(ytest)

    return mse, wordSort

In [11]:
for q,wList in ('Q1a', mostCommonUnigrams), ('Q1b', mostCommonBigrams), ('Q1c', mostCommonBoth):
    mse, wordSort = wList()

    answers[q] = [float(mse), [x[1] for x in wordSort[:5]], [x[1] for x in wordSort[-5:]]]

print(answers['Q1a'])
print(answers['Q1b'])
print(answers['Q1c'])

[1.2390553477075859, ['boring', 'disappointing', 'says', 'worst', 'basically'], ['5', 'yourself', 'beautifully', 'mix', 'wait']]
[1.2930626118603759, ['tuned for', 'miss your', 'the worst', 'a bad', 'too many'], ['reviews as', '5 stars', 'stay tuned', 'cant wait', 'forget to']]
[1.2366939869514826, ['katies corner', 'share', 'what is', 'least', 'able to'], ['at least', 'excellent', 'wait', 'able', 'katies']]


In [12]:
for q in 'Q1a', 'Q1b', 'Q1c':
    assert len(answers[q]) == 3
    assertFloat(answers[q][0])
    assert [type(x) for x in answers[q][1]] == [str]*5
    assert [type(x) for x in answers[q][2]] == [str]*5

In [13]:
### Question 2

In [14]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in dataset:
    r = d['review_text']
    ws = r.split()
    for w in ws:
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:1000]]

df = defaultdict(int)
for d in dataset:
    r = d['review_text']
    for w in set(r.split()):
        df[w] += 1

rev = dataset[9]

tf = defaultdict(int)
r = rev['review_text']
for w in r.split():
    # Note = rather than +=, different versions of tf could be used instead
    tf[w] = 1
    
tfidf = dict(zip(words,[tf[w] * math.log2(len(dataset) / df[w]) for w in words]))
tfidfQuery = [tf[w] * math.log2(len(dataset) / df[w]) for w in words]

def Cosine(x1,x2):
    numer = 0
    norm1 = 0
    norm2 = 0
    for a1,a2 in zip(x1,x2):
        numer += a1*a2
        norm1 += a1**2
        norm2 += a2**2
    if norm1*norm2:
        return numer / math.sqrt(norm1*norm2)
    return 0

similarities = []
for rev2 in dataset:
    tf = defaultdict(int)
    r = rev2['review_text']
    for w in r.split():
        # Note = rather than +=
        tf[w] = 1
    tfidf2 = [tf[w] * math.log2(len(dataset) / df[w]) for w in words]
    similarities.append((Cosine(tfidfQuery, tfidf2), rev2['review_text']))

similarities.sort(reverse=True)
sim, review = similarities[0]

In [15]:
answers['Q2'] = [sim, review]
answers['Q2']

[1.0,
 'I checked this out of the library as their Valentine\'s Day "Blind Date with a Book" display, and I\'m glad I hooked up with this graphic novel. It\'s been decades since I read any of King\'s Dark Tower books, and although this jumped into the plot, I was able to catch up and enjoy this slice of the story. \n The art is really nice, a the narration/dialogue balance works. It made me want to re-read King\'s books, and it definitely made me excited for the upcoming film.']

In [16]:
assert len(answers['Q2']) == 2
assertFloat(answers['Q2'][0])
assert type(answers['Q2'][1]) == str

In [17]:
### Question 3

In [18]:
reviewsPerUser = defaultdict(list)

In [19]:
for d in dataset:
    reviewsPerUser[d['user_id']].append((dateutil.parser.parse(d['date_added']), d['book_id']))

In [20]:
reviewLists = []
for u in reviewsPerUser:
    rl = list(reviewsPerUser[u])
    rl.sort()
    reviewLists.append([x[1] for x in rl])

model10 = Word2Vec(reviewLists,
                 min_count=1, # Words/items with fewer instances are discarded
                 vector_size=10, # Model dimensionality
                 window=3, # Window size
                 sg=1) # Skip-gram model

In [21]:
reviewLists[0][0]

'18471619'

In [22]:
similarities = model10.wv.similar_by_word(reviewLists[0][0])[:5]
similarities

[('8497638', 0.9409928917884827),
 ('25032624', 0.9308857321739197),
 ('21519210', 0.8949795365333557),
 ('22752448', 0.8882659673690796),
 ('5497136', 0.8640972375869751)]

In [23]:
answers['Q3'] = similarities # probably want model10.wv.similar_by_word(...)[:5]
answers['Q3']

[('8497638', 0.9409928917884827),
 ('25032624', 0.9308857321739197),
 ('21519210', 0.8949795365333557),
 ('22752448', 0.8882659673690796),
 ('5497136', 0.8640972375869751)]

In [24]:
assert len(answers['Q3']) == 5
assert [type(x[0]) for x in answers['Q3']] == [str]*5
assertFloatList([x[1] for x in answers['Q3']], 5)

In [25]:
### Question 4

In [26]:
ratingMean = sum([d['rating'] for d in dataset]) / len(dataset)

bookAverages = defaultdict(list)
reviewsPerUser = defaultdict(list)
    
for d in dataset:
    b = d['book_id']
    u = d['user_id']
    bookAverages[b].append(d['rating'])
    reviewsPerUser[u].append(d)
    
for b in bookAverages:
    bookAverages[b] = sum(bookAverages[b]) / len(bookAverages[b])

def predictRating(user,item):
    ratings = []
    similarities = []
    if not str(item) in model10.wv:
        return ratingMean
    for d in reviewsPerUser[user]:
        i2 = d['book_id']
        if i2 == item: continue
        ratings.append(d['rating'] - bookAverages[i2])
        if str(i2) in model10.wv:
            similarities.append(model10.wv.distance(str(item), str(i2)))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return bookAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        return ratingMean

In [27]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [28]:
predictions = [predictRating(d['user_id'],d['book_id']) for d in dataset]
labels = [d['rating'] for d in dataset]

In [29]:
mse4 = MSE(predictions, labels)

In [30]:
answers['Q4'] = mse4
answers['Q4']

0.43403542771743414

In [31]:
assertFloat(answers['Q4'])

In [32]:
### Q5

In [86]:
reviewsPerItem = defaultdict(list)
for d in dataset:
    reviewsPerItem[d['book_id']].append((dateutil.parser.parse(d['date_added']), d['user_id']))

In [87]:
reviewLists = []
for i in reviewsPerItem:
    rl = list(reviewsPerItem[i])
    rl.sort()
    reviewLists.append([x[1] for x in rl])

In [88]:
model = Word2Vec(reviewLists,
                 min_count=1, # Words/items with fewer instances are discarded
                 vector_size=10, # Model dimensionality
                 window=3, # Window size
                 sg=1) # Skip-gram model

In [89]:
reviewLists[0]

['dc3763cdb9b2cae805882878eebb6a32', 'eaa54d876d841293059657fb80a9bba6']

In [90]:
model.wv.similar_by_word(reviewLists[0][0])

[('efeb828154cb6024ae06501b582915fb', 0.9248121380805969),
 ('6bc8beca6151bf938e83d506e17c30e6', 0.8022053241729736),
 ('83cb3d94e518148de88b5873fc26d89c', 0.800977349281311),
 ('aa233705616224ecdeb8222f31403d02', 0.7624194025993347),
 ('b5e5c3932577b12cba9f3e84d029ddc3', 0.7350650429725647),
 ('74b6e3f6ec6370dd790b695198b0ea1b', 0.7223799228668213),
 ('1aaa886facd8dcd72be8f224bdd57ace', 0.7197946310043335),
 ('b9696fd94e05afe6df130405f2c4e056', 0.7197611331939697),
 ('db1737a0303404b846a7142fcd3bb05d', 0.7026280760765076),
 ('fa0a93ee673955268086e6b61a4b03ee', 0.7003586888313293)]

In [91]:
ratingMean = sum([d['rating'] for d in dataset]) / len(dataset)

userAverages = defaultdict(list)
reviewsPerItem = defaultdict(list)
    
for d in dataset:
    b = d['book_id']
    u = d['user_id']
    userAverages[u].append(d['rating'])
    reviewsPerItem[b].append(d)
    
for u in userAverages:
    userAverages[u] = sum(userAverages[u]) / len(userAverages[u])

def predictRating(user,item):
    ratings = []
    similarities = []
    if not str(user) in model.wv:
        return ratingMean
    for d in reviewsPerItem[item]:
        u2 = d['user_id']
        if u2 == user: continue
        ratings.append(d['rating'] - userAverages[u2])
        if str(u2) in model.wv:
            similarities.append(model.wv.distance(str(user), str(u2)))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return userAverages[user] + sum(weightedRatings) / sum(similarities)
    else:
        return ratingMean

In [92]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [93]:
predictions = [predictRating(d['user_id'],d['book_id']) for d in dataset]
labels = [d['rating'] for d in dataset]

In [94]:
mse5 = MSE(predictions, labels)
mse5

1.2322892157283016

****

In [61]:
reviewsPerUser = defaultdict(list)

In [62]:
for d in dataset:
    reviewsPerUser[d['user_id']].append((dateutil.parser.parse(d['date_added']), d['book_id']))

In [63]:
reviewLists = []
for u in reviewsPerUser:
    rl = list(reviewsPerUser[u])
    rl.sort()
    reviewLists.append([x[1] for x in rl])

model10 = Word2Vec(reviewLists,
                 min_count=1, # Words/items with fewer instances are discarded
                 vector_size=10, # Model dimensionality
                 window=3, # Window size
                 sg=5) # Skip-gram model

In [64]:
reviewLists[0]

['18471619']

In [65]:
similarities = model10.wv.similar_by_word(reviewLists[0][0])[:5]
similarities

[('8497638', 0.9406467080116272),
 ('25032624', 0.9298714399337769),
 ('21519210', 0.8964695334434509),
 ('22752448', 0.8873355388641357),
 ('5497136', 0.8650296926498413)]

In [66]:
ratingMean = sum([d['rating'] for d in dataset]) / len(dataset)

bookAverages = defaultdict(list)
reviewsPerUser = defaultdict(list)
    
for d in dataset:
    b = d['book_id']
    u = d['user_id']
    bookAverages[b].append(d['rating'])
    reviewsPerUser[u].append(d)
    
for b in bookAverages:
    bookAverages[b] = sum(bookAverages[b]) / len(bookAverages[b])

def predictRating(user,item):
    ratings = []
    similarities = []
    if not str(item) in model10.wv:
        return ratingMean
    for d in reviewsPerUser[user]:
        i2 = d['book_id']
        if i2 == item: continue
        ratings.append(d['rating'] - bookAverages[i2])
        if str(i2) in model10.wv:
            similarities.append(model10.wv.distance(str(item), str(i2)))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return bookAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        return ratingMean

In [67]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [68]:
predictions = [predictRating(d['user_id'],d['book_id']) for d in dataset]
labels = [d['rating'] for d in dataset]

In [69]:
mse5 = MSE(predictions, labels)
mse5

0.43398476240642303

In [70]:
answers['Q5'] = ["I change the parameters sg(Skip-gram) of model from 1 to 5. The skip-gram model builds a model by predicting surrounding words given the current word.",
                 mse5]

In [72]:
assert len(answers['Q5']) == 2
assert type(answers['Q5'][0]) == str
assertFloat(answers['Q5'][1])

In [73]:
f = open("answers_hw4.txt", 'w')
f.write(str(answers) + '\n')
f.close()