In [306]:
import gzip
import math
import numpy
import random
import sklearn
import string
import numpy as np
from collections import defaultdict
from nltk.stem.porter import *
from sklearn import linear_model
from gensim.models import Word2Vec
import dateutil
from scipy.sparse import lil_matrix # To build sparse feature matrices, if you like
from sklearn.metrics import accuracy_score

In [3]:
answers = {}

In [5]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

### Question 1
Using the Steam category data, build training/test sets consisting of 10,000 reviews each. Code to do so is provided in the stub. We'll start by building features to represent the common words. Start by removing punctuation and capitalization, and finding the 1,000 most common words across all reviews ('text' field) in the training set. See the 'text mining' lectures for code for this process. Report the 10 most common words, along with their frequencies, as a list of (frequncy, word) tuples.

In [8]:
dataset = []

f = gzip.open("steam_category.json.gz")
for l in f:
    d = eval(l)
    dataset.append(d)
    if len(dataset) >= 20000:
        break
f.close()

In [9]:
#Split data in training and test set
Ntrain = 10000
Ntest = 10000

dataTrain = dataset[:Ntrain]
dataTest = dataset[Ntrain:Ntrain + Ntest]

In [10]:
#initialize dict of word:appearance_count
wordCount = defaultdict(int)

#set of punctuation to reference
sp = set(string.punctuation)

for d in dataTrain: #for review in dataset
    r = ''.join([c for c in d['text'].lower() if not c in sp]) #forms a list of all words (remove capitalization) that are not punctuation
    ws = r.split() #splits that list into words
    for w in ws: #traverses through words & adds a count to dict. for each appearance
        wordCount[w]+=1

In [11]:
#create list of dict. entries as can't .sort() dict. type -> sort from most to least common
counts = [(wordCount[w], w) for w in wordCount]
counts.sort(reverse = True)

#grab list of words where x is in top 100 spots of sorted list, this will be a b.o.w feature vector for models
common_words = [x[1] for x in counts[:1000]]

In [14]:
answers['Q1'] = counts[:10]

In [18]:
answers['Q1']

[(34211, 'the'),
 (19392, 'and'),
 (18791, 'a'),
 (18077, 'to'),
 (15043, 'game'),
 (14095, 'of'),
 (13000, 'is'),
 (12735, 'you'),
 (12204, 'i'),
 (11824, 'it')]

In [20]:
assertFloatList([x[0] for x in answers['Q1']], 10)

### Question 2
Build bag-of-words feature vectors by counting the instances of these 1,000 words in each review. Set the labels (y) to be the 'genreID' column for the training instances. You may use these labels directly with sklearn's LogisticRegression model, which will automatically perform multiclass classification. Report performance (accuracy) on your test set.

In [114]:
wordId = dict(zip(common_words, range(len(common_words))))
wordSet = set(common_words)

In [116]:
#building X
def feature(datum): #function that creates b.o.w feature
    feat = [0]*len(common_words) #initialize feature vector of length 1000, all currently 0 count
    review = ''.join([c for c in datum['text'].lower() if not c in sp])
    for w in review.split():
        if w in wordSet:
            feat[wordId[w]] += 1
    feat.append(1) #offset during loop
    return feat

X = [feature(d) for d in dataset] #feature vectors for entire dataset

In [26]:
y = [d['genre'] for d in dataset] #response variable for entire dataset

In [118]:
#subset for train/test split
Xtrain = X[:Ntrain]
ytrain = y[:Ntrain]
Xtest = X[Ntrain:]
ytest = y[Ntrain:]

In [28]:
mod = linear_model.LogisticRegression(C=1)
mod.fit(Xtrain, ytrain)
y_pred = mod.predict(Xtest);

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
correct = (y_pred == ytest)
#accuracy = .6368

In [30]:
answers['Q2'] = sum(correct) / len(correct)
answers['Q2']

0.6368

In [31]:
assertFloat(answers['Q2'])

### Question 3
What is the inverse document frequency of the words 'character', 'game', 'length', 'a', and 'it'? What are their td-idf scores in the first (training) review (using log base 10, unigrams only, following the first definition of tf-idf given in the slides)? All frequencies etc. should be calculated using the training data only. Your answer should be a list of five (idf, tfidf) pairs.

In [39]:
#document frequency = # of documents that contain the term t
df = defaultdict(int)
for d in dataTrain:
    r = ''.join([c for c in d['text'].lower() if not c in sp])
    for w in set(r.split()):
        df[w] += 1 

In [41]:
#idf - take the log base 10 of (# of documents / # of documents that contain that term)
idf = {}
for w, freq in df.items():
    idf[w] = math.log10(len(dataTrain) / freq)

In [192]:
#first (training) data review
first_review = dataTrain[0]

tf = defaultdict(int) #initialize term freq. dictionary - how often term t appears in that document
r = ''.join([c for c in first_review['text'].lower() if not c in sp])
for w in r.split():
    tf[w] += 1 

In [45]:
#words to calculate TF-IDF for
words_to_calculate = ['character', 'game', 'length', 'a', 'it']
tf_idf = {}
idf_tfidf_pairs = []

for word in words_to_calculate:
    tf_value = tf.get(word)  #get the term frequency of the word, default to 0 if not present
    idf_value = idf.get(word)  #get the IDF value of the word, default to 0 if not present
    tf_idf[word] = tf_value * idf_value  #compute TF-IDF
    tfidf_value = tf_value * idf_value 
    idf_tfidf_pairs.append((idf_value, tfidf_value))

In [47]:
answers['Q3'] = idf_tfidf_pairs

In [49]:
answers['Q3']

[(1.453457336521869, 1.453457336521869),
 (0.22951619056889208, 0.45903238113778416),
 (2.2441251443275085, 4.488250288655017),
 (0.3047810810948491, 2.4382486487587927),
 (0.376647318462008, 1.129941955386024)]

In [51]:
assertFloatList([x[0] for x in answers['Q3']], 5)
assertFloatList([x[1] for x in answers['Q3']], 5)

### Question 4
Adapt your unigram model to use the tdidf scores of words, rather than a bag-of-words representation. That is, rather than your features containing the word counts for the 1000 most common unigrams. Report the accuracy of this new model.

In [268]:
# Get the 1000 most common words from document frequency
top_unigrams = sorted(df.items(), key=lambda x: x[1], reverse=True)[:1000]
vocab = [word for word, freq in top_unigrams]  # The 1000 most common words
X = []
for d in dataset:
    #compute term freq for that document
    tf = defaultdict(int)
    r = ''.join([c for c in d['text'].lower() if c not in sp])
    for w in r.split():
        tf[w] += 1

    #compute tf-idf for the document (use idf that we already calculated in Q3)
    tfidf = {word: tf[word] * idf.get(word, 0) for word in tf}
    
    #create a vector using the top 1000 unigrams
    vector = [tfidf.get(word, 0) for word in vocab]
    X.append(vector)

In [140]:
y = [d['genre'] for d in dataset]

In [142]:
Xtrain = X[:Ntrain]
ytrain = y[:Ntrain]
Xtest = X[Ntrain:]
ytest = y[Ntrain:]

In [150]:
mod = linear_model.LogisticRegression(C=1)
mod.fit(Xtrain, ytrain)
y_pred = mod.predict(Xtest)
correct = (y_pred == ytest)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [158]:
answers['Q4'] = sum(correct) / len(correct)
answers['Q4']

0.6082

In [154]:
assertFloat(answers['Q4'])

### Question 5
Which review in the test set has the highest cosine similarity compared to the first review in the training set, in terms of their tf-idf representation (considering unigrams only). Provide the cosine similarity score and the reviewID.

In [212]:
def Cosine(x1,x2):
    numer = 0
    norm1 = 0
    norm2 = 0 
    for a1, a2 in zip(x1, x2):
        numer += a1 * a2
        norm1 += a1**2
        norm2 += a2**2
    if norm1*norm2:
        return numer / math.sqrt(norm1 * norm2)
    return 0

tfidf_train_first = Xtrain[0]

In [274]:
#recreating test dataset with reviewID for reference
Xtest = []  # This will contain tuples of (reviewID, tfidf_vector)

for d in dataTest:
    reviewID = d['reviewID']  #xxtract the reviewID
    tf = defaultdict(int)
    r = ''.join([c for c in d['text'].lower() if c not in sp])
    for w in r.split():
        tf[w] += 1
    
    tfidf = {word: tf[word] * idf.get(word, 0) for word in tf}

    vector = [tfidf.get(word, 0) for word in vocab]

    Xtest.append((reviewID, vector))

In [288]:
similarities = []
for review in Xtest:
    similarity = Cosine(review[1], tfidf_train_first)
    similarities.append((similarity, reviewID))
    

In [222]:
for reviewID, tfidf_test in enumerate(Xtest):  #assuming Xtest contains test set tfidf vectors
    similarity = Cosine(tfidf_train_first, tfidf_test)  #compute cosine similarity
    similarities.append((reviewID, int(similarity)))  #store reviewID and similarity score

In [300]:
similarities.sort(key = lambda x: x[0], reverse=True)

In [292]:
answers['Q5'] = similarities[0]
answers['Q5']

(0.5016951620600787, 'r84358104')

In [294]:
assertFloat(answers['Q5'][0])

### Question 6
Try to improve upon the performance of the above classifiers from questions 2 and 4 by using different dictionary sizes, or changing the regularization constant C passed to the logistic regression model. Report the performance of your solution.

Use the first half (10,000) of the corpus for training and the rest for testing (code to read the data is provided in the stub). Process review without capitalization or punctuation (and without using stemming or stopwords).

In [318]:
#Model in question 2
wordId = dict(zip(common_words, range(len(common_words))))
wordSet = set(common_words)
#building X
def feature(datum): #function that creates b.o.w feature
    feat = [0]*len(common_words) #initialize feature vector of length 1000, all currently 0 count
    review = ''.join([c for c in datum['text'].lower() if not c in sp])
    for w in review.split():
        if w in wordSet:
            feat[wordId[w]] += 1
    feat.append(1) #offset during loop
    return feat

X = [feature(d) for d in dataset] #feature vectors for entire dataset
y = [d['genre'] for d in dataset] #response variable for entire dataset

#subset for train/test split
Xtrain = X[:Ntrain]
ytrain = y[:Ntrain]
Xtest = X[Ntrain:]
ytest = y[Ntrain:]

C_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
results=[]
for C in C_values:
    mod = linear_model.LogisticRegression(C=C, max_iter=5000) 
    mod.fit(Xtrain, ytrain)
    

    y_pred = mod.predict(Xtest)

    accuracy = accuracy_score(ytest, y_pred)

    results.append((C, accuracy))
    print(f"C: {C}, Accuracy: {accuracy}")
#accuracy = .6368
#C: 0.2, Accuracy: 0.6446

C: 0.1, Accuracy: 0.6439
C: 0.2, Accuracy: 0.6446
C: 0.3, Accuracy: 0.644
C: 0.4, Accuracy: 0.6409
C: 0.5, Accuracy: 0.6395
C: 0.6, Accuracy: 0.6381
C: 0.7, Accuracy: 0.6366
C: 0.8, Accuracy: 0.6348


AttributeError: 'list' object has no attribute 'max'

In [332]:
#Model in question 4
top_unigrams = sorted(df.items(), key=lambda x: x[1], reverse=True)[:1000]
vocab = [word for word, freq in top_unigrams]  # The 1000 most common words
X = []
for d in dataset:
    #compute term freq for that document
    tf = defaultdict(int)
    r = ''.join([c for c in d['text'].lower() if c not in sp])
    for w in r.split():
        tf[w] += 1

    #compute tf-idf for the document (use idf that we already calculated in Q3)
    tfidf = {word: tf[word] * idf.get(word, 0) for word in tf}
    
    #create a vector using the top 1000 unigrams
    vector = [tfidf.get(word, 0) for word in vocab]
    X.append(vector)
y = [d['genre'] for d in dataset]

Xtrain = X[:Ntrain]
ytrain = y[:Ntrain]
Xtest = X[Ntrain:]
ytest = y[Ntrain:]

C_values = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08]
results2 = []

for C in C_values:
  
    mod = linear_model.LogisticRegression(C=C, max_iter=5000) 
    mod.fit(Xtrain, ytrain)
    

    y_pred = mod.predict(Xtest)
    accuracy = accuracy_score(ytest, y_pred)

    results2.append((C, accuracy))
    print(f"C: {C}, Accuracy: {accuracy}")

C: 0.01, Accuracy: 0.6337
C: 0.02, Accuracy: 0.6383
C: 0.03, Accuracy: 0.6386
C: 0.04, Accuracy: 0.6377
C: 0.05, Accuracy: 0.6387
C: 0.06, Accuracy: 0.6372
C: 0.07, Accuracy: 0.6358
C: 0.08, Accuracy: 0.6343


In [360]:
modelq2 = results[1]
modelq4 = results2[4]
#used model with better performance below

In [364]:
answers['Q6'] = (modelq2[1])
answers['Q6']

0.6446

In [366]:
assertFloat(answers['Q6'])

### Question 7
This task should be completed using the entire dataset of 20,000 reviews from Goodreads:

Using the word2vec library in gensim, fit an item2vec model, treating each sentence as a temporally-ordered list of items per user. Use parameters min_count = 1, size = 5, window = 3, sg = 1. Report the 5 most similar items to the book from the first review along with their similarity scores (your answer can be the output of the similar_by_word function).

In [370]:
import dateutil.parser

In [536]:
dataset = []
reviews = []
reviewsPerUser = defaultdict(list)

f = gzip.open("young_adult_20000.json.gz")
for l in f:
    d = eval(l)
    d['datetime'] = dateutil.parser.parse(d['date_added'])
    reviewsPerUser[d['user_id']].append((d['datetime'], d['book_id']))
    reviews.append(d['review_text'])
    dataset.append(d)
    if len(dataset) >= 20000:
        break
        
f.close()

In [556]:
dataset[0]

{'user_id': 'dc3763cdb9b2cae805882878eebb6a32',
 'book_id': '18471619',
 'review_id': '66b2ba840f9bd36d6d27f46136fe4772',
 'rating': 3,
 'review_text': 'Sherlock Holmes and the Vampires of London \n Release Date: April 2014 \n Publisher: Darkhorse Comics \n Story by: Sylvain Cordurie \n Art by: Laci \n Colors by: Axel Gonzabo \n Cover by: Jean Sebastien Rossbach \n ISDN: 9781616552664 \n MSRP: $17.99 Hardcover \n "Sherlock Holmes died fighting Professor Moriarty in the Reichenbach Falls. \n At least, that\'s what the press claims. \n However, Holmes is alive and well and taking advantage of his presumed death to travel the globe. \n Unfortunately, Holmes\'s plans are thwarted when a plague of vampirism haunts Britain. \n This book collects Sherlock Holmes and the Vampires of London Volumes 1 and 2, originally created by French publisher Soleil." - Darkhorse Comics \n When I received this copy of "Sherlock Holmes and the Vampires of London" I was Ecstatic! The cover art was awesome and 

In [538]:
reviewTokens = []

In [540]:
for r in reviews:
    r = ''.join([c for c in r.lower() if not c in sp])
    tokens = []
    for w in r.split():
        tokens.append(w)
    reviewTokens.append(tokens)

In [541]:
reviewTokens[0]

['sherlock',
 'holmes',
 'and',
 'the',
 'vampires',
 'of',
 'london',
 'release',
 'date',
 'april',
 '2014',
 'publisher',
 'darkhorse',
 'comics',
 'story',
 'by',
 'sylvain',
 'cordurie',
 'art',
 'by',
 'laci',
 'colors',
 'by',
 'axel',
 'gonzabo',
 'cover',
 'by',
 'jean',
 'sebastien',
 'rossbach',
 'isdn',
 '9781616552664',
 'msrp',
 '1799',
 'hardcover',
 'sherlock',
 'holmes',
 'died',
 'fighting',
 'professor',
 'moriarty',
 'in',
 'the',
 'reichenbach',
 'falls',
 'at',
 'least',
 'thats',
 'what',
 'the',
 'press',
 'claims',
 'however',
 'holmes',
 'is',
 'alive',
 'and',
 'well',
 'and',
 'taking',
 'advantage',
 'of',
 'his',
 'presumed',
 'death',
 'to',
 'travel',
 'the',
 'globe',
 'unfortunately',
 'holmess',
 'plans',
 'are',
 'thwarted',
 'when',
 'a',
 'plague',
 'of',
 'vampirism',
 'haunts',
 'britain',
 'this',
 'book',
 'collects',
 'sherlock',
 'holmes',
 'and',
 'the',
 'vampires',
 'of',
 'london',
 'volumes',
 '1',
 'and',
 '2',
 'originally',
 'created'

In [544]:
model5 = Word2Vec(reviewTokens,
                  min_count=1, # Words/items with fewer instances are discarded
                  vector_size=5, # Model dimensionality
                  window=3, # Window size
                  sg=1) # Skip-gram model

In [545]:
model5.wv.similar_by_word('sherlock')

[('arrows', 0.9992873072624207),
 ('spiderwoman', 0.9991419911384583),
 ('priest', 0.9988325238227844),
 ('affair', 0.9988138675689697),
 ('sketch', 0.9987421631813049),
 ('benefits', 0.998714029788971),
 ('potter', 0.9986669421195984),
 ('principals', 0.998612642288208),
 ('inferno', 0.9984824061393738),
 ('trio', 0.9984773993492126)]

In [550]:
#item2vec implementation
reviewLists = []
for u in reviewsPerUser:
    rl = list(reviewsPerUser[u])
    rl.sort()
    reviewLists.append([x[1] for x in rl])

In [554]:
model6 = Word2Vec(reviewLists,
                  min_count=1, # Words/items with fewer instances are discarded
                  vector_size=5, # Model dimensionality
                  window=3, # Window size
                  sg=1) # Skip-gram model

In [562]:
res = model6.wv.similar_by_word('18471619')

In [564]:
answers['Q7'] = res[:5]
answers['Q7']

[('841237', 0.9972667694091797),
 ('34658929', 0.9909661412239075),
 ('10555316', 0.9896025061607361),
 ('13449407', 0.9894213080406189),
 ('16002011', 0.9874613285064697)]

In [566]:
assertFloatList([x[1] for x in answers['Q7']], 5)

In [568]:
f = open("answers_hw4.txt", 'w')
f.write(str(answers) + '\n')
f.close()