In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
import string
from sklearn import linear_model

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [4]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

<h3 style="font-size: 30px; font-weight: bold;"> Read Prediction </h3>
<p style="font-size: 15px;"> The following code uses Jaccard similarity and optimized thresholds to predict for each (user,book) pair in pairs_Read.csv if the user read that book (1) or not (0). The fraction of correct predictions was used to determine the accuracy of the model.</p>

In [5]:
allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

In [6]:
ratingsTrain = allRatings[:190000]
ratingsValid = allRatings[190000:]
ratingsPerUser = defaultdict(list)
ratingsPerBook = defaultdict(list)
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerBook[b].append((u,r))

In [7]:
# create new validation set that also contains negative ratings (one randomly chosed per positive rating)
allBooks = []
for user,book,rating in allRatings:
    if book not in allBooks:
        allBooks.append(book)

negRatings = []
for user,book,rating in ratingsValid:
    userRatings = []
    for b,r in ratingsPerUser[user]:
        userRatings.append(b)
    randomBook = random.choice(allBooks)
    while randomBook in userRatings:
        randomBook = random.choice(allBooks)
    negRatings.append((user, randomBook, 0))

newValid = ratingsValid + negRatings  

In [8]:
def Jaccard(s1, s2):
    numer = len(s1 and s2)
    denom = len(s1 or s2)
    if denom > 0:
        return numer/denom
    return 0

In [9]:
return2 = set()

for u,b,r in ratingsValid:
    similarities = []
    users = []
    for user,rating in ratingsPerBook[b]:
        users.append(user)
    books = []
    for book,rating in ratingsPerUser[u]:
        books.append(book)
    for b1 in books:
        users1 = []
        for user1,rating1 in ratingsPerBook[b1]:
            users1.append(user1)
        similarities.append(Jaccard(users1,users))
    if len(similarities) > 0:
        maxSim = max(similarities)
        if maxSim > 5 or len(ratingsPerBook[b]) > 35:
            return2.add(b)

In [10]:
c = 0
for user,book,rating in newValid:
    if book in return2:
        c += 1

acc = c/len(newValid)

In [11]:
# write predictions to file
predictions = open("predictions_Read.csv", 'w')
for l in open("pairs_Read.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,b = l.strip().split(',')
    if b in return2:
        predictions.write(u + ',' + b + ",1\n")
    else:
        predictions.write(u + ',' + b + ",0\n")

predictions.close()

<h3 style="font-size: 30px; font-weight: bold;"> Category Prediction </h3>
<p style="font-size: 15px;"> The following code uses Bag-of-Words feature matrix for a Linear Regression model to predict the genre of books from user reviews.</p>

In [12]:
data = []

for d in readGz("train_Category.json.gz"):
    data.append(d)

In [13]:
data[0]

{'user_id': 'u75242413',
 'review_id': 'r45843137',
 'rating': 4,
 'review_text': "a clever book with a deeply troubling premise and an intriguing protagonist. Thompson's clean, sparse prose style kept each page feeling light even as some rather heavy existential questions dropped upon them. I enjoyed it. \n and that cover design is boom-pow gorgeous.",
 'n_votes': 1,
 'genre': 'mystery_thriller_crime',
 'genreID': 3}

In [14]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

counts = [(wordCount[w],w) for w in wordCount]
counts.sort()
counts.reverse()

In [15]:
commonWords = [x[1] for x in counts[:4000]]

In [16]:
wordId = dict(zip(commonWords, range(len(commonWords))))
wordSet = set(commonWords)

In [17]:
def feature(d):
    feat = [0]*len(commonWords)
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    for w in r.split():
        if w in commonWords:
            feat[wordId[w]] += 1
    feat.append(1)
    return feat

In [18]:
X = [feature(d) for d in data]
y = [d['genreID'] for d in data]

In [19]:
Xtrain = X[:9*len(X)//10]
ytrain = y[:9*len(y)//10]
Xvalid = X[9*len(X)//10:]
yvalid = y[9*len(y)//10:]

In [20]:
def accuracy(predictions, y):
    TP = sum(numpy.logical_and(predictions, y))
    FP = sum(numpy.logical_and(predictions, numpy.logical_not(y)))
    TN = sum(numpy.logical_and(numpy.logical_not(predictions), numpy.logical_not(y)))
    FN = sum(numpy.logical_and(numpy.logical_not(predictions), y))
    return (TP + TN) / (TP + FP + TN + FN)

In [21]:
mod = linear_model.LogisticRegression(C=0.01)
mod.fit(Xtrain,ytrain)

In [22]:
dataTest = []
for d in readGz("test_Category.json.gz"):
    dataTest.append(d)

In [23]:
Xtest = [feature(d) for d in dataTest] 
preds = mod.predict(Xtest)

In [24]:
predictions = open("predictions_Category.csv", 'w')
pos = 0

for l in open("pairs_Category.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,b = l.strip().split(',')
    predictions.write(u + ',' + b + "," + str(preds[pos]) + "\n")
    pos += 1
            
predictions.close()

In [25]:
f = open("README.md", 'w')
f.write("Read Prediction: To predict whether or not a book was read, I iterated through the validation set to collect the following information. For the book in the validation set, I found the users that had read that book in the training set and for the user, I found the books that user read in the training set. Now, for each of these books from the training set that the user had read, I determined the users that had read them. Then I found the Jaccard similarities between the users in the training set that had read the book in the validation set and the users that read each book read by the user in the validation set. If the maximum similarity was greater than 5 or the number of times the book had been read was greater than 35 (I tried many thresholds, and this was the most optimal), I added the book to a set called return2. To make the predictions, I checked if the book was in the set return2. If the book was in the return2 set, I predicted 1 (meaning that the book is predicted to have been read) and if the book was not in the set, I predicted 0 (meaning that the book is predicted to be unread).")
f.write("\n")
f.write("\n")
f.write("Category Prediction: To predict the category of books, I created a feature function that creates a feature vector for a data point in which every time of the 4000 most common words is seen in the review's text, 1 is added to the location in the feature vector that corresponds to that common word. Also, a 1 is added to the end of the feature vector for the constant term. I created a matrix of all the feature vectors for all data points in the dataset and did a linear regression model on the matrix and the genreIDs for each data point. I was able to then create a feature matrix for the test set and use the model to predict the genres of the test set. For the model, I tried many different values for C and found that 0.01 had the best accuracy.")
f.close()