In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
from sklearn import linear_model
import tensorflow as tf

import warnings
warnings.filterwarnings("ignore")

def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

# Some data structures that will be useful

allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

random.shuffle(allRatings)
ratingsTrain = allRatings[:190000]
ratingsValid = allRatings[190000:]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))

2023-03-30 05:00:07.091068: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-30 05:00:07.660401: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-30 05:00:07.660441: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-30 05:00:07.720335: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-03-30 05:00:09.913767: W tensorflow/stream_executor/platform/de

In [2]:
##################################################
# Read prediction                                #
##################################################

In [3]:
# Preparing data set for read task
readValid = []
readTrain = []
readBooksPerUser = defaultdict(set)
allBooks = set()

for user, book, _ in allRatings:
    allBooks.add(book)
    readBooksPerUser[user].add(book)

# Construct training set
for user, book, _ in ratingsTrain:
    readTrain.append([user, book, 1])

    # Randomly sample a book not read by user
    randomBook = book
    while(randomBook in readBooksPerUser[user]):
        randomBook = random.sample(allBooks, 1)[0]
        
    readTrain.append([user, randomBook, 0])
    
# Construct validation set
for user, book, _ in ratingsValid:
    readValid.append([user, book, 1])

    # Randomly sampling a book not read by user
    randomBook = book
    while(randomBook in readBooksPerUser[user]):
        randomBook = random.sample(allBooks, 1)[0]
        
    readValid.append([user, randomBook, 0])

In [4]:
# Determine most popular books
bookCount = defaultdict(int)
totalRead = 0

for user,book,_ in readCSV("train_Interactions.csv.gz"):
    bookCount[book] += 1
    totalRead += 1

# Sort based on popularity
mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort()
mostPopular.reverse()

# Utility data structures
booksPerUser = defaultdict(set)
usersPerBook = defaultdict(set)

for u,b,_ in ratingsTrain:
    booksPerUser[u].add(b)
    usersPerBook[b].add(u)

In [5]:
# Function to compute jaccard similarity
def Jaccard(set1, set2):
    denom = len(set1.union(set2))
    numer = len(set1.intersection(set2))
    
    if(denom == 0):
        return 0
    
    return numer/denom

In [6]:
# Construct set of Popular books
popularBooks = set()
count = 0
popThreshold = 0.75
bookJaccardThreshold = 0.003
userJaccardThreshold = 0.25

for ic, i in mostPopular:
    count += ic
    popularBooks.add(i)
    if (count > popThreshold*totalRead): 
        break
        
# Construct feature vector
def GetFeatureVector(u, b):
    featVec = []
    bookSimList = []
    userSimList = []
    
    for otherBook in booksPerUser[u]:
        if(otherBook == b):
            continue
            
        bookSimList.append(Jaccard(usersPerBook[b], usersPerBook[otherBook]))
        
    for otherUser in usersPerBook[b]:
        if(otherUser == u):
            continue
            
        userSimList.append(Jaccard(booksPerUser[u], booksPerUser[otherUser]))
    
#     if(len(bookSimList)==0):
#         featVec.extend([0, 0])
#     else:
#         featVec.extend([sum(bookSimList)/len(bookSimList), max(bookSimList)])
        
    featVec.append(sum((np.array(bookSimList) > bookJaccardThreshold).astype(int)))
        
#     if(len(userSimList)==0):
#         featVec.extend([0, 0])
#     else:
#         featVec.extend([sum(userSimList)/len(userSimList), max(userSimList)])

    featVec.append(sum((np.array(userSimList) > userJaccardThreshold).astype(int)))
        
    featVec.append(int(b in popularBooks))
        
    return featVec
        
X = []
Y = []

for u,b,l in readTrain:
    X.append(GetFeatureVector(u, b))
    Y.append(l)
    
# for u, b, l in readValid:
#     X.append(GetFeatureVector(u,b))
#     Y.append(l)
    
X = np.array(X)
Y = np.array(Y)
# print(np.mean(X[:,1]))
# print(np.mean(X[:,2]))
# meanVec = np.mean(X, axis=0)
# stdVec = np.std(X, axis=0)
# X = (X - meanVec)/stdVec

readModel = linear_model.LogisticRegression(C=1.0, class_weight='balanced', max_iter=5000).fit(X,Y)
   
# Compute accuracy on training set
accTrain = sum((Y - readModel.predict(X) == 0).astype(int))/len(Y)
print(accTrain)
    
# Compute accuracy on validation set
Xvalid = []
Yvalid = []

for u, b, l in readValid:
    Xvalid.append(GetFeatureVector(u,b))
    Yvalid.append(l)

Xvalid = np.array(Xvalid)
Yvalid = np.array(Yvalid)
#Xvalid = (Xvalid - meanVec)/stdVec

acc = sum((Yvalid - readModel.predict(Xvalid) == 0).astype(int))/len(Yvalid)
print(acc)

0.9257526315789474
0.6365


In [7]:
predictions = open("predictions_Read.csv", 'w')
testDict = defaultdict(list)
# testList = []
# Xtest = []

for l in open("pairs_Read.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,b = l.strip().split(',')
    
    testDict[u].append(b)

for usr in testDict:
    bookList = testDict[usr]
    Xtest = [GetFeatureVector(usr,bk) for bk in bookList]
    scoreList = readModel.predict_proba(Xtest)[:,1]
    combList = list(zip(scoreList, bookList))
    combList.sort(reverse=True)
    half1 = combList[:int(len(combList)/2)]
    half2 = combList[int(len(combList)/2):]
    for scr, bk in half1:
        predictions.write(usr+","+bk+","+"1\n")
        
    for scr, bk in half2:
        predictions.write(usr+","+bk+","+"0\n")

predictions.close()