In [2]:
import implicit
import gzip
from collections import defaultdict

## Read Prediction Task
____

In [26]:
allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

In [28]:
ratingsTrain = allRatings[:190000]
ratingsValid = allRatings[190000:]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))

In [32]:
#create a set of all unique books from the training data
allBooks = set()
for _, book, _ in ratingsTrain:
    allBooks.add(book)

#for each user in the validation set, create a (user, book) pair with a book that the user has not interacted w/
negativeSamples = []
for u, b, _ in ratingsValid:
    #get the set of books the user has already interacted with
    readBooks = set(b for b, _ in ratingsPerUser[u])
    #find books the user hasn't read
    unreadBooks = list(allBooks - readBooks)
    if unreadBooks:
        #randomly select a book the user hasn't read as a negative sample
        negativeBook = random.choice(unreadBooks)
        negativeSamples.append((u, negativeBook))

In [None]:
#initialize variables to track the best threshold and its accuracy
bestThreshold = None
bestAccuracy = 0

#total number of interactions to adjust the threshold (e.g., vary from 10% to 90%)
thresholdPercentages = [0.1 * i for i in range(1, 10)]

for percentage in thresholdPercentages:
    #calculate the threshold based on the percentage of total interactions
    currentThreshold = totalRead * percentage

    #create the 'popular' book set based on the current threshold
    returnSet = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        returnSet.add(i)
        if count > currentThreshold:
            break

    #evaluate the accuracy of the model with the current threshold
    correctPredictions = 0
    totalPredictions = 0

    #check positive samples (true interactions)
    for u, b, _ in ratingsValid:
        if b in returnSet:  # If the book is in the set of popular books
            correctPredictions += 1
        totalPredictions += 1

    #check negative samples (non-interactions)
    for u, b in negativeSamples:
        if b not in returnSet:  # If the book is not in the set of popular books
            correctPredictions += 1
        totalPredictions += 1

    #calculate accuracy
    accuracy = correctPredictions / totalPredictions

    #update the best threshold if the current accuracy is higher
    if accuracy > bestAccuracy:
        bestAccuracy = accuracy
        bestThreshold = currentThreshold

print("Best threshold (in terms of totalRead):", bestThreshold / totalRead)
print("Best accuracy of the model with improved threshold:", bestAccuracy)
threshold = (bestThreshold / totalRead)

In [None]:
#initialize variables to track the best combined threshold and its accuracy
bestCombinedThresholds = None
bestCombinedAccuracy = 0

#create the 'popular' book set for a given percentage threshold (e.g., 50% of total interactions)
def create_popular_set(percentage):
    threshold = totalRead * percentage
    popularSet = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        popularSet.add(i)
        if count > threshold:
            break
    return popularSet

#define possible threshold values for popularity and Jaccard similarity
popularityThresholds = [0.1 * i for i in range(1, 10)]
jaccardThresholds = [0.1 * i for i in range(1, 10)]

for popThreshold in popularityThresholds:
    popularSet = create_popular_set(popThreshold)

    for jaccardThreshold in jaccardThresholds:
        correctPredictions = 0
        totalPredictions = 0

        #check positive samples (true interactions)
        for u, b, _ in ratingsValid:
            if u not in ratingsPerUser:
                continue  # Skip users who do not appear in the training data

            #get the books this user has read in the training data
            userBooks = [book for book, _ in ratingsPerUser[u]]

            #compute the maximum Jaccard similarity for this (u, b) pair
            maxSimilarity = max((jaccard_similarity(b, b_prime) for b_prime in userBooks), default=0)

            #predict 'read' if the book is in the popular set or if the max Jaccard similarity exceeds its threshold
            if b in popularSet or maxSimilarity > jaccardThreshold:
                correctPredictions += 1
            totalPredictions += 1

        #evaluate on negative samples
        for u, b in negativeSamples:
            if u not in ratingsPerUser:
                continue  # Skip users who do not appear in the training data

            #get the books this user has read in the training data
            userBooks = [book for book, _ in ratingsPerUser[u]]

            #compute the maximum Jaccard similarity for this (u, b) pair
            maxSimilarity = max((jaccard_similarity(b, b_prime) for b_prime in userBooks), default=0)

            #predict 'not read' if the book is not in the popular set and the max Jaccard similarity does not exceed its threshold
            if b not in popularSet and maxSimilarity <= jaccardThreshold:
                correctPredictions += 1
            totalPredictions += 1

        accuracy = correctPredictions / totalPredictions

        #update the best thresholds if the current accuracy is higher
        if accuracy > bestCombinedAccuracy:
            bestCombinedAccuracy = accuracy
            bestCombinedThresholds = (popThreshold, jaccardThreshold)

print("Best popularity threshold (in terms of totalRead):", bestCombinedThresholds[0])
print("Best Jaccard similarity threshold:", bestCombinedThresholds[1])
print("Best accuracy of the combined model:", bestCombinedAccuracy)


In [16]:
predictions = open("data/predictions_Read.csv", 'w')
for l in open("data/pairs_Read.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,b = l.strip().split(',')
    prediction = best_alpha + best_beta_u.get(u, 0) + best_beta_i.get(b, 0)
    
    # Ensure that the prediction is clipped within a valid range if needed
    # For binary classification of 'read' or 'not read', use a threshold
    # Here, we'll output '1' if the prediction > 0, otherwise '0'
    prediction = 1 if prediction > 0 else 0
    
    # Write the user, book, and predicted read/not read to the output file
    predictions.write(f"{u},{b},{prediction}\n")

predictions.close()

### Rating Prediction Task
___

In [None]:
predictions = open("data/predictions_Rating.csv", 'w')
for l in open("data/pairs_Rating.csv"):
    if l.startswith("userID"): # header
        predictions.write(l)
        continue
    u, b = l.strip().split(',')
    
    # Make the prediction using the best alpha, beta_u, and beta_i values
    prediction = best_alpha + best_beta_u[u] + best_beta_i[b]
    
    # Clip the prediction to a valid rating range if necessary (e.g., between 1 and 5)
    prediction = max(1, min(5, prediction))
    
    # Write the user, book, and predicted rating to the output file
    predictions.write(f"{u},{b},{prediction}\n")
    
predictions.close()