In [131]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
from sklearn import linear_model
import torch
import numpy as np

import warnings
warnings.filterwarnings("ignore")

def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

ratingsTrain = allRatings[:190000]
ratingsValid = allRatings[190000:]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
bookUsers = defaultdict(set)
userBooks = defaultdict(set)

userInteractionCounts = defaultdict(int)
bookInteractionCounts = defaultdict(int)

for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))
    bookUsers[b].add(u)
    userBooks[u].add(b)
    userInteractionCounts[u] += 1
    bookInteractionCounts[b] += 1
    

medianUserInteractions = np.median(list(userInteractionCounts.values()))
medianBookInteractions = np.median(list(bookInteractionCounts.values()))

bookCount = defaultdict(int)
totalRead = 0

for user,book,_ in readCSV("train_Interactions.csv.gz"):
    bookCount[book] += 1
    totalRead += 1

mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalRead/2: break


In [132]:
import math

In [133]:
negative_valid_samples = []

for user, book, _ in ratingsValid:
     negative_books = set()
     while len(negative_books) < 1:
        negative_book = random.choice(list(bookCount.keys()))
        if all(b != negative_book for b, _ in ratingsPerUser[user]) and negative_book not in negative_books:
            negative_books.add(negative_book)
            negative_valid_samples.append((user, negative_book, 0))  # 0 for negative sample

validation_with_negatives = [(u, b, 1) for u, b, _ in ratingsValid] + negative_valid_samples

In [134]:
def jaccardSim(book1, book2):
    users1 = bookUsers.get(book1, set())
    users2 = bookUsers.get(book2, set())
    if not users1 or not users2:
        return 0
    intersection = len(users1.intersection(users2))
    union = len(users1.union(users2))
    return intersection / union

def maxJaccardSimilarity(user, book):
    return max([jaccardSim(book, b_read) for b_read in userBooks[user] if b_read != book], default=0)

def avgJaccardSimilarity(user, book):
    ans = [jaccardSim(book, b_read) for b_read in userBooks[user] if b_read != book]
    return np.mean(ans) if ans else 0

def cosineSim(book1, book2):
    users1 = bookUsers.get(book1, set())
    users2 = bookUsers.get(book2, set())
    if not users1 or not users2:
        return 0
    intersection = len(users1.intersection(users2))
    magnitude1 = math.sqrt(len(users1))
    magnitude2 = math.sqrt(len(users2))
    return intersection / (magnitude1 * magnitude2)


def maxCosineSimilarity(user, book):
    return max([cosineSim(book, b_read) for b_read in userBooks[user]], default=0)

def avgCosineSimilarity(user, book):
    ans = [cosineSim(book, b_read) for b_read in userBooks[user] if b_read != book]
    return np.mean(ans) if ans else 0

In [135]:
import gzip
from collections import defaultdict
import random
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

trainRecords = []

for u, b, _ in ratingsTrain:
    numBookInteractions = bookInteractionCounts.get(b, medianBookInteractions)
    numUserInteractions = userInteractionCounts.get(u, medianUserInteractions)
    maxJaccardSim = maxJaccardSimilarity(u, b)
    maxCosineSim = maxCosineSimilarity(u, b)
    avgJaccardSim = avgJaccardSimilarity(u, b)
    avgCosineSim = avgCosineSimilarity(u, b)
    isPopular = 1 if b in mostPopular else 0
    
    trainRecords.append({
        'user': u,
        'book': b,
        'book_popularity': numBookInteractions,
        'user_interactions': numUserInteractions,
        'is_popular': isPopular,
        'max_jaccard_sim': maxJaccardSim,
        'max_cosine_sim': maxCosineSim,
        'avg_jaccard_sim': avgJaccardSim,
        'avg_cosine_sim': avgCosineSim,
        'label': 1
    })
    
    while True:
        negBook = random.choice(list(bookCount.keys()))
        if negBook not in userBooks[u]:
            numBookInteractions = bookInteractionCounts.get(negBook, medianBookInteractions)
            numUserInteractions = userInteractionCounts.get(u, medianUserInteractions)
            maxJaccardSim = maxJaccardSimilarity(u, negBook)
            maxCosineSim = maxCosineSimilarity(u, negBook)
            avgJaccardSim = avgJaccardSimilarity(u, negBook)
            avgCosineSim = avgCosineSimilarity(u, negBook)
            isPopular = 1 if b in mostPopular else 0

            trainRecords.append({
                'user': u,
                'book': negBook,
                'book_popularity': numBookInteractions,
                'user_interactions': numUserInteractions,
                'is_popular': isPopular,
                'max_jaccard_sim': maxJaccardSim,
                'max_cosine_sim': maxCosineSim,
                'avg_jaccard_sim': avgJaccardSim,
                'avg_cosine_sim': avgCosineSim,
                'label': 0
            })
            break

trainDf = pd.DataFrame(trainRecords)

validRecords = []

for u, b, label in validation_with_negatives:
    numBookInteractions = bookInteractionCounts.get(b, medianBookInteractions)
    numUserInteractions = userInteractionCounts.get(u, medianUserInteractions)
    isPopular = 1 if b in mostPopular else 0
    maxJaccardSim = maxJaccardSimilarity(u, b)
    maxCosineSim = maxCosineSimilarity(u, b)
    avgJaccardSim = avgJaccardSimilarity(u, b)
    avgCosineSim = avgCosineSimilarity(u, b)

    validRecords.append({
        'user': u,
        'book': b,
        'book_popularity': numBookInteractions,
        'user_interactions': numUserInteractions,
        'is_popular': isPopular,
        'max_jaccard_sim': maxJaccardSim,
        'max_cosine_sim': maxCosineSim,
        'avg_jaccard_sim': avgJaccardSim,
        'avg_cosine_sim': avgCosineSim,
        'label': label
    })

validDf = pd.DataFrame(validRecords)

In [136]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [137]:
# Fill missing values and standardize features
featureCols = ['book_popularity', 'user_interactions', 'max_jaccard_sim', 'max_cosine_sim', 'avg_jaccard_sim', 'avg_cosine_sim']
trainDf[featureCols] = trainDf[featureCols].fillna(0)
validDf[featureCols] = validDf[featureCols].fillna(0)

trainDf[['book_popularity_scaled', 'user_interactions_scaled']] = scaler.fit_transform(trainDf[['book_popularity', 'user_interactions']])
validDf[['book_popularity_scaled', 'user_interactions_scaled']] = scaler.transform(validDf[['book_popularity', 'user_interactions']])

In [201]:
featureCols = ['book_popularity_scaled', 'user_interactions_scaled', 'is_popular', 'max_jaccard_sim', 'avg_cosine_sim']
trainFeatures = np.array(trainDf[featureCols])
validFeatures = np.array(validDf[featureCols])

In [218]:
model = LogisticRegression(max_iter=10000, C = 0.005)
model.fit(trainFeatures, trainDf['label'])

validProbs = model.predict_proba(validFeatures)[:, 1]
n_test = len(validProbs)

n_positive = n_test // 2
sorted_indices = np.argsort(-validProbs)

validPreds = np.zeros(n_test, dtype=int)
validPreds[sorted_indices[:n_positive]] = 1

validPreds = (validProbs >= 0.35).astype(int)

accuracy = accuracy_score(validDf['label'], validPreds)
auc = roc_auc_score(validDf['label'], validProbs)

print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation AUC: {auc:.4f}")
print("\nClassification Report:")
print(classification_report(validDf['label'], validPreds))

Validation Accuracy: 0.7556
Validation AUC: 0.8226

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.80      0.77     10000
           1       0.78      0.71      0.74     10000

    accuracy                           0.76     20000
   macro avg       0.76      0.76      0.76     20000
weighted avg       0.76      0.76      0.76     20000



In [191]:
validProbs[:10]

array([0.40851065, 0.99743297, 0.32158174, 0.70294376, 0.84803255,
       0.54243706, 0.63005725, 0.88574091, 0.99146794, 0.36917487])

In [126]:
validProbs[:100]

array([0.44711378, 0.9983031 , 0.31994058, 0.70821143, 0.8546782 ,
       0.54027241, 0.6531626 , 0.89921993, 0.99289062, 0.34979513,
       0.99665277, 0.25373251, 0.99998982, 0.90485009, 0.66688095,
       0.45956429, 0.4601712 , 0.90869636, 0.27798971, 0.98132337,
       0.36442192, 0.70186892, 0.25306995, 0.47631094, 0.91252158,
       0.28119819, 0.99704291, 0.99647736, 0.99971581, 0.35990137,
       0.24717274, 0.98871317, 0.35699037, 0.24612964, 0.34387864,
       0.21602708, 0.42118277, 0.21873706, 0.82156536, 0.78262547,
       0.7936214 , 0.99169941, 0.38185029, 0.8188856 , 0.99974792,
       0.96538959, 0.88107921, 0.68963043, 0.43442545, 0.37923922,
       0.41334405, 0.27130194, 0.99962363, 0.39061746, 0.67356401,
       0.22982065, 0.30574848, 0.90013181, 0.53200398, 0.9999986 ,
       0.22701326, 0.48456844, 0.26228966, 0.46058446, 0.5511289 ,
       0.65723254, 0.30978371, 0.46992026, 0.99994631, 0.98116958,
       0.91232414, 0.73598245, 0.70925671, 0.24670752, 0.26961

In [9]:
validProbs

array([0.00155755, 0.011709  , 0.001266  , ..., 0.00099842, 0.00072374,
       0.00074837])

In [10]:
import numpy as np

In [217]:
test_pairs = []

with open("pairs_Read.csv", 'r') as f:
    for l in f:
        if l.startswith("userID"):
            continue
        u, b = l.strip().split(',')
        test_pairs.append((u, b))

test_records = []

for u, b in test_pairs:
    # Book popularity from training data
    numBookInteractions = bookInteractionCounts.get(b, medianBookInteractions)
    numUserInteractions = userInteractionCounts.get(u, medianUserInteractions)
    isPopular = 1 if b in mostPopular else 0

    maxJaccardSim = maxJaccardSimilarity(u, b)
    maxCosineSim = maxCosineSimilarity(u, b)
    avgJaccardSim = avgJaccardSimilarity(u, b)
    avgCosineSim = avgCosineSimilarity(u, b)
    
    test_records.append({
        'user': u,
        'book': b,
        'book_popularity': numBookInteractions,
        'user_interactions': numUserInteractions,
        'is_popular': isPopular,
        'max_jaccard_sim': maxJaccardSim,
        'max_cosine_sim': maxCosineSim,
        'avg_jaccard_sim': avgJaccardSim,
        'avg_cosine_sim': avgCosineSim,
    })

test_df = pd.DataFrame(test_records)

test_df[['book_popularity_scaled', 'user_interactions_scaled']] = scaler.transform(test_df[['book_popularity', 'user_interactions']])
feature_cols = ['book_popularity_scaled', 'user_interactions_scaled', 'is_popular', 'max_jaccard_sim', 'avg_jaccard_sim']
test_df[feature_cols] = test_df[feature_cols].fillna(0)
test_features = test_df[feature_cols]

test_probs = model.predict_proba(test_features)[:, 1]
test_preds = (test_probs >= 0.35).astype(int)
# n_test = len(test_probs)

# n_positive = n_test // 2
# sorted_indices = np.argsort(-test_probs)

# test_preds = np.zeros(n_test, dtype=int)
# test_preds[sorted_indices[:n_positive]] = 1

with open("predictions_Read.csv", 'w') as predictions:
    predictions.write("userID,bookID,prediction\n")
    
    for idx, (u, b) in enumerate(test_pairs):
        prediction = test_preds[idx]
        predictions.write(f"{u},{b},{prediction}\n")

