In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
from sklearn import linear_model
import torch
import pandas as pd
from surprise import Dataset, Reader, SVDpp, accuracy, KNNBaseline
from surprise.model_selection import train_test_split


import warnings
warnings.filterwarnings("ignore")

def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

ratingsTrain = allRatings[:190000]
ratingsValid = allRatings[190000:]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
bookUsers = defaultdict(set)
userBooks = defaultdict(set)
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))
    bookUsers[b].add(u)
    userBooks[u].add(b)

bookCount = defaultdict(int)
totalRead = 0

for user,book,_ in readCSV("train_Interactions.csv.gz"):
    bookCount[book] += 1
    totalRead += 1
    


In [2]:
from itertools import product
from surprise import Dataset, Reader, SVDpp, accuracy

In [3]:
# Create DataFrames from the training and validation data
df_train = pd.DataFrame(ratingsTrain, columns=['userID', 'itemID', 'rating'])
df_valid = pd.DataFrame(ratingsValid, columns=['userID', 'itemID', 'rating'])

reader = Reader(rating_scale=(1, 5))

# Prepare the training data
train_data = Dataset.load_from_df(df_train[['userID', 'itemID', 'rating']], reader)
trainset = train_data.build_full_trainset()

# Prepare the testset (validation data)
testset = list(zip(df_valid['userID'], df_valid['itemID'], df_valid['rating']))


In [None]:
from surprise import SVD

In [58]:
param_grid = {
    'n_factors': [1],
    'n_epochs': [19],
    'lr_bu': [0.01],
    'lr_bi': [0.005],
    'lr_pu': [0.001],
    'lr_qi': [0.005],
    'reg_bu': [0.15],
    'reg_bi': [0.08],
    'reg_pu': [0.1],
    'reg_qi': [0.1],
}




In [59]:
# Generate all combinations of parameters
param_names = list(param_grid.keys())
param_values = list(param_grid.values())
param_combinations = [dict(zip(param_names, v)) for v in product(*param_values)]

best_rmse = float('inf')
best_params = None

for params in param_combinations:
    algo_params = {k: v for k, v in params.items() if v is not None}
    algo_params['random_state'] = 420
    algo = SVD(**algo_params)
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print(f"Parameters: {algo_params} => RMSE: {rmse:.4f}")
    if rmse < best_rmse:
        best_rmse = rmse
        best_params = algo_params

print(f"\nBest RMSE: {best_rmse:.4f} with parameters: {best_params}")

#Parameters: {'n_factors': 1, 'n_epochs': 20, 'lr_bu': 0.005, 'lr_bi': 0.005, 'lr_pu': 0.005, 'lr_qi': 0.001, 'reg_bu': 0.01, 'reg_bi': 0.05, 'reg_pu': 0.05, 'reg_qi': 0.01, 'random_state': 100} => RMSE: 1.1920


Parameters: {'n_factors': 1, 'n_epochs': 19, 'lr_bu': 0.01, 'lr_bi': 0.005, 'lr_pu': 0.001, 'lr_qi': 0.005, 'reg_bu': 0.15, 'reg_bi': 0.08, 'reg_pu': 0.1, 'reg_qi': 0.1, 'random_state': 420} => RMSE: 1.1879

Best RMSE: 1.1879 with parameters: {'n_factors': 1, 'n_epochs': 19, 'lr_bu': 0.01, 'lr_bi': 0.005, 'lr_pu': 0.001, 'lr_qi': 0.005, 'reg_bu': 0.15, 'reg_bi': 0.08, 'reg_pu': 0.1, 'reg_qi': 0.1, 'random_state': 420}


In [63]:
best_pp_params = {'n_factors': 3, 'n_epochs': 20, 'lr_bu': 0.01, 'lr_bi': 0.006, 'lr_pu': 0.001, 'lr_qi': 0.005, 'reg_bu': 0.15, 'reg_bi': 0.1, 'reg_pu': 0.1, 'reg_qi': 0.1}
best_svd_params = {'n_factors': 1, 'n_epochs': 19, 'lr_bu': 0.01, 'lr_bi': 0.005, 'lr_pu': 0.001, 'lr_qi': 0.005, 'reg_bu': 0.15, 'reg_bi': 0.08, 'reg_pu': 0.1, 'reg_qi': 0.1, 'random_state': 139}
#algo = SVDpp(**best_params)
algo = SVD(**best_svd_params)
# Train the algorithm
algo.fit(trainset)

with open("pairs_Rating.csv", 'r') as pairs_file, open("predictions_Rating.csv", 'w') as predictions:
    for line in pairs_file:
        if line.startswith("userID"):
            predictions.write(line)
            continue
        userID, itemID = line.strip().split(',')

        # Predict the rating
        pred = algo.predict(userID, itemID)
        predRating = pred.est

        # Write the prediction
        predictions.write(f"{userID},{itemID},{predRating}\n")

In [61]:
best_params1 = {'n_factors': 0, 'n_epochs': 0, 'lr_all': 0.007394651229831707,  'reg_all': 0.07947268945576047}
algo = SVD(**best_params1)
algo.fit(trainset)
predictions = algo.test(testset)
rmse = accuracy.rmse(predictions, verbose=False)
print(f"Parameters: {best_params1} => RMSE: {rmse:.4f}")


Parameters: {'n_factors': 0, 'n_epochs': 0, 'lr_all': 0.007394651229831707, 'reg_all': 0.07947268945576047} => RMSE: 1.3142


In [30]:
from surprise import SVDpp, KNNBaseline

# Train SVD model
algo1 = SVDpp(**best_pp_params)
algo1.fit(trainset)

# Train KNNBaseline model
algo2 = SVD(**best_svd_params)
algo2.fit(trainset)

# Get predictions
predictions1 = algo1.test(testset)
predictions2 = algo2.test(testset)

# Combine predictions
def combine_predictions(preds1, preds2):
    combined_preds = []
    for p1, p2 in zip(preds1, preds2):
        est = p1.est * 0.8 + p2.est * 0.2
        combined_preds.append(p1._replace(est=est))
    return combined_preds

combined_predictions = combine_predictions(predictions1, predictions2)
rmse = accuracy.rmse(combined_predictions)
print(f"Ensemble Validation RMSE: {rmse}")


RMSE: 1.1882
Ensemble Validation RMSE: 1.188164237663758


In [33]:
with open("pairs_Rating.csv", 'r') as pairs_file, open("predictions_Rating.csv", 'w') as predictions:
    for line in pairs_file:
        if line.startswith("userID"):
            predictions.write(line)
            continue
        userID, itemID = line.strip().split(',')
        # Predict the rating
        pred1 = algo1.predict(userID, itemID)
        pred2 = algo2.predict(userID, itemID)
        predRating = pred1.est * 0.8 + pred2.est * 0.2

        # Write the prediction
        predictions.write(f"{userID},{itemID},{predRating}\n")

In [42]:
from surprise import SVD
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

def objective(params):
    algo = SVD(random_state=420, **params)
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    return {'loss': rmse, 'status': STATUS_OK}

param_space = {
    'n_factors': hp.choice('n_factors', [1, 10, 20]),
    'n_epochs': hp.choice('n_epochs', [20]),
    'lr_all': hp.loguniform('lr_all', -5, -2),
    'reg_all': hp.loguniform('reg_all', -5, -2),
}

trials = Trials()
best = fmin(fn=objective, space=param_space, algo=tpe.suggest, max_evals=50, trials=trials)
print(f"Best parameters: {best}")


100%|██████████| 50/50 [00:40<00:00,  1.23trial/s, best loss: 1.189038056796004] 
Best parameters: {'lr_all': 0.007394651229831707, 'n_epochs': 0, 'n_factors': 0, 'reg_all': 0.07947268945576047}
