In [1]:
#80/20 train/test split of model 

# @article{Hug2020,
#   doi = {10.21105/joss.02174},
#   url = {https://doi.org/10.21105/joss.02174},
#   year = {2020},
#   publisher = {The Open Journal},
#   volume = {5},
#   number = {52},
#   pages = {2174},
#   author = {Nicolas Hug},
#   title = {Surprise: A Python library for recommender systems},
#   journal = {Journal of Open Source Software}
# }



import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

df = pd.read_csv('joined_dataset.csv', low_memory=False)

reader = Reader(rating_scale=(1, 10)) 
data = Dataset.load_from_df(df[['User-ID', 'ISBN', 'Book-Rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2)

model = SVD()

In [2]:
from surprise.model_selection import cross_validate

cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.6396  1.6364  1.6355  1.6298  1.6421  1.6367  0.0042  
MAE (testset)     1.2643  1.2651  1.2632  1.2592  1.2642  1.2632  0.0021  
Fit time          6.71    6.80    6.81    6.93    6.94    6.84    0.09    
Test time         0.85    0.86    0.92    0.50    0.50    0.72    0.19    


{'test_rmse': array([1.63961336, 1.63638737, 1.63548877, 1.62981898, 1.64212033]),
 'test_mae': array([1.26434266, 1.26508775, 1.26315825, 1.25919327, 1.26419115]),
 'fit_time': (6.706929445266724,
  6.798751592636108,
  6.808389902114868,
  6.926517486572266,
  6.937875986099243),
 'test_time': (0.8484663963317871,
  0.8584671020507812,
  0.9170818328857422,
  0.4975879192352295,
  0.5018248558044434)}

In [4]:
# Random predictor baseline
from surprise import NormalPredictor

random_model = NormalPredictor()  # Random predictions based on normal distribution
cross_validate(random_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    2.5045  2.4972  2.4985  2.4974  2.4928  2.4981  0.0037  
MAE (testset)     1.9985  1.9964  1.9953  1.9929  1.9880  1.9942  0.0036  
Fit time          0.57    0.75    0.72    0.73    0.72    0.70    0.07    
Test time         0.47    0.55    0.52    0.50    0.44    0.50    0.04    


{'test_rmse': array([2.50445139, 2.49723811, 2.49850709, 2.49742461, 2.49276651]),
 'test_mae': array([1.99854087, 1.99638929, 1.99526808, 1.99290275, 1.98797308]),
 'fit_time': (0.5674049854278564,
  0.7508313655853271,
  0.7169806957244873,
  0.7335541248321533,
  0.7161374092102051),
 'test_time': (0.4708833694458008,
  0.5506687164306641,
  0.5193257331848145,
  0.5004458427429199,
  0.4353368282318115)}

In [3]:
from surprise import AlgoBase
from surprise import Dataset
from surprise.model_selection import cross_validate

# Custom baseline algorithm class
class MeanPredictor(AlgoBase):
    def __init__(self):
        # Call the base constructor
        AlgoBase.__init__(self)

    def fit(self, trainset):
        # This method is used to fit the algorithm to the training set
        # Store the global mean rating for predictions
        AlgoBase.fit(self, trainset)
        self.global_mean = trainset.global_mean
        return self

    def estimate(self, u, i):
        # This method estimates the rating for user u and item i
        # Here we simply return the global mean
        return self.global_mean


# Create the baseline model (MeanPredictor)
baseline_model = MeanPredictor()

# Perform cross-validation
cv_results = cross_validate(baseline_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Output the cross-validation results
print(cv_results)


Evaluating RMSE, MAE of algorithm MeanPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.8356  1.8445  1.8396  1.8464  1.8405  1.8413  0.0038  
MAE (testset)     1.4920  1.4960  1.4909  1.4998  1.4925  1.4942  0.0033  
Fit time          0.20    0.35    0.40    0.35    0.38    0.34    0.07    
Test time         0.27    0.23    0.75    0.27    0.78    0.46    0.25    
{'test_rmse': array([1.83555965, 1.8445041 , 1.83962357, 1.84638696, 1.84054816]), 'test_mae': array([1.49196806, 1.49602446, 1.49087695, 1.49976284, 1.49253408]), 'fit_time': (0.2002556324005127, 0.3512086868286133, 0.39946413040161133, 0.35322093963623047, 0.37673234939575195), 'test_time': (0.26767563819885254, 0.23459315299987793, 0.7517728805541992, 0.26772093772888184, 0.7840862274169922)}


In [11]:
from surprise import KNNBaseline
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate

# Load your dataset
df = pd.read_csv('joined_dataset.csv', low_memory=False)

# Filter users with fewer than a threshold number of ratings
min_ratings = 3
df_filtered = df.groupby('User-ID').filter(lambda x: len(x) >= min_ratings)

# Filter items (books) with fewer than a threshold number of ratings
min_item_ratings = 10
df_filtered = df_filtered.groupby('ISBN').filter(lambda x: len(x) >= min_item_ratings)


reader = Reader(rating_scale=(df_filtered['Book-Rating'].min(), df['Book-Rating'].max()))
data = Dataset.load_from_df(df_filtered[['User-ID', 'ISBN', 'Book-Rating']], reader)

# Set the similarity options
sim_options = {
    'name': 'pearson_baseline',  # Use Pearson correlation
    'user_based': True  # If True, computes similarities between users. False for items.
}

# Initialize the KNNBaseline algorithm
knn_baseline = KNNBaseline(sim_options=sim_options)

# Perform cross-validation
cv_results_knn = cross_validate(knn_baseline, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Output results
print(cv_results_knn)


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.7265  1.7464  1.7309  1.7374  1.7393  1.7361  0.0069  
MAE (testset)     1.3197  1.3296  1.3187  1.3186  1.3238  1.3221  0.0042  
Fit time          28.52   30.61   23.00   23.45   26.40   26.40   2.92    
Test time         1.27    1.62    1.59    1.28    1.3

In [9]:
#Made svd_model, prevent overwriting
# import pickle

# # Save the model
# with open('svd_model.pkl', 'wb') as file:
#     pickle.dump(model, file)