In [1]:
!pip install scikit-surprise


Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/772.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m768.0/772.0 kB[0m [31m11.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163763 sha256=991e76acd233dbdbb3460780aa55c01612389b8adaafb7c022d87ed980f73b24
  Stored in directory: /root/.cache/pip/wheels

In [8]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
import pandas as pd

In [18]:
# The path to the dataset file
file_path = Dataset.load_builtin('ml-100k')

# As in your previous code, define the reader with the correct format
reader = Reader(line_format='user item rating timestamp', sep='\t')

# Define the SVD algorithm
algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, file_path, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9399  0.9436  0.9301  0.9310  0.9332  0.9356  0.0053  
MAE (testset)     0.7426  0.7429  0.7347  0.7332  0.7349  0.7376  0.0042  
Fit time          1.43    1.43    1.42    1.46    2.15    1.58    0.29    
Test time         0.20    0.13    0.44    0.12    0.22    0.22    0.12    


{'test_rmse': array([0.93985157, 0.94361622, 0.93008772, 0.93099408, 0.9332488 ]),
 'test_mae': array([0.74261409, 0.74285568, 0.73465993, 0.73315292, 0.73486236]),
 'fit_time': (1.430532455444336,
  1.4265687465667725,
  1.4150314331054688,
  1.4574902057647705,
  2.15321683883667),
 'test_time': (0.20369362831115723,
  0.1315441131591797,
  0.4414336681365967,
  0.12229442596435547,
  0.22421526908874512)}

In [45]:
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split
import numpy as np

# Load the dataset
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=0.2)

# Define and train the SVD model
algo = SVD()
algo.fit(trainset)

# Predict on the test set
predictions = algo.test(testset)

# Custom function to calculate NDCG
def calculate_ndcg(predictions, k=10):
    # Group the prediction scores by user
    user_est_true = {}
    for uid, _, true_r, est, _ in predictions:
        if uid not in user_est_true:
            user_est_true[uid] = []
        user_est_true[uid].append((est, true_r))

    ndcg = 0
    for uid, user_ratings in user_est_true.items():
        # Keep only the top k items
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        user_ratings = user_ratings[:k]

        # Calculate DCG (Discounted Cumulative Gain) and IDCG (Ideal DCG)
        dcg = sum([true_r / np.log2(i + 2) for i, (_, true_r) in enumerate(user_ratings)])
        idcg = sum([np.log2(i + 2) for i in range(len(user_ratings))])
        ndcg += dcg / idcg if idcg > 0 else 0

    # Calculate the average NDCG
    return ndcg / len(user_est_true)

# Calculate NDCG
ndcg_value = calculate_ndcg(predictions, k=10)
print(f'NDCG: {ndcg_value}')


NDCG: 0.9209504395469672


In [44]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV

# Load the movielens-100k dataset
data = Dataset.load_builtin('ml-100k')

# Define a parameter grid to search over
param_grid = {
    'n_epochs': [5, 10], # Number of epochs. You can try different numbers here.
    'lr_all': [0.002, 0.005], # Learning rate. You can try different values here.
    'reg_all': [0.4, 0.6] # Regularization term. You can try different values here.
}

# Setup the grid search
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

# Perform the grid search
gs.fit(data)

# Best RMSE score
print(gs.best_score['rmse'])


0.9645225862412641


In [27]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
import numpy as np

# Load the dataset and train the model using the best parameters found by GridSearchCV
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=0.2)

best_params = gs.best_params['rmse']
algo = SVD(n_epochs=best_params['n_epochs'], lr_all=best_params['lr_all'], reg_all=best_params['reg_all'])
algo.fit(trainset)

# Make predictions on the testset
predictions = algo.test(testset)

# calculate_ndcg
def calculate_ndcg(predictions, k=10):
    user_est_true = {}
    for uid, _, true_r, est, _ in predictions:
        if uid not in user_est_true:
            user_est_true[uid] = []
        user_est_true[uid].append((est, true_r))

    ndcg = 0
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        user_ratings = user_ratings[:k]

        dcg = sum([true_r / np.log2(i + 2) for i, (_, true_r) in enumerate(user_ratings)])
        idcg = sum([np.log2(i + 2) for i in range(len(user_ratings))])
        ndcg += dcg / idcg if idcg > 0 else 0

    return ndcg / len(user_est_true)

ndcg_value = calculate_ndcg(predictions, k=10)
print(f'NDCG: {ndcg_value}')


NDCG: 0.9182108319784339


**Stacking Ensemble SVD and KNN**



In [32]:
from surprise import Dataset, Reader, SVD, KNNBasic, accuracy
from surprise.model_selection import KFold
import numpy as np

# Load the MovieLens 100k dataset
data = Dataset.load_builtin('ml-100k')

# Setting up five-fold cross-validation
kf = KFold(n_splits=5)

# Initialize SVD and KNN Basic algorithms
algo_svd = SVD()
algo_knn = KNNBasic()

# Lists to store predictions from SVD and KNN
svd_predictions = []
knn_predictions = []
actual_ratings = []  # List to store actual ratings

# For each train/test split in the dataset
for trainset, testset in kf.split(data):

    # Train SVD model and make predictions
    algo_svd.fit(trainset)
    predictions_svd = algo_svd.test(testset)
    svd_predictions.extend(predictions_svd)

    # Train KNN model and make predictions
    algo_knn.fit(trainset)
    predictions_knn = algo_knn.test(testset)
    knn_predictions.extend(predictions_knn)

    # Extract actual ratings from the testset
    actual_ratings.extend([rating for (_, _, rating) in testset])

# Combine predictions from SVD and KNN
combined_predictions = []
for svd_pred, knn_pred in zip(svd_predictions, knn_predictions):
    combined_pred = (svd_pred.est + knn_pred.est) / 2
    combined_predictions.append(combined_pred)

# Convert the predictions and actual ratings to numpy arrays for calculation
combined_predictions_array = np.array(combined_predictions)
actual_ratings_array = np.array(actual_ratings)

# Manually calculate RMSE
mse = np.mean((actual_ratings_array - combined_predictions_array) ** 2)
rmse = np.sqrt(mse)
print(f'Combined RMSE: {rmse}')


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Combined RMSE: 0.9350701259965264


In [43]:
from collections import defaultdict

def calculate_ndcg(predictions, k=10):
    # Create a dictionary to store predicted ratings and true ratings grouped by user ID
    user_est_true = defaultdict(list)

    # Iterate through all prediction results and group them by user ID
    for uid, _, true_r, est in predictions:
        user_est_true[uid].append((est, true_r))

    # Initialize the sum of NDCG
    ndcg = 0
    # Iterate through predicted ratings and true ratings for each user
    for uid, user_ratings in user_est_true.items():
        # Sort by predicted ratings in descending order
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        user_ratings_k = user_ratings[:k]

        # Calculate DCG (Discounted Cumulative Gain)
        dcg = sum([true_r / np.log2(i + 2) for i, (_, true_r) in enumerate(user_ratings_k)])

        # Sort by true ratings in descending order to calculate IDCG (Ideal Discounted Cumulative Gain)
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        ideal_ratings_k = user_ratings[:k]
        idcg = sum([true_r / np.log2(i + 2) for i, (_, true_r) in enumerate(ideal_ratings_k)])

        # Calculate NDCG (Normalized Discounted Cumulative Gain) and add it to the sum
        ndcg += dcg / idcg if idcg > 0 else 0

    # Calculate the average NDCG for all users, return 0 if there are no users
    return ndcg / len(user_est_true) if user_est_true else 0

# Calculate NDCG for the combined predictions
ndcg_value = calculate_ndcg(combined_predictions_with_details, k=10)
print(f'NDCG: {ndcg_value}')


NDCG: 0.8638025105186771
