In [4]:
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# Load datasets
movies_df = pd.read_excel('data/movies_df.xlsx')
movies_csv = pd.read_csv('data/movies.csv')
train_ratings = pd.read_csv('data/train_ratings.csv')
valid_data = pd.read_csv('data/valid_data.csv')

def create_user_item_matrix(df):
    """ Create a user-item matrix for collaborative filtering """
    user_ids = df['userId'].unique()
    movie_ids = df['movieId'].unique()
    
    user_id_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
    movie_id_to_idx = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}
    
    rows = df['userId'].map(user_id_to_idx)
    cols = df['movieId'].map(movie_id_to_idx)
    values = df['rating']
    
    return csr_matrix((values, (rows, cols)), shape=(len(user_ids), len(movie_ids))), user_id_to_idx, movie_id_to_idx

# Create user-item matrices for train and validation data
train_matrix, train_user_id_to_idx, train_movie_id_to_idx = create_user_item_matrix(train_ratings)
valid_matrix, valid_user_id_to_idx, valid_movie_id_to_idx = create_user_item_matrix(valid_data)

In [5]:
from scipy.sparse import coo_matrix
import numpy as np
from sklearn.metrics import mean_squared_error

def calculate_biases(ratings_df):
    global_mean = ratings_df['rating'].mean()

    user_bias = ratings_df.groupby('userId')['rating'].mean() - global_mean
    item_bias = ratings_df.groupby('movieId')['rating'].mean() - global_mean

    return global_mean, user_bias, item_bias

def predict_with_biases(U, sigma, Vt, user_id_to_idx, movie_id_to_idx, global_mean, user_bias, item_bias, userId, movieId):
    user_idx = user_id_to_idx.get(userId)
    movie_idx = movie_id_to_idx.get(movieId)

    if user_idx is not None and movie_idx is not None:
        pred_rating = np.dot(np.dot(U[user_idx, :], sigma), Vt[:, movie_idx])
        pred_rating += global_mean
        pred_rating += user_bias.get(userId, 0)
        pred_rating += item_bias.get(movieId, 0)
        return pred_rating
    else:
        return global_mean

# Calculate biases
global_mean, user_bias, item_bias = calculate_biases(train_ratings)

# Perform SVD
U, sigma, Vt = svds(train_matrix, k=500)
sigma = np.diag(sigma)

# Make predictions and calculate RMSE
predicted_ratings = []
for _, row in valid_data.iterrows():
    predicted_rating = predict_with_biases(U, sigma, Vt, train_user_id_to_idx, train_movie_id_to_idx, 
                                           global_mean, user_bias, item_bias, row.userId, row.movieId)
    predicted_ratings.append(predicted_rating)

rmse = np.sqrt(mean_squared_error(valid_data['rating'], predicted_ratings))
rmse
