In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
DATA_DIR = 'data/'

In [3]:
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# Load datasets
movies_df = pd.read_excel(f'{DATA_DIR}movies_df.xlsx')
movies_csv = pd.read_csv(f'{DATA_DIR}movies.csv')
train_ratings = pd.read_csv(f'{DATA_DIR}train_ratings.csv')
# split train data into train and validation data
valid_data = train_ratings.iloc[int(len(train_ratings)*0.8):]
train_ratings = train_ratings.iloc[:int(len(train_ratings)*0.8)]

def create_user_item_matrix(df):
    """ Create a user-item matrix for collaborative filtering """
    user_ids = df['userId'].unique()
    movie_ids = df['movieId'].unique()
    
    user_id_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
    movie_id_to_idx = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}
    
    rows = df['userId'].map(user_id_to_idx)
    cols = df['movieId'].map(movie_id_to_idx)
    values = df['rating']
    
    return csr_matrix((values, (rows, cols)), shape=(len(user_ids), len(movie_ids))), user_id_to_idx, movie_id_to_idx

# Create user-item matrices for train and validation data
train_matrix, train_user_id_to_idx, train_movie_id_to_idx = create_user_item_matrix(train_ratings)
valid_matrix, valid_user_id_to_idx, valid_movie_id_to_idx = create_user_item_matrix(valid_data)

In [4]:
# Calculate item and user similarities
item_sim = train_matrix.T.dot(train_matrix).toarray()
user_sim = train_matrix.dot(train_matrix.T).toarray()

In [5]:
def calculate_biases(ratings_df):
    global_mean = ratings_df['rating'].mean()

    user_bias = ratings_df.groupby('userId')['rating'].mean() - global_mean
    item_bias = ratings_df.groupby('movieId')['rating'].mean() - global_mean

    return global_mean, user_bias, item_bias

In [6]:
# Calculate biases
global_mean, user_bias, item_bias = calculate_biases(train_ratings)

In [8]:
# Predict using item similarities
def predict_item_based(user_id, movie_id, k=500, item_bias=item_bias, global_mean=global_mean, user_bias=user_bias):
    """ Predict rating for a given user and movie using item-based collaborative filtering """
    if user_id not in train_user_id_to_idx:
        return global_mean
    if movie_id not in train_movie_id_to_idx:
        return global_mean
    # Get index of user and movie
    user_idx = train_user_id_to_idx[user_id]
    movie_idx = train_movie_id_to_idx[movie_id]
    
    # Get k most similar items to movie
    similar_items = np.argsort(item_sim[movie_idx])[-k:]
    
    # Get ratings for similar items
    similar_ratings = train_matrix[user_idx, similar_items].toarray().ravel()
    
    # Get similarities for similar items
    similarities = item_sim[movie_idx, similar_items]
    
    # Predict rating as weighted average of ratings of similar items
    prediction = np.sum(similar_ratings * similarities) / np.sum(similarities)
    
    # Add item bias to prediction
    prediction += item_bias[movie_id]
    
    # Add global mean and user bias to prediction
    prediction += global_mean + user_bias[user_id]
    
    return prediction

predictions = []
actuals = []
for _, row in valid_data.iterrows():
    #print(row['userId'], row['movieId'])
    predictions.append(predict_item_based(row['userId'], row['movieId']))
    actuals.append(row['rating'])

print('Item-based CF MSE:', mean_squared_error(actuals, predictions))

Item-based CF MSE: 1.7926032366667275


In [9]:
# Predict using user similarities
def predict_user_based(user_id, movie_id, k=500, user_bias=user_bias, global_mean=global_mean, item_bias=item_bias):
    """ Predict rating for a given user and movie using user-based collaborative filtering """
    if user_id not in train_user_id_to_idx:
        return global_mean
    if movie_id not in train_movie_id_to_idx:
        return global_mean
    # Get index of user and movie
    user_idx = train_user_id_to_idx[user_id]
    movie_idx = train_movie_id_to_idx[movie_id]
    
    # Get k most similar users to user
    similar_users = np.argsort(user_sim[user_idx])[-k:]
    
    # Get ratings for similar users
    similar_ratings = train_matrix[similar_users, movie_idx].toarray().ravel()
    
    # Get similarities for similar users
    similarities = user_sim[user_idx, similar_users]
    
    # Predict rating as weighted average of ratings of similar users
    prediction = np.sum(similar_ratings * similarities) / np.sum(similarities)
    
    # Add user bias to prediction
    prediction += user_bias[user_id]
    
    # Add global mean and item bias to prediction
    prediction += global_mean + item_bias[movie_id]
    
    return prediction

predictions = []
actuals = []
for _, row in valid_data.iterrows():
    predictions.append(predict_user_based(row['userId'], row['movieId']))
    actuals.append(row['rating'])

print('User-based CF MSE:', mean_squared_error(actuals, predictions))

User-based CF MSE: 1.390212936000763
