In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
def load_ratings(file_path):
    df = pd.read_csv(file_path)
    return df
    
def calculate_user_similarity(ratings_matrix):
    # Replace zeros with NaN to ignore unrated movies when calculating similarity
    ratings_matrix_nan = ratings_matrix.replace(0, np.nan)
    similarity = ratings_matrix_nan.T.corr(method='pearson').fillna(0)
    return similarity

def predict_ratings(ratings_matrix, user_similarity):
    """
    - This function uses user-based collaborative filtering to predict ratings by
       leveraging the similarity between users.
    - Ratings are centered by subtracting the user's mean to normalize rating scales,
      ensuring fair comparisons between users.
    - Predictions are made using a weighted average of the ratings from similar users,
       where the weights are the similarity scores.
    - If no meaningful similarity exists (i.e., sum_of_weights is zero), the prediction 
      falls back to the user's mean rating.
    """
    # Mean rating per user (ignoring unrated movies)
    user_means = ratings_matrix.replace(0, np.nan).mean(axis=1)
    # print(f"user_means=\n{user_means}")
    
    # Create a matrix to store predicted ratings
    predicted_ratings = ratings_matrix.copy().astype(float)
    
    for user in range(ratings_matrix.shape[0]):
        for movie in range(ratings_matrix.shape[1]):
            if ratings_matrix.iloc[user, movie] == 0:  # Make prediction only for unrated movies
                # print(f"user:{user}, movie:{movie}:")
                # Compute the weighted sum of ratings from similar users
                similar_users = user_similarity.iloc[user]
                ratings = ratings_matrix.iloc[:, movie]
                
                # Center the ratings by subtracting user means
                ratings_centered = ratings - user_means
                # print(f"..similar_users=\n{similar_users}, ratings_centered.fillna(0)=\n{ratings_centered.fillna(0)}")
                
                # Weighted average prediction
                weighted_sum = np.dot(similar_users, ratings_centered.fillna(0))
                sum_of_weights = np.abs(similar_users).sum()
                # print(f"..weighted_sum={weighted_sum}, sum_of_weights={sum_of_weights}")
                if sum_of_weights != 0:
                    predicted_rating = user_means[user] + (weighted_sum / sum_of_weights)
                    predicted_ratings.iloc[user, movie] = predicted_rating
                else:
                    predicted_ratings.iloc[user, movie] = user_means[user]

    return predicted_ratings

def calculate_rmse(original_ratings, predicted_ratings):
    # Only consider non-zero ratings in the original matrix
    mask = original_ratings > 0
    print("mask=\n",mask)
    
    mse = mean_squared_error(original_ratings[mask].fillna(0), predicted_ratings[mask].fillna(0))
    return sqrt(mse)

def recommend_movies(original_ratings, predicted_ratings, user_index, n=3):
    user_predict_ratings = predicted_ratings.iloc[user_index]

    # Filter only the unrated movies (those that were originally 0 in the input data)
    user_orig_ratings = original_ratings.iloc[user_index]
    unrated_movies = user_orig_ratings[user_orig_ratings == 0].index

    # Get recommendations by excluding originally rated movies
    recommendations = user_predict_ratings[unrated_movies].sort_values(ascending=False)
    print(f"len(recommendations): {len(recommendations)}")
    
    print(f"Top {n} recommended movies for User {user_index+1}:")
    for i in range(min(n, len(recommendations))):
        movie_name = recommendations.index[i]  # This is the movie name, not the index
        print(f"{movie_name}: {recommendations.iloc[i]:.2f}")


In [3]:
if __name__ == "__main__":
    # Load ratings from the CSV file
    ratings_file = "ratingsTest.csv"
    orig_ratings = load_ratings(ratings_file)
    
    print("Original Ratings Matrix(6 entries):")
    print(orig_ratings.head(6))

    # Calculate user similarity
    user_similarity = calculate_user_similarity(orig_ratings)
    print("\nUser Similarity Matrix:")
    print(user_similarity)

    # Predict ratings
    predicted_ratings = predict_ratings(orig_ratings, user_similarity)
    print("\nPredicted Ratings Matrix(6 entries):")
    print(predicted_ratings.head(6))

    # Evaluate the model performance
    rmse = calculate_rmse(orig_ratings, predicted_ratings)
    print(f"\nRMSE: {rmse:.4f}")

    # Recommend top 3 movies for User 1 (index 0)
    recommend_movies(orig_ratings, predicted_ratings, user_index=0, n=3)
    

Original Ratings Matrix(6 entries):
   Movie1  Movie2  Movie3  Movie4  Movie5  Movie6  Movie7
0       0       5       3       0       0       2       0
1       0       0       0       5       3       4       1
2       3       4       5       4       3       3       4
3       3       1       5       4       4       0       5
4       2       4       3       1       3       0       4
5       4       0       4       3       0       0       1

User Similarity Matrix:
          0         1             2         3         4         5  \
0  1.000000  0.000000  3.273268e-01 -1.000000  1.000000  0.000000   
1  0.000000  1.000000 -1.690309e-01 -0.866025 -0.981981  1.000000   
2  0.327327 -0.169031  1.000000e+00  0.294118  0.189389  0.000000   
3 -1.000000 -0.866025  2.941176e-01  1.000000 -0.151511 -0.492366   
4  1.000000 -0.981981  1.893885e-01 -0.151511  1.000000 -0.547723   
5  0.000000  1.000000  0.000000e+00 -0.492366 -0.547723  1.000000   
6  0.944911 -1.000000 -9.468014e-17 -0.866025  0.8