In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.metrics.pairwise import cosine_similarity

movie_genres = pd.read_csv('movie_reviews/movie_genres.csv')
user_reviews = pd.read_csv('movie_reviews/user_reviews.csv')

In [38]:
R = user_reviews.iloc[:, 2:].copy().to_numpy()
G = movie_genres.iloc[:, 2:].copy()
n_users, n_movies = R.shape

In [10]:
def create_train_test_masks(ratings, split_ratio=0.8, seed=42):
    """
    Creates training and test masks by splitting nonzero ratings in the matrix.

    Parameters:
    - ratings: np.array (User-Movie Rating Matrix)
    - split_ratio: float (default 0.8) - fraction of nonzero entries to retain in train mask
    - seed: int (random seed for reproducibility)

    Returns:
    - train_matrix: np.array with 80% of ratings, 0 elsewhere
    - test_matrix: np.array with 20% of ratings, 0 elsewhere
    """
    np.random.seed(seed)

    # Find indices of all nonzero ratings
    nonzero_indices = np.argwhere(ratings > 0)  # Get (row, col) pairs

    # Shuffle indices
    np.random.shuffle(nonzero_indices)

    # Split indices into training (80%) and test (20%)
    split_idx = int(len(nonzero_indices) * split_ratio)
    train_indices = nonzero_indices[:split_idx]
    test_indices = nonzero_indices[split_idx:]

    # Create copies of the original matrix
    train_matrix = np.zeros_like(ratings)  # Empty training matrix
    test_matrix = np.zeros_like(ratings)   # Empty test matrix

    # Fill training matrix
    for row, col in train_indices:
        train_matrix[row, col] = ratings[row, col]

    # Fill test matrix
    for row, col in test_indices:
        test_matrix[row, col] = ratings[row, col]

    return train_matrix, test_matrix

In [None]:
R_rest, R_test = create_train_test_masks(R, 0.9)

assert R_rest[R_rest != 0].shape[0] + R_test[R_test != 0].shape[0] == R[R != 0].shape[0]

R_train, R_val = create_train_test_masks(R_rest, 0.8)

assert R_train[R_train != 0].shape[0] + R_val[R_val != 0].shape[0] == R_rest[R_rest != 0].shape[0]
assert R_train[R_train != 0].shape[0] + R_val[R_val != 0].shape[0] + R_test[R_test != 0].shape[0] == R[R != 0].shape[0]

# Method 1 : knn approach

In [114]:
def Jaccard_matrix(movies):
    """
    Compute the Jaccard distance matrix between movies based on their genres.

    Parameters:
    - movie_genres: pd.DataFrame (movie_id, genre1, genre2, ...)

    Returns:
    - similarity_matrix: np.array (n_movies, n_movies)
    """

    A = movies.values[:, np.newaxis]  
    B = movies.values

    intersection = np.logical_and(A, B).sum(axis=2)
    union = np.logical_or(A, B).sum(axis=2)
    similarity_matrix = (intersection / np.maximum(union, 1))

    mean_distance = np.mean(similarity_matrix)
    max_distance = np.max(similarity_matrix)
    min_distance = np.min(similarity_matrix)
    #print(mean_distance, max_distance, min_distance)

    return similarity_matrix


def get_nearest_movies(movie_id, similarity_matrix, neigh_distance):

    mask = similarity_matrix[movie_id] >= neigh_distance
    output = np.where(np.array(mask) == 1)[0].tolist()
    return output


def get_movies_recommendations(user, user_reviews, similarity_matrix, neigh_distance):

    candidate_movies = user_reviews.columns[user_reviews.iloc[user] == 0].to_numpy()
    
    b_u = np.nanmean(user_reviews.replace(0, np.nan), axis=0)
    estimated_ratings = {}
    user_ratings = user_reviews.to_numpy()

    for movie_id in candidate_movies:
        output = get_nearest_movies(movie_id, similarity_matrix, neigh_distance)
        
        N_u = np.array([movie for movie in output if user_ratings[user, movie] > 0])

        if N_u.size > 0:
            numerator = np.dot(similarity_matrix[movie_id, N_u], (user_ratings[user, N_u] - b_u[N_u]))
            denominator = np.sum(similarity_matrix[movie_id, N_u])

            estimated_ratings[movie_id] = b_u[movie_id] + (numerator / denominator if denominator != 0 else 0)
        else:
            estimated_ratings[movie_id] = b_u[movie_id]  # Default to baseline if no neighbors exist


    #print(f'user {user} done')
    return estimated_ratings


# def fill_missing_ratings___(user_reviews, distance_matrix, neigh_distance):
#     """
#     Returns a new user_ratings matrix where missing ratings (0s) are replaced
#     with estimated ratings computed using get_movies_recommendations.
#     """
    
#     filled_ratings = user_reviews.copy()  # Copy as NumPy array
    
#     def fill_user_ratings(user_id):
#         estimated_ratings = get_movies_recommendations(user_id, user_reviews, distance_matrix, neigh_distance)
#         for movie_id, rating in estimated_ratings.items():
#             filled_ratings[user_id, movie_id] = rating  # Replace 0s with estimated ratings

#     # Apply function to all users
#     np.apply_along_axis(fill_user_ratings, axis=1, arr=np.arange(user_reviews.shape[0]))
    
#     return pd.DataFrame(filled_ratings, columns=user_reviews.columns, index=user_reviews.index)


def fill_missing_ratings(user_reviews, distance_matrix, neigh_distance):
    """
    Returns a new user_ratings matrix where missing ratings (0s) are replaced
    with estimated ratings computed using get_movies_recommendations.
    """
    filled_ratings = user_reviews.copy()  # Copy as NumPy array
    pbar = tqdm(total=user_reviews.shape[0], desc="User Processed")
    for user_id in range(user_reviews.shape[0]):  # Iterate over users
        estimated_ratings = get_movies_recommendations(user_id, pd.DataFrame(user_reviews), distance_matrix, neigh_distance)
        for movie_id, rating in estimated_ratings.items():
            filled_ratings[user_id, movie_id] = rating  # Replace 0s with estimated ratings
        pbar.update(1)
    pbar.close()
    return pd.DataFrame(filled_ratings, columns=pd.DataFrame(user_reviews).columns, index=pd.DataFrame(user_reviews).index)



In [36]:
neigh_distance = 0.2


In [68]:
distance_matrix = Jaccard_matrix(G)
filled_ratings = fill_missing_ratings(R_train, distance_matrix, neigh_distance)
filled_ratings

  b_u = np.nanmean(user_reviews.replace(0, np.nan), axis=0)


user 0 done
user 1 done
user 2 done
user 3 done
user 4 done
user 5 done
user 6 done
user 7 done
user 8 done
user 9 done
user 10 done
user 11 done
user 12 done
user 13 done
user 14 done
user 15 done
user 16 done
user 17 done
user 18 done
user 19 done
user 20 done
user 21 done
user 22 done
user 23 done
user 24 done
user 25 done
user 26 done
user 27 done
user 28 done
user 29 done
user 30 done
user 31 done
user 32 done
user 33 done
user 34 done
user 35 done
user 36 done
user 37 done
user 38 done
user 39 done
user 40 done
user 41 done
user 42 done
user 43 done
user 44 done
user 45 done
user 46 done
user 47 done
user 48 done
user 49 done
user 50 done
user 51 done
user 52 done
user 53 done
user 54 done
user 55 done
user 56 done
user 57 done
user 58 done
user 59 done
user 60 done
user 61 done
user 62 done
user 63 done
user 64 done
user 65 done
user 66 done
user 67 done
user 68 done
user 69 done
user 70 done
user 71 done
user 72 done
user 73 done
user 74 done
user 75 done
user 76 done
user 77 d

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,4.588884,3.012325,3.914141,4.057475,4.936591,4.738095,3.568338,4.007378,3.156014,4.332298,...,4.157913,3.669157,3.814479,3.793093,4.963191,3.766305,4.340441,2.970434,2.826681,3.874092
1,4.198374,4.800000,4.074849,4.008646,5.171600,2.433333,3.492457,4.049064,2.999289,3.096997,...,4.158986,3.262096,4.872572,3.834778,4.408088,4.060333,4.846321,2.465816,3.017146,4.285714
2,3.447722,4.386459,3.732973,3.811267,4.147644,3.334145,3.160659,3.996604,2.442644,2.915192,...,3.535181,4.066504,3.685446,3.782318,3.777079,3.503739,4.552764,3.836979,2.614969,4.795469
3,4.983321,3.663954,4.595273,3.478105,5.208181,2.927536,3.837135,3.395926,3.032143,3.596820,...,3.760215,3.606819,3.698280,3.181640,4.593239,3.475926,3.875444,2.527944,2.772565,3.834893
4,2.861898,4.327844,4.472222,3.296695,3.737256,2.619048,3.386905,3.391801,3.169643,2.442857,...,3.090183,3.358899,3.386805,3.177516,3.256136,2.812300,4.053542,3.041209,2.929487,5.006389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,5.000000,3.174118,4.276068,3.420193,4.278952,3.008225,4.389820,2.351759,3.813548,3.832775,...,3.025875,4.158172,2.858693,2.137473,3.903009,2.241060,2.939787,2.059094,3.690735,3.429744
596,3.896203,4.811500,3.934143,4.026946,4.560813,3.000000,3.675822,4.028684,3.016088,3.093506,...,3.687948,4.224558,3.601307,3.814399,3.424033,3.599621,4.516349,3.849580,3.222672,5.196359
597,4.012380,4.105303,4.020202,4.181118,4.685368,2.282051,3.598846,3.851682,3.697917,3.358207,...,4.267874,3.695074,4.732962,3.637396,3.000000,3.696487,4.849178,2.894048,2.997750,4.085227
598,3.477931,4.803743,3.851616,3.978307,3.981283,2.630952,3.229429,3.974747,3.002544,2.811404,...,3.670455,4.242451,3.498291,3.760462,3.373782,3.765988,4.531313,4.174333,2.906056,4.832487


In [69]:
filled_ratings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,4.588884,3.012325,3.914141,4.057475,4.936591,4.738095,3.568338,4.007378,3.156014,4.332298,...,4.157913,3.669157,3.814479,3.793093,4.963191,3.766305,4.340441,2.970434,2.826681,3.874092
1,4.198374,4.800000,4.074849,4.008646,5.171600,2.433333,3.492457,4.049064,2.999289,3.096997,...,4.158986,3.262096,4.872572,3.834778,4.408088,4.060333,4.846321,2.465816,3.017146,4.285714
2,3.447722,4.386459,3.732973,3.811267,4.147644,3.334145,3.160659,3.996604,2.442644,2.915192,...,3.535181,4.066504,3.685446,3.782318,3.777079,3.503739,4.552764,3.836979,2.614969,4.795469
3,4.983321,3.663954,4.595273,3.478105,5.208181,2.927536,3.837135,3.395926,3.032143,3.596820,...,3.760215,3.606819,3.698280,3.181640,4.593239,3.475926,3.875444,2.527944,2.772565,3.834893
4,2.861898,4.327844,4.472222,3.296695,3.737256,2.619048,3.386905,3.391801,3.169643,2.442857,...,3.090183,3.358899,3.386805,3.177516,3.256136,2.812300,4.053542,3.041209,2.929487,5.006389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,5.000000,3.174118,4.276068,3.420193,4.278952,3.008225,4.389820,2.351759,3.813548,3.832775,...,3.025875,4.158172,2.858693,2.137473,3.903009,2.241060,2.939787,2.059094,3.690735,3.429744
596,3.896203,4.811500,3.934143,4.026946,4.560813,3.000000,3.675822,4.028684,3.016088,3.093506,...,3.687948,4.224558,3.601307,3.814399,3.424033,3.599621,4.516349,3.849580,3.222672,5.196359
597,4.012380,4.105303,4.020202,4.181118,4.685368,2.282051,3.598846,3.851682,3.697917,3.358207,...,4.267874,3.695074,4.732962,3.637396,3.000000,3.696487,4.849178,2.894048,2.997750,4.085227
598,3.477931,4.803743,3.851616,3.978307,3.981283,2.630952,3.229429,3.974747,3.002544,2.811404,...,3.670455,4.242451,3.498291,3.760462,3.373782,3.765988,4.531313,4.174333,2.906056,4.832487


In [112]:
def test_model(predicted_ratings, R, R_test):

    test_mask = R_test > 0
    test_error = np.sqrt(((R - predicted_ratings) ** 2 * test_mask).sum().sum() / np.sum(test_mask))
    print(f"RMSE: {test_error}")

    return test_error


test_model(filled_ratings, R, R_val)

RMSE: 1.149349441689475


np.float64(1.149349441689475)

In [116]:
# Define the range of threshold values to test
def optimal_knn_model(threshold_values, R, R_train, R_val, G):

    #threshold_values = np.linspace(0.1, 0.6, 5)  # Adjust range as needed
    best_threshold = None
    best_error = 1000  # Initialize with a large value
    best_predictions = None

    for threshold in threshold_values:
        print(f"Testing threshold: {threshold}")

        # Compute the distance matrix (assuming threshold affects it)
        distance_matrix = Jaccard_matrix(G)  
        
        # Compute the filled ratings matrix with the current threshold
        predicted_ratings = fill_missing_ratings(R_train, distance_matrix, threshold)
        
        # Compute RMSE
        error = test_model(predicted_ratings, R, R_val)
        print(f"RMSE: {error}")

        # Update the best model if this threshold gives a lower error
        if error < best_error:
            best_error = error
            best_threshold = threshold
            best_predictions = predicted_ratings.copy()

    print(f"\nBest threshold: {best_threshold} with RMSE: {best_error}")

    return best_threshold, best_error, best_predictions


best_threshold, best_error, best_predictions = optimal_knn_model(np.linspace(0.1, 0.6, 5), R_rest, R_train, R_val, G)

Testing threshold: 0.1


  b_u = np.nanmean(user_reviews.replace(0, np.nan), axis=0)
User Processed: 100%|██████████| 600/600 [04:22<00:00,  2.29it/s]


RMSE: 1.1509816259787964
Testing threshold: 0.225


User Processed: 100%|██████████| 600/600 [03:00<00:00,  3.33it/s]


RMSE: 1.1512196055854746
Testing threshold: 0.35


User Processed: 100%|██████████| 600/600 [01:58<00:00,  5.07it/s]


RMSE: 1.203819476448113
Testing threshold: 0.475


User Processed: 100%|██████████| 600/600 [01:17<00:00,  7.71it/s]


RMSE: 1.2166407425260017
Testing threshold: 0.6


User Processed: 100%|██████████| 600/600 [00:45<00:00, 13.16it/s]


RMSE: 1.250750180558447

Best threshold: 0.1 with RMSE: 1.1509816259787964
