In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

movie_genres = pd.read_csv('movie_reviews/movie_genres.csv')
user_reviews = pd.read_csv('movie_reviews/user_reviews.csv')

In [38]:
R = user_reviews.iloc[:, 2:].copy().to_numpy()
G = movie_genres.iloc[:, 2:].copy()
n_users, n_movies = R.shape

In [10]:
def create_train_test_masks(ratings, split_ratio=0.8, seed=42):
    """
    Creates training and test masks by splitting nonzero ratings in the matrix.

    Parameters:
    - ratings: np.array (User-Movie Rating Matrix)
    - split_ratio: float (default 0.8) - fraction of nonzero entries to retain in train mask
    - seed: int (random seed for reproducibility)

    Returns:
    - train_matrix: np.array with 80% of ratings, 0 elsewhere
    - test_matrix: np.array with 20% of ratings, 0 elsewhere
    """
    np.random.seed(seed)

    # Find indices of all nonzero ratings
    nonzero_indices = np.argwhere(ratings > 0)  # Get (row, col) pairs

    # Shuffle indices
    np.random.shuffle(nonzero_indices)

    # Split indices into training (80%) and test (20%)
    split_idx = int(len(nonzero_indices) * split_ratio)
    train_indices = nonzero_indices[:split_idx]
    test_indices = nonzero_indices[split_idx:]

    # Create copies of the original matrix
    train_matrix = np.zeros_like(ratings)  # Empty training matrix
    test_matrix = np.zeros_like(ratings)   # Empty test matrix

    # Fill training matrix
    for row, col in train_indices:
        train_matrix[row, col] = ratings[row, col]

    # Fill test matrix
    for row, col in test_indices:
        test_matrix[row, col] = ratings[row, col]

    return train_matrix, test_matrix

In [None]:
R_rest, R_test = create_train_test_masks(R, 0.9)

assert R_rest[R_rest != 0].shape[0] + R_test[R_test != 0].shape[0] == R[R != 0].shape[0]

R_train, R_val = create_train_test_masks(R_rest, 0.8)

assert R_train[R_train != 0].shape[0] + R_val[R_val != 0].shape[0] == R_rest[R_rest != 0].shape[0]
assert R_train[R_train != 0].shape[0] + R_val[R_val != 0].shape[0] + R_test[R_test != 0].shape[0] == R[R != 0].shape[0]

# Method 1 : knn approach

In [67]:
def Jaccard_matrix(movies):
    """
    Compute the Jaccard distance matrix between movies based on their genres.

    Parameters:
    - movie_genres: pd.DataFrame (movie_id, genre1, genre2, ...)

    Returns:
    - similarity_matrix: np.array (n_movies, n_movies)
    """

    A = movies.values[:, np.newaxis]  
    B = movies.values

    intersection = np.logical_and(A, B).sum(axis=2)
    union = np.logical_or(A, B).sum(axis=2)
    similarity_matrix = (intersection / np.maximum(union, 1))

    mean_distance = np.mean(similarity_matrix)
    max_distance = np.max(similarity_matrix)
    min_distance = np.min(similarity_matrix)
    #print(mean_distance, max_distance, min_distance)

    return similarity_matrix


def get_nearest_movies(movie_id, similarity_matrix, neigh_distance):

    mask = similarity_matrix[movie_id] >= neigh_distance
    output = np.where(np.array(mask) == 1)[0].tolist()
    return output


def get_movies_recommendations(user, user_reviews, similarity_matrix, neigh_distance):

    candidate_movies = user_reviews.columns[user_reviews.iloc[user] == 0].to_numpy()
    
    b_u = np.nanmean(user_reviews.replace(0, np.nan), axis=0)
    estimated_ratings = {}
    user_ratings = user_reviews.to_numpy()

    for movie_id in candidate_movies:
        output = get_nearest_movies(movie_id, similarity_matrix, neigh_distance)
        
        N_u = np.array([movie for movie in output if user_ratings[user, movie] > 0])

        if N_u.size > 0:
            numerator = np.dot(similarity_matrix[movie_id, N_u], (user_ratings[user, N_u] - b_u[N_u]))
            denominator = np.sum(similarity_matrix[movie_id, N_u])

            estimated_ratings[movie_id] = b_u[movie_id] + (numerator / denominator if denominator != 0 else 0)
        else:
            estimated_ratings[movie_id] = b_u[movie_id]  # Default to baseline if no neighbors exist


    print(f'user {user} done')
    return estimated_ratings


# def fill_missing_ratings___(user_reviews, distance_matrix, neigh_distance):
#     """
#     Returns a new user_ratings matrix where missing ratings (0s) are replaced
#     with estimated ratings computed using get_movies_recommendations.
#     """
    
#     filled_ratings = user_reviews.copy()  # Copy as NumPy array
    
#     def fill_user_ratings(user_id):
#         estimated_ratings = get_movies_recommendations(user_id, user_reviews, distance_matrix, neigh_distance)
#         for movie_id, rating in estimated_ratings.items():
#             filled_ratings[user_id, movie_id] = rating  # Replace 0s with estimated ratings

#     # Apply function to all users
#     np.apply_along_axis(fill_user_ratings, axis=1, arr=np.arange(user_reviews.shape[0]))
    
#     return pd.DataFrame(filled_ratings, columns=user_reviews.columns, index=user_reviews.index)


def fill_missing_ratings(user_reviews, distance_matrix, neigh_distance):
    """
    Returns a new user_ratings matrix where missing ratings (0s) are replaced
    with estimated ratings computed using get_movies_recommendations.
    """
    filled_ratings = user_reviews.copy()  # Copy as NumPy array

    for user_id in range(user_reviews.shape[0]):  # Iterate over users
        estimated_ratings = get_movies_recommendations(user_id, pd.DataFrame(user_reviews), distance_matrix, neigh_distance)
        for movie_id, rating in estimated_ratings.items():
            filled_ratings[user_id, movie_id] = rating  # Replace 0s with estimated ratings

    return pd.DataFrame(filled_ratings, columns=pd.DataFrame(user_reviews).columns, index=pd.DataFrame(user_reviews).index)



In [36]:
neigh_distance = 0.2


In [66]:
distance_matrix = Jaccard_matrix(G)
filled_ratings = fill_missing_ratings(R_train, distance_matrix, neigh_distance)
filled_ratings

  b_u = np.nanmean(user_reviews.replace(0, np.nan), axis=0)


user 0 done
user 1 done
user 2 done
user 3 done
user 4 done
user 5 done
user 6 done
user 7 done
user 8 done
user 9 done
user 10 done
user 11 done
user 12 done
user 13 done
user 14 done
user 15 done
user 16 done
user 17 done
user 18 done
user 19 done
user 20 done
user 21 done
user 22 done
user 23 done
user 24 done
user 25 done
user 26 done
user 27 done
user 28 done
user 29 done
user 30 done
user 31 done
user 32 done
user 33 done
user 34 done
user 35 done
user 36 done
user 37 done
user 38 done
user 39 done
user 40 done
user 41 done
user 42 done
user 43 done
user 44 done
user 45 done
user 46 done
user 47 done
user 48 done
user 49 done
user 50 done
user 51 done
user 52 done
user 53 done
user 54 done
user 55 done
user 56 done
user 57 done
user 58 done
user 59 done
user 60 done
user 61 done
user 62 done
user 63 done
user 64 done
user 65 done
user 66 done
user 67 done
user 68 done
user 69 done
user 70 done
user 71 done
user 72 done
user 73 done
user 74 done
user 75 done
user 76 done
user 77 d

AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [None]:
filled_ratings

NameError: name 'filled_ratings' is not defined

In [None]:
# Compute RMSE only on known ratings
best_val_score = 100

predicted_ratings = U @ M.T
train_mask = R_train > 0  # Only compare known ratings
val_mask = R_val > 0
train_error = np.sqrt(np.sum((R - predicted_ratings) ** 2 * train_mask) / np.sum(train_mask))
val_error = np.sqrt(np.sum((R - predicted_ratings) ** 2 * val_mask) / np.sum(val_mask))
print(f"Iteration {iteration}, Train RMSE: {train_error:.4f}, Val RMSE: {val_error:.4f}")
if val_error < best_val_score:
    best_val_score = val_error