In [12]:
import pandas as pd
import numpy as np

movie_genres = pd.read_csv('movie_reviews/movie_genres.csv')
user_reviews = pd.read_csv('movie_reviews/user_reviews.csv')

In [13]:
movie_genres

Unnamed: 0.1,Unnamed: 0,movie_title,genre_action,genre_adventure,genre_animation,genre_biography,genre_comedy,genre_crime,genre_documentary,genre_drama,...,genre_mystery,genre_news,genre_reality-tv,genre_romance,genre_sci-fi,genre_short,genre_sport,genre_thriller,genre_war,genre_western
0,0,The Net,1,0,0,0,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
1,1,Happily N'Ever After,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,Tomorrowland,1,1,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
3,3,American Hero,1,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,4,Das Boot,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1995,Big Fish,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1996,1996,Get Real,0,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
1997,1997,Trading Places,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1998,1998,DOA: Dead or Alive,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
user_reviews.head()

Unnamed: 0.1,Unnamed: 0,User,The Net,Happily N'Ever After,Tomorrowland,American Hero,Das Boot,Final Destination 3,Licence to Kill,The Hundred-Foot Journey,...,The Martian,Micmacs,Solomon and Sheba,In the Company of Men,Silent House,Big Fish,Get Real,Trading Places,DOA: Dead or Alive,Hey Arnold! The Movie
0,0,Vincent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Edgar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Addilyn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Marlee,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,Javier,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
R = user_reviews.iloc[:, 2:].copy().to_numpy()
n_users, n_movies = R.shape

In [97]:
from numpy.linalg import solve
n_factors = 10
U = np.random.rand(n_users, n_factors)
M = np.random.rand(n_movies, n_factors)
G = movie_genres.iloc[:, 2:].copy().to_numpy()


In [79]:
import numpy as np

def create_train_test_masks(ratings, split_ratio=0.8, seed=42):
    """
    Creates training and test masks by splitting nonzero ratings in the matrix.

    Parameters:
    - ratings: np.array (User-Movie Rating Matrix)
    - split_ratio: float (default 0.8) - fraction of nonzero entries to retain in train mask
    - seed: int (random seed for reproducibility)

    Returns:
    - train_matrix: np.array with 80% of ratings, 0 elsewhere
    - test_matrix: np.array with 20% of ratings, 0 elsewhere
    """
    np.random.seed(seed)

    # Find indices of all nonzero ratings
    nonzero_indices = np.argwhere(ratings > 0)  # Get (row, col) pairs

    # Shuffle indices
    np.random.shuffle(nonzero_indices)

    # Split indices into training (80%) and test (20%)
    split_idx = int(len(nonzero_indices) * split_ratio)
    train_indices = nonzero_indices[:split_idx]
    test_indices = nonzero_indices[split_idx:]

    # Create copies of the original matrix
    train_matrix = np.zeros_like(ratings)  # Empty training matrix
    test_matrix = np.zeros_like(ratings)   # Empty test matrix

    # Fill training matrix
    for row, col in train_indices:
        train_matrix[row, col] = ratings[row, col]

    # Fill test matrix
    for row, col in test_indices:
        test_matrix[row, col] = ratings[row, col]

    return train_matrix, test_matrix



In [93]:
import numpy as np
from numpy.linalg import solve
latent_dim=50
reg_param=0.1
max_iters=1000
R_train, R_val = create_train_test_masks(R)
U = np.random.rand(n_users, latent_dim)
M = np.random.rand(n_movies, latent_dim)

for iteration in range(max_iters):
    best_val_score = 100
    counter = 0
    # Solve for U while keeping M fixed
    for u in range(n_users):
        relevant_movies = R_train[u, :].nonzero()[0]  # Movies rated by user u
        if len(relevant_movies) > 0:
            M_subset = M[relevant_movies]  # Extract only relevant movie embeddings
            R_u = R_train[u, relevant_movies]  # Get known ratings

            # Solve least squares problem: (M.T @ M + λI)U = M.T @ R
            U[u] = solve(M_subset.T @ M_subset + reg_param * np.eye(latent_dim), M_subset.T @ R_u)

    # Solve for M while keeping U fixed
    for m in range(n_movies):
        relevant_users = R_train[:, m].nonzero()[0]  # Users who rated movie m
        if len(relevant_users) > 0:
            U_subset = U[relevant_users]  # Extract relevant user embeddings
            R_m = R_train[relevant_users, m] # Get known ratings

            # Solve least squares problem: (U.T @ U + λI)M = U.T @ R
            M[m] = solve(U_subset.T @ U_subset + reg_param * np.eye(latent_dim), U_subset.T @ R_m)

    if iteration % 5 == 0:
        # Compute RMSE only on known ratings
        predicted_ratings = U @ M.T
        train_mask = R_train > 0  # Only compare known ratings
        val_mask = R_val > 0
        train_error = np.sqrt(np.sum((R - predicted_ratings) ** 2 * train_mask) / np.sum(train_mask))
        val_error = np.sqrt(np.sum((R - predicted_ratings) ** 2 * val_mask) / np.sum(val_mask))
        print(f"Iteration {iteration}, Train RMSE: {train_error:.4f}, Val RMSE: {val_error:.4f}")
        if val_error < best_val_score:
            best_val_score = val_error
            counter = 0
        else:
            counter += 1
    
    if counter == 5:
        break
        

Iteration 0, Train RMSE: 0.0281, Val RMSE: 2.5977


KeyboardInterrupt: 

current best: 50 features, 1.3535, labda=0.1

In [133]:
import numpy as np

# Given list of user indices
user_indices = np.array([100, 103, 109])

# Number of elements per user
num_elements = 10

# Expand each index by adding 0 to (num_elements - 1)
expanded_indices = user_indices[:, None] * num_elements + np.arange(num_elements)

# Flatten into a 1D array
expanded_indices = expanded_indices.flatten()

print(expanded_indices)


[1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1030 1031 1032 1033
 1034 1035 1036 1037 1038 1039 1090 1091 1092 1093 1094 1095 1096 1097
 1098 1099]


In [134]:
movie_indices = np.arange(n_movies)[:, None] + np.arange(n_users) * n_movies  # Shape: (n_movies, n_users)

In [135]:
movie_indices

array([[      0,    2000,    4000, ..., 1194000, 1196000, 1198000],
       [      1,    2001,    4001, ..., 1194001, 1196001, 1198001],
       [      2,    2002,    4002, ..., 1194002, 1196002, 1198002],
       ...,
       [   1997,    3997,    5997, ..., 1195997, 1197997, 1199997],
       [   1998,    3998,    5998, ..., 1195998, 1197998, 1199998],
       [   1999,    3999,    5999, ..., 1195999, 1197999, 1199999]])

In [137]:
U.shape

(1200000, 50)

In [154]:
print(n_movies)

5


In [103]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

S = cosine_similarity(G)

In [160]:
import numpy as np
from numpy.linalg import solve
latent_dim=10
reg_param=0.1
max_iters=1000
R_train, R_val = create_train_test_masks(R)
n_users, n_movies = R.shape
U = np.random.rand(n_users * n_movies, latent_dim)
M = np.random.rand(n_movies, latent_dim) 


for iteration in range(max_iters):
    best_val_score = 100
    counter = 0
    # Solve for U while keeping M fixed
    for u in range(n_users):
        user = u % n_movies
        relevant_movies = R_train[user, :].nonzero()[0]  # Movies rated by user u
        if len(relevant_movies) > 0:
            M_subset = M[relevant_movies]  # Extract only relevant movie embeddings
            R_u = R_train[user, relevant_movies]  # Get known ratings

            # Solve least squares problem: (M.T @ M + λI)U = M.T @ R
            U[user] = solve(M_subset.T @ M_subset + reg_param * np.eye(latent_dim), M_subset.T @ R_u)

    # Solve for M while keeping U fixed
    for m in range(n_movies):
        relevant_users = R_train[:, m].nonzero()[0]  # Users who rated movie m
        if len(relevant_users) > 0:
            expanded_users = relevant_users[:, None] * n_movies + np.arange(n_movies)
            expanded_users = expanded_users.flatten()
            U_subset = U[expanded_users]  # Extract relevant user embeddings
            R_m = R_train[relevant_users, m] # Get known ratings
            repeated_R_m = np.repeat(R_m, n_movies)
            # Solve least squares problem: (U.T @ U + λI)M = U.T @ R
            M[m] = solve(U_subset.T @ U_subset + reg_param * np.eye(latent_dim), U_subset.T @ repeated_R_m)

    if iteration % 1 == 0:
        # Compute RMSE only on known ratings
        predicted_ratings = U @ M.T
        refined_ratings = np.zeros((n_users, n_movies))
        for m in range(n_movies):
            for u in range(n_users):
                refined_ratings[u, m] = predicted_ratings[u*n_movies+m, m]
        
        train_mask = R_train > 0  # Only compare known ratings
        val_mask = R_val > 0
        train_error = np.sqrt(np.sum((R - refined_ratings) ** 2 * train_mask) / np.sum(train_mask))
        val_error = np.sqrt(np.sum((R - refined_ratings) ** 2 * val_mask) / np.sum(val_mask))
        print(f"Iteration {iteration}, Train RMSE: {train_error:.4f}, Val RMSE: {val_error:.4f}")
        if val_error < best_val_score:
            best_val_score = val_error
            counter = 0
        else:
            counter += 1
    
    if counter == 5:
        break

: 

In [114]:
m = 10
movie_rows = np.arange(U.shape[0])[np.arange(U.shape[0]) % n_movies == m]
U = np.random.rand(n_users * n_movies, latent_dim)
U_filtered = U[movie_rows]  # Get all user-movie specific latent vectors for movie m

In [122]:
indices = np.array([m + i*n_movies for i in range(n_users)])

In [121]:
len(indices)

600

In [123]:
U[indices].shape

(600, 50)

In [None]:
n_

In [None]:
import numpy as np
from numpy.linalg import solve
latent_dim=50
reg_param=0.1
max_iters=1000
R_train, R_val = create_train_test_masks(R)
U = np.random.rand(n_users * n_movies, latent_dim)
M = np.random.rand(n_users * n_movies, latent_dim)

for iteration in range(max_iters):
    best_val_score = 100
    counter = 0
    # Solve for U while keeping M fixed
    for u in range(n_users):
        relevant_movies = R_train[u, :].nonzero()[0]  # Movies rated by user u
        if len(relevant_movies) > 0:
            M_subset = M[relevant_movies]  # Extract only relevant movie embeddings
            R_u = R_train[u, relevant_movies]  # Get known ratings

            # Solve least squares problem: (M.T @ M + λI)U = M.T @ R
            U[u] = solve(M_subset.T @ M_subset + reg_param * np.eye(latent_dim), M_subset.T @ R_u)

    # Solve for M while keeping U fixed
    for m in range(n_movies):
        relevant_users = R_train[:, m].nonzero()[0]  # Users who rated movie m
        if len(relevant_users) > 0:
            U_subset = U[relevant_users]  # Extract relevant user embeddings
            R_m = R_train[relevant_users, m] # Get known ratings

            # Solve least squares problem: (U.T @ U + λI)M = U.T @ R
            M[m] = solve(U_subset.T @ U_subset + reg_param * np.eye(latent_dim), U_subset.T @ R_m)

    if iteration % 5 == 0:
        # Compute RMSE only on known ratings
        predicted_ratings = U @ M.T
        train_mask = R_train > 0  # Only compare known ratings
        val_mask = R_val > 0
        train_error = np.sqrt(np.sum((R - predicted_ratings) ** 2 * train_mask) / np.sum(train_mask))
        val_error = np.sqrt(np.sum((R - predicted_ratings) ** 2 * val_mask) / np.sum(val_mask))
        print(f"Iteration {iteration}, Train RMSE: {train_error:.4f}, Val RMSE: {val_error:.4f}")
        if val_error < best_val_score:
            best_val_score = val_error
            counter = 0
        else:
            counter += 1
    
    if counter == 5:
        break