In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [21]:
namesngenre = np.load("data/namesngenre.npy")
ratings_test = np.load("data/ratings_test.npy")
ratings_train = np.load("data/ratings_train.npy")

In [35]:
namesngenre = pd.DataFrame(namesngenre, columns = ["name", "genre"])
print(namesngenre.shape)

ratings_test = pd.DataFrame(ratings_test)
ratings_test = ratings_test.T
print(ratings_test.shape)

ratings_train = pd.DataFrame(ratings_train)
ratings_train = ratings_train.T
print(ratings_train.shape)

(4980, 2)
(4980, 610)
(4980, 610)


In [41]:
ratings_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,600,601,602,603,604,605,606,607,608,609
0,,,,,4.0,,,,,,...,,,,3.0,,,4.0,,,5.0
1,,,,,,,,,,,...,,,,5.0,,,,,,
2,4.0,,,,,,,,,,...,,,,,,,,2.0,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [40]:
# Correct ways to check for NaN values

# Method 1: Using pd.isna() to get boolean mask
print("1. Boolean mask showing where NaN values are:")
nan_mask = ratings_train==0
print(f"Number of NaN values: {nan_mask.sum().sum()}")
print(f"Shape of ratings_train: {ratings_train.shape}")
print()

# Method 2: Check if there are any NaN values at all
print("2. Check if there are any NaN values:")
has_nans = ratings_train.isna().any().any()
print(f"Contains NaN values: {has_nans}")
print()

# Method 3: Count NaN values per column
print("3. NaN count per column (first 10 columns):")
nan_counts = ratings_train.isna().sum()
print(nan_counts.head(10))
print()

# Method 4: Show actual NaN positions (if any exist)
if has_nans:
    print("4. Showing some NaN positions:")
    nan_positions = np.where(pd.isna(ratings_train))
    print(f"Found NaN at positions (first 10): {list(zip(nan_positions[0][:10], nan_positions[1][:10]))}")
else:
    print("4. No NaN values found in the dataset")

1. Boolean mask showing where NaN values are:
Number of NaN values: 0
Shape of ratings_train: (4980, 610)

2. Check if there are any NaN values:
Contains NaN values: True

3. NaN count per column (first 10 columns):
0    4909
1    4966
2    4967
3    4910
4    4963
5    4897
6    4936
7    4968
8    4965
9    4934
dtype: int64

4. Showing some NaN positions:
Found NaN at positions (first 10): [(0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 9), (0, 10)]


In [25]:
# Hyperparams
k = 20

# Initialize I and U matrices with small random values
np.random.seed(42)  # For reproducibility
I = pd.DataFrame(np.random.normal(0, 0.1, (ratings_train.shape[1], k)))
U = pd.DataFrame(np.random.normal(0, 0.1, (ratings_train.shape[0], k)))

In [29]:
# ALS optimization
def als_optimization(ratings_matrix, I, U, num_iterations=10, lambda_reg=0.1):
    """
    ALS optimization for matrix factorization
    
    Args:
        ratings_matrix: The ratings matrix (items x users)
        I: Item matrix (items x k factors)
        U: User matrix (users x k factors) 
        num_iterations: Number of ALS iterations
        lambda_reg: Regularization parameter
    
    Returns:
        Optimized I and U matrices
    """
    
    # Convert to numpy for easier computation
    R = ratings_matrix.values
    I_mat = I.values
    U_mat = U.values
    
    # Create mask for observed ratings (non-NaN entries)
    mask = ~np.isnan(R) 
    
    losses = []
    
    for iteration in range(num_iterations):
        # Update I matrix (fix U, optimize I)
        for i in range(I_mat.shape[0]):  # For each item
            # Find users who rated this item
            user_indices = np.where(mask[i, :])[0]
            if len(user_indices) > 0:
                U_users = U_mat[user_indices, :]  # Users who rated item i
                ratings_i = R[i, user_indices]    # Ratings for item i
                
                # Solve: I_i = argmin ||r_i - I_i * U_users^T||^2 + lambda * ||I_i||^2
                A = U_users.T @ U_users + lambda_reg * np.eye(U_users.shape[1])
                b = U_users.T @ ratings_i
                I_mat[i, :] = np.linalg.solve(A, b)
        
        # Update U matrix (fix I, optimize U)
        for u in range(U_mat.shape[0]):  # For each user
            # Find items rated by this user
            item_indices = np.where(mask[:, u])[0]
            if len(item_indices) > 0:
                I_items = I_mat[item_indices, :]  # Items rated by user u
                ratings_u = R[item_indices, u]   # Ratings by user u
                
                # Solve: U_u = argmin ||r_u - I_items * U_u||^2 + lambda * ||U_u||^2
                A = I_items.T @ I_items + lambda_reg * np.eye(I_items.shape[1])
                b = I_items.T @ ratings_u
                U_mat[u, :] = np.linalg.solve(A, b)
        
        # Calculate reconstruction loss
        predicted = I_mat @ U_mat.T
        loss = np.sum((mask * (R - predicted)) ** 2) + lambda_reg * (np.sum(I_mat**2) + np.sum(U_mat**2))
        losses.append(loss)
        
        print(f"Iteration {iteration}, Loss: {loss:.6f}")
    
    return pd.DataFrame(I_mat), pd.DataFrame(U_mat), losses

# Run ALS optimization
print("Starting ALS optimization...")
I_optimized, U_optimized, losses = als_optimization(
    ratings_train.T, I, U, 
    num_iterations=20, 
    lambda_reg=0.1
)

print(f"Final loss: {losses[-1]:.6f}")
print(f"I matrix shape: {I_optimized.shape}")
print(f"U matrix shape: {U_optimized.shape}")

Starting ALS optimization...
Data sparsity: 1.04% observed ratings
Iteration 0, Loss: 7909.016596, RMSE: 0.096537
Iteration 0, Loss: 7909.016596, RMSE: 0.096537
Iteration 1, Loss: 7621.245993, RMSE: 0.093806
Iteration 1, Loss: 7621.245993, RMSE: 0.093806
Iteration 2, Loss: 7355.564311, RMSE: 0.091256
Iteration 2, Loss: 7355.564311, RMSE: 0.091256
Iteration 3, Loss: 7109.513045, RMSE: 0.088861
Iteration 3, Loss: 7109.513045, RMSE: 0.088861
Iteration 4, Loss: 6880.955046, RMSE: 0.086595
Iteration 4, Loss: 6880.955046, RMSE: 0.086595
Iteration 5, Loss: 6668.074712, RMSE: 0.084447
Iteration 5, Loss: 6668.074712, RMSE: 0.084447
Iteration 6, Loss: 6469.444260, RMSE: 0.082429
Iteration 6, Loss: 6469.444260, RMSE: 0.082429
Iteration 7, Loss: 6283.722285, RMSE: 0.080529
Iteration 7, Loss: 6283.722285, RMSE: 0.080529
Iteration 8, Loss: 6109.660258, RMSE: 0.078732
Iteration 8, Loss: 6109.660258, RMSE: 0.078732
Iteration 9, Loss: 5946.196301, RMSE: 0.077029
Iteration 9, Loss: 5946.196301, RMSE: 0.

In [30]:
I_optimized.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.197549,1.076389,-0.877844,1.761241,0.939502,-3.62651,2.377555,-2.871516,0.450468,0.375773,1.775021,-0.954951,-1.210261,-0.89679,0.987819,-1.225643,-2.194621,0.177502,-1.283243,2.453291
1,0.029218,0.318282,1.143881,1.655315,1.647259,-1.311432,1.146923,-1.050373,-0.943011,-0.271784,1.338992,-0.549165,-2.559961,-0.180592,1.355641,-1.462459,-2.550086,-0.387779,-1.485735,0.666241
2,0.737927,-0.677478,1.716958,0.085743,0.55127,0.229141,0.195821,-1.244964,1.027146,-0.651172,-0.578457,-1.219529,-1.025221,-0.636344,-0.265908,-0.311928,-1.404074,0.152472,-0.684319,1.33339
3,0.13618,0.290699,0.774223,0.583801,0.090883,-0.158947,-0.149805,0.485478,0.499101,-0.39412,-0.174533,-0.341805,-0.635189,0.038587,-0.016704,-0.039728,0.131727,0.485746,0.211047,-0.019649
4,-0.282626,-0.516892,-0.052348,1.042765,0.597845,0.232926,0.743216,-1.886806,0.676745,0.248195,-0.0194,-0.65409,-1.68447,0.253968,0.713526,-0.370756,-1.539511,1.740053,-0.458805,0.297152
5,1.123119,-0.843158,0.201516,0.682254,-0.1706,-2.554566,1.403109,-0.844694,0.957917,-0.751891,0.440856,-0.744025,-2.737736,-1.401545,1.503585,-1.686642,-2.851161,2.163248,-1.534831,0.213899
6,0.096521,0.827151,-0.045612,0.883017,-0.035893,-1.843464,-0.38856,-0.943613,1.325222,-0.382015,-0.215947,-0.946802,-2.590967,-0.755819,0.698836,-0.351802,-0.742237,-0.210462,-1.198322,-0.105505
7,-0.461963,0.156876,0.072056,-0.353672,-0.118669,0.382934,-0.367549,-0.135258,0.791913,0.06368,0.024339,0.254998,-0.272439,-0.159619,-0.274856,-0.224116,-0.582005,-0.014556,-0.456446,1.274075
8,1.003825,0.256388,0.24758,-0.095482,0.724519,0.160287,0.421445,-0.501663,-0.094286,-0.331165,-0.339654,-0.821948,-1.22857,-0.990695,-0.446119,-0.606543,-0.204344,0.078997,0.197899,0.235793
9,-0.351086,-0.893372,0.478291,1.150717,-0.689787,-2.441239,1.353772,-1.401171,1.016039,0.001165,0.197642,-1.257428,-1.487086,-2.093239,0.620404,-1.72135,-1.889687,0.207554,-1.149356,1.194172
