In [1]:
import numpy as np
from tqdm.notebook import tqdm

# The below approach is extremely slow, so ignore it.
def _matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    '''
    R: rating matrix
    P: |U| * K (User features matrix)
    Q: |D| * K (Item features matrix)
    K: latent features
    steps: iterations
    alpha: learning rate
    beta: regularization parameter'''
    Q = Q.T

    for step in tqdm(range(steps)):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    # calculate error
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])

                    for k in range(K):
                        # calculate gradient with a and beta parameter
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])

        eR = np.dot(P,Q)

        e = 0

        for i in range(len(R)):

            for j in range(len(R[i])):

                if R[i][j] > 0:

                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)

                    for k in range(K):

                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        # 0.001: local minimum
        if e < 0.001:

            break

    return P, Q.T

In [15]:
import cupy as cp
from tqdm import tqdm

def get_batches(R, batch_size):
    """Generate mini-batches for the non-zero entries in R."""
    non_zero_indices = cp.asarray(R).nonzero()
    num_samples = len(non_zero_indices[0])
    indices = cp.arange(num_samples)
    cp.random.shuffle(indices)
    
    for idx in range(0, num_samples, batch_size):
        batch_indices = indices[idx:idx+batch_size]
        i_indices = non_zero_indices[0][batch_indices]
        j_indices = non_zero_indices[1][batch_indices]
        values = R[i_indices, j_indices]
        yield i_indices, j_indices, values

def matrix_factorization_gpu_batched(R, P, Q, K, batch_size=1000, steps=5000, alpha=0.0002, beta=0.02):
    R = cp.array(R)
    P = cp.array(P)
    Q = cp.array(Q)
    Q = Q.T

    for step in tqdm(range(steps), desc="Epochs"):
        for i_indices, j_indices, values in tqdm(get_batches(R, batch_size), desc="Rows", leave=False):
            errors = values - cp.sum(P[i_indices] * Q[:, j_indices].T, axis=1)
            
            for k in range(K):
                P[i_indices, k] += alpha * (2 * errors * Q[k, j_indices] - beta * P[i_indices, k])
                Q[k, j_indices] += alpha * (2 * errors * P[i_indices, k] - beta * Q[k, j_indices])

        eR = cp.dot(P, Q)
        e = cp.sum((R[R > 0] - eR[R > 0]) ** 2) + beta/2.0 * (cp.sum(P**2) + cp.sum(Q**2))
        
        if e < 0.001:
            break

    return cp.asnumpy(P), cp.asnumpy(Q.T)


In [3]:
import pandas as pd
DATA_DIR = 'data'
train_ratings = pd.read_csv(f'{DATA_DIR}/train_ratings.csv')

In [80]:
#how many unique movies and users do we have?
n_movies = train_ratings.movieId.unique().shape[0]
n_users = train_ratings.userId.unique().shape[0]
print(f'Number of unique movies: {n_movies}')
print(f'Number of unique users: {n_users}')

Number of unique movies: 8983
Number of unique users: 610


In [81]:
train_matrix = train_ratings.pivot_table(index='userId', columns='movieId', values='rating')

In [83]:
N = len(train_matrix)
M = len(train_matrix.columns)
N, M

(610, 8983)

In [19]:
K = 10

P = np.random.rand(N,K)
Q = np.random.rand(M,K)

nP, nQ = matrix_factorization_gpu_batched(train_matrix.values, P, Q, K, 500000)

Epochs: 100%|██████████| 5000/5000 [06:43<00:00, 12.38it/s]


In [84]:
nP.shape, nQ.shape

((610, 10), (8983, 10))

In [140]:
import pandas as pd

# Load the test set
test_set = pd.read_csv(f'{DATA_DIR}/test_set_no_ratings.csv')

# Placeholder list to store the predicted ratings
predicted_ratings = []

# Iterate over each row in the test set
for _, row in test_set.iterrows():
    userId = row['userId']
    movieId = row['movieId']
    
    
    # Check if userId and movieId exist in ratings_matrix
    if userId in train_matrix.index and movieId in train_matrix.columns:
        user_idx = list(train_matrix.index).index(userId)
        movie_idx = list(train_matrix.columns).index(movieId)
        
        predicted_rating = np.dot(P[user_idx, :], Q[movie_idx, :].T)
    else:
        # Use a default value for unknown userIds or movieIds
        predicted_rating = np.mean(train_matrix.values[train_matrix.values > 0])
    
    predicted_ratings.append(predicted_rating)

In [None]:
# Add the predicted ratings to the test set dataframe and save to CSV
test_set['rating'] = predicted_ratings
predictions_df = test_set[['Id', 'rating']]
predictions_df.to_csv('predictions.csv', index=False)