In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
import pandas as pd
from tabulate import tabulate
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.decomposition import TruncatedSVD

In [13]:
# Path to the dataset
ratings_path = '/content/drive/My Drive/MatrixCompletion/ratings.csv'
movies_path = '/content/drive/My Drive/MatrixCompletion/movies.csv'
tags_path = '/content/drive/My Drive/MatrixCompletion/tags.csv'
links_path = '/content/drive/My Drive/MatrixCompletion/links.csv'

# Load the dataset
ratings = pd.read_csv(ratings_path)
movies = pd.read_csv(movies_path)
tags = pd.read_csv(tags_path)
links = pd.read_csv(links_path)


In [20]:
def als_train_gd(ratings, k=10, steps=20, learning_rate=0.01, reg=0.1, max_gradient=5.0):
    """
    ALS implementation using Gradient Descent for optimization with stability fixes.

    Parameters:
        ratings (ndarray): User-item ratings matrix (unobserved entries set as 0).
        k (int): Number of latent factors.
        steps (int): Number of iterations.
        learning_rate (float): Initial learning rate for gradient descent.
        reg (float): Regularization parameter.
        max_gradient (float): Maximum threshold for gradient clipping.

    Returns:
        U (ndarray): User latent factors.
        V (ndarray): Item latent factors.
    """
    num_users, num_items = ratings.shape

    # Initialize user and item latent factors with small random values
    U = np.random.rand(num_users, k) * 0.01
    V = np.random.rand(num_items, k) * 0.01

    # Mask to identify observed ratings (non-zero entries)
    observed_mask = ratings > 0

    print("Training ALS with Gradient Descent...")

    for step in range(steps):
        # Dynamic learning rate decay
        lr = learning_rate / (1 + step * 0.1)

        # Update U (user latent factors)
        for u in range(num_users):
            for f in range(k):
                error_sum = 0
                for i in range(num_items):
                    if observed_mask[u, i]:  # Only consider observed ratings
                        pred = np.dot(U[u, :], V[i, :])
                        error_sum += (ratings[u, i] - pred) * (-V[i, f])
                # Gradient update with clipping
                error_sum = np.clip(error_sum, -max_gradient, max_gradient)
                U[u, f] -= lr * (error_sum + reg * U[u, f])

        # Update V (item latent factors)
        for i in range(num_items):
            for f in range(k):
                error_sum = 0
                for u in range(num_users):
                    if observed_mask[u, i]:  # Only consider observed ratings
                        pred = np.dot(U[u, :], V[i, :])
                        error_sum += (ratings[u, i] - pred) * (-U[u, f])
                # Gradient update with clipping
                error_sum = np.clip(error_sum, -max_gradient, max_gradient)
                V[i, f] -= lr * (error_sum + reg * V[i, f])

        # Compute loss (for monitoring)
        loss = 0
        for u in range(num_users):
            for i in range(num_items):
                if observed_mask[u, i]:
                    pred = np.dot(U[u, :], V[i, :])
                    loss += (ratings[u, i] - pred) ** 2
        loss += reg * (np.sum(U ** 2) + np.sum(V ** 2))

        # Print loss and check for divergence
        print(f"Step: {step + 1}, Loss: {loss:.4f}")
        if np.isnan(loss):
            print("Divergence detected! Stopping training.")
            break

    return U, V

def evaluate(reconstructed_matrix, test_matrix, test_mask):
    """
    Evaluate the model using RMSE and MAE on the test set.

    Parameters:
        reconstructed_matrix (ndarray): Predicted user-item matrix.
        test_matrix (ndarray): True ratings for the test set.
        test_mask (ndarray): Mask for observed test ratings.

    Returns:
        rmse (float): Root Mean Squared Error.
        mae (float): Mean Absolute Error.
    """
    test_predictions = reconstructed_matrix[test_mask]
    actual_test_ratings = test_matrix[test_mask]
    rmse = np.sqrt(np.mean((actual_test_ratings - test_predictions) ** 2))
    mae = np.mean(np.abs(actual_test_ratings - test_predictions))
    return rmse, mae

# Example Usage
if __name__ == "__main__":

    ratings_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0).values

    # Create test mask (20% observed ratings for testing)
    np.random.seed(42)
    observed_mask = ratings_matrix > 0
    test_mask = np.random.rand(*ratings_matrix.shape) < 0.2
    train_mask = observed_mask & ~test_mask

    train_matrix = np.where(train_mask, ratings_matrix, 0)
    test_matrix = np.where(test_mask, ratings_matrix, 0)

    # Train ALS model using Gradient Descent
    k = 10  # Number of latent factors
    steps = 100  # Number of iterations
    learning_rate = 0.01
    reg = 0.1

    U, V = als_train_gd(train_matrix, k, steps, learning_rate, reg)

    # Reconstruct the matrix
    reconstructed_matrix = np.dot(U, V.T)

    # Evaluate the model
    print("\nEvaluating ALS Model:")
    rmse, mae = evaluate(reconstructed_matrix, test_matrix, test_mask)
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")


Training ALS with Gradient Descent...
Step: 1, Loss: 1066503.9081
Step: 2, Loss: 1038919.2534
Step: 3, Loss: 996506.7222
Step: 4, Loss: 943495.8955
Step: 5, Loss: 883396.9696
Step: 6, Loss: 819017.2821
Step: 7, Loss: 752575.1942
Step: 8, Loss: 685812.1577
Step: 9, Loss: 620091.9672
Step: 10, Loss: 556477.2633
Step: 11, Loss: 495790.5625
Step: 12, Loss: 438664.9641
Step: 13, Loss: 385582.3337
Step: 14, Loss: 336902.7841
Step: 15, Loss: 292884.9567
Step: 16, Loss: 253702.0928
Step: 17, Loss: 219438.0540
Step: 18, Loss: 190092.3446
Step: 19, Loss: 165515.0000
Step: 20, Loss: 145490.9033
Step: 21, Loss: 129614.1455
Step: 22, Loss: 117311.2784
Step: 23, Loss: 108048.8288
Step: 24, Loss: 101178.2036
Step: 25, Loss: 96074.1025
Step: 26, Loss: 92223.5744
Step: 27, Loss: 89239.7822
Step: 28, Loss: 86842.3661
Step: 29, Loss: 84803.0970
Step: 30, Loss: 83014.8701
Step: 31, Loss: 81420.4804
Step: 32, Loss: 79984.4651
Step: 33, Loss: 78678.8013
Step: 34, Loss: 77483.2859
Step: 35, Loss: 76382.5333


In [6]:
num_users, num_items = ratings.shape

# Initialize user and item latent factors randomly
U = np.random.rand(num_users, k)
V = np.random.rand(num_items, k)

# Mask to identify observed ratings (non-zero entries)
observed_mask = ratings > 0

In [7]:
print(type(observed_mask))

<class 'pandas.core.frame.DataFrame'>
