In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import pandas as pd
from tabulate import tabulate
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.decomposition import TruncatedSVD
from math import sqrt

In [17]:
# Path to the dataset
ratings_path = '/content/drive/My Drive/MatrixCompletion/ratings.csv'
movies_path = '/content/drive/My Drive/MatrixCompletion/movies.csv'
tags_path = '/content/drive/My Drive/MatrixCompletion/tags.csv'
links_path = '/content/drive/My Drive/MatrixCompletion/links.csv'

# Load the dataset
ratings = pd.read_csv(ratings_path)
movies = pd.read_csv(movies_path)
tags = pd.read_csv(tags_path)
links = pd.read_csv(links_path)


In [18]:
ratings_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')

In [19]:

# ----------------------------
# 1. Data Preparation
# ----------------------------
def prepare_data(matrix, test_size=0.2):
    """
    Splits the observed entries into train and test sets.
    Replaces missing entries with 0 for initial processing.
    """
    observed_mask = ~np.isnan(matrix)
    train_mask = observed_mask & (np.random.rand(*matrix.shape) > test_size)
    test_mask = observed_mask & ~train_mask

    train_matrix = np.where(train_mask, matrix, 0)
    test_matrix = np.where(test_mask, matrix, 0)

    #print("Number of test values:", np.sum(test_mask))


    return train_matrix, test_matrix, train_mask, test_mask



# ----------------------------
# 2. Gradient Descent SVD
# ----------------------------
def gradient_descent_svd(train_matrix, k, lr, reg, steps):
    """
    Performs gradient descent optimization to approximate SVD.
    Args:
        train_matrix: The input matrix with missing values filled as 0.
        k: Number of latent factors.
        lr: Learning rate for gradient descent.
        reg: Regularization parameter.
        steps: Number of iterations.

    Returns:
        U: User latent factor matrix.
        V: Item latent factor matrix.
        reconstructed: Reconstructed matrix.
    """
    num_users, num_items = train_matrix.shape

    # Initialize user and item matrices with small random values
    U = np.random.normal(0, 0.1, (num_users, k))
    V = np.random.normal(0, 0.1, (num_items, k))

    # Gradient Descent Loop
    for step in range(steps):
        for i in range(num_users):
            for j in range(num_items):
                if train_matrix[i, j] > 0:  # Only observed ratings contribute to the update
                    error_ij = train_matrix[i, j] - np.dot(U[i, :], V[j, :].T)

                    # Update user and item matrices with regularization
                    U[i, :] += lr * (error_ij * V[j, :] - reg * U[i, :])
                    V[j, :] += lr * (error_ij * U[i, :] - reg * V[j, :])

        # Compute Loss (optional for debugging)
        if step % 10 == 0:  # Log every 10 steps
            loss = 0
            for i in range(num_users):
                for j in range(num_items):
                    if train_matrix[i, j] > 0:
                        loss += (train_matrix[i, j] - np.dot(U[i, :], V[j, :].T))**2
            loss += reg * (np.sum(U**2) + np.sum(V**2))
            print(f"Step: {step}, Loss: {loss:.4f}")

    reconstructed = np.dot(U, V.T)  # Final reconstruction
    return U, V, reconstructed


# ----------------------------
# 3. Evaluation
# ----------------------------
def evaluate(reconstructed_matrix, test_matrix, test_mask):
    test_predictions = reconstructed_matrix[test_mask]
    actual_test_ratings = test_matrix[test_mask]

    # Check for empty arrays
    if len(actual_test_ratings) == 0 or len(test_predictions) == 0:
        print("Error: No test ratings to evaluate. Check test_mask!")
        return

    rmse = np.sqrt(mean_squared_error(actual_test_ratings, test_predictions))
    mae = mean_absolute_error(actual_test_ratings, test_predictions)
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")



# ----------------------------
# 4. Main Workflow
# ----------------------------
if __name__ == "__main__":
    # Simulated Input Matrix (Example with NaN entries)

    ratings_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')

    # Prepare Data
    train_matrix, test_matrix, train_mask, test_mask = prepare_data(ratings_matrix)

    # SVD with Gradient Descent
    k = 10           # Number of latent factors
    lr = 0.01       # Learning rate
    reg = 0.1       # Regularization
    steps = 100     # Number of iterations

    print("\nTraining Gradient Descent-based SVD...")
    U, V, reconstructed_matrix = gradient_descent_svd(train_matrix, k, lr, reg, steps)

    # Clip predictions to valid range (1-5 in this example)
    reconstructed_matrix = np.clip(reconstructed_matrix, 1, 5)

    # Evaluate Results
    evaluate(reconstructed_matrix, test_matrix, test_mask)

    # Final Reconstructed Matrix
    print("\nReconstructed Matrix:")
    print(np.round(reconstructed_matrix, 2))



Training Gradient Descent-based SVD...
Step: 0, Loss: 780457.8718
Step: 10, Loss: 66516.8333
Step: 20, Loss: 53502.5233
Step: 30, Loss: 46551.2108
Step: 40, Loss: 42125.9379
Step: 50, Loss: 39249.1105
Step: 60, Loss: 37331.3461
Step: 70, Loss: 36013.7555
Step: 80, Loss: 35077.3118
Step: 90, Loss: 34389.8195
RMSE: 0.9849
MAE: 0.7333

Reconstructed Matrix:
[[4.75 4.27 3.85 ... 3.63 3.52 4.23]
 [3.89 3.43 3.15 ... 2.81 2.82 3.27]
 [1.34 1.85 1.63 ... 1.28 1.19 1.46]
 ...
 [3.52 3.12 2.71 ... 3.29 3.42 4.03]
 [3.53 3.22 3.1  ... 2.59 2.43 3.14]
 [4.11 3.61 3.37 ... 3.26 3.2  3.91]]
