## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import random

## Setting path for reading data
   2 ways: either through google drive, or local pc path. Use the cell appropriately. Also change the paths accordingly. 
   Don't run the below 2 cells if not using on Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [3]:
%cd /content/gdrive/MyDrive
%cd CF_Data

/content/gdrive/MyDrive
/content/gdrive/MyDrive/CF_Data


## Nuclear Norm Minimzation

In [None]:
def nuclear_norm_minimization(Y, lambda_, k):
    #initialise X
    X = np.random.rand(Y.shape[0], Y.shape[1])
    #Creating R matrix with shape of Y
    R = np.zeros(Y.shape)
    #Now putting 1s in R where Y is not 0
    R[Y!=0] = 1
    #Now iterating k times
    while(k>0):
        #Putting 0 values in R_X where R is 0
        R_X = np.multiply(R, X)
        #Calculate B
        B = X + Y - R_X
        U, S, V = np.linalg.svd(B, full_matrices=False)
        S = np.diag(S)
        #Making negative values 0 after subtracting lambda/2
        S = np.maximum(S - lambda_/2, 0)
        #Updating X
        X = np.dot(U, np.dot(S, V))
        k = k-1
    return X

In [7]:
#Calculate NMAE for the users present in the test data
def NMAE(test_data, predicted_matrix):
    test_data = test_data[test_data['rating'] > 0]  
    NMAE = 0
    for line in test_data.itertuples():
        NMAE += abs(line[3] - predicted_matrix[line[1]-1, line[2]-1])
    NMAE = NMAE / len(test_data)
    #Normalise the NMAE by the range of the ratings in the test data and return
    return NMAE / (max(test_data['rating']) - min(test_data['rating']))

## Reading Data and running for 5-folds

In [None]:
#Performing ALS on the 5 fold datasets
def NN_folds(fold_index, lambda_, max_iter):
    #Read the data
    train_data = pd.read_csv(f'ml-100k/u{fold_index}.base', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    test_data = pd.read_csv(f'ml-100k/u{fold_index}.test', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    #Create a matrix of users and items
    train_data_matrix = np.zeros((943, 1682))
    for line in train_data.itertuples():
        train_data_matrix[line[1]-1, line[2]-1] = line[3]
    #Predict the missing values
    predicted_matrix = nuclear_norm_minimization(train_data_matrix, lambda_, max_iter)
    #Calculate NMAE for the users present in the test data
    NMAE_ = NMAE(test_data, predicted_matrix)
    print(f'NMAE for fold {fold_index} is: {NMAE_}')
    return NMAE_

# Optimization

## Finding the range of hyperparameters, where best results can be expected
### Note: Next couple of cells are just for pattern observation, need to scroll a lot

## Varying Learning Rate

In [None]:
for i in range(1, 6):
    NN_folds(i)

NMAE for fold 1 is: 0.756268895220086
NMAE for fold 2 is: 0.7578381339452863
NMAE for fold 3 is: 0.7531697032291403
NMAE for fold 4 is: 0.7526695514229723
NMAE for fold 5 is: 0.752533496765638


In [None]:
#Optimize the hyperparameter lambda using 5-fold cross validation
#Use the NMAE as the performance metric
#Plot the NMAE vs lambda curve
#Report the best value of lambda and the corresponding NMAE

max_iter = [10, 30, 50]
lamdas = [6, 6.5, 6.7]
NMAE_list = []
for j in max_iter:
    print(f'max_iter= {j}',end=':: ')
    for lr in lamdas:
        NMAE_list.append(NN_folds(4, lr, j))
        print(f'lr= {lr}',end='; ')
    print()

plt.plot(lamdas, NMAE_list)
plt.xlabel('lambda')
plt.ylabel('NMAE')
plt.show()

In [None]:
print(NN_folds(4, 8, 10))

NMAE for fold 4 is: 0.563121681721848
0.563121681721848


In [None]:
NN_folds(4, 6.9, 100)

NMAE for fold 4 is: 0.25224813366040805


0.25224813366040805

## Running for the best Parameters, for all folds

In [4]:
def nuclear_norm_minimization(Y, lambda_, k):
    #initialise X
    X = np.random.rand(Y.shape[0], Y.shape[1])
    #Creating R matrix with shape of Y
    R = np.zeros(Y.shape)
    #Now putting 1s in R where Y is not 0
    R[Y!=0] = 1
    #Now iterating k times
    while(k>0):
        #Putting 0 values in R_X where R is 0
        R_X = np.multiply(R, X)
        #Calculate B
        B = X + Y - R_X
        U, S, V = np.linalg.svd(B, full_matrices=False)
        S = np.diag(S)
        #Making negative values 0 after subtracting lambda/2
        S = np.maximum(S - 6/2, 0)
        #Updating X
        X = np.dot(U, np.dot(S, V))
        k = k-1
    return X

In [None]:
#Performing ALS on the 5 fold datasets
def NN_folds(fold_index, lambda_, max_iter):
    #Read the data
    train_data = pd.read_csv(f'ml-100k/u{fold_index}.base', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    test_data = pd.read_csv(f'ml-100k/u{fold_index}.test', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    #Create a matrix of users and items
    train_data_matrix = np.zeros((943, 1682))
    for line in train_data.itertuples():
        train_data_matrix[line[1]-1, line[2]-1] = line[3]
    #Predict the missing values
    predicted_matrix = nuclear_norm_minimization(train_data_matrix, lambda_, max_iter)
    #Calculate NMAE for the users present in the test data
    NMAE_ = NMAE(test_data, predicted_matrix)
    print(f'NMAE for fold {fold_index} is: {NMAE_}')
  
NN_folds(1, 6.9, 150)

NMAE for fold 1 is: 0.2376860394409616


In [None]:
#Performing ALS on the 5 fold datasets
predicted_matrix = []
predicted_matrix_rounded = []

def NN_folds(fold_index, lambda_, max_iter):
    global predicted_matrix
    global predicted_matrix_rounded
    #Read the data
    train_data = pd.read_csv(f'ml-100k/u{fold_index}.base', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    test_data = pd.read_csv(f'ml-100k/u{fold_index}.test', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    #Create a matrix of users and items
    train_data_matrix = np.zeros((943, 1682))
    for line in train_data.itertuples():
        train_data_matrix[line[1]-1, line[2]-1] = line[3]
    #Predict the missing values
    predicted_matrix = nuclear_norm_minimization(train_data_matrix, lambda_, max_iter)
    predicted_matrix_rounded = np.round(predicted_matrix)
    #Calculate NMAE for the users present in the test data
    NMAE_ = NMAE(test_data, predicted_matrix)
    print(f'NMAE for fold {fold_index} is: {NMAE_}')
  
NN_folds(1, 6.9, 200)

NMAE for fold 1 is: 0.22464227540395854


# Rounded off results to the nearest rating integer

In [None]:
test_data = pd.read_csv(f'ml-100k/u1.test', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
NMAE_ = NMAE(test_data, predicted_matrix_rounded)
print(f'NMAE for fold 1 is: {NMAE_}')

NMAE for fold 1 is: 0.2167375


In [8]:
#Performing ALS on the 5 fold datasets
predicted_matrix = []
predicted_matrix_rounded = []

def NN_folds(fold_index, lambda_, max_iter):
    global predicted_matrix
    global predicted_matrix_rounded
    #Read the data
    train_data = pd.read_csv(f'ml-100k/u{fold_index}.base', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    test_data = pd.read_csv(f'ml-100k/u{fold_index}.test', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    #Create a matrix of users and items
    train_data_matrix = np.zeros((943, 1682))
    for line in train_data.itertuples():
        train_data_matrix[line[1]-1, line[2]-1] = line[3]
    #Predict the missing values
    predicted_matrix = nuclear_norm_minimization(train_data_matrix, lambda_, max_iter)
    predicted_matrix_rounded = np.round(predicted_matrix)
    #Calculate NMAE for the users present in the test data
    NMAE_ = NMAE(test_data, predicted_matrix)
    print(f'NMAE for fold {fold_index} is: {NMAE_}')

NN_folds(1, 0.1, 50)

NMAE for fold 1 is: 0.369687476676441
