In [23]:
import gdown

url = 'https://drive.google.com/uc?id=1moaGNpmZjjN0XH0dcTOybEz8NVS00x-6'
output = 'train_dataset.txt'
gdown.download(url, output, quiet=False)

url = 'https://drive.google.com/uc?id=19ajg9r-QMvDBuVDxBne4Y-vIzSqauAHS'
output = 'test_dataset.txt'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1moaGNpmZjjN0XH0dcTOybEz8NVS00x-6
To: /content/train_dataset.txt
100%|██████████| 17.0M/17.0M [00:00<00:00, 96.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=19ajg9r-QMvDBuVDxBne4Y-vIzSqauAHS
To: /content/test_dataset.txt
100%|██████████| 4.25M/4.25M [00:00<00:00, 158MB/s]


'test_dataset.txt'

In [24]:
display(output)

'test_dataset.txt'

In [25]:
import csv
from scipy.sparse import csr_matrix

num_records = 0
record = []
with open('train_dataset.txt', 'r', encoding='utf-8') as file:
    reader = csv.reader(file, delimiter='\t', quoting=csv.QUOTE_ALL, quotechar='"')
    next(reader) # Skip the first line
    for row in reader:
        if len(row) == 4:
            record.append(row)

In [26]:
print(len(record))

23038


In [27]:
# Get unique users and items
unique_users = list(set([row[0] for row in record]))
unique_items = list(set([row[1] for row in record]))

# Create mappings from user_id and item_id to row and column indices
user_mapping = {user: index for index, user in enumerate(unique_users)}
item_mapping = {item: index for index, item in enumerate(unique_items)}

# Map user_id and item_id to row and column indices
record = [[user_mapping[row[0]], item_mapping[row[1]], row[2], row[3]] for row in record]

# Separate the ratings and review texts
ratings = [float(row[2]) for row in record]

# Create a sparse matrix for the ratings
r = csr_matrix((ratings, ([record[i][0] for i in range(len(record))], [record[i][1] for i in range(len(record))])))
R_matrix = r.toarray()
print(f'Number of records: {num_records}')
print(f'R matrix: \n{R_matrix}')

Number of records: 0
R matrix: 
[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [28]:
print(r)
print(R_matrix.size)

  (0, 0)	1.0
  (0, 18)	1.0
  (0, 24)	1.0
  (0, 36)	1.0
  (0, 43)	1.0
  (0, 60)	1.0
  (0, 69)	1.0
  (0, 70)	1.0
  (0, 71)	1.0
  (0, 93)	1.0
  (0, 101)	1.0
  (0, 125)	1.0
  (0, 127)	1.0
  (0, 141)	1.0
  (0, 149)	1.0
  (0, 157)	1.0
  (0, 193)	1.0
  (0, 199)	1.0
  (0, 218)	1.0
  (0, 220)	1.0
  (0, 225)	1.0
  (0, 228)	1.0
  (0, 282)	1.0
  (0, 321)	1.0
  (0, 393)	1.0
  :	:
  (1337, 521)	1.0
  (1337, 528)	1.0
  (1337, 535)	1.0
  (1337, 598)	1.0
  (1337, 665)	1.0
  (1337, 670)	1.0
  (1337, 713)	1.0
  (1338, 13)	1.0
  (1338, 88)	1.0
  (1338, 174)	1.0
  (1338, 268)	1.0
  (1338, 323)	1.0
  (1338, 345)	1.0
  (1338, 479)	1.0
  (1338, 598)	1.0
  (1338, 676)	1.0
  (1338, 693)	1.0
  (1339, 44)	1.0
  (1339, 144)	1.0
  (1339, 167)	1.0
  (1339, 181)	1.0
  (1339, 325)	1.0
  (1339, 326)	1.0
  (1339, 442)	1.0
  (1339, 542)	1.0
982220


In [29]:
print(len(unique_users))
print(len(unique_items))

1340
733


In [30]:
print(len(record))

23038


In [31]:
display(R_matrix)

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [32]:
import numpy as np
from tqdm import tqdm

class MatrixFactorization:
    def __init__(self, num_items, num_users, num_factors, learning_rate, regularization_rate, num_iterations):

        self.num_items = num_items
        self.num_users = num_users
        self.num_factors = num_factors
        self.learning_rate = learning_rate
        self.regularization_rate = regularization_rate
        self.num_iterations = num_iterations

        # Initialize Q and P matrices with random values

        self.Q = np.random.randn(num_items, num_factors)
        self.P = np.random.randn(num_users, num_factors)


    def update_parameters(self, R):


        for iterate in tqdm(range(self.num_iterations), desc="iteration"):
           for u in range(self.num_users) :
            interaction_items = np.where(R[u] > 0)[0]

            for i in interaction_items:

                riu = R[u, i]

                qi = self.Q[i]
                pu = self.P[u]

                error_of_predict = riu - qi.dot(pu.T)
                #stochastic gradiant discent

                # Update Q[i]
                Q_gradient = ( error_of_predict * pu.T + 2 * self.regularization_rate * self.Q[i])
                self.Q[i] -= self.learning_rate * Q_gradient

                # Update P[u]
                P_gradient = ( error_of_predict * qi + 2 * self.regularization_rate * self.P[u])
                self.P[u] -= self.learning_rate * P_gradient
        print(self.P)
        print(self.Q)


    def train(self, R):

        self.update_parameters(R)


    def get_P(self):
      return self.P

    def get_Q(self):
      return self.Q


In [33]:
num_items = len(unique_items)
num_users = len(unique_users)
num_factors = 64
learning_rate = 0.001
regularization_rate = 0.001
num_iterations = 3

In [34]:
model = MatrixFactorization(num_items, num_users, num_factors, learning_rate, regularization_rate,
                            num_iterations)

model.train(R_matrix)

iteration: 100%|██████████| 3/3 [00:01<00:00,  1.58it/s]

[[ 3.51783075  0.6064245  -2.02292506 ...  0.20976236  0.69345381
  -0.02856046]
 [ 1.96695074  0.52657848  0.70755064 ...  0.71969507 -0.97452265
   1.66378828]
 [ 2.54970684 -0.72120447 -1.82232335 ...  1.32900908  1.20204859
   1.29135903]
 ...
 [-1.0720609  -1.23200448 -0.49738717 ...  0.08282212 -0.74619024
  -0.22874953]
 [-0.12054816 -0.64567823  1.37858362 ... -1.20786202 -2.34327876
  -0.69787985]
 [ 1.44470751 -0.13274001  0.01391308 ... -0.31950018 -0.76452019
  -0.28451078]]
[[-2.38967811  0.19361862 -1.44360406 ...  1.82344941  2.39312369
   0.57796568]
 [-0.71729929 -1.80530827  1.04587476 ...  1.48085622 -1.06603762
   0.0517786 ]
 [-0.14355219  1.58055507 -0.48520108 ...  0.55667401 -1.1720203
  -0.17833234]
 ...
 [-1.06416574  1.10239394  1.26120919 ...  0.33588752  4.12237216
   0.40466054]
 [ 0.79786213  1.74714935  1.03753069 ...  2.13915393 -2.24842777
  -1.14905943]
 [ 2.24488189 -0.08272796  0.95934063 ... -0.824895   -0.18775621
   2.62215735]]





# **Get P and Q matrix**

In [35]:

Matrix_P = model.get_P()

Matrix_Q = model.get_Q()



In [36]:
#save matrix P and matrix Q for next parts

np.save('P.npy', Matrix_P)
np.save('Q.npy', Matrix_Q)

In [37]:
print("matrix P : \n")
print(Matrix_P)
print(" \n")
print("matrix Q : \n")

print(Matrix_Q )

matrix P : 

[[ 3.51783075  0.6064245  -2.02292506 ...  0.20976236  0.69345381
  -0.02856046]
 [ 1.96695074  0.52657848  0.70755064 ...  0.71969507 -0.97452265
   1.66378828]
 [ 2.54970684 -0.72120447 -1.82232335 ...  1.32900908  1.20204859
   1.29135903]
 ...
 [-1.0720609  -1.23200448 -0.49738717 ...  0.08282212 -0.74619024
  -0.22874953]
 [-0.12054816 -0.64567823  1.37858362 ... -1.20786202 -2.34327876
  -0.69787985]
 [ 1.44470751 -0.13274001  0.01391308 ... -0.31950018 -0.76452019
  -0.28451078]]
 

matrix Q : 

[[-2.38967811  0.19361862 -1.44360406 ...  1.82344941  2.39312369
   0.57796568]
 [-0.71729929 -1.80530827  1.04587476 ...  1.48085622 -1.06603762
   0.0517786 ]
 [-0.14355219  1.58055507 -0.48520108 ...  0.55667401 -1.1720203
  -0.17833234]
 ...
 [-1.06416574  1.10239394  1.26120919 ...  0.33588752  4.12237216
   0.40466054]
 [ 0.79786213  1.74714935  1.03753069 ...  2.13915393 -2.24842777
  -1.14905943]
 [ 2.24488189 -0.08272796  0.95934063 ... -0.824895   -0.18775621
   2

# **Rating Prediction**

In [38]:
import numpy as np
from tqdm import tqdm

def predict_rating(P , Q , i, u):

        return Q[i, :].dot(P[u, :].T)

In [39]:
import pandas as pd

# Load test dataset without header
test_data = pd.read_csv('test_dataset.txt', sep='\t', header=None, skiprows=1)

test_data.columns = ['user_id', 'item_id', 'rate', 'review_text']

ground_truth = test_data.groupby('user_id')['item_id'].apply(list).reset_index()
display(ground_truth)
ground_truth.columns = ['user_id', 'interacted_items']


Unnamed: 0,user_id,item_id
0,A100WO06OQR8BQ,"[B009FKNGGQ, B003U4LI7W, B007JT7ARQ, B001ECQ55..."
1,A100ZQDV7L8PVV,"[B005TI7LHS, B009XRF9M0]"
2,A105S56ODHGJEK,"[B00GTC02LA, B00AWLB9G6, B00AE07CRA, B006L6A06..."
3,A10E3F50DIUJEE,"[B009YSSLAU, B00GTBZI6A]"
4,A10M94ASQEBL56,"[B00GTBZWPW, B00AO4E9L8]"
...,...,...
1335,AYWHCM0TJ4737,"[B00AE07932, B00GTC1JHQ]"
1336,AZ26CDSJ363AH,"[B0073P01RC, B00BB8ZHJY, B00AE0790U]"
1337,AZA595ZPIG240,"[B001ECQ4YO, B00AO4E9MC, B00D6EDGYE, B00AE07FU..."
1338,AZFHSPEZUPGD2,"[B002UUT3YM, B0054MSBZA, B00639DLV2, B009YSSLA..."


In [40]:
all_items = set(unique_items) | set(test_data['item_id'])

train_interacted_items = {user: set(item[1] for item in record if item[0] == user_mapping[user]) for user in unique_users}

predicted_ratings = {}
for user in set(test_data['user_id']):
    user_idx = user_mapping[user]
    predicted_ratings[user] = {item: predict_rating(Matrix_P , Matrix_Q , item_mapping[item], user_idx) for item in all_items if item not in train_interacted_items[user]}



# **Evaluation**

In [41]:
from sklearn.metrics import recall_score

def calculate_recall(predicted_ratings, ground_truth, topk=20):
    recall_scores = []
    for user, ratings in predicted_ratings.items():
        # Get the top 20 items predicted for the user
        top_20_predicted_items = sorted(ratings.items(), key=lambda x: x[1], reverse=True)[:topk]

        # Get the actual items interacted by the user
        actual_items = ground_truth[ground_truth['user_id'] == user]['interacted_items'].values[0]

        actual_items_binary = [1 for item in actual_items]

        # Convert the items to binary format (1 if the item is in the top 20 predicted items, 0 otherwise)
        predicted_ratings_binary = [1 if item in dict(top_20_predicted_items) else 0 for item in actual_items]

        # Calculate the recall for the user
        recall_score_user = recall_score([actual_items_binary], [predicted_ratings_binary], average = 'weighted')
        recall_scores.append(recall_score_user)


    return np.mean(recall_scores)

In [42]:
from sklearn.metrics import ndcg_score

def calculate_ndcg(predicted_ratings , ground_truth, topk=20):

    ndcg_scores = []
    for user, ratings in predicted_ratings.items():

        # Get the top 20 items predicted for the user
        top_20_predicted_items = sorted(ratings.items(), key=lambda x: x[1], reverse=True)[:topk]

        # Get the actual items interacted by the user
        actual_items = list(ground_truth[ground_truth['user_id'] == user]['interacted_items'].values[0])

        actual_items_binary = [1 for item in actual_items]


        predicted_ratings_binary = [dict(top_20_predicted_items).get(item, 0) if item in dict(top_20_predicted_items) else 0 for item in actual_items]


        actual_items_binary = np.asarray([actual_items_binary])
        predicted_ratings_binary = np.asarray([predicted_ratings_binary])

        # Calculate the NDCG for the user
        ndcg_score_user = ndcg_score(actual_items_binary, predicted_ratings_binary)
        ndcg_scores.append(ndcg_score_user)


    return np.mean(ndcg_scores)

In [43]:
from scipy import stats
def calculate_rankcorrelation(predicted_ratings, ground_truth):

    spearman_scores = []
    for user, ratings in predicted_ratings.items():

        predicted_items = sorted(ratings.items(), key=lambda x: x[1], reverse=True)


        # Get the actual items interacted by the user
        actual_items = list(ground_truth[ground_truth['user_id'] == user]['interacted_items'].values[0])

        actual_items_ranking = list(range(1, len(actual_items) + 1)) # Assign ranks to actual items



      # Assign ranks to actual items based on their scores in the predicted list

        actual_items_ranking_in_predicted_list = [next((index + 1 for index, item in enumerate(predicted_items) if item[0] == target), None)  for target in actual_items]

        # Calculate the sperman correlation for the user
        spearman_score_user ,_ = stats.spearmanr(actual_items_ranking, actual_items_ranking_in_predicted_list )
        spearman_scores.append(spearman_score_user)

    return np.mean(spearman_scores)

In [44]:
print("recall score is : " , calculate_recall(predicted_ratings, ground_truth))

print("rank correlation(spearman) is : " , calculate_rankcorrelation(predicted_ratings , ground_truth) )

print("ndcg score is : " , calculate_ndcg(predicted_ratings, ground_truth))


recall score is :  0.03160290440506209
rank correlation(spearman) is :  0.02659669280648883
ndcg score is :  1.0
