**<h1>IMPORT LIBRARIES</h1>**

In [1]:
import chardet
import pandas as pd
import numpy as np
from collections import defaultdict
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split, cross_validate

**<h1>DATASET PRAPARATION</h1>**

In [2]:
movies_path = './movies.dat'
ratings_path = './ratings.dat'
users_path = './users.dat'

In [3]:
with open(movies_path, 'rb') as f:
    movie_encoding = chardet.detect(f.read())
    print(movie_encoding)
with open(ratings_path, 'rb') as f:
    rating_encoding = chardet.detect(f.read())
    print(rating_encoding)
with open(users_path, 'rb') as f:
    user_encoding = chardet.detect(f.read())
    print(user_encoding)

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}
{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}


In [4]:
movies = pd.read_csv(
    movies_path,
    sep='::',
    engine='python',
    names=['movieId', 'title', 'genres'],
    encoding='ISO-8859-1'
)
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [5]:
ratings = pd.read_csv(
    ratings_path,
    sep='::',
    engine='python',
    names=['userId', 'movieId', 'rating', 'timestamp'],
    encoding='ascii'
)
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [6]:
users = pd.read_csv(
    users_path, 
    sep='::', 
    engine="python", 
    names=['userId', 'gender', 'age', 'occupation', 'zip-code'],
    encoding='ascii'
)
users

Unnamed: 0,userId,gender,age,occupation,zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [7]:
ratings_with_movies = pd.merge(ratings, movies, on='movieId')
full_data = pd.merge(ratings_with_movies, users, on='userId')
full_data = full_data.drop(columns=['timestamp', 'zip-code'])

In [8]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], Reader(rating_scale=(1, 5)))

In [9]:
interaction_counts = full_data['userId'].value_counts()
cold_start_users = interaction_counts[interaction_counts == 0].index

In [10]:
cold_start_data = full_data[full_data['userId'].isin(cold_start_users)]

In [11]:
train_data, test_data = train_test_split(data, test_size=0.2)

In [12]:
model = SVD()
model.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a7922b5930>

In [13]:
predictions = model.test(test_data)
print(f"RMSE: {accuracy.rmse(predictions)}")
print(f"MAE: {accuracy.mae(predictions)}")

RMSE: 0.8741
RMSE: 0.8741096919311383
MAE:  0.6863
MAE: 0.6862969933884746


In [14]:
def get_user_item_predictions(predictions):
    user_item_ratings = defaultdict(list)
    for pred in predictions:
        user_item_ratings[pred.uid].append((pred.iid, pred.est, pred.r_ui))
    return user_item_ratings

In [15]:
user_item_ratings = get_user_item_predictions(predictions)

In [16]:
def calculate_ndcg(user_item_ratings, k=10):
    ndcg_scores = []

    for user, items in user_item_ratings.items():
        items_sorted_by_pred = sorted(items, key=lambda x: x[1], reverse=True)

        items_sorted_by_true = sorted(items, key=lambda x: x[2], reverse=True)

        predicted_relevance = [x[2] for x in items_sorted_by_pred[:k]]
        true_relevance = [x[2] for x in items_sorted_by_true[:k]]

        dcg = sum((2**rel - 1) / np.log2(idx + 2) for idx, rel in enumerate(predicted_relevance))

        idcg = sum((2**rel - 1) / np.log2(idx + 2) for idx, rel in enumerate(true_relevance))

        ndcg = dcg / idcg if idcg > 0 else 0
        ndcg_scores.append(ndcg)

    return np.mean(ndcg_scores)

In [17]:
ndcg_score = calculate_ndcg(user_item_ratings, k=10)
print(f"NDCG@10: {ndcg_score:.4f}")

NDCG@10: 0.8261
