In [1]:
import os
import sys

# 현재 작업 디렉토리 가져오기
current_dir = os.getcwd()

# TP2 폴더의 경로
tp2_dir = os.path.join(current_dir, '..', '..')

sys.path.append(tp2_dir)

import pandas as pd
import numpy as np
import os
from joblib import dump, load
from surprise import Dataset,Reader,NMF,accuracy
from surprise.model_selection import train_test_split,KFold
from utils import Dataloader
from surprise.accuracy import rmse, mae

import joblib
from collections import defaultdict
from surprise.model_selection import RandomizedSearchCV

In [5]:
DIR_PATH = "../../data"
users_df = Dataloader.load_users(DIR_PATH)
ratings_df = Dataloader.load_ratings(DIR_PATH)
movies_df = Dataloader.load_movies(DIR_PATH)

In [3]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

In [4]:
trainset, testset = train_test_split(data, test_size=0.2)

In [5]:
NMF_model = NMF()

In [6]:
param_dist_NMF = {
    'n_factors': [5, 10, 15, 20],
    'n_epochs': [30, 50, 100]
}
rs_NMF = RandomizedSearchCV(NMF, param_dist_NMF, measures=['rmse'], cv=3)
rs_NMF.fit(data)

In [7]:
print("NMF Best RMSE Score:", rs_NMF.best_score['rmse'])
print("NMF Best Params:", rs_NMF.best_params['rmse'])

NMF Best RMSE Score: 0.8943936160293706
NMF Best Params: {'n_factors': 20, 'n_epochs': 100}


In [8]:
NMF_model=NMF(n_factors=20 ,n_epochs=100)
NMF_model.fit(trainset)
NMF_model_predictions = NMF_model.test(testset)
NMF_model_predictions_df = pd.DataFrame(NMF_model_predictions, columns=['uid', 'iid', 'r_ui', 'est', 'details'])

In [12]:
dump(NMF_model_predictions_df, 'NMF_model.joblib')

['NMF_model.joblib']

In [6]:
def model_loader():

    # nmf모델 불러오기
    NMF_model_predictions_df = load('hybrid_models/NMF_model.joblib')

    return NMF_model_predictions_df

In [7]:
NMF_model_predictions_df = model_loader()

In [8]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [11]:
NMF_model_predictions_list = NMF_model_predictions_df.values.tolist()

# Precision@K와 Recall@K 계산
NMF_model_precisions, NMF_model_recalls = precision_recall_at_k(NMF_model_predictions_list, k=10, threshold=3.5)

# Precision과 Recall 평균 계산
NMF_model_avg_precision = sum(prec for prec in NMF_model_precisions.values()) / len(NMF_model_precisions)
NMF_model_avg_recall = sum(rec for rec in NMF_model_recalls.values()) / len(NMF_model_recalls)

print("Average Precision@10(NMF_model):", NMF_model_avg_precision)
print("Average Recall@10(NMF_model):", NMF_model_avg_recall)

Average Precision@10(NMF_model): 0.7774975613347425
Average Recall@10(NMF_model): 0.5267192749422243
