In [1]:
import pandas as pd
import numpy as np
import os
import joblib
from surprise import Dataset,Reader,KNNBasic,SVDpp,accuracy
from surprise.model_selection import train_test_split,KFold
from utils import Dataloader

from joblib import dump, load
from collections import defaultdict

In [2]:
DIR_PATH = "./data/"
users_df = Dataloader.load_users(DIR_PATH)
ratings_df = Dataloader.load_ratings(DIR_PATH)
movies_df = Dataloader.load_movies(DIR_PATH)

In [3]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

In [12]:
trainset, testset = train_test_split(data, test_size=0.2)

In [13]:
svdpp_model = SVDpp()

n_factors = 잠재요인의 수를 나타냄. 잠재요인은 사용자와 아이템 간의 특성을 나타내는 값. 클수록 복잡성 증가

b_epochs = 모델이 전체 학습 데이터를 몇 번 반복할지 결정.

lr_all = 학습 속도를 결정하는 파라미터 

reg_all = 모델의 복잡성을 제어하는 파라미터. 값이 클수록 모델이 간단해지는데 너무 크면 성능 저하

k = 사용자 또는 아이템 간의 유사성 측정할때 이웃 수

sim_options= 유사성 측정 방법. 코사인 피어슨 등

In [14]:

svdpp = SVDpp(n_factors=150, n_epochs=50, lr_all=0.003, reg_all=0.05)
svdpp.fit(trainset)
svdpp_predictions = svdpp.test(testset)
svdpp_predictions_df = pd.DataFrame(svdpp_predictions, columns=['uid', 'iid', 'r_ui', 'est', 'details'])
dump(svdpp_predictions_df, 'svdpp_model.joblib')

['svdpp_model.joblib']

In [15]:
knn = KNNBasic(k=60, min_k=2)
knn.fit(trainset)
knn_predictions = knn.test(testset)
knn_predictions_df = pd.DataFrame(knn_predictions, columns=['uid', 'iid', 'r_ui', 'est', 'details'])

Computing the msd similarity matrix...
Done computing similarity matrix.


In [4]:
def hybrid_model_loader():

    # SVD모델 불러오기
    svdpp_predictions_df = load('hybrid_models/svdpp_model.joblib')

    # KNN모델 불러오기
    knn_predictions_df = load('hybrid_models/knn_model.joblib')
    
    return svdpp_predictions_df, knn_predictions_df

In [5]:
svdpp_predictions_df, knn_predictions_df = hybrid_model_loader()


In [6]:
weight_svdpp = 0.7  # SVDpp 모델의 가중치
weight_knn = 0.3  # KNN 모델의 가중치

hybrid_preds = (weight_svdpp * svdpp_predictions_df['est']) + (weight_knn * knn_predictions_df['est'])

hybrid_errors = hybrid_preds - svdpp_predictions_df['r_ui'].values
rmse = np.sqrt(np.mean(hybrid_errors ** 2))

print("Hybrid Model RMSE:", rmse)

Hybrid Model RMSE: 0.9016215686658994


In [7]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [8]:
svdpp_predictions_list = svdpp_predictions_df.values.tolist()

# Precision@K와 Recall@K 계산
svdpp_precisions, svdpp_recalls = precision_recall_at_k(svdpp_predictions_list, k=10, threshold=3.5)

# Precision과 Recall 평균 계산
svdpp_avg_precision = sum(prec for prec in svdpp_precisions.values()) / len(svdpp_precisions)
svdpp_avg_recall = sum(rec for rec in svdpp_recalls.values()) / len(svdpp_recalls)

print("Average Precision@10(svdpp):", svdpp_avg_precision)
print("Average Recall@10(svdpp):", svdpp_avg_recall)

Average Precision@10(svdpp): 0.7942194146069702
Average Recall@10(svdpp): 0.5540900407011167


In [9]:
hybrid_predictions = []

for (svdpp_idx, svdpp_pred), (knn_idx, knn_pred) in zip(svdpp_predictions_df.iterrows(), knn_predictions_df.iterrows()):
    hybrid_pred = svdpp_pred['est'] * 0.7 + knn_pred['est'] * 0.3  # 가중치 적용
    hybrid_predictions.append((svdpp_pred['uid'], svdpp_pred['iid'], svdpp_pred['r_ui'], hybrid_pred, svdpp_pred['details']))

# Precision@K와 Recall@K 계산
precisions, recalls = precision_recall_at_k(hybrid_predictions, k=5, threshold=3.5)

# Precision과 Recall 평균 계산
avg_precision = sum(prec for prec in precisions.values()) / len(precisions)
avg_recall = sum(rec for rec in recalls.values()) / len(recalls)

print("Average Precision@10:", avg_precision)
print("Average Recall@10:", avg_recall)

Average Precision@10: 0.8109197305951419
Average Recall@10: 0.3960151036353499
