In [1]:
import annoy
import sys
import pandas as pd
import os
import numpy as np
import sre_parse
import implicit
from scipy import sparse
from scipy.sparse import csr_matrix, lil_matrix

In [2]:
data_path = "/opt/ml/input/project/data/"
ratings = pd.read_csv(data_path+"ml-25m/ratings.csv")
movies = pd.read_csv(data_path + "ml-25m/movies.csv")

ratings = ratings.merge(movies[["movieId","title"]],how="left",on = "movieId")                        
ratings.drop(columns = ["timestamp"],inplace = True)
ratings["rating"] = 1

inter_mbti_ml_title_movieid = pd.read_csv(data_path+"inter_mbti_ml_title_movieId.csv")


In [3]:
user2idx = {k:v for v,k in enumerate(ratings['userId'].unique())}
item2idx = {k:v for v,k in enumerate(ratings['movieId'].unique())}
idx2item = {v:k for v,k in enumerate(ratings['movieId'].unique())}

ratings['useridx'] = ratings['userId'].map(user2idx) # userId -> 인덱스로 바꿔줍니다.
ratings['movieidx'] = ratings['movieId'].map(item2idx) # movieId -> 인덱스로 바꿔줍니다.

In [4]:
inter_mbti_ml_movieid = list(inter_mbti_ml_title_movieid['movieId'].values)
ratings = ratings[ratings['movieId'].isin(inter_mbti_ml_movieid)]

In [5]:
import random 

user_list = list(ratings['userId'].unique())
test_size = int(len(user_list)*0.1)
random.seed(42)
test_user = random.sample(user_list, test_size)
train_user = list(set(user_list)-set(test_user))

In [6]:
train_df = ratings[ratings['userId'].isin(train_user)]
test_df = ratings[ratings['userId'].isin(test_user)]

In [7]:
print("총 user 수 :",ratings['userId'].nunique())
print("train user 수 : ",len(train_user))
print("test user 수 : ",len(test_user),"\n")
print("총 movieId 수 :",ratings['movieId'].nunique())
print("train movieId 수 : ",train_df["movieId"].nunique())
print("test movieId 수 : ",test_df["movieId"].nunique(),"\n")
print("총 interaction 수 : ", ratings.shape[0])
print("train interaction 수 : ", train_df.shape[0])
print("test interaction 수 : ", test_df.shape[0])

총 user 수 : 162537
train user 수 :  146284
test user 수 :  16253 

총 movieId 수 : 3213
train movieId 수 :  3212
test movieId 수 :  2950 

총 interaction 수 :  12854635
train interaction 수 :  11562102
test interaction 수 :  1292533


In [8]:
grouped = test_df.groupby('userId')
test_user_movie = grouped['movieId'].apply(list)
test_user_movie_dict = dict(test_user_movie)

In [9]:
annoy_model = annoy.AnnoyIndex(64,"angular")
annoy_model.load("./lgcn_interaction/similarity") # lgcn

True

In [9]:
annoy_model = annoy.AnnoyIndex(64,"angular")
annoy_model.load("./validation_test/ALS_angular_64") # ALS

True

In [9]:
annoy_model = annoy.AnnoyIndex(64,"angular")
annoy_model.load("./validation_test/BPR_64") # BPR

True

In [10]:
from tqdm import tqdm
K = 10
user_scores = {}

for user, movie_list in tqdm(test_user_movie_dict.items()):
    movie_list_len = len(movie_list)
    user_valid_movieids = random.sample(movie_list, min(K,movie_list_len-1)) # 검증할 movieId K개(인터랙션이 K보다 적다면 인터랙션 수-1 만큼만)
    user_prefer_movieids = list(set(movie_list)-set(user_valid_movieids)) # 인터랙션에서 user_valid_movieids를 제외한 영화
    scores = {}

    for mid in user_prefer_movieids: # prefer_movie와 유사도가 높을수록 높은 점수를 부여합니다.
        neighbour, dist = annoy_model.get_nns_by_item(mid,500,include_distances=True)
        norm = 2 # max( sqrt(2-2cos) )
        cnt = 0
        for n,d in zip(neighbour[1:],dist[1:]):
            if cnt == K:
                break
            if n not in user_prefer_movieids: # 유사한 영화가 prefer에 포함되어있으면 안됩니다.
                try:
                    scores[n] += 1-d/norm # angular similarity가 낮을수록 score를 높게 측정합니다.
                except:
                    scores[n] = 0
                    scores[n] += 1-d/norm
                cnt += 1
        assert cnt==K, "갯수가 모자라요"

    sort_scores = sorted(scores.items(),key = lambda x: -x[1])[:K] # score를 내림차순 정렬 해줍니다.
    recall = 0

    for mid,score in sort_scores:
        if mid in user_valid_movieids: # 상위 K개를 뽑았을 때 valid에 포함되어 있으면 +1 해줍니다.
            recall+=1
    user_scores[user] = recall/min(K,movie_list_len-1)

recall10 = sum(list(user_scores.values()))/len(user_scores.keys())

100%|██████████| 16253/16253 [18:57<00:00, 14.29it/s]  


In [15]:
sum(list(user_scores.values()))/len(user_scores.keys()) # ALS recall10

0.14220143973419255

In [12]:
recall10 # lgcn

0.0028004109619811555

In [13]:
recall10 # BPR

0.047163820305706115