In [86]:
!pwd

/opt/ml/input/fighting/Test


In [1]:
import os
import sys
sys.path.append('..')
from Utils import model_recommend_movies
from interaction_model import Inference
import pandas as pd
from tqdm import tqdm

In [2]:
ratings = pd.read_csv('/opt/ml/input/fighting/CSV/ml-25m/ratings.csv')

In [3]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [4]:
ratings['userId'].nunique(), ratings['movieId'].nunique()

(162541, 59047)

In [5]:
data_path = "/opt/ml/input/fighting/CSV/"
inter_mbti_ml_title_movieid = pd.read_csv(data_path+"inter_mbti_ml_title_movieId.csv")

In [6]:
inter_mbti_ml_movieid = list(inter_mbti_ml_title_movieid['movieId'].values)

In [7]:
# mbti와 movielens에 공통으로 있는 영화에 대한 interaction만 고려
ratings = ratings[ratings['movieId'].isin(inter_mbti_ml_movieid)]

In [8]:
ratings['userId'].nunique(), ratings['movieId'].nunique()

(162537, 3213)

## 9:1로 train, test 유저나누기

In [9]:
import random

user_list = list(ratings['userId'].unique())
test_size = int(len(user_list) * 0.1)
random.seed(42)
test_user = random.sample(user_list, test_size)
train_user = list(set(user_list)-set(test_user))

In [10]:
ratings['userId'].nunique(), ratings['movieId'].nunique()

(162537, 3213)

In [11]:
train_df = ratings[ratings['userId'].isin(train_user)]

In [12]:
train_df['userId'].nunique(), train_df['movieId'].nunique()

(146284, 3212)

In [13]:
train_df.to_csv("train_df.csv", index=False)

In [14]:
test_df = ratings[ratings['userId'].isin(test_user)]

In [15]:
test_df['userId'].nunique(), test_df['movieId'].nunique()

(16253, 2950)

## train_df으로 학습한 annoy 함수 만들기

In [16]:
!pwd

/opt/ml/input/fighting/Test


In [17]:
from scipy import sparse
from scipy.sparse import csr_matrix, lil_matrix
import implicit
import annoy

data_path = "/opt/ml/input/fighting/CSV/"
ratings = pd.read_csv("./train_df.csv")
movies = pd.read_csv(data_path + "ml-25m/movies.csv")

ratings = ratings.merge(movies[["movieId","title"]],how="left",on = "movieId")
ratings['rating'] = 1
ratings.drop(columns = "timestamp",inplace = True)
inter_mbti_ml_title_movieid = pd.read_csv(data_path+"inter_mbti_ml_title_movieId.csv")

user2idx = {k:v for v,k in enumerate(ratings['userId'].unique())}
item2idx = {k:v for v,k in enumerate(ratings['movieId'].unique())}
idx2item = {v:k for v,k in enumerate(ratings['movieId'].unique())}

ratings['useridx'] = ratings['userId'].map(user2idx) # userId -> 인덱스로 바꿔줍니다.
ratings['movieidx'] = ratings['movieId'].map(item2idx) # movieId -> 인덱스로 바꿔줍니다.

sparse_user_item = sparse.csr_matrix((ratings['rating'].astype(float),
                                      (ratings['useridx'], ratings['movieidx']))) # csr_matrix 생성

als_model = implicit.als.AlternatingLeastSquares(
                                factors = 64,
                                regularization = 0.1,
                                iterations = 200,
                                calculate_training_loss = False,
                                use_gpu = True
                            ) # 모델 init

als_model.fit(sparse_user_item) # 학습
item_vecs = als_model.item_factors.to_numpy() # item vector만 빼옵니다.

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=200.0), HTML(value='')))




In [18]:
# inter_mbti_ml_title_movieid도 인덱싱처리
inter_mbti_ml_title_movieid['movieidx'] = inter_mbti_ml_title_movieid['movieId'].map(item2idx)
inter_mbti_ml_title_movieid.dropna(inplace = True)
inter_mbti_ml_title_movieid['movieidx'] = inter_mbti_ml_title_movieid['movieidx'].astype(int)

# 유사도 측정방법
# angular : cosine similarity
# euclidean : l2-norm distance
# manhattan : l1-norm distance
# hamming : binary vector에
similar_items_index = annoy.AnnoyIndex(64,'angular')
n_trees = 40
for idx in inter_mbti_ml_title_movieid['movieidx']:
    similar_items_index.add_item(idx2item[idx],item_vecs[idx]) # idx -> movieId로 바꿔서 annoy에 저장합니다.
similar_items_index.build(n_trees)
similar_items_index.save("ALS_angular_64")

True

## test 유저별 interaction 영화 id list

In [16]:
grouped = test_df.groupby('userId')
test_user_movie = grouped['movieId'].apply(list)

In [17]:
test_user_movie

userId
3         [1, 32, 50, 111, 173, 293, 296, 318, 356, 442,...
10        [1, 32, 50, 110, 150, 193, 296, 318, 356, 380,...
23        [6, 7, 14, 16, 25, 32, 36, 39, 50, 104, 105, 1...
25        [16, 17, 25, 36, 39, 50, 110, 141, 150, 151, 1...
45        [141, 150, 296, 380, 562, 590, 608, 648, 653, ...
                                ...                        
162472    [1, 16, 32, 50, 110, 111, 163, 198, 223, 288, ...
162476    [1, 2, 17, 25, 32, 34, 36, 39, 50, 110, 141, 1...
162493    [1, 25, 32, 34, 50, 70, 110, 111, 151, 223, 25...
162501    [16, 32, 50, 111, 150, 223, 293, 296, 318, 356...
162513    [16, 50, 104, 110, 150, 173, 235, 266, 296, 31...
Name: movieId, Length: 16253, dtype: object

In [18]:
test_user_movie_dict = dict(test_user_movie)

## test_유저의 valid score 확인(한명에 대한 실험)

### 5:5로 test_유저별 영화기록을 입력, valid 나누기

In [19]:
valid_ratio = 0.5

In [20]:
sys.path.append(os.path.join(os.path.abspath(os.path.curdir),".."))
from Utils.utils import model_recommend_movies

In [21]:
# 모델명
model_name = './ALS_angular_64'

In [22]:
# inference 입력 영화 id
test_user_prefer_movieids = random.sample(test_user_movie_dict[3], int(len(test_user_movie_dict[3])*(1-valid_ratio)))

In [23]:
# valid 영화 id
test_user_valid_movieids = list(set(test_user_movie_dict[3]) - set(test_user_prefer_movieids))

### 추천 영화와 valid 영화 교집합 비율 확인

In [24]:
# 추천받은 영화 id
recommend_movie_ids = model_recommend_movies("INFP","5w4", test_user_prefer_movieids, 10, model_name)


In [25]:
# 추천받은 영화와 valid 영화 교집합
len(set(recommend_movie_ids) & set(test_user_valid_movieids))

143

In [26]:
# valid 영화 집합
len(test_user_valid_movieids)

170

In [27]:
# 유저 3에 대한 valid score
len(set(recommend_movie_ids) & set(test_user_valid_movieids)) / len(test_user_valid_movieids)

0.8411764705882353

## test_유저의 valid score 확인(일반화)

In [28]:
sys.path.append(os.path.join(os.path.abspath(os.path.curdir),".."))
from Utils.utils import model_recommend_movies
# 모델명
model_name = './ALS_angular_64'

In [29]:
scores = []
for user, movie_list in tqdm(test_user_movie_dict.items()):
    movie_list_len = len(movie_list)
    user_prefer_movieids = random.sample(movie_list, movie_list_len//2)     # inference 입력 영화 id list
    user_valid_movieids = list(set(movie_list) - set(user_prefer_movieids)) # valid 영화 id list
    user_rec_movieids = model_recommend_movies("INFP","5w4",user_prefer_movieids,10,model_name) #추천 영화 id list
    inter_cnt = len(set(user_valid_movieids) & set(user_rec_movieids))      # valid 영화와 추천 영화 교집합 개수
    score = inter_cnt / len(user_valid_movieids)                            # valid 영화 개수
    scores.append(score)

100%|██████████| 16253/16253 [04:26<00:00, 61.01it/s]


In [30]:
# scores

In [31]:
import numpy as np
mean = np.mean(scores)
print(f'test_유저의 valid 스코어의 평균: {mean}')
stdev = np.std(scores)
print(f'test_유저의 valid 스코어의 표준편차: {stdev}')

test_유저의 valid 스코어의 평균: 0.5561527517997764
test_유저의 valid 스코어의 표준편차: 0.2351874599828928
