In [3]:
from scipy import sparse
from scipy.sparse import csr_matrix, lil_matrix
import implicit
import annoy
import sys
import pandas as pd
import os
import numpy as np

## Data Load
- ratings : ml interaction 데이터 
- movies : ml movieId, title 데이터 
- inter_mbti_ml_title_movieid : ml, mbti 겹치는 데이터

In [4]:
data_path = "/opt/ml/input/project/data/"
ratings = pd.read_csv(data_path+"ml-25m/ratings.csv")
movies = pd.read_csv(data_path + "ml-25m/movies.csv")

ratings = ratings.merge(movies[["movieId","title"]],how="left",on = "movieId")
ratings['rating'] = 1                               
ratings.drop(columns = "timestamp",inplace = True)

inter_mbti_ml_title_movieid = pd.read_csv(data_path+"inter_mbti_ml_title_movieId.csv")


In [5]:
user2idx = {k:v for v,k in enumerate(ratings['userId'].unique())}
item2idx = {k:v for v,k in enumerate(ratings['movieId'].unique())}
idx2item = {v:k for v,k in enumerate(ratings['movieId'].unique())}

In [6]:
ratings['useridx'] = ratings['userId'].map(user2idx) # userId -> 인덱스로 바꿔줍니다.
ratings['movieidx'] = ratings['movieId'].map(item2idx) # movieId -> 인덱스로 바꿔줍니다.

## ALS 학습 
- csr_matrix 선언

In [7]:
sparse_user_item = sparse.csr_matrix((ratings['rating'].astype(float),
                                      (ratings['useridx'], ratings['movieidx']))) # csr_matrix 생성

- ALS 모델 init 및 학습

In [18]:
als_model = implicit.als.AlternatingLeastSquares(
                                factors = 64,
                                regularization = 0.1,
                                iterations = 200,
                                calculate_training_loss = False,
                                use_gpu = True
                            ) # 모델 init
als_model.fit(sparse_user_item) # 학습
item_vecs = als_model.item_factors.to_numpy() # item vector만 빼옵니다.
np.save("movie_vecs",item_vecs) # 혹시 몰라 numpy 형태로 item vector를 저장
item_vecs.shape

  0%|          | 0/200 [00:00<?, ?it/s]

(59047, 64)

inter_mbti_ml_title_movieid에 인덱스 column추가, null값을 제거합니다

In [9]:
inter_mbti_ml_title_movieid['movieidx'] = inter_mbti_ml_title_movieid['movieId'].map(item2idx)

In [13]:
display(inter_mbti_ml_title_movieid[inter_mbti_ml_title_movieid['movieidx'].isnull()].head())
print("null값 개수 : ",inter_mbti_ml_title_movieid[inter_mbti_ml_title_movieid['movieidx'].isnull()].shape[0])

Unnamed: 0,title,movieId,movieidx
220,marry me,150834,
270,the three musketeers,150106,
432,the intruder,151939,
435,the kitchen,138086,
671,the last of the mohicans,167876,


null값 개수 :  16


In [14]:
inter_mbti_ml_title_movieid.dropna(inplace = True)
inter_mbti_ml_title_movieid['movieidx'] = inter_mbti_ml_title_movieid['movieidx'].astype(int)

annoy init 부분
- AnnoyIndex(item_vector_size, 유사도 측정 방법)
- 유사도 측정방법
    - angular : cosine similarity
    - euclidean : l2-norm distance
    - manhattan : l1-norm distance
    - hamming : binary vector에만 사용
    - dot : dot product

In [16]:
similar_items_index = annoy.AnnoyIndex(64,'euclidean')

- add_item : movieId를 인덱스로 하는 embedding vector들을 annoy에 하나씩 넣어줍니다.
- build : add_item을 통해 받은 vector들의 유사도를 계산합니다.

In [19]:
n_trees = 20
for idx in inter_mbti_ml_title_movieid['movieidx']:
    similar_items_index.add_item(idx2item[idx],item_vecs[idx]) # idx -> movieId로 바꿔서 annoy에 저장합니다.
similar_items_index.build(n_trees)

True

- save : 파일로 저장합니다.
- load : 저장한 파일을 불러옵니다.

In [37]:
# similar_items_index.save("ALS_euclidean_64")

True

In [38]:
similar_items_index = annoy.AnnoyIndex(64,'angular')
similar_items_index.load("ALS_64")

True

In [39]:
def similar_item(id,N):
    result = []
    neighbour,dist = similar_items_index.get_nns_by_item(id, N+1, include_distances=True)
    for i in range(0,N):
        result.append(inter_mbti_ml_title_movieid[inter_mbti_ml_title_movieid['movieId']==neighbour[i]]['title'].values[0])
    return result[0],result[1:],dist[1:]
    

In [47]:
movieId_list = inter_mbti_ml_title_movieid['movieId'].unique()
title,result,dist = similar_item(movieId_list[2],10)

In [43]:
print("title : ",title)
result,dist

title :  zootopia


(['kubo and the two strings',
  'the secret life of pets',
  'the jungle book',
  'arrival',
  'the good dinosaur',
  'the martian',
  'hidden figures',
  'thor: ragnarok',
  'trolls'],
 [0.5943165421485901,
  0.6597697734832764,
  0.6920070052146912,
  0.7352758049964905,
  0.7466049194335938,
  0.75857013463974,
  0.7666743993759155,
  0.777453601360321,
  0.793731153011322,
  0.7965688109397888])

In [46]:
similar_items_index = annoy.AnnoyIndex(64,'euclidean')
similar_items_index.load("ALS_euclidean_64")

True

In [48]:
print("title : ",title)
result,dist

title :  zootopia


(['big hero 6',
  'the lego movie',
  'wreck-it ralph',
  'brave',
  'monsters university',
  'megamind',
  'tangled',
  'despicable me',
  'cloudy with a chance of meatballs'],
 [1.3419342041015625,
  1.47872793674469,
  1.4849172830581665,
  1.5692397356033325,
  1.5921216011047363,
  1.7869476079940796,
  1.7883639335632324,
  1.8061836957931519,
  1.8138846158981323,
  1.8814114332199097])

In [12]:
sys.path.append(os.path.join(os.path.abspath(os.path.curdir),".."))
from Utils.utils import model_recommend_movies

In [14]:
model_recommend_movies("ISTP", "1w9", [73141], 10, model_path='../interaction_model/ALS_64')

[171779, 33380, 192869, 156732, 152047, 170739, 92660, 116092, 124476, 116702]