In [2]:
from scipy import sparse
from scipy.sparse import csr_matrix, lil_matrix
import implicit
import annoy
import sys
import pandas as pd
import os
import numpy as np

In [3]:
data_path = "/opt/ml/input/project/data/"
ratings = pd.read_csv(data_path+"ml-25m/ratings.csv")
movies = pd.read_csv(data_path + "ml-25m/movies.csv")

ratings = ratings.merge(movies[["movieId","title"]],how="left",on = "movieId")
ratings['rating'] = 1
ratings.drop(columns = "timestamp",inplace = True)


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,title
0,1,296,1,Pulp Fiction (1994)
1,1,306,1,Three Colors: Red (Trois couleurs: Rouge) (1994)
2,1,307,1,Three Colors: Blue (Trois couleurs: Bleu) (1993)
3,1,665,1,Underground (1995)
4,1,899,1,Singin' in the Rain (1952)


In [4]:
inter_mbti_ml_title_movieid = pd.read_csv(data_path+"inter_mbti_ml_title_movieId.csv")

In [5]:
user2idx = {k:v for v,k in enumerate(ratings['userId'].unique())}
item2idx = {k:v for v,k in enumerate(ratings['movieId'].unique())}
idx2item = {v:k for v,k in enumerate(ratings['movieId'].unique())}

In [6]:
ratings['useridx'] = ratings['userId'].map(user2idx) # userId -> 인덱스로 바꿔줍니다.
ratings['movieidx'] = ratings['movieId'].map(item2idx) # movieId -> 인덱스로 바꿔줍니다.

In [9]:
sparse_user_item = sparse.csr_matrix((ratings['rating'].astype(float),
                                      (ratings['useridx'], ratings['movieidx']))) # csr_matrix 생성

In [12]:
als_model = implicit.als.AlternatingLeastSquares(
                                factors = 64,
                                regularization = 0.1,
                                iterations = 200,
                                calculate_training_loss = False,
                                use_gpu = True
                            ) # 모델 init

In [14]:
als_model.fit(sparse_user_item) # 학습
item_vecs = als_model.item_factors.to_numpy() # item vector만 빼옵니다.

  0%|          | 0/200 [00:00<?, ?it/s]

In [18]:
item_vecs.shape

(59047, 20)

In [19]:
np.save("movie_vecs",item_vecs) # 혹시 몰라 numpy 형태로 item vector를 저장

In [14]:
item_vecs = np.load("movie_vecs.npy")

In [8]:
similar_items_index = annoy.AnnoyIndex(20,'angular')
similar_items_index.load("similarity")

True

In [9]:
inter_mbti_ml_title_movieid['movieidx'] = inter_mbti_ml_title_movieid['movieId'].map(item2idx)


In [10]:
inter_mbti_ml_title_movieid[inter_mbti_ml_title_movieid['movieidx'].isnull()]

Unnamed: 0,title,movieId,movieidx
220,marry me,150834,
270,the three musketeers,150106,
432,the intruder,151939,
435,the kitchen,138086,
671,the last of the mohicans,167876,
1370,the coldest game,208655,
1434,lost,132512,
1482,the father,187779,
1877,the bounty hunter,125179,
2030,homeward bound,161694,


In [11]:
inter_mbti_ml_title_movieid.dropna(inplace = True)
inter_mbti_ml_title_movieid['movieidx'] = inter_mbti_ml_title_movieid['movieidx'].astype(int)

In [12]:
inter_mbti_ml_title_movieid.head()

Unnamed: 0,title,movieId,movieidx
0,moana,73141,20792
1,inside out,122470,24215
2,zootopia,152081,905
3,legally blonde,4447,358
4,dead poets society,1246,107


In [21]:
similar_items_index = annoy.AnnoyIndex(20,'angular')

In [22]:
n_trees = 40
for idx in inter_mbti_ml_title_movieid['movieidx']:
    similar_items_index.add_item(idx2item[idx],item_vecs[idx]) # idx -> movieId로 바꿔서 annoy에 저장합니다.
similar_items_index.build(n_trees)

True

In [41]:
similar_items_index.save("similarity")

True

In [18]:
def similar_item(id,N):
    result = []
    neighbour,dist = similar_items_index.get_nns_by_item(id, N, include_distances=True)
    for i in range(0,N):
        result.append(inter_mbti_ml_title_movieid[inter_mbti_ml_title_movieid['movieId']==neighbour[i]]['title'].values[0])
    return result[1:N]
    

In [27]:
movieId_list = inter_mbti_ml_title_movieid['movieId'].unique()
result = similar_item(movieId_list[1],10)

In [28]:
result

['rough night',
 'mike & dave need wedding dates',
 'when we first met',
 "isn't it romantic",
 'popstar: never stop never stopping',
 "daddy's home 2",
 'blockers',
 'the emoji movie',
 'uglydolls']