### 장르와 테그로 유사한 비슷한 영화 추천하기
- TF-IDF
- cosine similarity

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from collections import Counter

In [2]:
path = '../archive/data/movielens/'

ratings_df = pd.read_csv(os.path.join(path,'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path,'movies.csv'),index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path,'tags.csv'),encoding='utf-8')

### 장르를 이용한 movie representation

In [3]:
total_count = len(movies_df)
total_genres = set(sum(list(map(lambda x : x.split('|'),movies_df['genres'])),[]))
# sum(list , []) 2차원 리스트를 1차원으로 합해줌

In [4]:
genre_count = Counter(sum(list(map(lambda x : x.split('|'),movies_df['genres'])),[]))

In [43]:
## 카운터 함수를 사용하지않고 genre_count 만들기
# genre_count = dict.fromkeys(total_genres)

# for each_genre_list in movies_df['genres']:
#     for genre in each_genre_list.split('|'):
#         if genre_count[genre] == None:
#             genre_count[genre] = 1
#         else:
#             genre_count[genre] = genre_count[genre] + 1

<br>
자주 나오는 장르는 비슷한 영화를 판단할때 별로 중요하지 않다고 판단하고<br><br>
자주 등장하지 않는 장르는 그 영화를 잘 설명해준다고 판단함 => 가중치를 줌
<br><br>

In [5]:
#장르별 가중치 계산
for each_genre in genre_count:
    genre_count[each_genre] = np.log10(total_count/genre_count[each_genre])

# 전체 갯수 / 해당 장르 갯수를 하고 로그를 취해줘서 어느정도 값을 보정함.

In [6]:
genre_representation = pd.DataFrame(columns=sorted(total_genres), index=movies_df.index)

In [7]:
for index, each_row in tqdm(movies_df.iterrows()):
    dict_temp = {i : genre_count[i] for i in each_row['genres'].split('|')}
    genre_representation.loc[index] = dict_temp

9742it [00:06, 1577.49it/s]


### 태그를 이용한 movie representation

In [8]:
tag_column = list(map(lambda x : x.split(',') , tags_df['tag']))
unique_tags = set(tags_df['tag'].values)

In [9]:
from collections import Counter

total_movie_count = len(set(tags_df['movieId']))
tag_count_dict = Counter(tags_df['tag'].values)

In [10]:
tag_idf = dict()
for each_tag in tag_count_dict:
    tag_idf[each_tag] = np.log10(total_movie_count/tag_count_dict[each_tag])

In [11]:
tag_representation = pd.DataFrame(columns=sorted(unique_tags), index=list(set(tags_df['movieId'])))

In [12]:
for movieId , group in tqdm(tags_df.groupby(by='movieId')):
    dict_temp = {i : tag_idf[i] for i in list(group['tag'])}
    tag_representation.loc[movieId] = dict_temp

100%|██████████| 1572/1572 [00:01<00:00, 975.50it/s] 


In [13]:
movie_representation = pd.concat([genre_representation, tag_representation], axis=1).fillna(0)

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_matrix(a , b):
    cos_sim = cosine_similarity(a , b)
    result_df = pd.DataFrame(data=cos_sim, index=[a.index])
    
    return result_df

In [20]:
cs_df = cos_sim_matrix(movie_representation,movie_representation)

In [21]:
#cs_df.to_pickle('movie_cos.pkl')
#movie_cos = pd.read_pickle('movie_cos.pkl')

In [22]:
cs_df[0].sort_values(ascending=False)

1         1.000000
122918    0.542857
136016    0.503643
65577     0.503643
2294      0.503643
            ...   
26195     0.000000
26176     0.000000
26172     0.000000
26169     0.000000
7301      0.000000
Name: 0, Length: 9742, dtype: float64

In [34]:
cs_df.loc[1]

title                                Toy Story (1995)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 1, dtype: object

In [33]:
print(cs_df.loc[122918])
print(cs_df.loc[136016])
print(cs_df.loc[65577])
print(cs_df.loc[2294])

title     Guardians of the Galaxy 2 (2017)
genres             Action|Adventure|Sci-Fi
Name: 122918, dtype: object
title                        The Good Dinosaur (2015)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 136016, dtype: object
title                  Tale of Despereaux, The (2008)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 65577, dtype: object
title                                     Antz (1998)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 2294, dtype: object
