### 장르와 테그로 유사한 비슷한 영화 추천하기
- TF-IDF
- cosine similarity

In [356]:
import os
import pandas as pd

import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from collections import Counter

In [357]:
path = '../archive/data/movielens/'

ratings_df = pd.read_csv(os.path.join(path,'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path,'movies.csv'),index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path,'tags.csv'),encoding='utf-8')

### 장르를 이용한 movie representation

In [358]:
total_count = len(movies_df)
total_genres = set(sum(list(map(lambda x : x.split('|'),movies_df['genres'])),[]))
# sum(list , []) 2차원 리스트를 1차원으로 합해줌

In [359]:
genre_count = Counter(sum(list(map(lambda x : x.split('|'),movies_df['genres'])),[]))

In [360]:
genre_count

Counter({'Adventure': 1263,
         'Animation': 611,
         'Children': 664,
         'Comedy': 3756,
         'Fantasy': 779,
         'Romance': 1596,
         'Drama': 4361,
         'Action': 1828,
         'Crime': 1199,
         'Thriller': 1894,
         'Horror': 978,
         'Mystery': 573,
         'Sci-Fi': 980,
         'War': 382,
         'Musical': 334,
         'Documentary': 440,
         'IMAX': 158,
         'Western': 167,
         'Film-Noir': 87,
         '(no genres listed)': 34})

In [26]:
## 카운터 함수를 사용하지않고 genre_count 만들기
# genre_count = dict.fromkeys(total_genres)

# for each_genre_list in movies_df['genres']:
#     for genre in each_genre_list.split('|'):
#         if genre_count[genre] == None:
#             genre_count[genre] = 1
#         else:
#             genre_count[genre] = genre_count[genre] + 1

<br>
자주 나오는 장르는 비슷한 영화를 판단할때 별로 중요하지 않다고 판단하고<br><br>
자주 등장하지 않는 장르는 그 영화를 잘 설명해준다고 판단함 => 가중치를 줌
<br><br>

In [27]:
#장르별 가중치 계산
for each_genre in genre_count:
    genre_count[each_genre] = np.log10(total_count/genre_count[each_genre])

# 전체 갯수 / 해당 장르 갯수를 하고 로그를 취해줘서 어느정도 값을 보정함.

In [28]:
genre_representation = pd.DataFrame(columns=sorted(total_genres), index=movies_df.index)

In [30]:
for index, each_row in tqdm(movies_df.iterrows()):
    try:
        dict_temp = {i : genre_count[i] for i in each_row['genres'].split('|')}
        genre_representation.loc[index] = dict_temp
    except:
        print(index)

9742it [00:03, 3137.80it/s]


### 태그를 이용한 movie representation

In [31]:
tag_column = list(map(lambda x : x.split(',') , tags_df['tag']))
unique_tags = set(tags_df['tag'].values)

In [34]:
from collections import Counter

total_movie_count = len(set(tags_df['movieId']))
tag_count_dict = Counter(tags_df['tag'].values)

In [35]:
tag_idf = dict()
for each_tag in tag_count_dict:
    tag_idf[each_tag] = np.log10(total_movie_count/tag_count_dict[each_tag])

In [36]:
tag_representation = pd.DataFrame(columns=sorted(unique_tags), index=list(set(tags_df['movieId'])))

In [37]:
for movieId , group in tqdm(tags_df.groupby(by='movieId')):
    dict_temp = {i : tag_idf[i] for i in list(group['tag'])}
    tag_representation.loc[movieId] = dict_temp

100%|██████████| 1572/1572 [00:00<00:00, 2209.72it/s]


In [38]:
movie_representation = pd.concat([genre_representation, tag_representation], axis=1).fillna(0)

In [39]:
movie_representation

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,0.0,0.000000,0.887245,1.202607,1.16648,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.887245,0.000000,1.16648,0.000000,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.349062,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.726672,0.000000,1.202607,0.00000,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.000000,0.000000,1.202607,0.00000,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.349062,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.726672,0.000000,1.202607,0.00000,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_matrix(a , b):
    cos_sim = cosine_similarity(a , b)
    result_df = pd.DataFrame(data=cos_sim, index=[a.index])
    
    return result_df

In [41]:
cs_df = cos_sim_matrix(movie_representation,movie_representation)

In [42]:
#cs_df.to_pickle('movie_cos.pkl')
#movie_cos = pd.read_pickle('movie_cos.pkl')

In [249]:
cs_df.columns = movie_representation.index

In [251]:
cs_df.loc[1].T.sort_values(by=(1,) , ascending=False)[1:7]


Unnamed: 0,1
122918,0.542857
136016,0.503643
65577,0.503643
2294,0.503643
166461,0.503643
53121,0.503643


In [236]:
cs_df[0].sort_values(ascending=False)[:6]

1         1.000000
122918    0.542857
136016    0.503643
65577     0.503643
2294      0.503643
166461    0.503643
Name: 0, dtype: float64

In [51]:
print(movies_df.loc[122918]['title'])
print(movies_df.loc[136016]['title'])
print(movies_df.loc[65577]['title'])
print(movies_df.loc[2294]['title'])
print(movies_df.loc[166461]['title'])

Guardians of the Galaxy 2 (2017)
The Good Dinosaur (2015)
The Tale of Despereaux (2008)
Antz (1998)
Moana (2016)


In [102]:
# 토이스토리 감독
pick_movie = 1
derector = movies_df.loc[pick_movie]['director']

In [150]:
derector_movies = movies_df[movies_df['director'] == derector].sort_values(by='vote_count',ascending=False)
derector_movies.loc[derector_movies.index != pick_movie][:6]

Unnamed: 0_level_0,title,genres,only_title,year,vote_average,vote_count,director,actors
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
115617,Big Hero 6 (2014),Action|Animation|Comedy,Big Hero 6,2014,7.8,6289.0,John Lasseter,"Scott Adsit,Ryan Potter,Daniel Henney,T.J. Mil..."
106696,Frozen (2013),Adventure|Animation|Comedy|Fantasy|Musical|Rom...,Frozen,2013,7.3,5440.0,John Lasseter,"Kristen Bell,Idina Menzel,Jonathan Groff,Josh ..."
8961,The Incredibles (2004),Action|Adventure|Animation|Children|Comedy,The Incredibles,2004,7.4,5290.0,John Lasseter,"Craig T. Nelson,Holly Hunter,Samuel L. Jackson..."
45517,Cars (2006),Animation|Children|Comedy,Cars,2006,6.6,3991.0,John Lasseter,"Owen Wilson,Paul Newman,Bonnie Hunt,Larry the ..."
81847,Tangled (2010),Animation|Children|Comedy|Fantasy|Musical|Roma...,Tangled,2010,7.4,3419.0,John Lasseter,"Mandy Moore,Zachary Levi,Donna Murphy,Delaney ..."
87876,Cars 2 (2011),Adventure|Animation|Children|Comedy|IMAX,Cars 2,2011,5.8,2088.0,John Lasseter,"Owen Wilson,Larry the Cable Guy,Michael Caine,..."


In [77]:
actors = movies_df.loc[1]['actors'].split(',')

In [78]:
#톰 행크스
actors1 = actors[0]

In [83]:
movies_df = movies_df.fillna('None')

In [92]:
cast_idx = []
for i in movies_df['actors'].index :
    if actors1 in movies_df.loc[i]['actors']:
        cast_idx.append(i)

In [355]:
actor_movies = movies_df.loc[cast_idx].sort_values(by='vote_count',ascending=False)
actor_movies.loc[actor_movies.index != pick_movie][:6]

Unnamed: 0_level_0,title,genres,only_title,year,vote_average,vote_count,director,actors
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
356,Forrest Gump (1994),Comedy|Drama|Romance|War,Forrest Gump,1994,8.2,8147.0,Robert Zemeckis,"Tom Hanks,Robin Wright,Gary Sinise,Mykelti Wil..."
2028,Saving Private Ryan (1998),Action|Drama|War,Saving Private Ryan,1998,7.9,5148.0,Steven Spielberg,"Tom Hanks,Matt Damon,Vin Diesel,Tom Sizemore,B..."
78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3,2010,7.6,4710.0,Andrew Stanton,"Tom Hanks,Tim Allen,Ned Beatty,Joan Cusack,Mic..."
3147,The Green Mile (1999),Crime|Drama,The Green Mile,1999,8.2,4166.0,Thomas Newman,"Tom Hanks,Michael Clarke Duncan,David Morse,Bo..."
5989,Catch Me If You Can (2002),Crime|Drama,Catch Me If You Can,2002,7.7,3917.0,John Williams,"Leonardo DiCaprio,Tom Hanks,Christopher Walken..."
3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2,1999,7.3,3914.0,Andrew Stanton,"Tom Hanks,Tim Allen,Joan Cusack,Kelsey Grammer..."


In [None]:
#컨텐츠가 5개 이상 안나오는 것
#하나밖에 없는 거

#투표수 + 평점 적절한 비율

#감독 영화 행렬
#3개 미만은 그냥 제거

#배우 영화 행렬
#3개 미만은 제거

In [315]:
def recommend_movie(movieId):
    title = movies_df.loc[movieId]['only_title']
    
    genre = cs_df.loc[movieId].T.sort_values(by=(movieId,) , ascending=False)[1:7]
    genre_id = genre.index 
    
    derector = movies_df.loc[movieId]['director']
    derector_movies = movies_df[movies_df['director'] == derector].sort_values(by='vote_count',ascending=False)
    derector_id = derector_movies.loc[derector_movies.index != movieId][:6].index
    
    actors = movies_df.loc[movieId]['actors'].split(',')
    main_character1 = actors[0]
    main_character2 = actors[1]
    
    cast_idx1 = []
    cast_idx2 = []
    for i in movies_df['actors'].index :
        if main_character1 in movies_df.loc[i]['actors']:
            cast_idx1.append(i)
            
        if main_character2 in movies_df.loc[i]['actors']:
            cast_idx2.append(i)
    
   x

In [343]:
recommend_movie(2)

{'title': 'Jumanji',
 'genre': Int64Index([46972, 158813, 119655, 80748, 2161, 2162], dtype='int64'),
 'derector': 'Larry J. Franco',
 'derector_movie': Int64Index([72378, 2, 2288, 2517, 26603], dtype='int64', name='movieId'),
 'actor1': 'Robin Williams',
 'actor2': 'Jonathan Hyde',
 'actor1_movie': Int64Index([588, 46972, 1704, 1246, 68793, 119155], dtype='int64', name='movieId'),
 'actor2_movie': Int64Index([], dtype='int64', name='movieId')}

In [344]:
def show_movies(movie_dict):
    print("{}와 비슷한 장르 추천".format(movie_dict['title']))
    print(movies_df.loc[movie_dict['genre'].values]['title'])
    print()
    print("{} 감독 {} 의 작품 추천".format(movie_dict['title'],movie_dict['derector']))
    print(movies_df.loc[movie_dict['derector_movie'].values]['title'])
    print()
    print("{} 의 주연 {} 의 작품 추천".format(movie_dict['title'], movie_dict['actor1']))
    print(movies_df.loc[movie_dict['actor1_movie'].values]['title'])
    print()
    print("{} 의 주연 {} 의 작품 추천".format(movie_dict['title'], movie_dict['actor2']))
    print(movies_df.loc[movie_dict['actor2_movie'].values]['title'])

In [353]:
show_movies(recommend_movie(58559))

The Dark Knight와 비슷한 장르 추천
movieId
109850      Need for Speed (2014)
1377        Batman Returns (1992)
33794        Batman Begins (2005)
86644            Fast Five (2011)
592                 Batman (1989)
102716    Fast & Furious 6 (2013)
Name: title, dtype: object

The Dark Knight 감독 Lucinda Syson 의 작품 추천
movieId
58559        The Dark Knight (2008)
33794          Batman Begins (2005)
104841               Gravity (2013)
102716      Fast & Furious 6 (2013)
87232     X-Men: First Class (2011)
76251               Kick-Ass (2010)
Name: title, dtype: object

The Dark Knight 의 주연 Christian Bale 의 작품 추천
movieId
91529     The Dark Knight Rises (2012)
33794             Batman Begins (2005)
48780              The Prestige (2006)
106916          American Hustle (2013)
148626            The Big Short (2015)
68791      Terminator Salvation (2009)
Name: title, dtype: object

The Dark Knight 의 주연 Michael Caine 의 작품 추천
movieId
109487                    Interstellar (2014)
91529            The Dark Kni