# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data = pd.read_csv('./data/kaggle_movie/ratings_small.csv')
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
# 현재 가지고있는 데이터의 경우 userid 컬럼과 movieid 컬럼이 따로 존재하므로 아이템 기반 필터링을 만들기 위해서는 user - item 테이블을 따로 만들어야 함.
data = data.pivot_table('rating', index = 'userId', columns='movieId')
data.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,


In [5]:
data.shape


(671, 9066)

In [8]:
# 이제 해당 테이블을 통해 사용자 아이디별 영화에 대한 평점을 볼 수 있지만 영화 title이 없다는 문제가 존재
# 따라서 영화 title을 가져와서 merge
ratings = pd.read_csv('./data/kaggle_movie/ratings_small.csv')
movies = pd.read_csv('./data/kaggle_movie/movies_metadata.csv')


  movies = pd.read_csv('./data/kaggle_movie/movies_metadata.csv')


In [9]:
movies.rename(columns={'id':'movieId'}, inplace=True)
# merge를 위한 컬럼 이름 변경

In [13]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [12]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   movieId                45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [17]:
# merge를 위해 그 전에 int형변환에 에러가 되는 행 삭제 
movies.drop([35587,19730,29503], axis=0, inplace=True)

In [18]:
movies['movieId'] = movies['movieId'].astype('int')

In [19]:
ratings_movies = pd.merge(ratings, movies, on='movieId')

In [20]:
ratings_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,adult,belongs_to_collection,budget,genres,homepage,imdb_id,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,1,1371,2.5,1260759135,False,"{'id': 1575, 'name': 'Rocky Collection', 'post...",17000000,"[{'id': 18, 'name': 'Drama'}]",,tt0084602,...,1982-05-28,270000000.0,99.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The greatest challenge.,Rocky III,False,6.6,894.0
1,4,1371,4.0,949810302,False,"{'id': 1575, 'name': 'Rocky Collection', 'post...",17000000,"[{'id': 18, 'name': 'Drama'}]",,tt0084602,...,1982-05-28,270000000.0,99.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The greatest challenge.,Rocky III,False,6.6,894.0
2,7,1371,3.0,851869160,False,"{'id': 1575, 'name': 'Rocky Collection', 'post...",17000000,"[{'id': 18, 'name': 'Drama'}]",,tt0084602,...,1982-05-28,270000000.0,99.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The greatest challenge.,Rocky III,False,6.6,894.0
3,19,1371,4.0,855193404,False,"{'id': 1575, 'name': 'Rocky Collection', 'post...",17000000,"[{'id': 18, 'name': 'Drama'}]",,tt0084602,...,1982-05-28,270000000.0,99.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The greatest challenge.,Rocky III,False,6.6,894.0
4,21,1371,3.0,853852263,False,"{'id': 1575, 'name': 'Rocky Collection', 'post...",17000000,"[{'id': 18, 'name': 'Drama'}]",,tt0084602,...,1982-05-28,270000000.0,99.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The greatest challenge.,Rocky III,False,6.6,894.0


In [21]:
ratings_movies.shape

(44994, 27)

In [23]:
data = ratings_movies.pivot_table('rating', index = 'userId', columns='title').fillna(0)

In [24]:
data.head()

title,!Women Art Revolution,'Gator Bait,'Twas the Night Before Christmas,...And God Created Woman,00 Schneider - Jagd auf Nihil Baxter,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 Angry Men,...,Zodiac,Zombie Flesh Eaters,Zombie Holocaust,Zozo,eXistenZ,xXx,¡Three Amigos!,À nos amours,Ödipussi,Şaban Oğlu Şaban
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
data.shape
# 사용자별 영화 title에 따라 평점을 매긴 정보 확인 가능

(671, 2794)

In [26]:
# 아이템 기반 협업 필터링이므로 row가 user기반이면 암됨
# item이 row가 되어야 하므로 row를 item으로 변경 ( cosine 유사도를 구할 때 row 기반으로 유사도를 측정하기 때문)

data = data.transpose()
data.head(2)

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!Women Art Revolution,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Gator Bait,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
data.shape

(2794, 671)

# Cosine Similarity

In [28]:
movie_sim = cosine_similarity(data, data)
movie_sim.shape

(2794, 2794)

In [29]:
movie_sim_df = pd.DataFrame(data = movie_sim, index = data.index, columns = data.index)
movie_sim_df.head()

title,!Women Art Revolution,'Gator Bait,'Twas the Night Before Christmas,...And God Created Woman,00 Schneider - Jagd auf Nihil Baxter,10 Items or Less,10 Things I Hate About You,"10,000 BC",11'09''01 - September 11,12 Angry Men,...,Zodiac,Zombie Flesh Eaters,Zombie Holocaust,Zozo,eXistenZ,xXx,¡Three Amigos!,À nos amours,Ödipussi,Şaban Oğlu Şaban
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!Women Art Revolution,1.0,0.0,0.513704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.118456,0.0,0.0,0.0,0.0
'Gator Bait,0.0,1.0,0.0,0.0,0.0,0.20739,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.182018,0.0,0.03818,0.0,0.0
'Twas the Night Before Christmas,0.513704,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...And God Created Woman,0.0,0.0,0.0,1.0,0.789352,0.345651,0.0,0.0,0.0,0.0,...,0.0,1.0,0.631169,0.813733,0.339227,0.130013,1.0,0.0,0.0,0.0
00 Schneider - Jagd auf Nihil Baxter,0.0,0.0,0.0,0.789352,1.0,0.27284,0.0,0.0,0.0,0.0,...,0.0,0.789352,0.498214,0.642322,0.267769,0.102626,0.789352,0.210963,0.0,0.0


In [30]:
# 특정 영화와 비교했을 때 그 영화와 유사한 영화들을 추천
movie_sim_df["X-Men Origins: Wolverine"].sort_values(ascending=False)[1:10]

title
Romeo Must Die                        0.649625
The Pawnshop                          0.635039
The Wedding Planner                   0.631669
The Getaway                           0.606240
Dogtown and Z-Boys                    0.501189
The Grapes of Wrath                   0.499450
An Unfinished Life                    0.485643
Conquest of the Planet of the Apes    0.474626
Broken Blossoms                       0.462291
Name: X-Men Origins: Wolverine, dtype: float64

In [31]:
movie_sim_df["Harry Potter and the Half-Blood Prince"].sort_values(ascending=False)[1:10]

title
Harry Potter and the Half-Blood Prince    1.0
The Blue Lagoon                           1.0
Once                                      1.0
The Blue Angel                            1.0
Family Plot                               1.0
I Know Who Killed Me                      1.0
Teorema                                   1.0
Synecdoche, New York                      1.0
A Pyromaniac's Love Story                 1.0
Name: Harry Potter and the Half-Blood Prince, dtype: float64

In [32]:

movie_sim_df["King Kong"].sort_values(ascending=False)[1:10]

title
Soldier of Orange                    0.660110
Fantasia                             0.583460
No End                               0.583460
2046                                 0.583460
Executive Decision                   0.533229
Crusade in Jeans                     0.527759
The Piano Teacher                    0.521862
The Assault                          0.514917
Charlie and the Chocolate Factory    0.505769
Name: King Kong, dtype: float64

In [None]:
# 성능이 별로 좋지 않은 이유는 2개의 데이터 셋이 조금 다르기 때문. ratings와 movies 데이터가 달라서 (ratings_small은 데이터가 적음)
# 따라서 ratings가 모든 영화 정보를 포함하지 않기에 성능이 다름 