## 14-2장. 데이터 분석 예제 - MovieLens

In [1]:
import numpy as np
import pandas as pd 
import matplotlib as mpl 
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set(style='whitegrid', palette="pastel")

import warnings
warnings.filterwarnings("ignore")

###### 6,040명이 3,900개의 영화에 대해 평가한 1,000,209개의 영화평점 데이터. table 파일 형태(.dat)로 되어 있으며, 구분자는 ::, header는 없다.  
<b>1. ratings.dat (UserID::MovieID::Rating::Timestamp) </b>
- UserIDs range between 1 and 6040 
- MovieIDs range between 1 and 3952
- Ratings are made on a 5-star scale (whole-star ratings only)
- Timestamp is represented in seconds since the epoch as returned by time(2)
- Each user has at least 20 ratings

<b>2. users.dat (UserID::Gender::Age::Occupation::Zip-code) </b>
##### column = age 
* 1:  "Under 18"
* 18:  "18-24"
* 25:  "25-34"
* 35:  "35-44"
* 45:  "45-49"
* 50:  "50-55"
* 56:  "56+"

##### column = Occupation 
*  0:  "other" or not specified
*  1:  "academic/educator"
*  2:  "artist"
*  3:  "clerical/admin"
*  4:  "college/grad student"
*  5:  "customer service"
*  6:  "doctor/health care"
*  7:  "executive/managerial"
*  8:  "farmer"
*  9:  "homemaker"
* 10:  "K-12 student"
* 11:  "lawyer"
* 12:  "programmer"
* 13:  "retired"
* 14:  "sales/marketing"
* 15:  "scientist"
* 16:  "self-employed"
* 17:  "technician/engineer"
* 18:  "tradesman/craftsman"
* 19:  "unemployed"
* 20:  "writer"

<b>3. movie.dat (MovieID::Title::Genres) </b>
##### colums = Genres (장르)
* Action
* Adventure
* Animation
* Children's
* Comedy
* Crime
* Documentary
* Drama
* Fantasy
* Film-Noir
* Horror
* Musical
* Mystery
* Romance
* Sci-Fi
* Thriller
* War
* Western

In [2]:
#### 데이터 로딩 : header가 없으므로 컬럼명을 지정해 주고, 구분자도 지정해주어야 함 

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
mnames = ['movie_id', 'title', 'genres']

users = pd.read_table('datasets/movielens/users.dat', sep='::', header = None, names = unames)
ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::', header = None, names = rnames)
movies = pd.read_table('datasets/movielens/movies.dat', sep='::', header = None, names = mnames)

In [3]:
users.head()
# ratings.head()
# movies.head()

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [4]:
#### merge로 테이블들 병합 
data = pd.merge(pd.merge(users, ratings), movies)
print(data.shape)
data.head()

(1000209, 10)


Unnamed: 0,user_id,gender,age,occupation,zip,movie_id,rating,timestamp,title,genres
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


In [5]:
#################################################################################################
########### 1. 여성에서 좋은 평점을 받은 영화는 무엇인가? 
#################################################################################################

In [6]:
mean_ratings = data.pivot_table('rating', index = 'title', columns = 'gender', aggfunc = 'mean')
mean_ratings

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375000,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024
...,...,...
"Zed & Two Noughts, A (1985)",3.500000,3.380952
Zero Effect (1998),3.864407,3.723140
Zero Kelvin (Kj�rlighetens kj�tere) (1995),,3.500000
Zeus and Roxanne (1997),2.777778,2.357143


In [7]:
#### 영화가 너무 많다. 평점갯수 250개 이상만 추려내자 
rating_by_title = data.groupby('title').size()
print( rating_by_title )

active_titles = rating_by_title.index[rating_by_title > 250]
active_titles

title
$1,000,000 Duck (1971)                         37
'Night Mother (1986)                           70
'Til There Was You (1997)                      52
'burbs, The (1989)                            303
...And Justice for All (1979)                 199
                                             ... 
Zed & Two Noughts, A (1985)                    29
Zero Effect (1998)                            301
Zero Kelvin (Kj�rlighetens kj�tere) (1995)      2
Zeus and Roxanne (1997)                        23
eXistenZ (1999)                               410
Length: 3706, dtype: int64


Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
       '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
       '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
       '2010 (1984)',
       ...
       'X-Men (2000)', 'Year of Living Dangerously (1982)',
       'Yellow Submarine (1968)', 'You've Got Mail (1998)',
       'Young Frankenstein (1974)', 'Young Guns (1988)',
       'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
       'Zero Effect (1998)', 'eXistenZ (1999)'],
      dtype='object', name='title', length=1214)

In [8]:
mean_ratings = mean_ratings.loc[active_titles]
mean_ratings

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"'burbs, The (1989)",2.793478,2.962085
10 Things I Hate About You (1999),3.646552,3.311966
101 Dalmatians (1961),3.791444,3.500000
101 Dalmatians (1996),3.240000,2.911215
12 Angry Men (1957),4.184397,4.328421
...,...,...
Young Guns (1988),3.371795,3.425620
Young Guns II (1990),2.934783,2.904025
Young Sherlock Holmes (1985),3.514706,3.363344
Zero Effect (1998),3.864407,3.723140


In [9]:
#### 여성들이 매긴 평점 순으로 정렬 
female_like = mean_ratings.sort_values(by='F', ascending = False)
female_like.head(10)

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Close Shave, A (1995)",4.644444,4.473795
"Wrong Trousers, The (1993)",4.588235,4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.57265,4.464589
Wallace & Gromit: The Best of Aardman Animation (1996),4.563107,4.385075
Schindler's List (1993),4.562602,4.491415
"Shawshank Redemption, The (1994)",4.539075,4.560625
"Grand Day Out, A (1992)",4.537879,4.293255
To Kill a Mockingbird (1962),4.536667,4.372611
Creature Comforts (1990),4.513889,4.272277
"Usual Suspects, The (1995)",4.513317,4.518248


In [10]:
#################################################################################################
########### 2. 남녀간 호,불호가 갈리는 영화들은 무엇인가? 
#################################################################################################

In [11]:
mean_ratings['differ'] = mean_ratings['M'] - mean_ratings['F'] 
Man_like = mean_ratings.sort_values(by='differ', ascending = False)
Woman_like = mean_ratings.sort_values(by='differ', ascending = True)

print(Man_like.head(10))
print('-------------------------------------------------------------')
print(Woman_like.head(10))

gender                                         F         M    differ
title                                                               
Good, The Bad and The Ugly, The (1966)  3.494949  4.221300  0.726351
Kentucky Fried Movie, The (1977)        2.878788  3.555147  0.676359
Dumb & Dumber (1994)                    2.697987  3.336595  0.638608
Longest Day, The (1962)                 3.411765  4.031447  0.619682
Cable Guy, The (1996)                   2.250000  2.863787  0.613787
Evil Dead II (Dead By Dawn) (1987)      3.297297  3.909283  0.611985
Hidden, The (1987)                      3.137931  3.745098  0.607167
Rocky III (1982)                        2.361702  2.943503  0.581801
Caddyshack (1980)                       3.396135  3.969737  0.573602
For a Few Dollars More (1965)           3.409091  3.953795  0.544704
-------------------------------------------------------------
gender                                        F         M    differ
title                                     

In [None]:
#################################################################################################
########### 3. 호,불호가 갈리는 영화들은 무엇인가? (평점의 var or std가 높은 영화)
#################################################################################################

In [21]:
rating_var_title = data.groupby('title')['rating'].var()
a = rating_var_title.sort_values(ascending = False)
a[:10]

title
Foreign Student (1994)                                             8.000000
Criminal Lovers (Les Amants Criminels) (1999)                      5.333333
Identification of a Woman (Identificazione di una donna) (1982)    4.500000
Sunset Park (1996)                                                 4.500000
Eaten Alive (1976)                                                 4.500000
Neon Bible, The (1995)                                             4.500000
Talk of Angels (1998)                                              4.500000
Tokyo Fist (1995)                                                  4.500000
Paralyzing Fear: The Story of Polio in America, A (1998)           4.500000
Better Living (1998)                                               4.500000
Name: rating, dtype: float64

In [24]:
rating_std_title = data.groupby('title')['rating'].std()
b = rating_std_title.sort_values(ascending = False)
b[:10]

title
Foreign Student (1994)                                             2.828427
Criminal Lovers (Les Amants Criminels) (1999)                      2.309401
Identification of a Woman (Identificazione di una donna) (1982)    2.121320
Sunset Park (1996)                                                 2.121320
Eaten Alive (1976)                                                 2.121320
Neon Bible, The (1995)                                             2.121320
Talk of Angels (1998)                                              2.121320
Tokyo Fist (1995)                                                  2.121320
Paralyzing Fear: The Story of Polio in America, A (1998)           2.121320
Better Living (1998)                                               2.121320
Name: rating, dtype: float64