# 1장 pandas 기초

- pandas 데이터의 각 column은 단일 data type만 저장한다.
- DataFrame 데이터의 각 column을 선택할 수 있다. 그 결과는 Series를 반환한다.

## 0. import / data 설명

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
%matplotlib inline

#### data 설명
```cmd
color: 흑백/컬러
director_name: 감독이름
num_critic_for_reviews: 비평가 review 개수
duration: 상영시간
director_facebook_likes: 감독 좋아요 개수
actor_3_facebook_likes: actor3 좋아요 개수
actor_2_name: actor2 이름
actor_1_facebook_likes: actor1 좋아요 개수
gross: 총수익
genres: 장르
actor_1_name: actor1 이름
movie_title: 영화제목
num_voted_users: 투표한 user?
cast_total_facebook_likes: 모든 출연자의 좋아요 개수
actor_3_name: actor3 이름
facenumber_in_poster: 영화 포스터에 얼굴 수?
plot_keywords: 키워드
movie_imdb_link: IMDB링크
num_user_for_reviews: user review 개수
language: 언어
country: 국가
content_rating: 심의등급
budget: 제작비
title_year: 출시년도
actor_2_facebook_likes: actor2 좋아요 개수
imdb_score: IMDB Score
aspect_ratio: 종횡비
movie_facebook_likes: 영화 좋아요 개수
```

## 1. pandas 주요 구성 요소
- index, index label
- columns
- axis 0
- axis 1
- data

In [2]:
movie = pd.read_csv('../data/movie.csv')
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


- color, director_name 등을 column name
- 0, 1, 2 등을 index, index는 index label을 총칭하는 단어
- index 방향을 axis 0 (수직축)
- 가로 방향을 axis 1 (수평축)

- head(n) 상위 n개
- tail(n) 하위 n개

## 2. DataFrame의 주요 attribute

### attribute
- index
- columns
- values

### index

In [3]:
# index는 range 와 유사하게 생겼다.
index = movie.index
index

RangeIndex(start=0, stop=4916, step=1)

In [4]:
# list
[val for val in index]

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [5]:
len(index.values)

4916

In [6]:
index.values

array([   0,    1,    2, ..., 4913, 4914, 4915], dtype=int64)

### columns

In [7]:
columns = movie.columns
columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [8]:
len(columns)

28

In [9]:
columns.values

array(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users',
       'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link',
       'num_user_for_reviews', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'aspect_ratio', 'movie_facebook_likes'], dtype=object)

### values

In [10]:
data = movie.values
data

array([['Color', 'James Cameron', 723.0, ..., 7.9, 1.78, 33000],
       ['Color', 'Gore Verbinski', 302.0, ..., 7.1, 2.35, 0],
       ['Color', 'Sam Mendes', 602.0, ..., 6.8, 2.35, 85000],
       ...,
       ['Color', 'Benjamin Roberds', 13.0, ..., 6.3, nan, 16],
       ['Color', 'Daniel Hsia', 14.0, ..., 6.3, 2.35, 660],
       ['Color', 'Jon Gunn', 43.0, ..., 6.6, 1.85, 456]], dtype=object)

- index, columns, 데이터 모두 Numpy의 ndarray로 만들어져있다.

## 3. pandas의 Data Type

- 중요한거 몇 가지만 기억하자.
    - Boolean: bool
    - Integer: int
    - Float: float
    - Object: 문자열, 다른 파이썬 객체(tuple, list, dict)
    - Datetime
    - Timedelta

In [11]:
# DataFrame 각 열의 Data Type을 표시
movie.dtypes

color                         object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
plot_keywords                 object
movie_imdb_link               object
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
m

In [12]:
# 각 type 별 개수를 확인한다.
movie.get_dtype_counts()

float64    13
int64       3
object     12
dtype: int64

## 4. 데이터 단일 column을 Series로 선택하기
- indexing operator
- dot notation: 추천하지 않는다.

In [13]:
movie['director_name']

0            James Cameron
1           Gore Verbinski
2               Sam Mendes
3        Christopher Nolan
4              Doug Walker
5           Andrew Stanton
6                Sam Raimi
7             Nathan Greno
8              Joss Whedon
9              David Yates
10             Zack Snyder
11            Bryan Singer
12            Marc Forster
13          Gore Verbinski
14          Gore Verbinski
15             Zack Snyder
16          Andrew Adamson
17             Joss Whedon
18            Rob Marshall
19        Barry Sonnenfeld
20           Peter Jackson
21               Marc Webb
22            Ridley Scott
23           Peter Jackson
24             Chris Weitz
25           Peter Jackson
26           James Cameron
27           Anthony Russo
28              Peter Berg
29         Colin Trevorrow
               ...        
4886            Eric Eason
4887              Uwe Boll
4888     Richard Linklater
4889       Joseph Mazzella
4890          Travis Legge
4891         Alex Kendrick
4

In [14]:
movie.director_name

0            James Cameron
1           Gore Verbinski
2               Sam Mendes
3        Christopher Nolan
4              Doug Walker
5           Andrew Stanton
6                Sam Raimi
7             Nathan Greno
8              Joss Whedon
9              David Yates
10             Zack Snyder
11            Bryan Singer
12            Marc Forster
13          Gore Verbinski
14          Gore Verbinski
15             Zack Snyder
16          Andrew Adamson
17             Joss Whedon
18            Rob Marshall
19        Barry Sonnenfeld
20           Peter Jackson
21               Marc Webb
22            Ridley Scott
23           Peter Jackson
24             Chris Weitz
25           Peter Jackson
26           James Cameron
27           Anthony Russo
28              Peter Berg
29         Colin Trevorrow
               ...        
4886            Eric Eason
4887              Uwe Boll
4888     Richard Linklater
4889       Joseph Mazzella
4890          Travis Legge
4891         Alex Kendrick
4

### tip
- shift + tab + tab

In [15]:
# Series의 name attribute
director = movie['director_name']
director.name

'director_name'

In [16]:
# to_frame() 메서드를 이용하여 Series를 단일 column을 가진 DafaFrame으로 변환한다.
# Series의 이름을 column name으로 사용한다.
director.to_frame().head()

Unnamed: 0,director_name
0,James Cameron
1,Gore Verbinski
2,Sam Mendes
3,Christopher Nolan
4,Doug Walker


### tip. Select row

In [117]:
movie2.loc['Avatar']

color                                                                    Color
director_name                                                    James Cameron
num_critic_for_reviews                                                     723
duration                                                                   178
director_facebook_likes                                                      0
actor_3_facebook_likes                                                     855
actor_2_name                                                  Joel David Moore
actor_1_facebook_likes                                                    1000
gross                                                              7.60506e+08
genres                                         Action|Adventure|Fantasy|Sci-Fi
actor_1_name                                                       CCH Pounder
num_voted_users                                                         886204
cast_total_facebook_likes                           

## 5. Series, DataFrame의 attribute / method
- Series와 DataFrame은 각각 다양한 attribute와 메서드를 가지고 있다.

- dir 함수로 확인할 수 있다.

----
### tip. python 기초: set

In [17]:
set([1, 2, 2, 3, 3])

{1, 2, 3}

In [18]:
(set([1, 2, 3]) & set([2, 3, 4]))

{2, 3}

In [19]:
len((set([1, 2, 3]) & set([2, 3, 4])))

2

----

In [20]:
# Series의 attribute + method
len(dir(pd.Series))

455

In [21]:
# DataFrame의 attribute + method
len(dir(pd.DataFrame))

462

In [22]:
len(set(dir(pd.Series)) & set(dir(pd.DataFrame)))

391

- Series와 DataFrame의 많은 수의 attribute와 메서드를 공유한다는 것을 알 수 있다.

- 1.head() / tail()
- 2.value_counts(): 각 데이터의 빈도 출력, 대략적인 분포 확인, value_counts(normalize=True): 정규화
- 3.size / shape: 데이터의 길이? 개수
- 4.count(): NaN이 아닌 데이터의 개수
- 5.min(), max(), mean(), median(), std(), sum()
- 6.describe(): 숫자 데이터와 아닐 때 각각 결과가 다름
- 7.quantile()
- 8.isnull() / notnull() / isnull().sum() / notnull().sum()
- 9.fillna() / dropna()
- 10.hasnans: True/Fasle

In [23]:
movie = pd.read_csv('../data/movie.csv')

In [24]:
director = movie['director_name']

In [25]:
actor_1_fb_likes = movie['actor_1_facebook_likes']

In [26]:
director.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [27]:
actor_1_fb_likes.head()

0     1000.0
1    40000.0
2    11000.0
3    27000.0
4      131.0
Name: actor_1_facebook_likes, dtype: float64

In [28]:
# 각 데이터의 count를 출력
director.value_counts()

Steven Spielberg        26
Woody Allen             22
Clint Eastwood          20
Martin Scorsese         20
Ridley Scott            16
Spike Lee               16
Steven Soderbergh       15
Renny Harlin            15
Tim Burton              14
Oliver Stone            14
Robert Zemeckis         13
Robert Rodriguez        13
Joel Schumacher         13
Ron Howard              13
Barry Levinson          13
Kevin Smith             12
Michael Bay             12
Brian De Palma          12
Tony Scott              12
Francis Ford Coppola    11
Sam Raimi               11
Richard Donner          11
Chris Columbus          11
Shawn Levy              11
Richard Linklater       11
Rob Reiner              11
John Carpenter          10
John McTiernan          10
Stephen Frears          10
Bobby Farrelly          10
                        ..
Allan Dwan               1
Rupert Sanders           1
John D. Hancock          1
Martha Coolidge          1
Hark Tsui                1
Justin Zackham           1
M

In [29]:
actor_1_fb_likes.value_counts()

1000.0     436
11000.0    206
2000.0     189
3000.0     150
12000.0    131
13000.0    123
14000.0    120
10000.0    109
18000.0    106
22000.0     80
15000.0     71
23000.0     55
16000.0     55
4000.0      54
8000.0      51
17000.0     45
26000.0     39
20000.0     38
40000.0     36
21000.0     34
19000.0     31
5000.0      30
24000.0     29
49000.0     27
0.0         26
29000.0     20
6000.0      20
33000.0     18
826.0       17
34000.0     16
          ... 
458.0        1
77000.0      1
763.0        1
961.0        1
701.0        1
123.0        1
575.0        1
481.0        1
107.0        1
279.0        1
188.0        1
619.0        1
652.0        1
237.0        1
764.0        1
335.0        1
494.0        1
732.0        1
712.0        1
91.0         1
437.0        1
406.0        1
762.0        1
432.0        1
644.0        1
362.0        1
216.0        1
859.0        1
225.0        1
334.0        1
Name: actor_1_facebook_likes, Length: 877, dtype: int64

- size, shape, len() 함수 거의 동일한 기능을 한다.

In [30]:
# Series가 속한 DataFrame의 row 수와 동일하다.
director.size

4916

In [31]:
director.shape

(4916,)

In [32]:
len(director)

4916

In [33]:
# NaN 제외
director.count()

4814

In [34]:
len(director) - director.count()

102

In [35]:
director.isnull().sum()

102

In [36]:
(actor_1_fb_likes.min(), actor_1_fb_likes.max(),
 actor_1_fb_likes.mean(), actor_1_fb_likes.median(),
 actor_1_fb_likes.std(), actor_1_fb_likes.sum())

(0.0, 640000.0, 6494.488490527602, 982.0, 15106.986883848185, 31881444.0)

In [37]:
actor_1_fb_likes.describe()

count      4909.000000
mean       6494.488491
std       15106.986884
min           0.000000
25%         607.000000
50%         982.000000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64

In [38]:
director.describe()

count                 4814
unique                2397
top       Steven Spielberg
freq                    26
Name: director_name, dtype: object

In [39]:
actor_1_fb_likes.quantile(0.2)

510.0

In [40]:
actor_1_fb_likes.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])

0.1      240.0
0.2      510.0
0.3      694.0
0.4      854.0
0.5      982.0
0.6     1000.0
0.7     8000.0
0.8    13000.0
0.9    18000.0
Name: actor_1_facebook_likes, dtype: float64

In [41]:
director.isnull()

0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
4886    False
4887    False
4888    False
4889    False
4890    False
4891    False
4892    False
4893    False
4894    False
4895    False
4896    False
4897    False
4898    False
4899    False
4900    False
4901    False
4902    False
4903    False
4904    False
4905    False
4906    False
4907    False
4908    False
4909    False
4910    False
4911    False
4912     True
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [42]:
director_fill = director.fillna('leesuho')

In [43]:
director_fill.describe()

count        4916
unique       2398
top       leesuho
freq          102
Name: director_name, dtype: object

In [44]:
director_drop = director.dropna()

In [45]:
director_drop.describe()

count                 4814
unique                2397
top       Steven Spielberg
freq                    26
Name: director_name, dtype: object

In [46]:
director.value_counts(normalize=True)

Steven Spielberg        0.005401
Woody Allen             0.004570
Clint Eastwood          0.004155
Martin Scorsese         0.004155
Ridley Scott            0.003324
Spike Lee               0.003324
Steven Soderbergh       0.003116
Renny Harlin            0.003116
Tim Burton              0.002908
Oliver Stone            0.002908
Robert Zemeckis         0.002700
Robert Rodriguez        0.002700
Joel Schumacher         0.002700
Ron Howard              0.002700
Barry Levinson          0.002700
Kevin Smith             0.002493
Michael Bay             0.002493
Brian De Palma          0.002493
Tony Scott              0.002493
Francis Ford Coppola    0.002285
Sam Raimi               0.002285
Richard Donner          0.002285
Chris Columbus          0.002285
Shawn Levy              0.002285
Richard Linklater       0.002285
Rob Reiner              0.002285
John Carpenter          0.002077
John McTiernan          0.002077
Stephen Frears          0.002077
Bobby Farrelly          0.002077
          

In [47]:
director.hasnans

True

In [48]:
director.notnull()

0        True
1        True
2        True
3        True
4        True
5        True
6        True
7        True
8        True
9        True
10       True
11       True
12       True
13       True
14       True
15       True
16       True
17       True
18       True
19       True
20       True
21       True
22       True
23       True
24       True
25       True
26       True
27       True
28       True
29       True
        ...  
4886     True
4887     True
4888     True
4889     True
4890     True
4891     True
4892     True
4893     True
4894     True
4895     True
4896     True
4897     True
4898     True
4899     True
4900     True
4901     True
4902     True
4903     True
4904     True
4905     True
4906     True
4907     True
4908     True
4909     True
4910     True
4911     True
4912    False
4913     True
4914     True
4915     True
Name: director_name, Length: 4916, dtype: bool

In [49]:
director.notnull().sum()

4814

## 6. Series에 연산자 사용하기
- Arithmetic operator
- Comparison operator

In [50]:
movie = pd.read_csv('../data/movie.csv')

In [51]:
imdb = movie['imdb_score']
imdb

0       7.9
1       7.1
2       6.8
3       8.5
4       7.1
5       6.6
6       6.2
7       7.8
8       7.5
9       7.5
10      6.9
11      6.1
12      6.7
13      7.3
14      6.5
15      7.2
16      6.6
17      8.1
18      6.7
19      6.8
20      7.5
21      7.0
22      6.7
23      7.9
24      6.1
25      7.2
26      7.7
27      8.2
28      5.9
29      7.0
       ... 
4886    7.0
4887    6.3
4888    7.1
4889    4.8
4890    3.3
4891    6.9
4892    4.6
4893    3.0
4894    6.6
4895    7.4
4896    6.2
4897    4.0
4898    6.1
4899    6.9
4900    7.5
4901    6.7
4902    7.4
4903    6.1
4904    5.4
4905    6.4
4906    7.0
4907    6.3
4908    6.9
4909    7.8
4910    6.4
4911    7.7
4912    7.5
4913    6.3
4914    6.3
4915    6.6
Name: imdb_score, Length: 4916, dtype: float64

In [52]:
imdb // 7

0       1.0
1       1.0
2       0.0
3       1.0
4       1.0
5       0.0
6       0.0
7       1.0
8       1.0
9       1.0
10      0.0
11      0.0
12      0.0
13      1.0
14      0.0
15      1.0
16      0.0
17      1.0
18      0.0
19      0.0
20      1.0
21      1.0
22      0.0
23      1.0
24      0.0
25      1.0
26      1.0
27      1.0
28      0.0
29      1.0
       ... 
4886    1.0
4887    0.0
4888    1.0
4889    0.0
4890    0.0
4891    0.0
4892    0.0
4893    0.0
4894    0.0
4895    1.0
4896    0.0
4897    0.0
4898    0.0
4899    0.0
4900    1.0
4901    0.0
4902    1.0
4903    0.0
4904    0.0
4905    0.0
4906    1.0
4907    0.0
4908    0.0
4909    1.0
4910    0.0
4911    1.0
4912    1.0
4913    0.0
4914    0.0
4915    0.0
Name: imdb_score, Length: 4916, dtype: float64

In [53]:
imdb > 7

0        True
1        True
2       False
3        True
4        True
5       False
6       False
7        True
8        True
9        True
10      False
11      False
12      False
13       True
14      False
15       True
16      False
17       True
18      False
19      False
20       True
21      False
22      False
23       True
24      False
25       True
26       True
27       True
28      False
29      False
        ...  
4886    False
4887    False
4888     True
4889    False
4890    False
4891    False
4892    False
4893    False
4894    False
4895     True
4896    False
4897    False
4898    False
4899    False
4900     True
4901    False
4902     True
4903    False
4904    False
4905    False
4906    False
4907    False
4908    False
4909     True
4910    False
4911     True
4912     True
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

In [54]:
# James Cameron 감독의 영화를 모두 찾아보자.
movie[movie['director_name']=='James Cameron']

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
26,Color,James Cameron,315.0,194.0,0.0,794.0,Kate Winslet,29000.0,658672302.0,Drama|Romance,...,2528.0,English,USA,PG-13,200000000.0,1997.0,14000.0,7.7,2.35,26000
285,Color,James Cameron,210.0,153.0,0.0,539.0,Jenette Goldstein,780.0,204843350.0,Action|Sci-Fi,...,983.0,English,USA,R,102000000.0,1991.0,604.0,8.5,2.35,13000
288,Color,James Cameron,94.0,141.0,0.0,618.0,Tia Carrere,2000.0,146282411.0,Action|Comedy|Thriller,...,351.0,English,USA,R,115000000.0,1994.0,1000.0,7.2,2.35,0
599,Color,James Cameron,82.0,171.0,0.0,638.0,Todd Graff,2000.0,54222000.0,Adventure|Drama|Sci-Fi|Thriller,...,380.0,English,USA,PG-13,69500000.0,1989.0,650.0,7.6,2.35,0
2455,Color,James Cameron,250.0,154.0,0.0,604.0,Carrie Henn,2000.0,85200000.0,Action|Adventure|Sci-Fi,...,1076.0,English,USA,R,18500000.0,1986.0,626.0,8.4,1.85,18000
3508,Color,James Cameron,204.0,107.0,0.0,255.0,Brian Thompson,2000.0,38400000.0,Action|Sci-Fi,...,692.0,English,UK,R,6500000.0,1984.0,663.0,8.1,1.85,13000


## 7. Series 메서드 체인
- dot notation을 사용해서 메서드를 연속적으로 호출하는 것을 method chaining이라고 한다.
- function chaining이 아니라 method chaining이다.
- Series나 DataFrame 메서드는 또 다른 Series나 DataFrame을 반환하므로 method chaining이 용이하다.

In [55]:
movie = pd.read_csv('../data/movie.csv')

In [56]:
director = movie['director_name']
actor_1_fb_likes = movie['actor_1_facebook_likes']

In [57]:
director.value_counts().head(3)

Steven Spielberg    26
Woody Allen         22
Clint Eastwood      20
Name: director_name, dtype: int64

In [58]:
# NaN을 확인하는 일반적인 방법은 isnull, sum 메서드 체인이다.
director.isnull().sum()

102

- Facebook의 like 개수는 정수다.
- NaN이 존재하는 숫자 column은 datatype이 float이다.
- NaN을 모두 0으로 채운 후에 astype 메서드를 이용하여 정수로 변환할 수 있다.

In [59]:
actor_1_fb_likes.dtype

dtype('float64')

In [60]:
# actor_1_fb_likes.astype(int)
# ValueError: Cannot convert non-finite values (NA or inf) to integer

In [61]:
actor_1_fb_likes.fillna(0)\
                .astype(int)\
                .head()

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int32

- 메서드 체인에 `\` 대신 전체를 `()`로 묶어도 된다.

In [62]:
(actor_1_fb_likes.fillna(0)
                 .astype(int)
                 .head())

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int32

## 8. index를 의미 있게 만들기
- DataFrame의 index는 각 row의 label을 제공한다.
- DataFrame을 생성할 때 명시적인 index를 제공하지 않으면 디폴트로 RangeIndex가 생성된다.
- RangeIndex는 0부터 n-1까지 정수가 부여된다.

In [63]:
movie = pd.read_csv('../data/movie.csv')

- 데이터를 읽어 DataFrame으로 변경 한 후 특정 column을 index로 변환한다.

In [64]:
movie2 = movie.set_index('movie_title')

In [65]:
movie2.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


### tip.Christopher Nolan 감독의 영화를 찾아보자.

In [66]:
movie2[movie2['director_name']=='Christopher Nolan']

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
The Dark Knight,Color,Christopher Nolan,645.0,152.0,22000.0,11000.0,Heath Ledger,23000.0,533316061.0,Action|Crime|Drama|Thriller,...,4667.0,English,USA,PG-13,185000000.0,2008.0,13000.0,9.0,2.35,37000
Interstellar,Color,Christopher Nolan,712.0,169.0,22000.0,6000.0,Anne Hathaway,11000.0,187991439.0,Adventure|Drama|Sci-Fi,...,2725.0,English,USA,PG-13,165000000.0,2014.0,11000.0,8.6,2.35,349000
Inception,Color,Christopher Nolan,642.0,148.0,22000.0,23000.0,Tom Hardy,29000.0,292568851.0,Action|Adventure|Sci-Fi|Thriller,...,2803.0,English,USA,PG-13,160000000.0,2010.0,27000.0,8.8,2.35,175000
Batman Begins,Color,Christopher Nolan,478.0,128.0,22000.0,11000.0,Liam Neeson,23000.0,205343774.0,Action|Adventure,...,2685.0,English,USA,PG-13,150000000.0,2005.0,14000.0,8.3,2.35,15000
Insomnia,Color,Christopher Nolan,185.0,118.0,22000.0,319.0,Maura Tierney,14000.0,67263182.0,Drama|Mystery|Thriller,...,651.0,English,USA,R,46000000.0,2002.0,509.0,7.2,2.35,0
The Prestige,Color,Christopher Nolan,341.0,130.0,22000.0,19000.0,Hugh Jackman,23000.0,53082743.0,Drama|Mystery|Sci-Fi|Thriller,...,1100.0,English,USA,PG-13,40000000.0,2006.0,20000.0,8.5,2.35,49000
Memento,Black and White,Christopher Nolan,274.0,113.0,22000.0,379.0,Thomas Lennon,716.0,25530884.0,Mystery|Thriller,...,2067.0,English,USA,R,9000000.0,2000.0,651.0,8.5,2.35,40000


In [67]:
movie.set_index('movie_title')['director_name'].head()

movie_title
Avatar                                            James Cameron
Pirates of the Caribbean: At World's End         Gore Verbinski
Spectre                                              Sam Mendes
The Dark Knight Rises                         Christopher Nolan
Star Wars: Episode VII - The Force Awakens          Doug Walker
Name: director_name, dtype: object

- 데이터 파일을 로드하는 시점에 특정 column을 index로 지정할 수도 있다.

In [68]:
movie = pd.read_csv('../data/movie.csv', index_col='movie_title')

In [69]:
movie.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


- set_index와 read_csv에서 index_col을 사용하면 해당 column을 DataFrame에서 삭제한다.
- drop keyword param을 False로 설정하면 DataFrame에 그대로 유지된다.

In [70]:
movie2.reset_index()

Unnamed: 0,movie_title,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,...,,,,,,,12.0,7.1,,0
5,John Carter,Color,Andrew Stanton,462.0,132.0,475.0,530.0,Samantha Morton,640.0,73058679.0,...,738.0,English,USA,PG-13,263700000.0,2012.0,632.0,6.6,2.35,24000
6,Spider-Man 3,Color,Sam Raimi,392.0,156.0,0.0,4000.0,James Franco,24000.0,336530303.0,...,1902.0,English,USA,PG-13,258000000.0,2007.0,11000.0,6.2,2.35,0
7,Tangled,Color,Nathan Greno,324.0,100.0,15.0,284.0,Donna Murphy,799.0,200807262.0,...,387.0,English,USA,PG,260000000.0,2010.0,553.0,7.8,1.85,29000
8,Avengers: Age of Ultron,Color,Joss Whedon,635.0,141.0,0.0,19000.0,Robert Downey Jr.,26000.0,458991599.0,...,1117.0,English,USA,PG-13,250000000.0,2015.0,21000.0,7.5,2.35,118000
9,Harry Potter and the Half-Blood Prince,Color,David Yates,375.0,153.0,282.0,10000.0,Daniel Radcliffe,25000.0,301956980.0,...,973.0,English,UK,PG,250000000.0,2009.0,11000.0,7.5,2.35,10000


- reset_index를 사용하면 index가 column으로 변환된다.
- index는 RangeIndex로 변환된다.
- index였던 column은 DataFrame의 첫 번째 column이 된다.

## 9. Rename row and column

In [71]:
movie = pd.read_csv('../data/movie.csv', index_col='movie_title')

In [72]:
movie.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [73]:
# 변경하고자하는 데이터를 dict로 만든다.
idx_rename = {'Avatar': 'Evatar', 'Spectre': 'Ertceps'}

In [74]:
# rename 메소드에 index keyword param으로 dict를 전달하여 index를 변경한다.
movie_rename = movie.rename(index=idx_rename)

In [75]:
movie_rename.head()

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Evatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [76]:
col_rename = {'director_name': 'Director Name', 'num_critic_for_reviews': 'Critical Reviews'}

In [77]:
movie_rename = movie.rename(columns=col_rename)

In [78]:
movie_rename.head()

Unnamed: 0_level_0,color,Director Name,Critical Reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [79]:
movie = pd.read_csv('../data/movie.csv', index_col='movie_title')

In [80]:
# DataFrame의 index 객체를 직접 수정하려고 하면 exception이 발생한다.
# movie.index[0] = 'AAAAvatar'
# TypeError: Index does not support mutable operations

In [81]:
# DataFrame의 index 객체를 python list로 변환
index_list = movie.index.tolist()

In [82]:
columns_list = movie.columns.tolist()

In [83]:
# python의 list는 수정할 수 있다.
index_list[0] = 'Ratava'

In [84]:
index_list[2] = 'Ertceps'

In [85]:
columns_list[0] = 'Director Name'

In [86]:
columns_list[2] = 'Critical Reviews'

In [87]:
index_list

['Ratava',
 "Pirates of the Caribbean: At World's End",
 'Ertceps',
 'The Dark Knight Rises',
 'Star Wars: Episode VII - The Force Awakens',
 'John Carter',
 'Spider-Man 3',
 'Tangled',
 'Avengers: Age of Ultron',
 'Harry Potter and the Half-Blood Prince',
 'Batman v Superman: Dawn of Justice',
 'Superman Returns',
 'Quantum of Solace',
 "Pirates of the Caribbean: Dead Man's Chest",
 'The Lone Ranger',
 'Man of Steel',
 'The Chronicles of Narnia: Prince Caspian',
 'The Avengers',
 'Pirates of the Caribbean: On Stranger Tides',
 'Men in Black 3',
 'The Hobbit: The Battle of the Five Armies',
 'The Amazing Spider-Man',
 'Robin Hood',
 'The Hobbit: The Desolation of Smaug',
 'The Golden Compass',
 'King Kong',
 'Titanic',
 'Captain America: Civil War',
 'Battleship',
 'Jurassic World',
 'Skyfall',
 'Spider-Man 2',
 'Iron Man 3',
 'Alice in Wonderland',
 'X-Men: The Last Stand',
 'Monsters University',
 'Transformers: Revenge of the Fallen',
 'Transformers: Age of Extinction',
 'Oz the Gre

In [88]:
columns_list

['Director Name',
 'director_name',
 'Critical Reviews',
 'duration',
 'director_facebook_likes',
 'actor_3_facebook_likes',
 'actor_2_name',
 'actor_1_facebook_likes',
 'gross',
 'genres',
 'actor_1_name',
 'num_voted_users',
 'cast_total_facebook_likes',
 'actor_3_name',
 'facenumber_in_poster',
 'plot_keywords',
 'movie_imdb_link',
 'num_user_for_reviews',
 'language',
 'country',
 'content_rating',
 'budget',
 'title_year',
 'actor_2_facebook_likes',
 'imdb_score',
 'aspect_ratio',
 'movie_facebook_likes']

In [89]:
# DataFrame의 index를 재지정 하는 것은 가능하다. size가 같아야 한다.
movie.index = index_list

In [90]:
movie.columns = columns_list

In [91]:
movie.head()

Unnamed: 0,Director Name,director_name,Critical Reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
Ratava,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


## 10. Create/Delete columns

In [92]:
movie = pd.read_csv('../data/movie.csv')

- column을 생성하는 가장 간단한 방법은 스칼라 값을 할당하는 것
- 가장 마지막에 추가된다.

In [93]:
movie['has_seen'] = 0

In [94]:
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seen
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,12.0,7.1,,0,0


In [95]:
# 배우와 감독의 facebook 좋아요를 모두 더하여 새로운 column을 생성한다.
movie['actor_director_facebook_likes'] = \
    (movie['actor_1_facebook_likes'] +
     movie['actor_2_facebook_likes'] +
     movie['actor_3_facebook_likes'] +
     movie['director_facebook_likes'])

### tip. pandas DataFrame 만들기 기초

In [96]:
df1 = DataFrame({'a': [1, 2, 3, 4]})
df1

Unnamed: 0,a
0,1
1,2
2,3
3,4


In [97]:
df2 = DataFrame({'a':[1, 2, np.NaN, 4]})
df2

Unnamed: 0,a
0,1.0
1,2.0
2,
3,4.0


In [98]:
df1 + df2

Unnamed: 0,a
0,2.0
1,4.0
2,
3,8.0


In [99]:
df3 = DataFrame({'a': [1, 2, 3, 4], 'b': [np.NaN, 2, 3, 4]})
df3

Unnamed: 0,a,b
0,1,
1,2,2.0
2,3,3.0
3,4,4.0


In [100]:
df3['c'] = df3['a'] + df3['b']

In [101]:
df3

Unnamed: 0,a,b,c
0,1,,
1,2,2.0,4.0
2,3,3.0,6.0
3,4,4.0,8.0


- 스칼라 값을 더할 때 NaN이 있으면 NaN이 된다.

In [102]:
movie['actor_director_facebook_likes'].isnull().sum()

122

- NaN을 없애기 위해 NaN을 0으로 변환

In [103]:
movie['actor_director_facebook_likes'] =\
    movie['actor_director_facebook_likes'].fillna(0)

In [104]:
movie['actor_director_facebook_likes'].isnull().sum()

0

In [105]:
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seen,actor_director_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,0,2791.0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0,46563.0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0,11554.0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,0,95000.0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,12.0,7.1,,0,0,0.0


- drop() 메서드는 row, columns을 선택하여 삭제할 수 있다.

In [106]:
movie.drop('actor_director_facebook_likes', axis=1)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seen
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,12.0,7.1,,0,0
5,Color,Andrew Stanton,462.0,132.0,475.0,530.0,Samantha Morton,640.0,73058679.0,Action|Adventure|Sci-Fi,...,English,USA,PG-13,263700000.0,2012.0,632.0,6.6,2.35,24000,0
6,Color,Sam Raimi,392.0,156.0,0.0,4000.0,James Franco,24000.0,336530303.0,Action|Adventure|Romance,...,English,USA,PG-13,258000000.0,2007.0,11000.0,6.2,2.35,0,0
7,Color,Nathan Greno,324.0,100.0,15.0,284.0,Donna Murphy,799.0,200807262.0,Adventure|Animation|Comedy|Family|Fantasy|Musi...,...,English,USA,PG,260000000.0,2010.0,553.0,7.8,1.85,29000,0
8,Color,Joss Whedon,635.0,141.0,0.0,19000.0,Robert Downey Jr.,26000.0,458991599.0,Action|Adventure|Sci-Fi,...,English,USA,PG-13,250000000.0,2015.0,21000.0,7.5,2.35,118000,0
9,Color,David Yates,375.0,153.0,282.0,10000.0,Daniel Radcliffe,25000.0,301956980.0,Adventure|Family|Fantasy|Mystery,...,English,UK,PG,250000000.0,2009.0,11000.0,7.5,2.35,10000,0


In [107]:
movie.drop('has_seen', axis='columns')

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,actor_director_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,2791.0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,46563.0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,11554.0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,95000.0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,12.0,7.1,,0,0.0
5,Color,Andrew Stanton,462.0,132.0,475.0,530.0,Samantha Morton,640.0,73058679.0,Action|Adventure|Sci-Fi,...,English,USA,PG-13,263700000.0,2012.0,632.0,6.6,2.35,24000,2277.0
6,Color,Sam Raimi,392.0,156.0,0.0,4000.0,James Franco,24000.0,336530303.0,Action|Adventure|Romance,...,English,USA,PG-13,258000000.0,2007.0,11000.0,6.2,2.35,0,39000.0
7,Color,Nathan Greno,324.0,100.0,15.0,284.0,Donna Murphy,799.0,200807262.0,Adventure|Animation|Comedy|Family|Fantasy|Musi...,...,English,USA,PG,260000000.0,2010.0,553.0,7.8,1.85,29000,1651.0
8,Color,Joss Whedon,635.0,141.0,0.0,19000.0,Robert Downey Jr.,26000.0,458991599.0,Action|Adventure|Sci-Fi,...,English,USA,PG-13,250000000.0,2015.0,21000.0,7.5,2.35,118000,66000.0
9,Color,David Yates,375.0,153.0,282.0,10000.0,Daniel Radcliffe,25000.0,301956980.0,Adventure|Family|Fantasy|Mystery,...,English,UK,PG,250000000.0,2009.0,11000.0,7.5,2.35,10000,46282.0


In [108]:
movie.drop(0, axis=0)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seen,actor_director_facebook_likes
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0,46563.0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0,11554.0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,0,95000.0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,12.0,7.1,,0,0,0.0
5,Color,Andrew Stanton,462.0,132.0,475.0,530.0,Samantha Morton,640.0,73058679.0,Action|Adventure|Sci-Fi,...,USA,PG-13,263700000.0,2012.0,632.0,6.6,2.35,24000,0,2277.0
6,Color,Sam Raimi,392.0,156.0,0.0,4000.0,James Franco,24000.0,336530303.0,Action|Adventure|Romance,...,USA,PG-13,258000000.0,2007.0,11000.0,6.2,2.35,0,0,39000.0
7,Color,Nathan Greno,324.0,100.0,15.0,284.0,Donna Murphy,799.0,200807262.0,Adventure|Animation|Comedy|Family|Fantasy|Musi...,...,USA,PG,260000000.0,2010.0,553.0,7.8,1.85,29000,0,1651.0
8,Color,Joss Whedon,635.0,141.0,0.0,19000.0,Robert Downey Jr.,26000.0,458991599.0,Action|Adventure|Sci-Fi,...,USA,PG-13,250000000.0,2015.0,21000.0,7.5,2.35,118000,0,66000.0
9,Color,David Yates,375.0,153.0,282.0,10000.0,Daniel Radcliffe,25000.0,301956980.0,Adventure|Family|Fantasy|Mystery,...,UK,PG,250000000.0,2009.0,11000.0,7.5,2.35,10000,0,46282.0
10,Color,Zack Snyder,673.0,183.0,0.0,2000.0,Lauren Cohan,15000.0,330249062.0,Action|Adventure|Sci-Fi,...,USA,PG-13,250000000.0,2016.0,4000.0,6.9,2.35,197000,0,21000.0


- insert 메서드는 특정한 위치에 데이터를 끼워 넣을 수 있다.

In [109]:
profit_index = movie.columns.get_loc('gross')+1
profit_index

9

In [110]:
movie.insert(loc=profit_index,
             column='profit',
             value=movie['gross'] - movie['budget'])

In [111]:
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,profit,...,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seen,actor_director_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,523505847.0,...,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,0,2791.0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,9404152.0,...,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0,46563.0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,-44925825.0,...,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0,11554.0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,198130642.0,...,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,0,95000.0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,,...,,,,,12.0,7.1,,0,0,0.0


In [112]:
del movie['num_critic_for_reviews']

In [113]:
movie.head()

Unnamed: 0,color,director_name,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,profit,genres,...,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seen,actor_director_facebook_likes
0,Color,James Cameron,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,523505847.0,Action|Adventure|Fantasy|Sci-Fi,...,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,0,2791.0
1,Color,Gore Verbinski,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,9404152.0,Action|Adventure|Fantasy,...,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0,46563.0
2,Color,Sam Mendes,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,-44925825.0,Action|Adventure|Thriller,...,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0,11554.0
3,Color,Christopher Nolan,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,198130642.0,Action|Thriller,...,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,0,95000.0
4,,Doug Walker,,131.0,,Rob Walker,131.0,,,Documentary,...,,,,,12.0,7.1,,0,0,0.0
