# ch02. DataFrame 필수 연산

## 0. import

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

## 1. DataFrame에서 복수 column 선택

In [2]:
movie = pd.read_csv('../data/movie.csv')

In [3]:
movie_actor_director = movie[['actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name',]]

In [4]:
movie_actor_director.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


- 단일 column을 선택하더라도 DataFrame으로 만들고 싶다면 indexing operator에 list를 사용한다.

In [21]:
movie[['director_name']].head()

Unnamed: 0_level_0,director_name
movie_title,Unnamed: 1_level_1
Avatar,James Cameron
Pirates of the Caribbean: At World's End,Gore Verbinski
Spectre,Sam Mendes
The Dark Knight Rises,Christopher Nolan
Star Wars: Episode VII - The Force Awakens,Doug Walker


## 2. 메서드를 사용한 column 선택
- select_dtypes
- filter

- get_dtype_counts 메서드를 사용하면 각 data type의 개수를 출력한다.

In [22]:
movie = pd.read_csv('../data/movie.csv', index_col='movie_title')

In [23]:
movie.get_dtype_counts()

float64    13
int64       3
object     11
dtype: int64

### tip. 특정 column의 data type을 확인하기 위해 dtype attribute를 사용한다.

In [27]:
movie[['director_name']].dtypes

director_name    object
dtype: object

In [28]:
movie.dtypes

color                         object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
plot_keywords                 object
movie_imdb_link               object
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
movie_facebook_likes           int64
d

----

- select_dtypes 메서드를 사용하면 특정 data type column만 선택할 수 있다.

In [5]:
movie.select_dtypes(include=['int64']).head()

Unnamed: 0_level_0,num_voted_users,cast_total_facebook_likes,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Avatar,886204,4834,33000
Pirates of the Caribbean: At World's End,471220,48350,0
Spectre,275868,11700,85000
The Dark Knight Rises,1144337,106759,164000
Star Wars: Episode VII - The Force Awakens,8,143,0


- 수치로 된 모든 열을 선택하려면 number 인자를 전달한다.

In [6]:
movie.select_dtypes(include=['number']).head()

Unnamed: 0_level_0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Avatar,723.0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35,0
Spectre,602.0,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,,131.0,,131.0,,8,143,0.0,,,,12.0,7.1,,0


- column을 선택하는 또 다른 방법은 filter 메서드를 사용하는 것이다.

In [8]:
movie.filter(like='facebook').head()

Unnamed: 0_level_0,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,cast_total_facebook_likes,actor_2_facebook_likes,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Avatar,0.0,855.0,1000.0,4834,936.0,33000
Pirates of the Caribbean: At World's End,563.0,1000.0,40000.0,48350,5000.0,0
Spectre,0.0,161.0,11000.0,11700,393.0,85000
The Dark Knight Rises,22000.0,23000.0,27000.0,106759,23000.0,164000
Star Wars: Episode VII - The Force Awakens,131.0,,131.0,143,12.0,0


- filter 메서드에 정규식을 사용할 수 있다.

In [11]:
movie.filter(regex='\d+').head()

Unnamed: 0_level_0,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,actor_1_name,actor_3_name,actor_2_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Avatar,855.0,Joel David Moore,1000.0,CCH Pounder,Wes Studi,936.0
Pirates of the Caribbean: At World's End,1000.0,Orlando Bloom,40000.0,Johnny Depp,Jack Davenport,5000.0
Spectre,161.0,Rory Kinnear,11000.0,Christoph Waltz,Stephanie Sigman,393.0
The Dark Knight Rises,23000.0,Christian Bale,27000.0,Tom Hardy,Joseph Gordon-Levitt,23000.0
Star Wars: Episode VII - The Force Awakens,,Rob Walker,131.0,Doug Walker,,12.0


- filter() 메서드는 keyword param으로 3 종류를 사용할 수 있다.
    - items
    - like
    - regex

In [31]:
movie.filter(items=['actor_1_facebook_likes', 'actor_2_facebook_likes']).head()

Unnamed: 0_level_0,actor_1_facebook_likes,actor_2_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Avatar,1000.0,936.0
Pirates of the Caribbean: At World's End,40000.0,5000.0
Spectre,11000.0,393.0
The Dark Knight Rises,27000.0,23000.0
Star Wars: Episode VII - The Force Awakens,131.0,12.0


## 3. column name 정렬하기

- 간단한 가이드라인
    - 각 column을 이산이나 연속에 따라 분류한다.
    - 이산 column과 연속 column 내에서 공통적인 column을 group으로 만들라.
    - group 내 가장 중요한 column을 제일 먼저 나오게 하고 범주형 column을 연속형보다 먼저 나오게 하라.

In [33]:
movie = pd.read_csv('../data/movie.csv')

In [34]:
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


- 가이드라인에 맞춰 column을 리스트로 정리해보자.

In [51]:
disc_core = ['movie_title', 'title_year', 'content_rating', 'genres']
disc_people = ['director_name', 'actor_1_name',
               'actor_2_name', 'actor_3_name',]
disc_other = ['color', 'country', 'language',
              'plot_keywords', 'movie_imdb_link']
cont_fb = ['director_facebook_likes', 'actor_1_facebook_likes',
           'actor_2_facebook_likes', 'actor_3_facebook_likes',
           'cast_total_facebook_likes', 'movie_facebook_likes',]
cont_finance = ['budget', 'gross']
cont_num_reviews = ['num_voted_users', 'num_user_for_reviews',
                    'num_critic_for_reviews']
cont_other = ['imdb_score', 'duration',
              'aspect_ratio', 'facenumber_in_poster']

- 최종적인 column 순서를 만들기 위해 리스트를 모두 합친다.

In [52]:
new_col_order = disc_core + disc_people + disc_other + \
                cont_fb + cont_finance + cont_num_reviews + \
                cont_other

- movie의 모든 column을 포함하고 있는지 확인한다.

In [53]:
set(movie.columns) == set(new_col_order)

True

In [54]:
movie2 = movie[new_col_order]

In [55]:
movie2.head()

Unnamed: 0,movie_title,title_year,content_rating,genres,director_name,actor_1_name,actor_2_name,actor_3_name,color,country,...,movie_facebook_likes,budget,gross,num_voted_users,num_user_for_reviews,num_critic_for_reviews,imdb_score,duration,aspect_ratio,facenumber_in_poster
0,Avatar,2009.0,PG-13,Action|Adventure|Fantasy|Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Color,USA,...,33000,237000000.0,760505847.0,886204,3054.0,723.0,7.9,178.0,1.78,0.0
1,Pirates of the Caribbean: At World's End,2007.0,PG-13,Action|Adventure|Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Color,USA,...,0,300000000.0,309404152.0,471220,1238.0,302.0,7.1,169.0,2.35,0.0
2,Spectre,2015.0,PG-13,Action|Adventure|Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Color,UK,...,85000,245000000.0,200074175.0,275868,994.0,602.0,6.8,148.0,2.35,1.0
3,The Dark Knight Rises,2012.0,PG-13,Action|Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Color,USA,...,164000,250000000.0,448130642.0,1144337,2701.0,813.0,8.5,164.0,2.35,0.0
4,Star Wars: Episode VII - The Force Awakens,,,Documentary,Doug Walker,Doug Walker,Rob Walker,,,,...,0,,,8,,,7.1,,,0.0


## 4. 전체 DataFrame에 대한 연산

In [56]:
movie = pd.read_csv('../data/movie.csv')

In [57]:
# shape attribute: tuple(index, column)
movie.shape

(4916, 28)

In [58]:
# size: index * column
movie.size

137648

In [59]:
# DataFrame의 dimension은 2
movie.ndim

2

In [60]:
# DafaFrame의 len == len(index)
len(movie)

4916