## More on DataFrame Operations

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#let's use the same dataset as before
movie = pd.read_csv('data/movie.csv')

In [3]:
#set pandas options so that it displays all the columns for better readability
pd.options.display.max_columns=30

In [4]:
movie.head().T

Unnamed: 0,0,1,2,3,4
color,Color,Color,Color,Color,
director_name,James Cameron,Gore Verbinski,Sam Mendes,Christopher Nolan,Doug Walker
num_critic_for_reviews,723,302,602,813,
duration,178,169,148,164,
director_facebook_likes,0,563,0,22000,131
actor_3_facebook_likes,855,1000,161,23000,
actor_2_name,Joel David Moore,Orlando Bloom,Rory Kinnear,Christian Bale,Rob Walker
actor_1_facebook_likes,1000,40000,11000,27000,131
gross,7.60506e+08,3.09404e+08,2.00074e+08,4.48131e+08,
genres,Action|Adventure|Fantasy|Sci-Fi,Action|Adventure|Fantasy,Action|Adventure|Thriller,Action|Thriller,Documentary


In [5]:
#extract select fields to work on as a separate DF

movie_actor_director_likes = movie[['actor_1_name','actor_1_facebook_likes','actor_2_name','actor_2_facebook_likes', 
                                    'actor_3_name','actor_3_facebook_likes','director_name', 'director_facebook_likes']]

In [6]:
movie_actor_director_likes.head()

Unnamed: 0,actor_1_name,actor_1_facebook_likes,actor_2_name,actor_2_facebook_likes,actor_3_name,actor_3_facebook_likes,director_name,director_facebook_likes
0,CCH Pounder,1000.0,Joel David Moore,936.0,Wes Studi,855.0,James Cameron,0.0
1,Johnny Depp,40000.0,Orlando Bloom,5000.0,Jack Davenport,1000.0,Gore Verbinski,563.0
2,Christoph Waltz,11000.0,Rory Kinnear,393.0,Stephanie Sigman,161.0,Sam Mendes,0.0
3,Tom Hardy,27000.0,Christian Bale,23000.0,Joseph Gordon-Levitt,23000.0,Christopher Nolan,22000.0
4,Doug Walker,131.0,Rob Walker,12.0,,,Doug Walker,131.0


In [7]:
#create a list of columns for extracting data from movie DF
cols1 = ['movie_title','director_name', 'duration', 'imdb_score']

In [8]:
movie[cols1].head()

Unnamed: 0,movie_title,director_name,duration,imdb_score
0,Avatar,James Cameron,178.0,7.9
1,Pirates of the Caribbean: At World's End,Gore Verbinski,169.0,7.1
2,Spectre,Sam Mendes,148.0,6.8
3,The Dark Knight Rises,Christopher Nolan,164.0,8.5
4,Star Wars: Episode VII - The Force Awakens,Doug Walker,,7.1


In [9]:
movie.get_dtype_counts()

float64    13
int64       3
object     12
dtype: int64

In [10]:
#select fields based on datatypes
movie.select_dtypes('int64').head()

Unnamed: 0,num_voted_users,cast_total_facebook_likes,movie_facebook_likes
0,886204,4834,33000
1,471220,48350,0
2,275868,11700,85000
3,1144337,106759,164000
4,8,143,0


In [11]:
#similarly include multiple datatypes
movie.select_dtypes(include=('int64', 'object')).head()

Unnamed: 0,color,director_name,actor_2_name,genres,actor_1_name,movie_title,num_voted_users,cast_total_facebook_likes,actor_3_name,plot_keywords,movie_imdb_link,language,country,content_rating,movie_facebook_likes
0,Color,James Cameron,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,4834,Wes Studi,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,English,USA,PG-13,33000
1,Color,Gore Verbinski,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,48350,Jack Davenport,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,English,USA,PG-13,0
2,Color,Sam Mendes,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,11700,Stephanie Sigman,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,English,UK,PG-13,85000
3,Color,Christopher Nolan,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,106759,Joseph Gordon-Levitt,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,English,USA,PG-13,164000
4,,Doug Walker,Rob Walker,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens,8,143,,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,,,,0


In [14]:
#using select dtype numbers will include both integers and float values
movie.select_dtypes(include=('number')).head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,723.0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78,33000
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35,0
2,602.0,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35,85000
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,,131.0,,131.0,,8,143,0.0,,,,12.0,7.1,,0


In [20]:
#select all fields with the 'facebook' in it
movie.filter(like='actor').head()

Unnamed: 0,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,actor_1_name,actor_3_name,actor_2_facebook_likes
0,855.0,Joel David Moore,1000.0,CCH Pounder,Wes Studi,936.0
1,1000.0,Orlando Bloom,40000.0,Johnny Depp,Jack Davenport,5000.0
2,161.0,Rory Kinnear,11000.0,Christoph Waltz,Stephanie Sigman,393.0
3,23000.0,Christian Bale,27000.0,Tom Hardy,Joseph Gordon-Levitt,23000.0
4,,Rob Walker,131.0,Doug Walker,,12.0


In [28]:
movie.filter(items=['actor_2_name','movie_title', 'asdf']).head()

Unnamed: 0,actor_2_name,movie_title
0,Joel David Moore,Avatar
1,Orlando Bloom,Pirates of the Caribbean: At World's End
2,Rory Kinnear,Spectre
3,Christian Bale,The Dark Knight Rises
4,Rob Walker,Star Wars: Episode VII - The Force Awakens


In [29]:
#rordering columns to make sense. Here the columns are being reordered based on discrete and continuous fields

disc_core = ['movie_title','title_year', 'content_rating','genres']
disc_people = ['director_name','actor_1_name', 'actor_2_name','actor_3_name']
disc_other = ['color','country','language','plot_keywords','movie_imdb_link']
cont_fb = ['director_facebook_likes','actor_1_facebook_likes','actor_2_facebook_likes',
           'actor_3_facebook_likes', 'cast_total_facebook_likes', 'movie_facebook_likes']
cont_finance = ['budget','gross']
cont_num_reviews = ['num_voted_users','num_user_for_reviews', 'num_critic_for_reviews']
cont_other = ['imdb_score','duration', 'aspect_ratio', 'facenumber_in_poster']

In [30]:
new_col_order = disc_core + disc_people + disc_other + \
                    cont_fb + cont_finance + cont_num_reviews + cont_other

new_col_order

['movie_title',
 'title_year',
 'content_rating',
 'genres',
 'director_name',
 'actor_1_name',
 'actor_2_name',
 'actor_3_name',
 'color',
 'country',
 'language',
 'plot_keywords',
 'movie_imdb_link',
 'director_facebook_likes',
 'actor_1_facebook_likes',
 'actor_2_facebook_likes',
 'actor_3_facebook_likes',
 'cast_total_facebook_likes',
 'movie_facebook_likes',
 'budget',
 'gross',
 'num_voted_users',
 'num_user_for_reviews',
 'num_critic_for_reviews',
 'imdb_score',
 'duration',
 'aspect_ratio',
 'facenumber_in_poster']

In [34]:
set(movie.columns) == set(new_col_order)

True

In [35]:
movie2 = movie[new_col_order]
movie2.head()

Unnamed: 0,movie_title,title_year,content_rating,genres,director_name,actor_1_name,actor_2_name,actor_3_name,color,country,language,plot_keywords,movie_imdb_link,director_facebook_likes,actor_1_facebook_likes,actor_2_facebook_likes,actor_3_facebook_likes,cast_total_facebook_likes,movie_facebook_likes,budget,gross,num_voted_users,num_user_for_reviews,num_critic_for_reviews,imdb_score,duration,aspect_ratio,facenumber_in_poster
0,Avatar,2009.0,PG-13,Action|Adventure|Fantasy|Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Color,USA,English,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,0.0,1000.0,936.0,855.0,4834,33000,237000000.0,760505847.0,886204,3054.0,723.0,7.9,178.0,1.78,0.0
1,Pirates of the Caribbean: At World's End,2007.0,PG-13,Action|Adventure|Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Color,USA,English,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,563.0,40000.0,5000.0,1000.0,48350,0,300000000.0,309404152.0,471220,1238.0,302.0,7.1,169.0,2.35,0.0
2,Spectre,2015.0,PG-13,Action|Adventure|Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Color,UK,English,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,0.0,11000.0,393.0,161.0,11700,85000,245000000.0,200074175.0,275868,994.0,602.0,6.8,148.0,2.35,1.0
3,The Dark Knight Rises,2012.0,PG-13,Action|Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Color,USA,English,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,22000.0,27000.0,23000.0,23000.0,106759,164000,250000000.0,448130642.0,1144337,2701.0,813.0,8.5,164.0,2.35,0.0
4,Star Wars: Episode VII - The Force Awakens,,,Documentary,Doug Walker,Doug Walker,Rob Walker,,,,,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,131.0,131.0,12.0,,143,0,,,8,,,7.1,,,0.0


In [38]:
# we can check the size of the overall dataframe (rows x columns, here it is 4916 * 28)
movie2.size

137648

In [41]:
#how many dimensions in this array?
movie2.ndim

2

In [42]:
movie2.count()

movie_title                  4916
title_year                   4810
content_rating               4616
genres                       4916
director_name                4814
actor_1_name                 4909
actor_2_name                 4903
actor_3_name                 4893
color                        4897
country                      4911
language                     4904
plot_keywords                4764
movie_imdb_link              4916
director_facebook_likes      4814
actor_1_facebook_likes       4909
actor_2_facebook_likes       4903
actor_3_facebook_likes       4893
cast_total_facebook_likes    4916
movie_facebook_likes         4916
budget                       4432
gross                        4054
num_voted_users              4916
num_user_for_reviews         4895
num_critic_for_reviews       4867
imdb_score                   4916
duration                     4901
aspect_ratio                 4590
facenumber_in_poster         4903
dtype: int64

In [48]:
movie.min()

num_critic_for_reviews          1.00
duration                        7.00
director_facebook_likes         0.00
actor_3_facebook_likes          0.00
actor_1_facebook_likes          0.00
gross                         162.00
num_voted_users                 5.00
cast_total_facebook_likes       0.00
facenumber_in_poster            0.00
num_user_for_reviews            1.00
budget                        218.00
title_year                   1916.00
actor_2_facebook_likes          0.00
imdb_score                      1.60
aspect_ratio                    1.18
movie_facebook_likes            0.00
dtype: float64

In [53]:
movie.describe()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4867.0,4901.0,4814.0,4893.0,4909.0,4054.0,4916.0,4916.0,4903.0,4895.0,4432.0,4810.0,4903.0,4916.0,4590.0,4916.0
mean,137.988905,107.090798,691.014541,631.276313,6494.488491,47644510.0,82644.92,9579.815907,1.37732,267.668846,36547490.0,2002.447609,1621.923516,6.437429,2.222349,7348.294142
std,120.239379,25.286015,2832.954125,1625.874802,15106.986884,67372550.0,138322.2,18164.31699,2.023826,372.934839,100242700.0,12.453977,4011.299523,1.127802,1.40294,19206.016458
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
25%,49.0,93.0,7.0,132.0,607.0,5019656.0,8361.75,1394.75,0.0,64.0,6000000.0,1999.0,277.0,5.8,1.85,0.0
50%,108.0,103.0,48.0,366.0,982.0,25043960.0,33132.5,3049.0,1.0,153.0,19850000.0,2005.0,593.0,6.6,2.35,159.0
75%,191.0,118.0,189.75,633.0,11000.0,61108410.0,93772.75,13616.75,2.0,320.5,43000000.0,2011.0,912.0,7.2,2.35,2000.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,4200000000.0,2016.0,137000.0,9.5,16.0,349000.0


In [56]:
#check for the number of missing values
movie.isnull().sum()

color                         19
director_name                102
num_critic_for_reviews        49
duration                      15
director_facebook_likes      102
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        862
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                152
movie_imdb_link                0
num_user_for_reviews          21
language                      12
country                        5
content_rating               300
budget                       484
title_year                   106
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 326
movie_facebook_likes           0
dtype: int64

In [59]:
movie.select_dtypes(['object']).fillna('').max()

color                                                          Color
director_name                                          Étienne Faure
actor_2_name                                           Zubaida Sahar
genres                                                       Western
actor_1_name                                           Óscar Jaenada
movie_title                                                 Æon Flux
actor_3_name                                           Óscar Jaenada
plot_keywords                                    zombie|zombie spoof
movie_imdb_link    http://www.imdb.com/title/tt5574490/?ref_=fn_t...
language                                                        Zulu
country                                                 West Germany
content_rating                                                     X
dtype: object