# Manejando la data de Netflix

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame,Series

In [2]:
# importar el csv descargado en nuestra computadora

netflix = pd.read_csv("../../data/netflix_titles.csv")
netflix.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."


## Adding new columns

In [3]:
# Total directors
netflix[ 'director'].str.split( ',' , expand = True ).notnull()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7782,True,False,False,False,False,False,False,False,False,False,False,False,False
7783,True,False,False,False,False,False,False,False,False,False,False,False,False
7784,False,False,False,False,False,False,False,False,False,False,False,False,False
7785,False,False,False,False,False,False,False,False,False,False,False,False,False


In [4]:
netflix[ 'num_director'] = netflix[ 'director'].str.split( ',' , expand = True ).notnull().sum( axis = 1 )

In [5]:
# Total countries

netflix[ 'country' ]

0                                                  Brazil
1                                                  Mexico
2                                               Singapore
3                                           United States
4                                           United States
                              ...                        
7782    Sweden, Czech Republic, United Kingdom, Denmar...
7783                                                India
7784                                                  NaN
7785                                            Australia
7786                United Kingdom, Canada, United States
Name: country, Length: 7787, dtype: object

In [6]:
netflix[ 'country'].str.split( ',' , expand = True )

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,Brazil,,,,,,,,,,,
1,Mexico,,,,,,,,,,,
2,Singapore,,,,,,,,,,,
3,United States,,,,,,,,,,,
4,United States,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
7782,Sweden,Czech Republic,United Kingdom,Denmark,Netherlands,,,,,,,
7783,India,,,,,,,,,,,
7784,,,,,,,,,,,,
7785,Australia,,,,,,,,,,,


In [7]:
netflix[ 'num_ctry'] = netflix[ 'country'].str.split( ',' , expand = True ).notnull().sum( axis = 1 )

## filtering NA directors and countries

In [None]:
netflix.loc[ netflix['num_ctry'] == 0, 'num_ctry'] = None

In [None]:
netflix.loc[ netflix['num_director'] == 0, 'num_director'] = None

## Groupby

In [8]:
# Differents type
netflix[ 'type' ].unique()

array(['TV Show', 'Movie'], dtype=object)

In [9]:
netflix.groupby( [ 'type' ] ).mean()

Unnamed: 0_level_0,release_year,num_director,num_ctry
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Movie,2012.92003,1.092989,1.229868
TV Show,2016.191701,0.098755,1.018257


## Extra examples of group by

Inspiration from this [code](https://stackoverflow.com/questions/19384532/get-statistics-for-each-group-such-as-count-mean-etc-using-pandas-groupby).

In [10]:
# Not null values
netflix.groupby( [ 'type' ] )[['num_ctry', 'num_director']].count()

Unnamed: 0_level_0,num_ctry,num_director
type,Unnamed: 1_level_1,Unnamed: 2_level_1
Movie,5377,5377
TV Show,2410,2410


In [11]:
netflix.groupby( [ 'type' ] )[['num_ctry', 'num_director']].max().reset_index( level = [ 'type'] )

Unnamed: 0,type,num_ctry,num_director
0,Movie,12,13
1,TV Show,6,7


## diferent statistics

In [12]:
netflix.groupby( [ 'type' ] )[['num_ctry', 'num_director']].agg( ['min', 'max'] )

Unnamed: 0_level_0,num_ctry,num_ctry,num_director,num_director
Unnamed: 0_level_1,min,max,min,max
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Movie,0,12,0,13
TV Show,0,6,0,7


In [13]:
netflix.groupby( [ 'type' ] )[['num_ctry', 'num_director']].agg( num_ctry_min = pd.NamedAgg( column = "num_ctry", aggfunc = "min" ), num_direc_max = pd.NamedAgg( column = "num_director", aggfunc = "max" ) )

Unnamed: 0_level_0,num_ctry_min,num_direc_max
type,Unnamed: 1_level_1,Unnamed: 2_level_1
Movie,0,13
TV Show,0,7


## Merge 

In [15]:
netflix_stats = netflix.groupby( [ 'type' ] )[['num_ctry', 'num_director']].agg( num_ctry_min = pd.NamedAgg( column = "num_ctry", aggfunc = "min" ), num_direc_max = pd.NamedAgg( column = "num_director", aggfunc = "max" ) ).reset_index( level = ['type'] )

In [16]:
netflix.merge( netflix_stats , how = 'right', on = 'type' , indicator = True )

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,num_director,num_ctry,num_ctry_min,num_direc_max,_merge
0,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...,1,1,0,13,both
1,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow...",1,1,0,13,both
2,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi...",1,1,0,13,both
3,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...,1,1,0,13,both
4,s7,Movie,122,Yasir Al Yasiri,"Amina Khalil, Ahmed Dawood, Tarek Lotfy, Ahmed...",Egypt,"June 1, 2020",2019,TV-MA,95 min,"Horror Movies, International Movies","After an awful accident, a couple admitted to ...",1,1,0,13,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7782,s7768,TV Show,Zindagi Gulzar Hai,,"Sanam Saeed, Fawad Khan, Ayesha Omer, Mehreen ...",Pakistan,"December 15, 2016",2012,TV-PG,1 Season,"International TV Shows, Romantic TV Shows, TV ...","Strong-willed, middle-class Kashaf and carefre...",0,1,0,7,both
7783,s7776,TV Show,Zoids Wild,,"Kensho Ono, Takahiro Sakurai, Mikako Komatsu, ...",Japan,"August 14, 2020",2018,TV-Y7,1 Season,"Anime Series, Kids' TV",A quest for freedom and legendary treasure beg...,0,1,0,7,both
7784,s7778,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g...",0,0,0,7,both
7785,s7780,TV Show,Zona Rosa,,"Manu NNa, Ana Julia Yeyé, Ray Contreras, Pablo...",Mexico,"November 26, 2019",2019,TV-MA,1 Season,"International TV Shows, Spanish-Language TV Sh...",An assortment of talent takes the stage for a ...,0,1,0,7,both
