In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import seaborn as sns 
%matplotlib inline 

In [2]:
# Load the data set 

netflix = pd.read_csv('Resources/netflix_titles.csv')
netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


In [3]:
netflix.shape

(6234, 12)

In [4]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6234 entries, 0 to 6233
Data columns (total 12 columns):
show_id         6234 non-null int64
type            6234 non-null object
title           6234 non-null object
director        4265 non-null object
cast            5664 non-null object
country         5758 non-null object
date_added      6223 non-null object
release_year    6234 non-null int64
rating          6224 non-null object
duration        6234 non-null object
listed_in       6234 non-null object
description     6234 non-null object
dtypes: int64(2), object(10)
memory usage: 584.5+ KB


Looks like director, cast, and country have the most missing values. Based on our goals, these columns wont be relevant in our analysis. 

In [5]:
netflix.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [6]:
netflix = netflix.rename({'listed_in': 'genre', 'rating':'film_rating', 'release_year':'original_release_year'}, axis=1)

netflix.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'original_release_year', 'film_rating', 'duration', 'genre',
       'description'],
      dtype='object')

In [7]:
netflix.dtypes

show_id                   int64
type                     object
title                    object
director                 object
cast                     object
country                  object
date_added               object
original_release_year     int64
film_rating              object
duration                 object
genre                    object
description              object
dtype: object

In [8]:
netflix_cleaned = netflix.dropna(subset=['date_added'])
netflix_cleaned.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,original_release_year,film_rating,duration,genre,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


In [9]:
netflix_cleaned['date_added']

0       September 9, 2019
1       September 9, 2016
2       September 8, 2018
3       September 8, 2018
4       September 8, 2017
5       September 8, 2017
6       September 8, 2017
7       September 8, 2017
8       September 8, 2017
9       September 8, 2017
10      September 8, 2017
11      September 8, 2017
12      September 8, 2017
13      September 8, 2017
14      September 8, 2017
15      September 8, 2017
16      September 8, 2017
17      September 8, 2017
18      September 8, 2017
19      September 8, 2017
20      September 8, 2017
21      September 8, 2017
22      September 8, 2017
23      September 8, 2017
24      September 8, 2017
25      September 8, 2015
26      September 7, 2018
27      September 7, 2018
28      September 7, 2018
29      September 7, 2018
              ...        
6193       April 29, 2019
6194       April 28, 2019
6195       April 27, 2018
6196       April 27, 2017
6197       April 26, 2019
6198       April 26, 2019
6199       April 23, 2019
6200       A

In [10]:
netflix_cleaned['month_added'] = netflix_cleaned['date_added'].str.split(' ').apply(lambda x:x[0])
netflix_cleaned['day_added'] = netflix_cleaned['date_added'].str.replace(',','').str.split(' ').apply(lambda x:x[1])
netflix_cleaned['year_added'] = netflix_cleaned['date_added'].str.split(' ').apply(lambda x:x[2])

netflix_cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,show_id,type,title,director,cast,country,date_added,original_release_year,film_rating,duration,genre,description,month_added,day_added,year_added
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...,September,9,2019
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...,September,9,2016
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob...",September,8,2018
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...,September,8,2018
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...,September,8,2017


In [11]:
netflix_cleaned['film_rating'].value_counts()

TV-MA       2025
TV-14       1695
TV-PG        699
R            508
PG-13        286
NR           217
PG           184
TV-Y7        168
TV-G         149
TV-Y         142
TV-Y7-FV      95
G             37
UR             7
NC-17          2
Name: film_rating, dtype: int64

In [12]:
netflix_cleaned['genre'].value_counts()

Documentaries                                                 299
Stand-Up Comedy                                               273
Dramas, International Movies                                  248
Dramas, Independent Movies, International Movies              186
Comedies, Dramas, International Movies                        174
Kids' TV                                                      158
Documentaries, International Movies                           150
Children & Family Movies, Comedies                            129
Comedies, International Movies                                120
Children & Family Movies                                      120
Dramas, International Movies, Romantic Movies                 108
Action & Adventure, Dramas, International Movies              103
Comedies, International Movies, Romantic Movies                97
Dramas, International Movies, Thrillers                        93
Crime TV Shows, International TV Shows, TV Dramas              92
Internatio

In [15]:
netflix_cleaned['season_for_tv_show'] = netflix_cleaned.apply(lambda x : x['duration'].split(' ')[0] if "Season" in x['duration'] else '',axis=1)

netflix_cleaned['duration'] = netflix_cleaned.apply(lambda x : x['duration'].split(' ')[0] if "Season" not in x['duration'] else '',axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [16]:
netflix_cleaned[['type', 'title','director','country','original_release_year', 'film_rating','duration','genre','season_for_tv_show']]

Unnamed: 0,type,title,director,country,original_release_year,film_rating,duration,genre,season_for_tv_show
0,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","United States, India, South Korea, China",2019,TV-PG,90,"Children & Family Movies, Comedies",
1,Movie,Jandino: Whatever it Takes,,United Kingdom,2016,TV-MA,94,Stand-Up Comedy,
2,TV Show,Transformers Prime,,United States,2013,TV-Y7-FV,,Kids' TV,1
3,TV Show,Transformers: Robots in Disguise,,United States,2016,TV-Y7,,Kids' TV,1
4,Movie,#realityhigh,Fernando Lebrija,United States,2017,TV-14,99,Comedies,
5,TV Show,Apaches,,Spain,2016,TV-MA,,"Crime TV Shows, International TV Shows, Spanis...",1
6,Movie,Automata,Gabe Ibáñez,"Bulgaria, United States, Spain, Canada",2014,R,110,"International Movies, Sci-Fi & Fantasy, Thrillers",
7,Movie,Fabrizio Copano: Solo pienso en mi,"Rodrigo Toro, Francisco Schultz",Chile,2017,TV-MA,60,Stand-Up Comedy,
8,TV Show,Fire Chasers,,United States,2017,TV-MA,,"Docuseries, Science & Nature TV",1
9,Movie,Good People,Henrik Ruben Genz,"United States, United Kingdom, Denmark, Sweden",2014,R,90,"Action & Adventure, Thrillers",
