In [141]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import seaborn as sns 
%matplotlib inline 
import plotly.graph_objects as go
import plotly.express as pex
from plotly.offline import init_notebook_mode, iplot
from plotly.subplots import make_subplots

In [2]:
# Load the data set 

netflix = pd.read_csv('Resources/netflix_titles.csv')
netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


In [3]:
netflix.shape

(6234, 12)

In [4]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6234 entries, 0 to 6233
Data columns (total 12 columns):
show_id         6234 non-null int64
type            6234 non-null object
title           6234 non-null object
director        4265 non-null object
cast            5664 non-null object
country         5758 non-null object
date_added      6223 non-null object
release_year    6234 non-null int64
rating          6224 non-null object
duration        6234 non-null object
listed_in       6234 non-null object
description     6234 non-null object
dtypes: int64(2), object(10)
memory usage: 584.5+ KB


Looks like director, cast, and country have the most missing values. Based on our goals, these columns wont be relevant in our analysis. 

In [5]:
netflix.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [6]:
netflix = netflix.rename({'type': 'show type','listed_in': 'genre', 'rating':'film_rating', 'release_year':'original_release_year'}, axis=1)

netflix.columns

Index(['show_id', 'show type', 'title', 'director', 'cast', 'country',
       'date_added', 'original_release_year', 'film_rating', 'duration',
       'genre', 'description'],
      dtype='object')

In [7]:
netflix.dtypes

show_id                   int64
show type                object
title                    object
director                 object
cast                     object
country                  object
date_added               object
original_release_year     int64
film_rating              object
duration                 object
genre                    object
description              object
dtype: object

In [8]:
netflix_cleaned = netflix.dropna(subset=['date_added'])
netflix_cleaned.head()

Unnamed: 0,show_id,show type,title,director,cast,country,date_added,original_release_year,film_rating,duration,genre,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


In [9]:
netflix_cleaned['date_added']

0       September 9, 2019
1       September 9, 2016
2       September 8, 2018
3       September 8, 2018
4       September 8, 2017
5       September 8, 2017
6       September 8, 2017
7       September 8, 2017
8       September 8, 2017
9       September 8, 2017
10      September 8, 2017
11      September 8, 2017
12      September 8, 2017
13      September 8, 2017
14      September 8, 2017
15      September 8, 2017
16      September 8, 2017
17      September 8, 2017
18      September 8, 2017
19      September 8, 2017
20      September 8, 2017
21      September 8, 2017
22      September 8, 2017
23      September 8, 2017
24      September 8, 2017
25      September 8, 2015
26      September 7, 2018
27      September 7, 2018
28      September 7, 2018
29      September 7, 2018
              ...        
6193       April 29, 2019
6194       April 28, 2019
6195       April 27, 2018
6196       April 27, 2017
6197       April 26, 2019
6198       April 26, 2019
6199       April 23, 2019
6200       A

In [10]:
netflix_cleaned['date_added'] = pd.to_datetime(netflix_cleaned['date_added'])
netflix_cleaned.dtypes



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



show_id                           int64
show type                        object
title                            object
director                         object
cast                             object
country                          object
date_added               datetime64[ns]
original_release_year             int64
film_rating                      object
duration                         object
genre                            object
description                      object
dtype: object

In [11]:
netflix_cleaned['date_added'].value_counts()

2020-01-01    126
2019-11-01    104
2018-03-01     79
2019-12-31     76
2019-10-01     74
2018-10-01     74
2019-07-01     63
2018-11-01     61
2018-01-01     59
2017-10-01     53
2019-02-01     48
2018-04-01     47
2017-09-01     47
2018-05-01     45
2017-07-01     44
2017-05-01     44
2019-01-01     41
2018-07-01     41
2017-08-01     40
2019-09-01     40
2018-08-01     38
2017-03-10     37
2019-12-01     35
2019-08-01     35
2017-11-01     35
2019-11-20     35
2019-03-01     35
2016-01-01     34
2017-06-01     34
2017-03-31     32
             ... 
2019-02-03      1
2016-09-06      1
2017-10-28      1
2014-12-19      1
2017-06-27      1
2019-10-07      1
2014-01-17      1
2017-06-25      1
2018-06-14      1
2016-03-02      1
2017-04-23      1
2017-08-23      1
2019-01-21      1
2018-08-30      1
2017-08-16      1
2017-11-06      1
2016-11-19      1
2015-02-15      1
2016-05-06      1
2018-05-02      1
2017-07-10      1
2018-10-28      1
2017-09-28      1
2017-07-25      1
2015-05-10

In [12]:
netflix_cleaned['year_released_on_nf'] = pd.DatetimeIndex(netflix_cleaned['date_added']).year
netflix_cleaned['month_released_on_nf'] = pd.DatetimeIndex(netflix_cleaned['date_added']).month
netflix_cleaned['day_released_on_nf'] = pd.DatetimeIndex(netflix_cleaned['date_added']).day
netflix_cleaned.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Unnamed: 0,show_id,show type,title,director,cast,country,date_added,original_release_year,film_rating,duration,genre,description,year_released_on_nf,month_released_on_nf,day_released_on_nf
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China",2019-09-09,2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...,2019,9,9
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,2016-09-09,2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...,2016,9,9
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,2018-09-08,2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob...",2018,9,8
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,2018-09-08,2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...,2018,9,8
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,2017-09-08,2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...,2017,9,8


In [13]:
netflix_cleaned['film_rating'].value_counts()

TV-MA       2025
TV-14       1695
TV-PG        699
R            508
PG-13        286
NR           217
PG           184
TV-Y7        168
TV-G         149
TV-Y         142
TV-Y7-FV      95
G             37
UR             7
NC-17          2
Name: film_rating, dtype: int64

In [14]:
netflix_cleaned['season_for_tv_show'] = netflix_cleaned.apply(lambda x : x['duration'].split(' ')[0] if "Season" in x['duration'] else '',axis=1)

netflix_cleaned['duration'] = netflix_cleaned.apply(lambda x : x['duration'].split(' ')[0] if "Season" not in x['duration'] else '',axis=1)





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [15]:
netflix_df = netflix_cleaned[['show type', 'title', 'film_rating','duration','genre','season_for_tv_show', 'year_released_on_nf','day_released_on_nf','director','country','original_release_year']]

In [16]:
netflix_df.isnull().sum()

show type                   0
title                       0
film_rating                 9
duration                    0
genre                       0
season_for_tv_show          0
year_released_on_nf         0
day_released_on_nf          0
director                 1958
country                   474
original_release_year       0
dtype: int64

In [17]:
netflix_df.shape

(6223, 11)

In [18]:
netflix_df.head()

Unnamed: 0,show type,title,film_rating,duration,genre,season_for_tv_show,year_released_on_nf,day_released_on_nf,director,country,original_release_year
0,Movie,Norm of the North: King Sized Adventure,TV-PG,90.0,"Children & Family Movies, Comedies",,2019,9,"Richard Finn, Tim Maltby","United States, India, South Korea, China",2019
1,Movie,Jandino: Whatever it Takes,TV-MA,94.0,Stand-Up Comedy,,2016,9,,United Kingdom,2016
2,TV Show,Transformers Prime,TV-Y7-FV,,Kids' TV,1.0,2018,8,,United States,2013
3,TV Show,Transformers: Robots in Disguise,TV-Y7,,Kids' TV,1.0,2018,8,,United States,2016
4,Movie,#realityhigh,TV-14,99.0,Comedies,,2017,8,Fernando Lebrija,United States,2017


In [37]:
netflix_df['year_released_on_nf'].value_counts()

2019    2349
2018    1782
2017    1300
2016     456
2020     184
2015      90
2014      25
2011      13
2013      12
2012       7
2009       2
2008       2
2010       1
Name: year_released_on_nf, dtype: int64

In [42]:
nf_copy = netflix_df.copy()

nf_copy.head()

Unnamed: 0,show type,title,film_rating,duration,genre,season_for_tv_show,year_released_on_nf,day_released_on_nf,director,country,original_release_year
0,Movie,Norm of the North: King Sized Adventure,TV-PG,90.0,"Children & Family Movies, Comedies",,2019,9,"Richard Finn, Tim Maltby","United States, India, South Korea, China",2019
1,Movie,Jandino: Whatever it Takes,TV-MA,94.0,Stand-Up Comedy,,2016,9,,United Kingdom,2016
2,TV Show,Transformers Prime,TV-Y7-FV,,Kids' TV,1.0,2018,8,,United States,2013
3,TV Show,Transformers: Robots in Disguise,TV-Y7,,Kids' TV,1.0,2018,8,,United States,2016
4,Movie,#realityhigh,TV-14,99.0,Comedies,,2017,8,Fernando Lebrija,United States,2017


# Data Analysis

First let's look at the distribution of TV Shows and Movies offered on Netflix

In [159]:
show_types = netflix_df['show type'].value_counts().reset_index()
show_types = show_types.rename(columns={'index':'Type',
                                       'show type':'Count'})

pie_chart = go.Pie(labels=show_types['Type'], values=show_types['Count'],
                   pull=[0.05,0],
                   marker = dict(colors=["#6ad49b", "#1f77b4"]))

fig_layout = go.Layout(title=" Distribution of Show Types on Netflix", height = 400, legend=dict(x=0.1, y=1.1))
fig = go.Figure(data= pie_chart, layout=fig_layout)
iplot(fig)


More than 2/3 (68.5%) are Movies and remaining 31.5% are TV shows. 

In [67]:
year_type_group = nf_copy.groupby(['show type'])
year_type_group = year_type_group['year_released_on_nf'].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,year_released_on_nf
show type,year_released_on_nf,Unnamed: 2_level_1
Movie,2019,1546
Movie,2018,1290
Movie,2017,913
Movie,2016,264
Movie,2020,147
Movie,2015,58
Movie,2014,19
Movie,2011,13
Movie,2013,6
Movie,2012,4


In [143]:
year_type_df = pd.DataFrame([['Movie', 2019, 1546],
                            ['Movie', 2018, 1290],
                            ['Movie', 2017, 913],
                            ['Movie', 2016, 264],
                            ['Movie', 2020, 147],
                            ['Movie', 2015, 58],
                            ['TV Show', 2019, 803],
                            ['TV Show', 2018, 492],
                            ['TV Show', 2017, 387],
                            ['TV Show', 2016, 192],
                            ['TV Show', 2020, 37],
                            ['TV Show', 2015, 32]],
                           columns =['show type','year','count'])
year_type_df


Unnamed: 0,show type,year,count
0,Movie,2019,1546
1,Movie,2018,1290
2,Movie,2017,913
3,Movie,2016,264
4,Movie,2020,147
5,Movie,2015,58
6,TV Show,2019,803
7,TV Show,2018,492
8,TV Show,2017,387
9,TV Show,2016,192


In [158]:
fig = pex.bar(year_type_df, x='year', y='count', color ='show type', barmode= 'group')
fig.update_layout(title_text ='Distribution of Movies and TV Shows from 2015-2020')
fig.show()