# Netflix TV Shows & Movies - Deep Dive

## Set Up Environment

In [None]:
# Import libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Read in CSV files into a pandas dataframes.
netflix = pd.read_csv('data/netflix_titles_clean.csv')
movies = pd.read_csv('data/netflix_movies_clean.csv')
tv_shows = pd.read_csv('data/netflix_tv_shows_clean.csv')

## Analysis: Deep Dive

### Possible Analyses with this Dataset
#### Overview
1. Change in number of shows/movies available over time.
2. Proportion of shows vs movies.
3. Number of directors, cast members, countries available, and genres.
4. Average lengths for movies and tenure for shows.
5. Most and least common countries, genres, ratings, directors, and cast members.

#### Deep Dive
6. Length of cast by country.
7. Proportion of shows vs movies by country.
8. Range of directors in terms of number of countries.
9. Average movie length by genre and country.
10. TV show tenure by genre.
11. Deeper dive into specific regions (e.g. Africa).

#### Natural Language Processing
12. Most common words in descriptions overall and by genre.

### 6. Length of cast by country.

In [None]:
netflix[['country','cast_size']]

In [None]:
# Create a unique list of countries.
countries_list = []
for entry in list(netflix.country_list):
    for x in entry:
        if x not in countries_list:
            countries_list.append(x)

countries_list

In [None]:
# Calculate the average cast size for each country.
cast_sizes_by_country = {}
for c in countries_list:
    indexes = []
    for i in list(netflix.index):
        countries = netflix.loc[i, 'country_list']
        if c in countries:
            indexes.append(i)
    subset_df = netflix.loc[indexes, :]
    avg_cast_size = round(subset_df.cast_size.mean(), 2)
    cast_sizes_by_country[c] = avg_cast_size

cast_sizes_by_country

In [None]:
country_data = pd.DataFrame(pd.Series(cast_sizes_by_country), columns=['avg_cast_size'])
country_data

In [None]:
# Countries with the highest and lowest average cast size.
country_data.sort_values(by='avg_cast_size', ascending=False)

### 7. Proportion of shows vs movies by country.

In [None]:
country_data.head()

In [None]:
# Calculate the number of tv shows and movies there are from each country.
tv_shows_by_country = {}
movies_by_country = {}
for c in countries_list:
    indexes = []
    for i in list(netflix.index):
        countries = netflix.loc[i, 'country_list']
        if c in countries:
            indexes.append(i)
    subset_df = netflix.loc[indexes, :]
    tv_show_count = len(subset_df[subset_df.type == 'TV Show'])
    movie_count = len(subset_df[subset_df.type == 'Movie'])
    tv_shows_by_country[c] = tv_show_count
    movies_by_country[c] = movie_count

tv_shows_by_country

In [None]:
# Add tv and movie counts to 'country_data' dataframe.
country_data['tv_count'] = country_data.index.map(tv_shows_by_country)
country_data['movie_count'] = country_data.index.map(movies_by_country)
country_data.head()

In [None]:
# Calculate percent of entries that are tv shows and movies for each country.
country_data['tv_percent'] = country_data.tv_count / (country_data.tv_count + country_data.movie_count) * 100
country_data['movie_percent'] = country_data.movie_count / (country_data.tv_count + country_data.movie_count) * 100
country_data.head()

### 8. Range of directors in terms of number of countries.

In [None]:
# Create a unique list of directors.
directors_list = []
for entry in list(netflix.director_list):
    for x in entry:
        if x not in directors_list:
            directors_list.append(x)

directors_list

In [None]:
# Calculate the number of netflix tv shows and movies each director has directed.
country_count_by_director = {}
for d in directors_list:
    indexes = []
    for i in list(netflix.index):
        directors = netflix.loc[i, 'director_list']
        if d in directors:
            indexes.append(i)
    subset_df = netflix.loc[indexes, :]
    country_count = {}
    for entry in subset_df.country.unique():
        for x in entry:
            if x not in country_count:
                country_count[x] = 1
            else:
                country_count[x] += 1
    
    country_count_by_director[d] = len(country_count)

country_count_by_director

In [None]:
# Directors with the highest number of tv shows and movies on Netflix.
pd.Series(country_count_by_director).sort_values(ascending=False)[1:11]

### 9. Average movie length by genre and country.

In [None]:
movies.head(2)

In [None]:
# Shortest Netflix movies.
movies[movies.duration < 10]

In [None]:
# Longest Netflix movies.
movies[movies.duration > 240]

In [None]:
# Create a unique list of genres.
genres_list = []
for entry in list(netflix.listed_in_list):
    for x in entry:
        if x not in genres_list:
            genres_list.append(x)

genres_list

In [None]:
# Calculate the number of movies and their average duration for each genre.
count_by_genre_movie = {}
avg_duration_by_genre = {}
for g in genres_list:
    indexes = []
    for i in list(movies.index):
        genres = movies.loc[i, 'listed_in_list']
        if g in genres:
            indexes.append(i)
    subset_df = movies.loc[indexes, :]
    count = len(subset_df)
    avg_duration = subset_df.duration.mean()
    count_by_genre_movie[g] = count
    avg_duration_by_genre[g] = avg_duration

genre_duration = pd.DataFrame(pd.Series(avg_duration_by_genre), columns=['avg_duration'])
genre_duration['movie_count'] = genre_duration.index.map(count_by_genre_movie)
genre_duration.dropna(inplace=True)
genre_duration.sort_values(by='avg_duration', inplace=True)
genre_duration

In [None]:
genre_duration.avg_duration.sort_values(ascending=False).plot.barh(figsize=(6,6), title='Average Movie Length by Genre (in minutes)')

In [None]:
# Calculate the number of movies and their average duration for each country.
avg_duration_by_country = {}
for c in countries_list:
    indexes = []
    for i in list(movies.index):
        countries = movies.loc[i, 'country_list']
        if c in countries:
            indexes.append(i)
    subset_df = movies.loc[indexes, :]
    avg_duration = subset_df.duration.mean()
    avg_duration_by_country[c] = avg_duration

country_duration = pd.DataFrame(pd.Series(avg_duration_by_country), columns=['avg_duration'])
country_duration['movie_count'] = country_duration.index.map(movies_by_country)
country_duration.dropna(inplace=True)
country_duration.sort_values(by='avg_duration', inplace=True)
country_duration

In [None]:
country_duration[country_duration.movie_count >= 5].avg_duration.sort_values(ascending=False).plot.barh(
    figsize=(8,16), title='Average Movie Length by Country (in minutes)')

### 10. TV show tenure by genre.

In [None]:
tv_shows.head(2)

In [None]:
# Frequency distribution of tv shows by number of seasons.
tv_shows.duration.value_counts()

In [None]:
# Longest tenured tv show on Netflix.
tv_shows[tv_shows.duration == '16 Seasons']

In [None]:
# Create a dataframe with the number of tv shows for each genre and length by seasons.
count_by_genre_tv = {}
length_dist_by_genre = {}
for g in genres_list:
    indexes = []
    for i in list(tv_shows.index):
        genres = tv_shows.loc[i, 'listed_in_list']
        if g in genres:
            indexes.append(i)
    subset_df = tv_shows.loc[indexes, :]
    count = len(subset_df)
    length_dist = dict(subset_df.duration.value_counts())
    count_by_genre_tv[g] = count
    length_dist_by_genre[g] = length_dist

tv_genre_season = pd.DataFrame.from_dict(length_dist_by_genre, orient='index')
tv_genre_season_cols = [
    '1 Season',
    '2 Seasons',
    '3 Seasons',
    '4 Seasons',
    '5 Seasons',
    '6 Seasons',
    '7 Seasons',
    '8 Seasons',
    '9 Seasons',
    '10 Seasons',
    '11 Seasons',
    '12 Seasons',
    '13 Seasons',
    '15 Seasons',
    '16 Seasons',
]
tv_genre_season = tv_genre_season.reindex(columns=tv_genre_season_cols)
tv_genre_season['total'] = tv_genre_season.index.map(count_by_genre_tv)
tv_genre_season.fillna(0, inplace=True)
tv_genre_season = tv_genre_season.astype('int')
tv_genre_season.sort_values(by='total', ascending=False, inplace=True)
tv_genre_season

In [None]:
# Aggregate columns for 4 or more seasons into 1 summary column.
four_plus_seasons = [
    '4 Seasons',
    '5 Seasons',
    '6 Seasons',
    '7 Seasons',
    '8 Seasons',
    '9 Seasons',
    '10 Seasons',
    '11 Seasons',
    '12 Seasons',
    '13 Seasons',
    '15 Seasons',
    '16 Seasons',
]
over_four_season_sum = tv_genre_season[four_plus_seasons].sum(axis=1)
tv_genre_season_pct = tv_genre_season[['1 Season','2 Seasons','3 Seasons']].copy()
tv_genre_season_pct['4+ Seasons'] = over_four_season_sum
tv_genre_season_pct['total'] = tv_genre_season.total
tv_genre_season_pct

In [None]:
# Calculate percentages by # of seasons for each genre and add to dataframe.
tgsp_norm_cols = tv_genre_season_pct.columns.tolist()
tgsp_norm_cols.remove('total')
tgsp_norm_cols
for col in tgsp_norm_cols:
    tv_genre_season_pct['{}_pct'.format(col)] = tv_genre_season_pct[col] / tv_genre_season_pct.total * 100

tv_genre_season_pct = tv_genre_season_pct.reindex(columns=[
    '1 Season',
    '1 Season_pct',
    '2 Seasons',
    '2 Seasons_pct',
    '3 Seasons',
    '3 Seasons_pct',
    '4+ Seasons',
    '4+ Seasons_pct',
    'total'
])
tv_genre_season_pct

In [None]:
# Graph percentages for each genre using a stacked bar chart.
stacked_bar_cols = ['1 Season_pct','2 Seasons_pct','3 Seasons_pct','4+ Seasons_pct']
tv_genre_season_pct[stacked_bar_cols].sort_values(by='1 Season_pct').plot.barh(
    stacked=True, figsize=(8,10), cmap='Accent', title='TV Show Tenure Proportion by Genre')