# Modeling Template
----

## Step 0 - Import libraries and define pipeline functions

In [None]:
#!pip3 install ipywidgets
#!pip3 install Pyarrow
#!pip3 install ydata-profiling
#!pip3 install --upgrade notebook ipywidgets

In [1]:
import pandas as pd
import sqlite3
from IPython.display import display
import numpy as np
from ydata_profiling import ProfileReport
pd.set_option('display.max_columns', None)

In [18]:
def genLUPs(ser):
    """Pass in a series to the function. The function will return two dictionaries in a tuple
    Based on the unique fields in the series:
    1) a dictionary to be used as the basis for a dimension,
    2) a lookup to be used to encode the fact table
    """
    ser_dict = dict(enumerate(ser.unique()))
    return ser_dict, {value: key for key, value in ser_dict.items()}

## Step 1 - Import raw data file and load into a temporary df

In [3]:
netflix_data = pd.read_csv('./netflix_data.csv')
df = netflix_data.copy()
df.shape

(7787, 11)

## Step 2 - Exploratory Data Analysis (EDA)

In [4]:
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,genres
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,14-Aug-20,2020,TV-MA,4,"International TV Shows, TV Dramas, TV Sci-Fi &..."
1,s10,Movie,1920,Vikram Bhatt,"Rajneesh Duggal, Adah Sharma, Indraneil Sengup...",India,15-Dec-17,2008,TV-MA,143,"Horror Movies, International Movies, Thrillers"
2,s100,Movie,3 Heroines,Iman Brotoseno,"Reza Rahadian, Bunga Citra Lestari, Tara Basro...",Indonesia,05-Jan-19,2016,TV-PG,124,"Dramas, International Movies, Sports Movies"
3,s1000,Movie,Blue Mountain State: The Rise of Thadland,Lev L. Spiro,"Alan Ritchson, Darin Brooks, James Cade, Rob R...",United States,01-Mar-16,2016,R,90,Comedies
4,s1001,TV Show,Blue Planet II,,David Attenborough,United Kingdom,03-Dec-18,2017,TV-G,1,"British TV Shows, Docuseries, Science & Nature TV"
...,...,...,...,...,...,...,...,...,...,...,...
7782,s995,TV Show,Blown Away,,,Canada,12-Jul-19,2019,TV-14,1,"International TV Shows, Reality TV"
7783,s996,TV Show,Blue Exorcist,,"Nobuhiko Okamoto, Jun Fukuyama, Kana Hanazawa,...",Japan,01-Sep-20,2017,TV-MA,2,"Anime Series, International TV Shows"
7784,s997,Movie,Blue Is the Warmest Color,Abdellatif Kechiche,"Léa Seydoux, Adèle Exarchopoulos, Salim Kechio...","France, Belgium, Spain",26-Aug-16,2013,NC-17,180,"Dramas, Independent Movies, International Movies"
7785,s998,Movie,Blue Jasmine,Woody Allen,"Cate Blanchett, Sally Hawkins, Alec Baldwin, L...",United States,08-Mar-19,2013,PG-13,98,"Comedies, Dramas, Independent Movies"


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7787 entries, 0 to 7786
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       7787 non-null   object
 1   type          7787 non-null   object
 2   title         7787 non-null   object
 3   director      5398 non-null   object
 4   cast          7069 non-null   object
 5   country       7280 non-null   object
 6   date_added    7777 non-null   object
 7   release_year  7787 non-null   int64 
 8   rating        7780 non-null   object
 9   duration      7787 non-null   int64 
 10  genres        7787 non-null   object
dtypes: int64(2), object(9)
memory usage: 669.3+ KB


In [7]:
# Exploring nulls, looking at percentage of nulls
nulls = df.isnull().sum().sort_values(ascending=False)
nulls[nulls > 0] / df.shape[0] * 100

director      30.679337
cast           9.220496
country        6.510851
date_added     0.128419
rating         0.089893
dtype: float64

In [8]:
# ProfileReport(df, title='Netflix Data')

## Step 3 - Set the data type if the type needs to be casted and index if desired

In [9]:
# during EDA, the ratings feature showed some possible errors - isolate these possible errors
#df_tmp = df[df['director'].isnull()]
#df_tmp['rating'].unique()
#df_tmp['rating'].nunique()
#df_tmp['rating'].value_counts()

# write out questionable records to csv for business analysis
#df[df['rating'].isin(['NR', 'R', 'NC-17', 'PG-13'])].to_csv('mis-ratings.csv')

In [10]:
# make date in suitable format for database loading
df['date_added'] = pd.to_datetime(df['date_added'], dayfirst=True, format='mixed')

In [11]:
#set index if available
#df.set.index(['show_id'], inplace=True)

## Step 4 - Handle all nulls in the df

In [15]:
df['director'] = df['director'].fillna('TV Directors')
df['cast'] = df['cast'].fillna('Cast Not Listed')
df['date_added'] = df['date_added'].fillna('1900-01-01')
df['country'] = df['country'].fillna('Country Not Listed')
df['rating'] = df['rating'].fillna('Unknown')

In [16]:
df.isnull().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
genres          0
dtype: int64

## Step 5 - Identify lookups/dimensions

### Create 'type' dictionary and corresponding lookup

In [19]:
dict_type, lup_type = genLUPs(df['type'])
dict_type, lup_type

({0: 'TV Show', 1: 'Movie'}, {'TV Show': 0, 'Movie': 1})

### Create 'country' dictionary and corresponding lookup

In [20]:
dict_country, lup_country = genLUPs(df['country'])
dict_country, lup_country

({0: 'Brazil',
  1: 'India',
  2: 'Indonesia',
  3: 'United States',
  4: 'United Kingdom',
  5: 'United States, France',
  6: 'Germany, United States',
  7: 'Canada',
  8: 'Japan',
  9: 'Thailand',
  10: 'Country Not Listed',
  11: 'Denmark, Sweden, Israel, United States',
  12: 'Ireland, United Kingdom',
  13: 'Spain',
  14: 'United States, Iceland',
  15: 'Colombia',
  16: 'Germany',
  17: 'United Arab Emirates, United Kingdom, India',
  18: 'New Zealand',
  19: 'Netherlands',
  20: 'Egypt',
  21: 'Australia',
  22: 'Sweden',
  23: 'Russia',
  24: 'Norway, Germany, Sweden',
  25: 'Finland, France',
  26: 'Denmark',
  27: 'Philippines',
  28: 'Denmark, Spain',
  29: 'United Kingdom, Russia',
  30: 'Denmark, France, Poland',
  31: 'Lebanon',
  32: 'United States, United Kingdom, Australia',
  33: 'Canada, United States',
  34: 'Singapore',
  35: 'Poland',
  36: 'Bulgaria, United States',
  37: 'South Korea',
  38: 'India, United Kingdom',
  39: 'United States, Canada, Ireland',
  40: 

### Create 'director' dictionary and corresponding lookup

In [21]:
dict_director, lup_director = genLUPs(df['director'])
dict_director, lup_director

({0: 'TV Directors',
  1: 'Vikram Bhatt',
  2: 'Iman Brotoseno',
  3: 'Lev L. Spiro',
  4: 'Jeremy Saulnier',
  5: 'Les Mayfield',
  6: 'Derek Cianfrance',
  7: 'Rohan Sippy',
  8: 'Barry Avrich',
  9: 'Nawapol Thamrongrattanarit',
  10: 'Bo Burnham, Christopher Storer',
  11: 'Rajkumar Hirani',
  12: 'Jeremy Kenyon Lockyer Corbell',
  13: 'Jay Surridge',
  14: 'Elvira Lind',
  15: 'Kirk Wise',
  16: 'Raj Kapoor',
  17: 'Samar Shaikh',
  18: 'Dawn Porter',
  19: 'Shazia Javed',
  20: 'Gabriel Clarke, Torquil Jones',
  21: 'Brendan Byrne',
  22: 'Siddique',
  23: 'Nisheeta Keni',
  24: 'Jorge M. Fontana',
  25: 'Geoffrey Orthwein, Andrew Sullivan',
  26: 'Luis Alberto Restrepo, Andrés Beltrán, Jaime Rayo',
  27: 'Sinan Akkuş',
  28: 'Nagesh Kukunoor',
  29: 'Byron Howard, Chris Williams',
  30: 'Borja Cobeaga',
  31: 'Pia Sukanya',
  32: 'Anurag Kashyap, Dibakar Banerjee, Karan Johar, Zoya Akhtar',
  33: 'Riccardo Pilizzeri',
  34: 'Alexandra Dean',
  35: 'Jelle de Jonge',
  36: 'Alain 

### Create 'cast' dictionary and corresponding lookup

In [22]:
dict_cast, lup_cast = genLUPs(df['cast'])
dict_cast, lup_cast

({0: 'João Miguel, Bianca Comparato, Michel Gomes, Rodolfo Valente, Vaneza Oliveira, Rafael Lozano, Viviane Porto, Mel Fronckowiak, Sergio Mamberti, Zezé Motta, Celso Frateschi',
  1: 'Rajneesh Duggal, Adah Sharma, Indraneil Sengupta, Anjori Alagh, Rajendranath Zutshi, Vipin Sharma, Amin Hajee, Shri Vallabh Vyas',
  2: 'Reza Rahadian, Bunga Citra Lestari, Tara Basro, Chelsea Islan',
  3: 'Alan Ritchson, Darin Brooks, James Cade, Rob Ramsay, Chris Romano, Frankie Shaw, Omari Newton, Ed Marinaro, Dhani Jones, Ed Amatrudo, Jimmy Tatro',
  4: 'David Attenborough',
  5: 'Macon Blair, Devin Ratray, Amy Hargreaves, Kevin Kolack, Eve Plumb, David W. Thompson, Brent Werzner, Stacy Rock, Sidné Anderson, Sandy Barnett, Bonnie Johnson',
  6: 'Martin Lawrence, Luke Wilson, Peter Greene, Dave Chappelle, Nicole Ari Parker, Graham Beckel, Robert Miranda, Olek Krupa, Saverio Guerra, Richard C. Sarafian, William Forsythe',
  7: 'Ryan Gosling, Michelle Williams, Faith Wladyka, John Doman, Mike Vogel, Ben

### Create 'Date_Added' df - to load to a database for Date/Time hierarchy

In [23]:
df_date_added = pd.DataFrame(df['date_added'].unique())
df_date_added.columns = ['Date_Added']
df_date_added.sort_values(by='Date_Added', inplace=True)
df_date_added.reset_index(inplace=True)
df_date_added.drop('index', axis=1, inplace=True)
df_date_added

Unnamed: 0,Date_Added
0,1900-01-01
1,2008-01-01
2,2008-02-04
3,2009-05-05
4,2009-11-18
...,...
1508,2021-01-12
1509,2021-01-13
1510,2021-01-14
1511,2021-01-15


In [24]:
df_date_added['Year'] = df_date_added['Date_Added'].dt.year
df_date_added['Month'] = df_date_added['Date_Added'].dt.month
df_date_added['Day'] = df_date_added['Date_Added'].dt.day
df_date_added

Unnamed: 0,Date_Added,Year,Month,Day
0,1900-01-01,1900,1,1
1,2008-01-01,2008,1,1
2,2008-02-04,2008,2,4
3,2009-05-05,2009,5,5
4,2009-11-18,2009,11,18
...,...,...,...,...
1508,2021-01-12,2021,1,12
1509,2021-01-13,2021,1,13
1510,2021-01-14,2021,1,14
1511,2021-01-15,2021,1,15


## Step 6 - Use lookups to create a 'fact' df

In [25]:
df['type'] = df['type'].map(lup_type)
df['country'] = df['country'].map(lup_country)
df['director'] = df['director'].map(lup_director)
df['cast'] = df['cast'].map(lup_cast)
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,genres
0,s1,0,3%,0,0,0,2020-08-14,2020,TV-MA,4,"International TV Shows, TV Dramas, TV Sci-Fi &..."
1,s10,1,1920,1,1,1,2017-12-15,2008,TV-MA,143,"Horror Movies, International Movies, Thrillers"
2,s100,1,3 Heroines,2,2,2,2019-01-05,2016,TV-PG,124,"Dramas, International Movies, Sports Movies"
3,s1000,1,Blue Mountain State: The Rise of Thadland,3,3,3,2016-03-01,2016,R,90,Comedies
4,s1001,0,Blue Planet II,0,4,4,2018-12-03,2017,TV-G,1,"British TV Shows, Docuseries, Science & Nature TV"
...,...,...,...,...,...,...,...,...,...,...,...
7782,s995,0,Blown Away,0,9,7,2019-07-12,2019,TV-14,1,"International TV Shows, Reality TV"
7783,s996,0,Blue Exorcist,0,6828,8,2020-09-01,2017,TV-MA,2,"Anime Series, International TV Shows"
7784,s997,1,Blue Is the Warmest Color,4048,6829,681,2016-08-26,2013,NC-17,180,"Dramas, Independent Movies, International Movies"
7785,s998,1,Blue Jasmine,4049,6830,3,2019-03-08,2013,PG-13,98,"Comedies, Dramas, Independent Movies"


## Step 7 - Create fields based on conditionals

In [27]:
df['seasons'] = np.where(df['type'] == 0, df['duration'], 0)
df['minutes'] = np.where(df['type'] == 1, df['duration'], 0)
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,genres,seasons,minutes
0,s1,0,3%,0,0,0,2020-08-14,2020,TV-MA,4,"International TV Shows, TV Dramas, TV Sci-Fi &...",4,0
1,s10,1,1920,1,1,1,2017-12-15,2008,TV-MA,143,"Horror Movies, International Movies, Thrillers",0,143
2,s100,1,3 Heroines,2,2,2,2019-01-05,2016,TV-PG,124,"Dramas, International Movies, Sports Movies",0,124
3,s1000,1,Blue Mountain State: The Rise of Thadland,3,3,3,2016-03-01,2016,R,90,Comedies,0,90
4,s1001,0,Blue Planet II,0,4,4,2018-12-03,2017,TV-G,1,"British TV Shows, Docuseries, Science & Nature TV",1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7782,s995,0,Blown Away,0,9,7,2019-07-12,2019,TV-14,1,"International TV Shows, Reality TV",1,0
7783,s996,0,Blue Exorcist,0,6828,8,2020-09-01,2017,TV-MA,2,"Anime Series, International TV Shows",2,0
7784,s997,1,Blue Is the Warmest Color,4048,6829,681,2016-08-26,2013,NC-17,180,"Dramas, Independent Movies, International Movies",0,180
7785,s998,1,Blue Jasmine,4049,6830,3,2019-03-08,2013,PG-13,98,"Comedies, Dramas, Independent Movies",0,98


## Step 8 - Load the dataframes to tables

In [28]:
# convert dictionaries to dataframes
df_type = pd.DataFrame(dict_type.items(), columns=['Id', 'Type'])
df_country = pd.DataFrame(dict_country.items(), columns=['Id', 'Country'])
df_director = pd.DataFrame(dict_director.items(), columns=['Id', 'Director'])
df_cast = pd.DataFrame(dict_cast.items(), columns=['Id', 'Cast'])

In [29]:
# load data frame to db tables
conn = sqlite3.connect('./data/Netflix.db')

df_date_added.to_sql('Calendar', conn, if_exists='replace', index=False)
df_type.to_sql('Type', conn, if_exists='replace', index=False)
df_country.to_sql('Country', conn, if_exists='replace', index=False)
df_director.to_sql('Director', conn, if_exists='replace', index=False)
df_cast.to_sql('Cast', conn, if_exists='replace', index=False)

df.to_sql('fListings', conn, if_exists='replace', index=False)

conn.close()