In [1]:
import pandas as pd
import datetime
import warnings
warnings.filterwarnings("ignore")
from sqlalchemy import create_engine

# Data Extract

### Import CSV files from resource folder

In [2]:
netflix_all = pd.read_csv('Resources/netflix_titles.csv')
netflix_all.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [3]:
netflix_all.shape

(7787, 12)

# Data Transformation 

Netflix Data File Transformation: 
1. Checked the data types - Changed "date_added" to datetime format
2. Filtered out movies from all show types by loc function
3. Rename, organized and split the data into two tables- Netflix Movie Basic info and Netflix Movie Date info
4. Checked and handled missing values for each new table- replaced Netflix Movie Basic missing "country" to "other" and missing "Rating" to "NR"

In [4]:
netflix_all.dtypes

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object

In [5]:
netflix_all['date_added'] =pd.to_datetime(netflix_all.date_added)

In [6]:
# Check missing values
netflix_all.isnull().sum()

show_id            0
type               0
title              0
director        2389
cast             718
country          507
date_added        10
release_year       0
rating             7
duration           0
listed_in          0
description        0
dtype: int64

In [7]:
# Filter out Movies
netflix_movie = netflix_all.loc[netflix_all.type=='Movie']
netflix_movie.shape

(5377, 12)

In [8]:
netflix_movie = netflix_movie.rename(columns={'show_id':'netflix_show_id'})

In [9]:
netflix_movie_basic = netflix_movie [[ 'netflix_show_id','title','country','rating','duration','description']]
netflix_movie_basic.isnull().sum()

netflix_show_id      0
title                0
country            230
rating               5
duration             0
description          0
dtype: int64

In [10]:
netflix_movie_basic.rating.value_counts()

TV-MA       1845
TV-14       1272
R            663
TV-PG        505
PG-13        386
PG           247
TV-Y         117
TV-G         111
TV-Y7         95
NR            79
G             39
UR             5
TV-Y7-FV       5
NC-17          3
Name: rating, dtype: int64

In [11]:
# Change the countries from Na to "other"
netflix_movie_basic['country'] =netflix_movie_basic['country'].fillna('other')
netflix_movie_basic['rating'] =netflix_movie_basic['rating'].fillna('NR')

In [12]:
netflix_movie_basic.isnull().sum()

netflix_show_id    0
title              0
country            0
rating             0
duration           0
description        0
dtype: int64

In [13]:
netflix_movie_basic.set_index("netflix_show_id", inplace=True)
netflix_movie_basic.head()

Unnamed: 0_level_0,title,country,rating,duration,description
netflix_show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
s2,7:19,Mexico,TV-MA,93 min,After a devastating earthquake hits Mexico Cit...
s3,23:59,Singapore,R,78 min,"When an army recruit is found dead, his fellow..."
s4,9,United States,PG-13,80 min,"In a postapocalyptic world, rag-doll robots hi..."
s5,21,United States,PG-13,123 min,A brilliant group of students become card-coun...
s7,122,Egypt,TV-MA,95 min,"After an awful accident, a couple admitted to ..."


In [14]:
netflix_movie_date = netflix_movie [[ 'netflix_show_id','date_added','release_year']]
netflix_movie_date.isnull().sum()

netflix_show_id    0
date_added         0
release_year       0
dtype: int64

In [15]:
netflix_movie_date.set_index("netflix_show_id", inplace=True)
netflix_movie_date.head()

Unnamed: 0_level_0,date_added,release_year
netflix_show_id,Unnamed: 1_level_1,Unnamed: 2_level_1
s2,2016-12-23,2016
s3,2018-12-20,2011
s4,2017-11-16,2009
s5,2020-01-01,2008
s7,2020-06-01,2019


# Load Data to SQL

In [16]:
#Create database connection 
connection_string = "postgres:bootcamp@localhost:5432/Netflix_IMDB"
engine = create_engine(f'postgresql://{connection_string}')

In [17]:
netflix_movie_basic.to_sql(name='netflix_movie_basic', con=engine, if_exists='replace', index=True)

In [18]:
netflix_movie_date.to_sql(name='netflix_movie_date', con=engine, if_exists='replace', index=True)