In [1]:
import pandas as pd
import datetime
import warnings
warnings.filterwarnings("ignore")
from sqlalchemy import create_engine

In [2]:
rotten_tomatoes_movies = pd.read_csv('Resources/rotten_tomatoes_movies.csv')
rotten_tomatoes_movies.head(2)

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,...,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19


Rotten Tomatoes Movies Data File Transformation:

1. Checked the data types
2. Kept columns for title, rating and counts only
3. Renamed columns for label consistency between datasets
4. Checked missing calues and dropped NA rows

In [3]:
rotten_tomatoes_movies = rotten_tomatoes_movies.rename(columns={'rotten_tomatoes_link':'rotten_tomatoes_id','movie_title':'title'})

In [4]:
rotten_tomatoes_movies.shape

(17712, 22)

In [5]:
rotten_tomatoes_movies.dtypes

rotten_tomatoes_id                   object
title                                object
movie_info                           object
critics_consensus                    object
content_rating                       object
genres                               object
directors                            object
authors                              object
actors                               object
original_release_date                object
streaming_release_date               object
runtime                             float64
production_company                   object
tomatometer_status                   object
tomatometer_rating                  float64
tomatometer_count                   float64
audience_status                      object
audience_rating                     float64
audience_count                      float64
tomatometer_top_critics_count         int64
tomatometer_fresh_critics_count       int64
tomatometer_rotten_critics_count      int64
dtype: object

In [6]:
rotten_tomatoes_movies.isnull().sum()

rotten_tomatoes_id                     0
title                                  0
movie_info                           321
critics_consensus                   8578
content_rating                         0
genres                                19
directors                            194
authors                             1542
actors                               352
original_release_date               1166
streaming_release_date               384
runtime                              314
production_company                   499
tomatometer_status                    44
tomatometer_rating                    44
tomatometer_count                     44
audience_status                      448
audience_rating                      296
audience_count                       297
tomatometer_top_critics_count          0
tomatometer_fresh_critics_count        0
tomatometer_rotten_critics_count       0
dtype: int64

In [7]:
rotten_tomatoes_movies_clean = rotten_tomatoes_movies.dropna(subset = ['tomatometer_rating', 'audience_count'])
rotten_tomatoes_movies_clean['audience_status'] =rotten_tomatoes_movies_clean['audience_status'].fillna('other')

In [8]:
rotten_tomatoes_movies_clean.isnull().sum()

rotten_tomatoes_id                     0
title                                  0
movie_info                           234
critics_consensus                   8293
content_rating                         0
genres                                19
directors                            178
authors                             1451
actors                               310
original_release_date               1036
streaming_release_date               302
runtime                              238
production_company                   443
tomatometer_status                     0
tomatometer_rating                     0
tomatometer_count                      0
audience_status                        0
audience_rating                        0
audience_count                         0
tomatometer_top_critics_count          0
tomatometer_fresh_critics_count        0
tomatometer_rotten_critics_count       0
dtype: int64

In [9]:
rotten_tomatoes_movies_rating = rotten_tomatoes_movies_clean[['rotten_tomatoes_id','title','tomatometer_status','tomatometer_rating','tomatometer_count','audience_status','audience_rating','audience_count']]
rotten_tomatoes_movies_rating.set_index("rotten_tomatoes_id", inplace=True)
rotten_tomatoes_movies_rating.head(2)

Unnamed: 0_level_0,title,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count
rotten_tomatoes_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
m/0814255,Percy Jackson & the Olympians: The Lightning T...,Rotten,49.0,149.0,Spilled,53.0,254421.0
m/0878835,Please Give,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0


In [10]:
connection_string = "postgres:bootcamp@localhost:5432/Netflix_IMDB"
engine = create_engine(f'postgresql://{connection_string}')

In [11]:
rotten_tomatoes_movies_rating.to_sql(name='rotten_tomatoes_movies_rating', con=engine, if_exists='replace', index=True)