In [2]:
import numpy as np 
import pandas as pd 

import datetime as dt
#from datetime import datetime


from sqlalchemy import create_engine

import json



### Extract CSVs into DataFrames

In [3]:
#reading the  movies_metadata
movies_metadata=pd.read_csv("Resources/movies_metadata.csv")

In [4]:
movies_metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,10/30/1995,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,12/15/1995,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,12/22/1995,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,12/22/1995,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,2/10/1995,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [5]:
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

### Transform movies_metadata DataFrame

In [6]:
# Create a filtered dataframe from specific columns
movie_cols = ["id", "imdb_id","genres","budget", "title" , "release_date","revenue" ,"vote_average"]
movie_transformed= movies_metadata[movie_cols].copy()

In [7]:
# Rename the column headers
movie_transformed = movie_transformed.rename(columns={"id": "id",
                                                          "imdb_id": "imdb_id",
                                                          "title": "title",
                                                          "genres" : "genres",
                                                          "release_date" :"release_date",
                                                          "budget" : "budget",
                                                          "revenue" :"revenue",
                                                          "vote_average" :"vote_average"
                                                         
                                                     })

In [8]:
movie_transformed["id"].astype(int)

0           862
1          8844
2         15602
3         31357
4         11862
          ...  
45458    439050
45459    111109
45460     67758
45461    227506
45462    461257
Name: id, Length: 45463, dtype: int64

In [9]:
# Clean the data by dropping duplicates and setting the index
movie_transformed.drop_duplicates("id", inplace=True)
#movie_transformed.set_index("id", inplace=True)

In [10]:
movie_transformed.dropna(how="any",inplace=True)

In [11]:
#movie_transformed[['tt', 'imdb_id']] =  movie_transformed['imdb_id'].str.split('tt', expand=True)

In [11]:
# The genres column is string, so transform the genres column into json so that we can reference as a list.
# First replace all single quote to double quote to satisfy json format.
movie_transformed['genres'] = movie_transformed['genres'].str.replace("'",'"')

In [12]:
# Create an empty list to store json result:
json_list = []

# Load each row into json format, and store into the list:
for index, row in movie_transformed.iterrows():
    json_list.append(json.loads(movie_transformed['genres'][index]))

In [13]:
# Replace the genres column with the json_list:
movie_transformed['genres'] = json_list

In [14]:
# Test if the genres column can be referenced successfully:
movie_transformed['genres'][0][2]['name']

'Family'

In [15]:
movie_transformed.head()

Unnamed: 0,id,imdb_id,genres,budget,title,release_date,revenue,vote_average
0,862,tt0114709,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",30000000,Toy Story,10/30/1995,373554033.0,7.7
1,8844,tt0113497,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",65000000,Jumanji,12/15/1995,262797249.0,6.9
2,15602,tt0113228,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",0,Grumpier Old Men,12/22/1995,0.0,6.5
3,31357,tt0114885,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",16000000,Waiting to Exhale,12/22/1995,81452156.0,6.1
4,11862,tt0113041,"[{'id': 35, 'name': 'Comedy'}]",0,Father of the Bride Part II,2/10/1995,76578911.0,5.7


In [16]:
#reading the ratings_small
ratings_small=pd.read_csv("Resources/ratings_small.csv")

In [17]:
ratings_small.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [18]:
#reading the links_small
links_small=pd.read_csv("Resources/links_small.csv")

In [19]:
links_small.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


### Transform links_small DataFrame

In [20]:
links_cols = ["movieId", "imdbId", "tmdbId"]
links_transformed = links_small[links_cols].copy()

In [21]:
# Rename the column headers
links_transformed = links_transformed.rename(columns={"movieId": "movieId",
                                                         "imdbId": "imdb_id",
                                                         "tmdbId": "tmdbId",
                                                         })

In [22]:
links_transformed.head()

Unnamed: 0,movieId,imdb_id,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [23]:
ratings_small.dropna()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


### Transform ratings_small DataFrame

In [24]:
ratings_small.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [25]:
ratings_cols = ["movieId", "rating"]
ratings_transformed = ratings_small[ratings_cols].copy()

In [26]:
# Rename the column headers
ratings_transformed = ratings_transformed.rename(columns={"movieId": "movieId",
                                                         "rating": "rating"
                                                         
                                                         })

In [27]:
ratings_transformed.head()

Unnamed: 0,movieId,rating
0,31,2.5
1,1029,3.0
2,1061,3.0
3,1129,2.0
4,1172,4.0


### Create database connection

### Load DataFrames into database