# ETL Netflix

In [7]:
import pandas as pd
import sqlalchemy as db
from sqlalchemy import text

In [11]:
# El motor de BD para conectarnos a Mysql
# Va a almacenar la configuracion en una cadena de conexion
engine = db.create_engine("mysql://root:root@127.0.0.1:3310/db_movies_netflix_transact")

conn = engine.connect()

In [13]:
query = '''
SELECT 
    movie.movieID as movieID, movie.movieTitle as title, movie.releaseDate as releaseDate, 
    gender.name as gender , person.name as participantName, participant.participantRole as roleparticipant 
FROM movie 
INNER JOIN participant 
    ON movie.movieID=participant.movieID
INNER JOIN person
    ON person.personID = participant.personID
INNER JOIN movie_gender 
    ON movie.movieID = movie_gender.movieID
INNER JOIN gender 
    ON movie_gender.genderID = gender.genderID
'''

In [None]:
# Obtener la data de movies
movies_data = pd.read_sql(query, con = conn)

# Convertir una columna a entero
movies_data['movieID'] = movies_data['movieID'].astype('int')

movies_data

Unnamed: 0,movieID,title,releaseDate,gender,participantName,roleparticipant
0,80192187,Triple Frontier,2019-04-12,Action,Joseph Chavez Pineda,Actor
1,80210920,The Mother,2023-01-05,Drama,Maria Alejandra Navarro,Actor
2,81157374,Run,2021-05-21,Adventure,aria Lopez Gutierrez,Director


In [17]:
# leer otra fuente ( csv )
movies_awards = pd.read_csv('./data/Awards_movie.csv')

# Convertir una columna a entero
movies_awards['movieID'] = movies_awards['movieID'].astype('int')

# renombrar Aware por Award
movies_awards.rename(columns={"Aware": "Awards"}, inplace=True)

movies_awards

Unnamed: 0,movieID,IdAward,Awards
0,80210920,0,Oscar
1,81157374,1,Grammy
2,80192187,2,Oscar


In [18]:
# Cruzar las movies con Awards
movie_data = pd.merge(movies_data, 
                      movies_awards,
                      left_on= 'movieID',
                      right_on= 'movieID'
)

movie_data

Unnamed: 0,movieID,title,releaseDate,gender,participantName,roleparticipant,IdAward,Awards
0,80192187,Triple Frontier,2019-04-12,Action,Joseph Chavez Pineda,Actor,2,Oscar
1,80210920,The Mother,2023-01-05,Drama,Maria Alejandra Navarro,Actor,0,Oscar
2,81157374,Run,2021-05-21,Adventure,aria Lopez Gutierrez,Director,1,Grammy


In [19]:
# El motor de BD para conectarnos a Mysql
# Va a almacenar la configuracion en una cadena de conexion
engine_dw = db.create_engine("mysql://root:root@127.0.0.1:3310/dw_netflix")

conn_dw = engine_dw.connect()

In [24]:
movie_data = movie_data.rename(columns={"releaseDate":"releaseMovie","Awards":"AwardMovie"})

In [26]:
movie_data = movie_data.drop(columns=['IdAward'])
movie_data

Unnamed: 0,movieID,title,releaseMovie,gender,participantName,roleparticipant,AwardMovie
0,80192187,Triple Frontier,2019-04-12,Action,Joseph Chavez Pineda,Actor,Oscar
1,80210920,The Mother,2023-01-05,Drama,Maria Alejandra Navarro,Actor,Oscar
2,81157374,Run,2021-05-21,Adventure,aria Lopez Gutierrez,Director,Grammy


In [27]:
movie_data.to_sql('dimMovie', conn, if_exists='append', index=False)

3

In [28]:
moview_load = pd.read_sql('dimMovie', con=conn)
moview_load

Unnamed: 0,movieID,title,releaseMovie,gender,participantName,roleparticipant,AwardMovie
0,80192187,Triple Frontier,2019-04-12,Action,Joseph Chavez Pineda,Actor,Oscar
1,80210920,The Mother,2023-01-05,Drama,Maria Alejandra Navarro,Actor,Oscar
2,81157374,Run,2021-05-21,Adventure,aria Lopez Gutierrez,Director,Grammy
