In [1]:
import pandas as pd
import numpy as np
import ast

df = pd.read_csv('./data/top_movies_cleaned.csv') # read csv file

Create a DataFrame for the table in SQL with the cast id, name and gender:

In [3]:
df_act = pd.read_csv('./data/raw/API_cast_gender.csv') # read csv file
df_dir = pd.read_csv('./data/raw/API_dir_gender.csv') # read csv file

In [3]:
df_act['person_gender'] = df_act['person_gender'].replace({'Hombre': 'M', 'Mujer': 'F'})

In [4]:
df_act.rename(columns={'person_id': 'actor_id', 'person_name': 'actor_name', 'person_gender': 'actor_gender'}, inplace=True)

In [5]:
df_act.to_csv('./csv/actors.csv', index=False)


Create a DataFrame for the table in SQL with the director id, name and gender:

In [4]:
df_dir['person_gender'] = df_dir['person_gender'].replace({'Hombre': 'M', 'Mujer': 'F'})

In [5]:
df_dir.rename(columns={'person_id': 'director_id', 'person_name': 'director_name', 'person_gender': 'director_gender'}, inplace=True)

In [6]:
df_dir['director_gender'] = df_dir['director_gender'].replace('Lasse Hallström', 'M')

In [7]:
df_dir['director_gender'] = df_dir['director_gender'].fillna('N')

In [8]:

df_dir['director_id'] = pd.to_numeric(df_dir['director_id'], errors='coerce')

In [9]:
df_dir.to_csv('./csv/directors.csv', index=False)

Create he dataframe to the genres table, with the id and genre name:

In [30]:
df['genre'] = df['genre'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [31]:
all_genres = [genre for sublist in df['genre'] for genre in sublist]

unique_genres = pd.unique(np.array(all_genres))

genres_df = pd.DataFrame(unique_genres, columns=['genre_name'])

In [32]:
genres_df['genre_id'] = range(1, len(genres_df) + 1)

In [34]:
genres_df.to_csv('./csv/genres.csv', index=False)

Create the dataframe for 'movies'table with all the basic information with 'movie_id', 'title', 'rating', 'meta_score', 'year', 'revenue', 'director_id'

In [35]:
# Make a copy of the dataframe to avoid modifying the original data
movies_df = df[['title', 'rating', 'meta_score', 'year', 'revenue', 'director']].copy()

In [36]:
#Add the movie_id column
movies_df['movie_id'] = movies_df.index + 1

In [37]:
# Translate the director's name to director_id
movies_df = movies_df.merge(df_dir[['director_id', 'director_name']], left_on='director', right_on='director_name', how='left')

In [38]:
#Select only the columns we need
movies_df = movies_df[['movie_id', 'title', 'rating', 'meta_score', 'year', 'revenue', 'director_id']]

In [40]:
movies_df.to_csv('./csv/movies.csv', index=False)

Create dataframe for cast table with 'movie_id' and 'person_id':

In [41]:
df['cast'] = df['cast'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [42]:
cast_df = df.merge(movies_df[['movie_id', 'title']], how='left', left_on='title', right_on='title')

In [43]:
cast_df = cast_df[['movie_id', 'cast']].explode('cast').rename(columns={'cast': 'actor_name'})

In [44]:
cast_df = cast_df.merge(df_act, how='left', left_on='actor_name', right_on='actor_name')

In [45]:
cast_df = cast_df[['movie_id', 'actor_id']]

In [48]:
cast_df = cast_df.drop_duplicates(subset=['movie_id', 'actor_id'])

In [49]:
cast_df.to_csv('./csv/cast.csv', index=False)

Create a dataframe wich relates movies_id and genres_id

In [50]:
df['genre'] = df['genre'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

movies_genres_df = df.merge(movies_df[['movie_id', 'title']], how='left', left_on='title', right_on='title')

movies_genres_df = movies_genres_df[['movie_id', 'genre']].explode('genre').rename(columns={'genre': 'genre_name'})

movies_genres_df = movies_genres_df.merge(genres_df, how='left', left_on='genre_name', right_on='genre_name')

movies_genres_df = movies_genres_df[['movie_id', 'genre_id']]

In [51]:
movies_genres_df = movies_genres_df.drop_duplicates(subset=['movie_id', 'genre_id'])

In [52]:
movies_genres_df.to_csv('./csv/movies_genres.csv', index=False)

# TO SQL

In [34]:
import pandas as pd
from sqlalchemy import create_engine
import pymysql

In [35]:
from dotenv import load_dotenv
import os

load_dotenv()

password = os.getenv('password')

# Tus parámetros de conexión
bd = "top_movies"
connection_string = f'mysql+pymysql://root:{password}@localhost/{bd}'
engine = create_engine(connection_string)

In [61]:
dataframes = {"actors": df_act,
    "directors": df_dir,
    "genres": genres_df,
    "movies": movies_df,
    "cast": cast_df,
    "movies_genres": movies_genres_df}

In [62]:
# Sent each DataFrame to the database
for table_name, df in dataframes.items():
    df.to_sql(table_name, con=engine, if_exists='append', index=False)
    print(f"Datos insertados en la tabla {table_name}")

Datos insertados en la tabla cast
Datos insertados en la tabla movies_genres
