## Ingestion del archivo "movie_genre.json"

#### `Paso 0 - Actualización de variables y funciones y validaciones

In [0]:
dbutils.widgets.text("p_environment", "")
v_environment = dbutils.widgets.get("p_environment")

In [0]:

dbutils.widgets.text("p_file_date", "2024-12-16")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
#Chama a função para validar se o arquivo a ser processado existe
valida_arquivo(bronze_folder_path, v_file_date, "movie.csv")

#### Paso 1 - Leer el archivo JSON usando "DataFrameReader" de Spark

In [0]:
#%fs
#ls /mnt/moviehistoryadilmor/bronze

In [0]:
#display(spark.read.option("header", True).json(f"{bronze_folder_path}/movie_genre.json"))

In [0]:
#Definindo o schema na variavel counties_schema
movie_genre_schema = "genreId INT, movieId INT"

In [0]:
movie_genre_df = spark.read \
               .schema(movie_genre_schema) \
               .json(f"{bronze_folder_path}/{v_file_date}/movie_genre.json")

In [0]:
#display(movie_genre_df)

#### Paso 2 - Cambiar el nombre de las columnas y añadir "ingestion_date" y "enviroment"

In [0]:
from pyspark.sql.functions import current_timestamp, lit

movies_genre_temp_df = add_ingestion_date(movie_genre_df) \
                    .withColumnRenamed("movieId", "movie_id") \
                    .withColumnRenamed("genreId", "genre_id") \
                    .withColumn("enviroment", lit("production")) \
                    .withColumn("file_date", lit(v_file_date))

In [0]:
#ordenando os campos
movies_genre_final_df = movies_genre_temp_df \
                    .select("movie_id", "genre_id", "ingestion_date", "enviroment", "file_date")

In [0]:
#display(movies_genre_final_df)

#### Paso 5 - Escribir datos en el datalake en formato parquet partition by movie_id

In [0]:
#Elimina partição se ela já existir
#drop_partition_if_exists(movies_genre_final_df, "movie_silver", "movie_genre", "file_date")

In [0]:
#Forma 1 - Escrevendo para o storage somente
#movies_genre_final_df.write.mode("overwrite") \
#                     .partitionBy("movie_id") \
#                     .parquet(f"{silver_folder_path}/movie_genre/")

#Forma 2 - Sem partição devido ao limitador da versão da API write
#movies_genre_final_df.write.mode("append") \
#                     .partitionBy("file_date") \
#                     .format("parquet") \
#                     .saveAsTable("movie_silver.movie_genre")

#Caso a tabela não existir, cria a tabela com os dados, senão efetua um merge dos dados na tabela
merge_delta_lake(movies_genre_final_df, "movie_silver", "movie_genre", silver_folder_path, "tgt.movie_id = src.movie_id AND tgt.genre_id = src.genre_id", "file_date")

In [0]:
#%fs
#ls /mnt/moviehistoryadilmor/silver/movie_genre

In [0]:
#display(spark.read.parquet(f"{silver_folder_path}/movie_genre"))
#spark.sql("select * from movie_silver.movie_genre").display()

display(spark.sql("SELECT file_date, \
                          COUNT(1) \
                   FROM movie_silver.movie_genre \
                   GROUP BY file_date"
                   )
        )

In [0]:
dbutils.notebook.exit("Success")