### Carga de la tabla result_movie_genre_language en la capa Gold

#### Paso 0 - Actualizando las variables globales y las funciones comunes.

In [0]:
dbutils.widgets.text("p_file_date", "2024-12-30")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

%md
#### Paso 1 - Leyendo todas las tablas necesarias."

In [0]:
#movies_20XX_df = spark.read.parquet(f"{silver_folder_path}/movies") \
#                        .filter("year_release_date >= 2000") \
#                        .filter(f"file_date = '{v_file_date}'")

movies_20XX_df = spark.read.format("delta").load(f"{silver_folder_path}/movies") \
                        .filter("year_release_date >= 2000") \
                        .filter(f"file_date = '{v_file_date}'")

In [0]:
movies_20XX_df.count()

In [0]:
#movies_lang_df = spark.read.parquet(f"{silver_folder_path}/movies_languages") \
#                            .filter(f"file_date = '{v_file_date}'")

movies_lang_df = spark.read.format("delta").load(f"{silver_folder_path}/movies_languages") \
                            .filter(f"file_date = '{v_file_date}'")

In [0]:
#lang_df = spark.read.parquet(f"{silver_folder_path}/languages")
lang_df = spark.read.format("delta").load(f"{silver_folder_path}/languages")

In [0]:
# movie_genre_df = spark.read.parquet(f"{silver_folder_path}/movie_genre") \
#                             .filter(f"file_date = '{v_file_date}'")

movie_genre_df = spark.read.format("delta").load(f"{silver_folder_path}/movie_genre") \
                            .filter(f"file_date = '{v_file_date}'")

In [0]:
#genre_df = spark.read.parquet(f"{silver_folder_path}/genre")
genre_df = spark.read.format("delta").load(f"{silver_folder_path}/genre")

### Paso 2 - Generando el DataFrame con todos los requisitos

- Realiza un **inner join** entre todas las tablas  
- Selecciona todos los campos necesarios  
- Elimina todas las filas duplicadas  
- Ordena por **fecha de lanzamiento** en orden descendente  
- Llama a la función para agregar la **fecha y hora actual**

In [0]:
results_movie_genre_lang_df = movies_20XX_df.join(movie_genre_df, 
                                                  movies_20XX_df.movie_id == movie_genre_df.movie_id,
                                                  "inner") \
                                            .join(genre_df, 
                                                  movie_genre_df.genre_id == genre_df.genre_id,
                                                  "inner") \
                                            .join(movies_lang_df, 
                                                  movies_20XX_df.movie_id == movies_lang_df.movie_id,
                                                  "inner") \
                                            .join(lang_df,
                                                  movies_lang_df.language_id == lang_df.language_id,
                                                  "inner") \
                                            .select(movies_20XX_df.movie_id,
                                                    movies_lang_df.language_id,
                                                    movie_genre_df.genre_id,
                                                    movies_20XX_df.title,
                                                    movies_20XX_df.duration_time,
                                                    movies_20XX_df.release_date,
                                                    movies_20XX_df.vote_average,
                                                    lang_df.language_name,
                                                    genre_df.genre_name) \
                                            .distinct() \
                                            .sort(movies_20XX_df.release_date.desc())

In [0]:
from pyspark.sql.functions import lit

#results_movie_genre_lang_final_df = add_ingestion_date(results_movie_genre_lang_df)
results_movie_genre_lang_final_df = results_movie_genre_lang_df \
                                    .withColumn("created_date", lit(v_file_date))

### Paso 3 - Generando el archivo final en la carpeta Gold

In [0]:
#drop_partition_if_exists(results_movie_genre_lang_final_df, "movie_gold", "results_movie_genre_language", "created_date")

In [0]:
#results_movie_genre_lang_final_df.write.mode("overwrite").parquet(f"{gold_folder_path}/results_movie_genre_language")

# results_movie_genre_lang_final_df.write \
#             .mode("append") \
#             .partitionBy("created_date") \
#             .format("parquet") \
#             .saveAsTable("movie_gold.results_movie_genre_language")

#Caso a tabela não existir, cria a tabela com os dados, senão efetua um merge dos dados na tabela
merge_delta_lake(results_movie_genre_lang_final_df, "movie_gold", "results_movie_genre_language", gold_folder_path, "tgt.movie_id = src.movie_id AND tgt.language_id = src.language_id AND tgt.genre_id = src.genre_id", "created_date")

In [0]:
#display(spark.read.parquet(f"{gold_folder_path}/results_movie_genre_language"))
spark.sql("SELECT created_date, count(*) \
           FROM movie_gold.results_movie_genre_language \
           GROUP BY created_date").display() 
