### Carga de la tabla result_group_movie_genre en la capa Gold

#### Paso 0 - Actualizando las variables globales y las funciones comunes.

In [0]:
dbutils.widgets.text("p_file_date", "2024-12-30")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

%md
#### Paso 1 - Leyendo todas las tablas necesarias."

In [0]:
# movies_df = spark.read.parquet(f"{silver_folder_path}/movies") \
#                     .filter("year_release_date >= 2010") \
#                     .filter(f"file_date = '{v_file_date}'")

movies_df = spark.read.format("delta").load(f"{silver_folder_path}/movies") \
                    .filter("year_release_date >= 2010") \
                    .filter(f"file_date = '{v_file_date}'")

In [0]:
# movie_genre_df = spark.read.parquet(f"{silver_folder_path}/movie_genre") \
#                             .filter(f"file_date = '{v_file_date}'")

movie_genre_df = spark.read.format("delta").load(f"{silver_folder_path}/movie_genre") \
                             .filter(f"file_date = '{v_file_date}'")

In [0]:
# genre_df = spark.read.parquet(f"{silver_folder_path}/genre")

genre_df = spark.read.format("delta").load(f"{silver_folder_path}/genre")

### Paso 2 - Generando el DataFrame con todos los requisitos

- Realiza un **inner join** entre todas las tablas  
- Selecciona todos los campos necesarios 
- Elimina todas las filas duplicadas 

In [0]:
results_movie_genre_df = movies_df.join(movie_genre_df, 
                                          movies_df.movie_id == movie_genre_df.movie_id,
                                          "inner") \
                                    .join(genre_df, 
                                          movie_genre_df.genre_id == genre_df.genre_id,
                                          "inner") \
                                    .select(movies_df.year_release_date,
                                            movies_df.budget,
                                            movies_df.revenue,
                                            genre_df.genre_name) \
                                    .distinct()

In [0]:
from pyspark.sql.functions import sum

In [0]:
results_group_movie_genre_df = results_movie_genre_df \
                                .groupBy("year_release_date", "genre_name") \
                                    .agg(
                                         sum("budget").alias("total_budget"),
                                         sum("revenue").alias("total_revenue")
                                         ) \
                                    .orderBy("year_release_date", "genre_name")

In [0]:
#display(results_group_movie_genre_df)

###Paso 4 - Generando el ranking por year_release_date y genre_name

In [0]:
from pyspark.sql.functions import dense_rank, rank, desc, lit
from pyspark.sql.window import Window

In [0]:
movie_rank = Window.partitionBy("year_release_date").orderBy(desc("total_budget"),desc("total_revenue"))

#.select("year_release_date", "total_budget", "total_revenue", "country_name") \

results_group_movie_genre_final_df = results_group_movie_genre_df \
        .withColumn("rank", rank().over(movie_rank)) \
        .withColumn("created_date", lit(v_file_date))

### Paso 4 - Generando el archivo final en la carpeta Gold

In [0]:
#drop_partition_if_exists(results_group_movie_genre_final_df, "movie_gold", "results_group_movie_genre", "created_date")

In [0]:
#results_group_movie_country_final_df.write.mode("overwrite").parquet(f"{gold_folder_path}/results_group_movie_country")

# results_group_movie_genre_final_df.write \
#                 .mode("append") \
#                 .partitionBy("created_date") \
#                 .format("parquet") \
#                 .saveAsTable("movie_gold.results_group_movie_genre")

#Caso a tabela não existir, cria a tabela com os dados, senão efetua um merge dos dados na tabela
merge_delta_lake(results_group_movie_genre_final_df, "movie_gold", "results_group_movie_genre", gold_folder_path, "tgt.genre_name = src.genre_name AND tgt.year_release_date = src.year_release_date", "created_date")

In [0]:
#display(spark.read.parquet(f"{gold_folder_path}/results_group_movie_country"))
spark.sql("SELECT created_date, count(*) \
           FROM movie_gold.results_group_movie_genre \
           GROUP BY created_date").display()