## Ingestion del archivo "movie.csv"

#### `Paso 0 - Actualización de variables y funciones y validaciones

In [0]:
dbutils.widgets.text("p_environment", "")
v_environment = dbutils.widgets.get("p_environment")

In [0]:
dbutils.widgets.text("p_file_date", "2024-12-30")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
valida_arquivo(bronze_folder_path, v_file_date, "movie.csv")

#### Paso 1 - Leer el archivo usando "DataFrameReader" de Spark

In [0]:
from pyspark.sql.types import *

In [0]:
movie_schema = StructType(fields=[
    StructField("movieId", IntegerType(), False),
    StructField("title", StringType(), True),
    StructField("budget", DoubleType(), True),
    StructField("homepage", StringType(), True),    
    StructField("overview", StringType(), True),
    StructField("popularity", DoubleType(), True), 
    StructField("yearReleaseDate", IntegerType(), True),  
    StructField("releaseDate", DateType(), True),
    StructField("revenue", DoubleType(), True),
    StructField("durationTime", IntegerType(), True),
    StructField("movieStatus", StringType(), True),
    StructField("tagline", StringType(), True),
    StructField("voteAverage", DoubleType(), True),
    StructField("voteCount", IntegerType(), True)
])

In [0]:
movie_df = spark.read \
    .option("header", True) \
    .schema(movie_schema) \
    .csv(f"{bronze_folder_path}/{v_file_date}/movie.csv")

#### Paso 2 - Seleccionar sólo las columnas "requeridas"

In [0]:
#1ª forma
#movies_selected_df = movie_df.select("movieId", "title", "budget", "popularity", "yearReleaseDate", "releaseDate", "revenue", "durationTime", "voteAverage", "voteCount")

In [0]:
#2ª forma
#movies_selected_df = movie_df.select(movie_df.movieId, movie_df.title, movie_df.budget, movie_df.popularity, movie_df.yearReleaseDate, movie_df.releaseDate, movie_df.revenue, movie_df.durationTime, movie_df.voteAverage, movie_df.voteCount)

In [0]:
#3ª forma
#movies_selected_df = movie_df.select(
#                                movie_df["movieId"], 
#                                movie_df["title"], 
#                                movie_df["budget"], 
#                                movie_df["popularity"], 
#                                movie_df["yearReleaseDate"], 
#                                movie_df["releaseDate"], 
#                                movie_df["revenue"], 
#                                movie_df["durationTime"], 
#                                movie_df["voteAverage"], 
#                                movie_df["voteCount"])

In [0]:
#4ª forma
from pyspark.sql.functions import col

movies_selected_df = movie_df.select(
                            col("movieId"), 
                            col("title"), 
                            col("budget"), 
                            col("popularity"), 
                            col("yearReleaseDate"), 
                            col("releaseDate"), 
                            col("revenue"), 
                            col("durationTime"), 
                            col("voteAverage"), 
                            col("voteCount")
                                    )

#### Paso 3 - Cambiar el nombre de las columnas según lo "requerido"

In [0]:
# Alterando somente os nomes das colunas necessárias
# 1ª Forma
movies_renamed_df = movies_selected_df \
                    .withColumnRenamed("movieId", "movie_id") \
                    .withColumnRenamed("yearReleaseDate", "year_release_date") \
                    .withColumnRenamed("releaseDate", "release_date") \
                    .withColumnRenamed("durationTime", "duration_time") \
                    .withColumnRenamed("voteAverage", "vote_average") \
                    .withColumnRenamed("voteCount", "vote_count")

In [0]:
# Alterando somente os nomes das colunas necessárias
# 2ª Forma
#movies_renamed_df = movies_selected_df \
#                    .withColumnsRenamed({"movieId": "move_id", \
#                                        "yearReleaseDate": "year_release_date", \
#                                        "releaseDate": "release_date", \
#                                        "durationTime": "duration_time", \
#                                        "voteAverage": "vote_average", \
#                                        "voteCount": "vote_count"})

#### Paso 4 - Agregar la columna "ingestion_date" al DataFrame

In [0]:
from pyspark.sql.functions import lit

In [0]:
# 1ª Forma
# Cria o dataframe final, adicionando as colunas 'ingestion_date' e 'env'
movies_final_df = add_ingestion_date(movies_renamed_df) \
                  .withColumn("env", lit(v_environment)) \
                  .withColumn("file_date", lit(v_file_date))

In [0]:
# 2ª Forma
#movies_final_df = movies_renamed_df.withColumns({"ingestion_date": current_timestamp(), \
#                                                 "env": lit("production")})

#### Paso 5 - Escribir datos en el datalake en formato parquet

In [0]:
# Usamos a função collect que retorna os dados em memória para apagar a partição se a tabela e patição já existir
#for item_list in movies_final_df.select("file_date").distinct().collect():
#    
#    if (spark._jsparkSession.catalog().tableExists("movie_silver.movies")):
#    
#        #print(f"ALTER TABLE movie_silver.movies DROP IF EXISTS PARTITION (file_date='{item_list.file_date}')")
#        spark.sql(f"ALTER TABLE movie_silver.movies DROP IF EXISTS PARTITION (file_date='{item_list.file_date}')")

In [0]:
#Elimina partição de uma tabela parquet se ela já existir
#drop_partition_if_exists(movies_final_df, "movie_silver", "movies", "file_date")

In [0]:
# Escrevendo o dataframe final no caminho de destino
#movies_final_df.write.mode("overwrite").partitionBy("year_release_date").parquet(f"{silver_folder_path}/movies/")

# Escrevendo o dataframe final em uma tabela no schema movie_siver gestionada pelo DataBricks
#movies_final_df.write.mode("append").format("parquet").saveAsTable("movie_silver.movies")

# Escrevendo o dataframe final em um arquivo parquet particionado gestionada pelo DataBricks
#movies_final_df.write.mode("append").partitionBy("file_date").format("parquet").saveAsTable("movie_silver.movies")

#Caso a tabela não existir, cria a tabela com os dados, senão efetua um merge dos dados na tabela
merge_delta_lake(movies_final_df, "movie_silver", "movies", silver_folder_path, "tgt.movie_id = src.movie_id", "file_date")

In [0]:
#display(spark.read.parquet(f"{silver_folder_path}/movies/"))
display(spark.sql("SELECT file_date, \
                          env, \
                          COUNT(1) \
                   FROM movie_silver.movies \
                   GROUP BY file_date, env"
                   )
       )

In [0]:
dbutils.notebook.exit("Success")