#### **Ingesta del archivo "movie.csv"**

In [0]:
dbutils.widgets.text("environment","")
var_environment = dbutils.widgets.get("environment")

In [0]:
dbutils.widgets.text("file_date","2024-12-30")
var_file_date = dbutils.widgets.get("file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

##### Librerías

In [0]:
from pyspark.sql.types import IntegerType, StringType, StructField, StructType,DateType,DoubleType
from pyspark.sql.functions import current_timestamp,lit

##### Esquema

In [0]:
df_movie_schema = StructType([
    StructField('movieId', IntegerType(), False),
    StructField('title', StringType(), True),
    StructField('budget', DoubleType(), True),
    StructField('homePage', StringType(), True),
    StructField('overview', StringType(), True),
    StructField('popularity', DoubleType(), True),
    StructField('yearReleaseDate', IntegerType(), True),
    StructField('releaseDate', DateType(), True),
    StructField('revenue', DoubleType(), True),
    StructField('durationTime', IntegerType(), True),
    StructField('movieStatus', StringType(), True),
    StructField('tagline', StringType(), True),
    StructField('voteAverage', DoubleType(), True),
    StructField('voteCount', IntegerType(), True),
    ])

##### Leer fichero CSV

In [0]:
df_movie = spark.read.format("csv").option("header","true").schema(schema=df_movie_schema).load(f"{bronze_folder_path}/{var_file_date}/movie.csv")

##### Seleccionar columnas

In [0]:
df_movie_selected = df_movie.select(df_movie.movieId,df_movie.title,df_movie.budget,df_movie.popularity,df_movie.yearReleaseDate,
df_movie.releaseDate,df_movie.revenue, \
df_movie.durationTime,df_movie.voteAverage,df_movie.voteCount)

##### Renombrar columnas

In [0]:
df_movie_renamed = df_movie_selected.withColumnsRenamed({"movieId":"movie_id","yearReleaseDate":"year_release_date","releaseDate":"release_date",
"durationTime":"duration_time","voteAverage":"vote_average","voteCount":"vote_count"})

##### Añadir columnas

In [0]:
df_movies_final = add_ingestion_date(df_movie_renamed).withColumn("environment",lit(var_environment)).withColumn("file_date",lit(var_file_date))

##### Escribir datos en Silver

In [0]:
#overwrite_partition(df_movies_final,"movie_silver","movies","file_date")

In [0]:
merge_condition = "target.movie_id = source.movie_id AND target.file_date = source.file_date"
merge_delta_lake(df_movies_final,"movie_silver","movies",silver_folder_path,merge_condition,"file_date")

In [0]:
%sql
SELECT file_date, COUNT(1)
FROM movie_silver.movies
GROUP BY file_date

In [0]:
dbutils.notebook.exit("Success")