In [0]:
import dlt
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import col, count

# Définir le schéma
schema = StructType([
    StructField("Unnamed: 0", StringType(), True),
    StructField("Title", StringType(), True),
    StructField("Year", IntegerType(), True),
    StructField("imdbID", StringType(), True),
    StructField("Type", StringType(), True),
    StructField("Poster", StringType(), True),
])

@dlt.table(
    comment="Table bronze des films pokemon"
)
def movies_bronze():
    # Lecture en streaming
    movies_bronze = (spark.readStream
        .format("csv")
        .option("header", "true")
        .schema(schema) 
        .load("gs://de-01-data-ingestion/csv/")
    )
    movies_bronze = movies_bronze.withColumnRenamed("Unnamed: 0", "row_id")
    return movies_bronze

In [0]:
@dlt.view(
    comment="Table silver des films pokemon après nettoyage"
)
@dlt.expect_or_drop("Valid_Year", "Year IS NOT NULL")
@dlt.expect_or_drop("Valid_Title", "Title IS NOT NULL")
def movies_silver():
    movies_silver = dlt.read_stream("movies_bronze")
    return movies_silver.dropDuplicates()


In [0]:
@dlt.table(
    comment="Table gold des films pokemon par année"
)
def movies_by_year_gold():
    movies_by_year_gold = dlt.read_stream("movies_silver")
    movies_by_year_gold = movies_by_year_gold.groupBy("Year").agg(count("*").alias("Total"))
    return movies_by_year_gold