In [1]:
!pip install delta-spark==2.4.0



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, lower, trim, udf, regexp_extract
from pyspark.sql.types import DoubleType, IntegerType, StringType
from delta import *
import unicodedata

# Caminho do warehouse Hive no HDFS
warehouse_location = "hdfs://hdfs-nn:9000/warehouse"

# Criação da sessão Spark com suporte a Hive + Delta Lake
spark = (
    SparkSession.builder
    .appName("Silver_BooksIntoMovies_Treatment")
    # ---- configurações Hive ----
    .config("spark.sql.warehouse.dir", warehouse_location)
    .config("hive.metastore.uris", "thrift://hive-metastore:9083")
    .config("spark.sql.catalogImplementation", "hive")
    .config("hive.metastore.warehouse.dir", warehouse_location)
    # ---- extensões Delta Lake ----
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    # ---- pacote Delta compatível com Spark 3.4.1 (Scala 2.12) ----
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
    .enableHiveSupport()
    .getOrCreate()
)

print("Spark iniciado com sucesso — versão:", spark.version)


Spark iniciado com sucesso — versão: 3.4.1


In [3]:
from pyspark.sql.types import StructType, StructField

bronze_path = "hdfs://hdfs-nn:9000/datasets/bronze/Books_into_Movies_Atualizado_1920_2025.csv"  

# Schema simples e explícito
schema = StructType([
    StructField("Author", StringType(), True),
    StructField("Movie Title", StringType(), True),
    StructField("Movie Release Date", StringType(), True),
    StructField("Book Title", StringType(), True)
])

books_bronze = (
    spark.read
    .option("header", True)
    .schema(schema)
    .csv(bronze_path)
)

print("Bronze Books Into Movies carregado:")
books_bronze.show(10, truncate=False)


Bronze Books Into Movies carregado:
+-----------------+---------------------+------------------+-----------------------+
|Author           |Movie Title          |Movie Release Date|Book Title             |
+-----------------+---------------------+------------------+-----------------------+
|Doug Stanton     |12 Strong            |1/19/2018         |null                   |
|W. Bruce Cameron |A Dog's Purpose      |1/27/2017         |null                   |
|Madeleine L'Engle|A Wrinkle in Time    |3/9/2018          |null                   |
|Jennifer Niven   |All the Bright Places|null              |null                   |
|Jeff Vandermeer  |Annihilation         |2/23/2018         |null                   |
|Ruta Sepetys     |Ashes in the Snow    |null              |Between Shades of Gray |
|Lauren Oliver    |Before I Fall        |3/3/2017          |null                   |
|Ann Patchett     |Bel Canto            |null              |null                   |
|Melanie Joosten  |Berlin Syn

In [4]:
spark.sql("""
    CREATE DATABASE IF NOT EXISTS silver
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/silver.db'
""")

print("Base de dados 'silver' pronta.")


Base de dados 'silver' pronta.


In [5]:
# Lógica do grupo para remover acentos
def remove_accents(text):
    if text is None:
        return None
    return ''.join(
        c for c in unicodedata.normalize('NFKD', text)
        if not unicodedata.combining(c)
    )

remove_accents_udf = udf(remove_accents, StringType())


In [6]:
books_silver = (
    books_bronze

    # - Remover registos com nulos críticos
    .na.drop(subset=["Author", "Movie Title"])   # Book Title pode ser NULL

    #  - Normalizar AUTHOR → minúsculas + sem acentos + sem caracteres especiais
    .withColumn("author_clean", lower(trim(col("Author"))))
    .withColumn("author_noacc", remove_accents_udf(col("author_clean")))
    .withColumn("author_norm", regexp_replace(col("author_noacc"), r"[^a-z0-9 ]", ""))

    #  - Normalizar BOOK TITLE (pode ser null)
    .withColumn("book_title_clean", lower(trim(col("Book Title"))))
    .withColumn("book_title_noacc", remove_accents_udf(col("book_title_clean")))
    .withColumn("book_title_norm", regexp_replace(col("book_title_noacc"), r"[^a-z0-9 ]", ""))

    #  - Normalizar MOVIE TITLE
    .withColumn("movie_title_clean", lower(trim(col("Movie Title"))))
    .withColumn("movie_title_noacc", remove_accents_udf(col("movie_title_clean")))
    .withColumn("movie_title_norm", regexp_replace(col("movie_title_noacc"), r"[^a-z0-9 ]", ""))

    #  - Extrair ANO (corrigido)
    .withColumn("movie_year_str",
        regexp_extract(col("Movie Release Date"), r"(\d{4})", 1))
    .withColumn("movie_year", col("movie_year_str").cast(IntegerType()))

    #  - Filtrar linhas com ano válido
    .filter(col("movie_year").isNotNull())
    .filter(col("movie_year").between(1900, 2030))

    #  - Remover duplicados reais
    .dropDuplicates(["author_norm", "book_title_norm", "movie_title_norm", "movie_year"])

    #  - Selecionar colunas finais exatamente como combinado
    .select(
        col("author_norm").alias("author"),
        col("book_title_norm").alias("book_title"),
        col("movie_title_norm").alias("movie_title"),
        "movie_year"
    )
)

print("Silver Books Into Movies transformado:")
books_silver.show(20, truncate=False)


Silver Books Into Movies transformado:
+---------------------------------------------------------------+------------------------------------------------------------------------------------+------------------------------------------------------------+----------+
|author                                                         |book_title                                                                          |movie_title                                                 |movie_year|
+---------------------------------------------------------------+------------------------------------------------------------------------------------+------------------------------------------------------------+----------+
|el james                                                       |null                                                                                |fifty shades darker                                         |2017      |
|el james                                                       |null

In [7]:
(
    books_silver.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", True)
    .option("path", "hdfs://hdfs-nn:9000/warehouse/silver.db/adaptations")
    .saveAsTable("silver.adaptations")
)

print("Tabela silver.adaptations gravada com sucesso!")


Tabela silver.adaptations gravada com sucesso!


In [8]:
print(books_silver.columns)

['author', 'book_title', 'movie_title', 'movie_year']


In [9]:
spark.stop()