In [54]:
!pip install delta-spark==2.4.0



In [55]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, lower, trim, udf
from pyspark.sql.types import DoubleType, IntegerType, StringType
from delta import *
import unicodedata

# Caminho do warehouse Hive no HDFS
warehouse_location = "hdfs://hdfs-nn:9000/warehouse"

# Criação da sessão Spark com suporte a Hive + Delta Lake
spark = (
    SparkSession.builder
    .appName("Silver_BoxOffice_Treatment")
    # ---- configurações Hive ----
    .config("spark.sql.warehouse.dir", warehouse_location)
    .config("hive.metastore.uris", "thrift://hive-metastore:9083")
    .config("spark.sql.catalogImplementation", "hive")
    .config("hive.metastore.warehouse.dir", warehouse_location)
    # ---- extensões Delta Lake ----
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    # ---- pacote Delta compatível com Spark 3.4.1 (Scala 2.12) ----
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
    .enableHiveSupport()
    .getOrCreate()
)
print("Spark iniciado com sucesso — versão:", spark.version)


Spark iniciado com sucesso — versão: 3.4.1


In [56]:
bronze_path = "hdfs://hdfs-nn:9000/bronze/boxoffice_data_2024.csv"

# Schema simples e explícito
from pyspark.sql.types import StructType, StructField

schema = StructType([
    StructField("Year", IntegerType(), True),
    StructField("Title", StringType(), True),
    StructField("Gross", StringType(), True)
])

boxoffice_bronze = (
    spark.read
    .option("header", True)
    .schema(schema)
    .csv(bronze_path)
)

print("Bronze carregado:")
boxoffice_bronze.show(10, truncate=False)


Bronze carregado:
+----+------------------------------------+------------+
|Year|Title                               |Gross       |
+----+------------------------------------+------------+
|1984|Beverly Hills Cop                   |$234,760,478|
|1984|Ghostbusters                        |$229,376,332|
|1984|Indiana Jones and the Temple of Doom|$179,876,727|
|1984|Gremlins                            |$148,171,538|
|1984|The Karate Kid                      |$90,817,155 |
|1984|Police Academy                      |$81,198,894 |
|1984|Footloose                           |$80,038,626 |
|1984|Romancing the Stone                 |$76,572,547 |
|1984|Star Trek III: The Search for Spock |$76,471,046 |
|1984|Splash                              |$69,821,334 |
+----+------------------------------------+------------+
only showing top 10 rows



In [57]:
spark.sql("""
    CREATE DATABASE IF NOT EXISTS silver
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/silver.db'
""")

print("Base de dados 'silver' pronta.")


Base de dados 'silver' pronta.


In [58]:
# Logica do grupo para remover acentos
def remove_accents(text):
    if text is None:
        return None
    return ''.join(
        c for c in unicodedata.normalize('NFKD', text)
        if not unicodedata.combining(c)
    )

remove_accents_udf = udf(remove_accents, StringType())


In [59]:
boxoffice_silver = (
    boxoffice_bronze

    # 1 - Remove todas as linhas que não possuem
    .na.drop(subset=["Year", "Title", "Gross"])

    # 2️ - Limpar gross -> remover $, vírgulas e espaços
    .withColumn("gross_clean", regexp_replace(col("Gross"), "[$,]", ""))

    # 3️ - Converter gross para número
    .withColumn("gross", col("gross_clean").cast(DoubleType()))

    # 4️ - Normalizar título → minúsculas + sem acentos + sem caracteres especiais
    .withColumn("title_lower", lower(trim(col("Title"))))
    .withColumn("title_noacc", remove_accents_udf(col("title_lower")))
    .withColumn("title_norm", regexp_replace(col("title_noacc"), r"[^a-z0-9 ]", ""))

    # 5️ - Normalizar year
    .withColumn("year", col("Year").cast(IntegerType()))

    # 6️ - Criar década (necessário para a questão 8)
    .withColumn("decade", (col("year") / 10).cast(IntegerType()) * 10)

    # 7️ - Remover duplicados
    .dropDuplicates(["year", "title_norm"])

    # 8️ - Filtrar valores inválidos
    .filter(col("gross") > 0)          # gross não pode ser negativo / zero
    .filter(col("year").between(1984, 2024))  # dataset cobre estas datas

    .drop("Title")


    # 9️ - Selecionar colunas finais
    .select(
        "year",
        col("title_norm").alias("title"),
        "gross",
        "decade",
    )
)

print("Silver transformado:")
boxoffice_silver.show(20, truncate=False)


Silver transformado:
+----+----------------------------+-----------+------+
|year|title                       |gross      |decade|
+----+----------------------------+-----------+------+
|1984|the flamingo kid            |2.3859382E7|1980  |
|1984|body double                 |8802128.0  |1980  |
|1987|cross my heart              |1025762.0  |1980  |
|1988|caddyshack ii               |1.1798302E7|1980  |
|1989|romero                      |1316495.0  |1980  |
|1990|night of the living dead    |5835247.0  |1990  |
|1990|welcome home roxy carmichael|3989297.0  |1990  |
|1990|hidden agenda               |1030938.0  |1990  |
|1991|pure luck                   |2.2641969E7|1990  |
|1992|tous les matins du monde    |3089497.0  |1990  |
|1992|shadows and fog             |2735731.0  |1990  |
|1992|casablanca 1992 rerelease   |1719913.0  |1990  |
|1994|double dragon               |2341309.0  |1990  |
|1994|where the rivers flow north |595505.0   |1990  |
|1995|leaving las vegas           |3.2029928

In [60]:
(
    boxoffice_silver.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", True)
    .option("path", "hdfs://hdfs-nn:9000/warehouse/silver.db/boxoffice")
    .saveAsTable("silver.boxoffice")
)

print("Tabela silver.boxoffice gravada com sucesso!")


Tabela silver.boxoffice gravada com sucesso!


In [61]:
print(boxoffice_silver.columns)

['year', 'title', 'gross', 'decade']


In [62]:
spark.stop()