In [17]:

# Instalar Delta
!pip install delta-spark==2.4.0



In [18]:
# Imports + Spark Session (Hive + Delta)
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, lower, trim, regexp_replace, udf, lit,
    countDistinct, collect_set, size, when, avg
)
from pyspark.sql.types import StringType, IntegerType, DoubleType
from pyspark.sql.window import Window
from delta import *
import unicodedata

warehouse_location = "hdfs://hdfs-nn:9000/warehouse"

spark = (
    SparkSession.builder
    .appName("Gold_filmes_avaliados")
    # ---- configurações Hive ----
    .config("spark.sql.warehouse.dir", warehouse_location)
    .config("hive.metastore.uris", "thrift://hive-metastore:9083")
    .config("spark.sql.catalogImplementation", "hive")
    .config("hive.metastore.warehouse.dir", warehouse_location)
    # ---- extensões Delta Lake ----
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    # ---- pacote Delta compatível com Spark 3.4.1 (Scala 2.12) ----
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
    .enableHiveSupport()
    .getOrCreate()
)

print("Spark iniciado com sucesso — versão:", spark.version)


Spark iniciado com sucesso — versão: 3.4.1


In [19]:
# Criar base de dados GOLD (se não existir)
spark.sql("""
    CREATE DATABASE IF NOT EXISTS gold
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/gold.db'
""")
spark.sql("USE gold")
print("Base de dados 'gold' pronta.")


Base de dados 'gold' pronta.


In [20]:
# Função UDF para remover acentos (mesma lógica da Silver)
def remove_accents(text):
    if text is None:
        return None
    return ''.join(
        c for c in unicodedata.normalize('NFKD', text)
        if not unicodedata.combining(c)
    )

remove_accents_udf = udf(remove_accents, StringType())

def normalize_title_col(c):
    """
    Normalização compatível com silver.boxoffice.title e silver.adaptations.movie_title:
    - lowercase + trim
    - remover acentos
    - remover caracteres especiais (manter a-z, 0-9 e espaço)
    """
    return regexp_replace(
        remove_accents_udf(lower(trim(c))),
        r"[^a-z0-9 ]",
        ""
    )


In [21]:
# Ler tabelas Silver
rating_movies = spark.table("silver.rating_movies")         
films_awards = spark.table("silver.films_awards")   

print("Silver loaded:")
print(" - rating_movies:", rating_movies.count())
print(" - films_awards:", films_awards.count())

Silver loaded:
 - rating_movies: 5471
 - films_awards: 5077


In [22]:
# 2) Garantir colunas-chave consistentes (title_norm + year)
rating_keyed = (
    rating_movies
    .select(
        col("title_norm"),
        col("release_year").cast(IntegerType()).alias("year"),
        col("tomatometer_rating").cast(DoubleType()).alias("tomatometer_rating"),
        col("tomatometer_count").cast(IntegerType()).alias("tomatometer_count"),
        col("audience_rating").cast(DoubleType()).alias("audience_rating"),
        col("audience_count").cast(IntegerType()).alias("audience_count"),
        col("tomatometer_status"),
        col("critics_consensus")
    )
    .dropDuplicates(["title_norm", "year"])
)
awards_keyed = (
    films_awards
    .select(
        col("film").alias("title_norm"),
        col("oscar_year").cast(IntegerType()).alias("year"),
        col("won_any_oscar").cast("boolean").alias("won_any_oscar"),
        col("oscar_wins").cast(IntegerType()).alias("oscar_wins"),
        col("oscar_nominations").cast(IntegerType()).alias("oscar_nominations"),
        col("oscar_categories")  # se existir; se não existir, remove esta linha
    )
    .dropDuplicates(["title_norm", "year"])
)

In [23]:
# 3) Construir tabela GOLD (base = ratings, depois left join aos Óscares)
films_avaliados = (
    rating_keyed
    .join(awards_keyed, on=["title_norm", "year"], how="left")
    # defaults para dashboard
    .withColumn("won_any_oscar", when(col("won_any_oscar").isNull(), lit(False)).otherwise(col("won_any_oscar")))
    .withColumn("oscar_wins", when(col("oscar_wins").isNull(), lit(0)).otherwise(col("oscar_wins")))
    .withColumn("oscar_nominations", when(col("oscar_nominations").isNull(), lit(0)).otherwise(col("oscar_nominations")))
)

In [24]:
films_avaliados = films_avaliados.withColumnRenamed("title_norm", "title")

In [25]:
print("GOLD films_avaliados preview:")
films_avaliados.show(20, truncate=False)
print("Registos:", films_avaliados.count())

GOLD films_avaliados preview:
+-----------------------------------------+----+------------------+-----------------+---------------+--------------+------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+----------+-----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|title                                    |year|tomatometer_rating|tomatometer_count|audience_rating|audience_count|tomatometer_status|critics_consensus                                                                                                                                                                        |won_any_oscar|oscar_wins|oscar_nominations|oscar_categories                                         

In [26]:
# 4) Gravar em DELTA
spark.sql("DROP TABLE IF EXISTS gold.films_avaliados")

(
    films_avaliados.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", True)
    .option("path", "hdfs://hdfs-nn:9000/warehouse/gold.db/films_avaliados")
    .saveAsTable("gold.films_avaliados")
)

print("Tabela gold.films_avaliados gravada com sucesso!")

Tabela gold.films_avaliados gravada com sucesso!


In [27]:
spark.sql("""
SELECT
  won_any_oscar,
  AVG(tomatometer_rating) AS avg_critic_rating,
  AVG(audience_rating)    AS avg_audience_rating,
  AVG(oscar_wins)         AS avg_oscar_wins,
  AVG(oscar_nominations)  AS avg_oscar_nominations,
  COUNT(*)                AS total_films
FROM gold.films_avaliados
GROUP BY won_any_oscar
ORDER BY won_any_oscar DESC
""").show()



+-------------+------------------+-------------------+------------------+---------------------+-----------+
|won_any_oscar| avg_critic_rating|avg_audience_rating|    avg_oscar_wins|avg_oscar_nominations|total_films|
+-------------+------------------+-------------------+------------------+---------------------+-----------+
|         true| 85.70866141732283|  82.24015748031496|2.1102362204724407|    5.464566929133858|        254|
|        false|59.377995016292886| 61.034310906651335|               0.0|   0.1562200498370711|       5217|
+-------------+------------------+-------------------+------------------+---------------------+-----------+



In [32]:
from pyspark.sql.functions import concat_ws, col

df = spark.table("gold.films_avaliados")

df_export = df.withColumn("oscar_categories", concat_ws(", ", col("oscar_categories")))

df_export.coalesce(1) \
  .write \
  .mode("overwrite") \
  .option("header", "true") \
  .csv("/tmp/filmes_avaliados_csv")



In [33]:
!ls /tmp/filmes_avaliados_csv

part-00000-6c087299-1309-4641-b2d9-03798b59dfc2-c000.csv  _SUCCESS


In [34]:
!cp /tmp/filmes_avaliados_csv/part-00000-*.csv /home/jovyan/filmes_avaliados.csv


In [35]:
!ls -lah /home/jovyan/filmes_avaliados.csv


-rw-r--r-- 1 jovyan users 1.2M Dec 26 11:54 /home/jovyan/filmes_avaliados.csv


In [29]:
spark.sql("SHOW DATABASES").show(truncate=False)


+---------+
|namespace|
+---------+
|default  |
|demo     |
|gold     |
|silver   |
+---------+



In [30]:
spark.sql("SHOW TABLES IN gold").show(truncate=False)


+---------+---------------+-----------+
|namespace|tableName      |isTemporary|
+---------+---------------+-----------+
|gold     |films_avaliados|false      |
+---------+---------------+-----------+



In [None]:
spark.stop()