In [1]:
!pip install delta-spark==2.4.0



In [2]:
import pandas as pd
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from delta import *
from pyspark.sql.functions import col, when, sum, lit, trim
from pyspark.sql.functions import col, when, substring, trim, upper, lower

from pyspark.sql.types import IntegerType

In [3]:
# 1. Configuração da Sessão Spark
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'
builder = SparkSession \
    .builder \
    .appName("Projeto - Script Silver") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()
print("Sessão Spark (Silver v3.1) iniciada.")

Sessão Spark (Silver v3.1) iniciada.


In [4]:
# 2. Criar Base de Dados Silver
print("A criar/usar a base de dados 'silver'...")
spark.sql("CREATE DATABASE IF NOT EXISTS silver LOCATION 'hdfs://hdfs-nn:9000/warehouse/silver.db/'")
spark.sql("USE silver")

A criar/usar a base de dados 'silver'...


DataFrame[]

In [5]:
# Apagar tabela antiga se existir
spark.sql("DROP TABLE IF EXISTS silver.films_awards")

DataFrame[]

In [6]:
df_full  = spark.table("silver.full_data")
df_oscar = spark.table("silver.the_oscar_award")

In [7]:
df_full = df_full.withColumn("year_int", F.col("year").cast("int"))

In [8]:
df_oscar_clean = df_oscar.withColumn(
    "winner",
    F.coalesce(F.col("winner"), F.lit(False))
)

In [9]:
# 3) Agregar Óscares por (film, year_film)
#    → 1 linha por filme+ano com métricas agregadas
# ------------------------------------------------------------
df_oscar_agg = df_oscar_clean.groupBy("film", "year_film").agg(
    F.count(F.lit(1)).alias("oscar_nominations"),                          # nº total de nomeações
    F.sum(F.when(F.col("winner") == True, 1).otherwise(0)).alias("oscar_wins"),  # nº total de vitórias
    F.collect_set("canon_category").alias("oscar_categories")               # conjunto de categorias
)

print("Linhas em df_oscar_agg:", df_oscar_agg.count())

Linhas em df_oscar_agg: 5325


In [10]:
df_full_unique = df_full.dropDuplicates(["film", "year_int"])

In [11]:
# 5) LEFT JOIN film + ano (film-level)
#    → mantém todos os filmes do full, junta info de Óscares
# ------------------------------------------------------------
df_joined = df_full_unique.join(
    df_oscar_agg,
    (df_full_unique["film"] == df_oscar_agg["film"]) &
    (df_full_unique["year_int"] == df_oscar_agg["year_film"]),
    "left"
)

print("Linhas em df_joined:", df_joined.count())


Linhas em df_joined: 5168


In [12]:
# 6) Selecionar colunas finais para a Silver films_awards
#    - todas as colunas do full_data (já deduplicado)
#    - métricas agregadas dos Óscares
# ------------------------------------------------------------
df_silver = df_joined.select(
    df_full_unique["*"],                 
    F.col("oscar_nominations"),
    F.col("oscar_wins"),
    F.col("oscar_categories"),
    F.col("year_film").alias("oscar_year")
)

In [13]:
# 7.1) Remover linhas sem 'film' 
df_silver_final = df_silver.filter(F.col("film").isNotNull())


In [14]:
# 7.2) Normalizar colunas numéricas / de controlo dos Óscares
df_silver_final = df_silver_final.withColumn(
    "oscar_nominations", F.coalesce(F.col("oscar_nominations"), F.lit(0))
).withColumn(
    "oscar_wins", F.coalesce(F.col("oscar_wins"), F.lit(0))
).withColumn(
    "oscar_year", F.coalesce(F.col("oscar_year"), F.lit(0))
)

In [15]:
# 7.3) Normalizar array de categorias dos Óscares
df_silver_final = df_silver_final.withColumn(
    "oscar_categories",
    F.coalesce(F.col("oscar_categories"), F.array().cast("array<string>"))
)

In [16]:
# 7.4) winner (do full_data) → NULL → False

if "winner" in [c for c, _ in df_silver_final.dtypes]:
    df_silver_final = df_silver_final.withColumn(
        "winner",
        F.when(F.col("winner").isNull(), F.lit(False)).otherwise(F.col("winner"))
    )

In [17]:
# 7.5) Criar flag útil: ganhou pelo menos 1 Óscar?
df_silver_final = df_silver_final.withColumn(
    "won_any_oscar",
    (F.col("oscar_wins") > 0)
)


In [18]:
df_silver_final = df_silver_final.drop("year")

In [19]:
# 7.6) colunas que não interessam 

cols_to_drop = []
for col_name, _ in df_silver_final.dtypes:
    if col_name == "nominees" or col_name == "name":
   # podes adicionar mais se quiseres
        cols_to_drop.append(col_name)

if cols_to_drop:
    df_silver_final = df_silver_final.drop(*cols_to_drop)

print("Linhas em df_silver_final:", df_silver_final.count())
df_silver_final.show(10, truncate=False)

Linhas em df_silver_final: 5077
+--------+----------+-----------------------------+----------------------------------------------------+-------------------------+------+--------+-----------------+----------+--------------------------------------------------------+----------+-------------+
|ceremony|class     |canonical_category           |category                                            |film                     |winner|year_int|oscar_nominations|oscar_wins|oscar_categories                                        |oscar_year|won_any_oscar|
+--------+----------+-----------------------------+----------------------------------------------------+-------------------------+------+--------+-----------------+----------+--------------------------------------------------------+----------+-------------+
|8       |Production|SOUND RECORDING              |SOUND RECORDING                                     |$1,000 a Minute          |false |1935    |1                |0         |[SOUND RECORDING]  

In [21]:
from pyspark.sql.functions import lower, col

# Converter os nomes das colunas para minúsculas
df_silver_final = df_silver_final.toDF(*[c.lower() for c in df_silver_final.columns])

# Converter valores string para minúsculas
df_silver_final = df_silver_final.select(
    *[
        lower(col(c)).alias(c) if dict(df_silver_final.dtypes)[c] == "string" else col(c)
        for c in df_silver_final.columns
    ]
)

df_silver_final.show(5)


+--------+----------+--------------------+--------------------+--------------------+------+--------+-----------------+----------+--------------------+----------+-------------+
|ceremony|     class|  canonical_category|            category|                film|winner|year_int|oscar_nominations|oscar_wins|    oscar_categories|oscar_year|won_any_oscar|
+--------+----------+--------------------+--------------------+--------------------+------+--------+-----------------+----------+--------------------+----------+-------------+
|       8|production|     sound recording|     sound recording|     $1,000 a minute| false|    1935|                1|         0|   [SOUND RECORDING]|      1935|        false|
|      59|     title|international fea...|foreign language ...|                '38'| false|    1986|                1|         0|[INTERNATIONAL FE...|      1986|        false|
|      59|   writing|writing (original...|writing (screenpl...|  'crocodile' dundee| false|    1986|                1|  

In [22]:
from pyspark.sql.functions import regexp_replace, trim

df_silver_final = df_silver_final.withColumn(
    "film",
    trim(regexp_replace("film", r"[\'\"]", ""))  # remove as aspas ' ou "
)


In [24]:
from pyspark.sql.functions import expr

df_silver_final = df_silver_final.withColumn(
    "oscar_categories",
    expr("transform(oscar_categories, x -> lower(x))")
)


In [25]:
from pyspark.sql.functions import array_join

df_silver_final.select(
    "film",
    array_join("oscar_categories", ", ").alias("oscar_categories_str")
).show(10, False)


+---------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|film                             |oscar_categories_str                                                                                                                                                                                                    |
+---------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|task force                       |documentary (short subject)                                                                                                                                                                                   

In [26]:
spark.sql("""
CREATE TABLE IF NOT EXISTS silver.films_awards
USING DELTA
LOCATION 'hdfs://hdfs-nn:9000/warehouse/silver.db/films_awards'
""")


DataFrame[]

In [27]:
df_silver_final.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save("hdfs://hdfs-nn:9000/warehouse/silver.db/films_awards")


In [28]:
# Verificação final
df_check = spark.table("silver.films_awards")
print("Registos na tabela silver.films_awards:", df_check.count())
df_check.show(10, truncate=False)

Registos na tabela silver.films_awards: 5077
+--------+----------+-----------------------------+----------------------------------------------------+-------------------------+------+--------+-----------------+----------+--------------------------------------------------------+----------+-------------+
|ceremony|class     |canonical_category           |category                                            |film                     |winner|year_int|oscar_nominations|oscar_wins|oscar_categories                                        |oscar_year|won_any_oscar|
+--------+----------+-----------------------------+----------------------------------------------------+-------------------------+------+--------+-----------------+----------+--------------------------------------------------------+----------+-------------+
|8       |production|sound recording              |sound recording                                     |$1,000 a minute          |false |1935    |1                |0         |[sound

In [29]:
from pyspark.sql.functions import col, when, trim, lit
import pyspark.sql.functions as F

total_count = df_silver_final.count()

expressions = []
for c_name, c_type in df_silver_final.dtypes:

    # 1. Contar nulos
    null_count = F.sum(when(col(c_name).isNull(), 1).otherwise(0))

    # 2. Contar strings vazias
    if c_type == 'string':
        empty_count = F.sum(when(trim(col(c_name)) == "", 1).otherwise(0))
        total_bad = null_count + empty_count
    else:
        total_bad = null_count

    # 3. Percentagem corrigida com F.round
    percent_expression = F.round(
        (total_bad / lit(total_count) * 100), 2
    ).alias(f"percent_nulo_{c_name}")

    expressions.append(percent_expression)

# Executar o aggregation
df_report = df_silver_final.agg(*expressions)
df_report.show(truncate=False)


+---------------------+------------------+-------------------------------+---------------------+-----------------+-------------------+---------------------+------------------------------+-----------------------+-----------------------------+-----------------------+--------------------------+
|percent_nulo_ceremony|percent_nulo_class|percent_nulo_canonical_category|percent_nulo_category|percent_nulo_film|percent_nulo_winner|percent_nulo_year_int|percent_nulo_oscar_nominations|percent_nulo_oscar_wins|percent_nulo_oscar_categories|percent_nulo_oscar_year|percent_nulo_won_any_oscar|
+---------------------+------------------+-------------------------------+---------------------+-----------------+-------------------+---------------------+------------------------------+-----------------------+-----------------------------+-----------------------+--------------------------+
|0.0                  |0.0               |0.0                            |0.0                  |0.0              |0.0    

In [30]:
spark.stop()
print("Sessão Spark terminada.")

Sessão Spark terminada.
