In [1]:
!pip install delta-spark==2.4.0



In [2]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import (
    LongType, StringType, StructField, StructType, 
    BooleanType, ArrayType, IntegerType, DoubleType
)
from pyspark.sql.functions import col

In [3]:
# 1. Configuração da Sessão Spark
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'
builder = SparkSession \
    .builder \
    .appName("Projeto - Carga de Dados Oscar (CORRIGIDO)") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()
print("Sessão Spark iniciada.")

Sessão Spark iniciada.


In [4]:
# === 3) Garantir a BD 'silver' no warehouse ===
spark.sql("""
CREATE DATABASE IF NOT EXISTS silver
LOCATION 'hdfs://hdfs-nn:9000/warehouse/silver.db/'
""")

DataFrame[]

In [5]:
# === 4) Ler o ficheiro do Bronze ===
hdfs_path = "hdfs://hdfs-nn:9000/datasets/bronze/full_data/full_data.csv"

customSchema = StructType([
    StructField("Ceremony", IntegerType(), True),
    StructField("Year", StringType(), True),
    StructField("Class", StringType(), True),
    StructField("CanonicalCategory", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("NomId", StringType(), True),
    StructField("Film", StringType(), True),
    StructField("FilmId", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Nominees", StringType(), True),
    StructField("NomineeIds", StringType(), True),
    StructField("Winner", BooleanType(), True),
    StructField("Detail", StringType(), True),
    StructField("Note", StringType(), True),
    StructField("Citation", StringType(), True),
    StructField("MultifilmNomination", BooleanType(), True),
])

print(f"A ler ficheiro: {hdfs_path}")
oscar_df = (
    spark.read
         .option("delimiter", "\t")   # <-- separador TAB
         .option("header", "true")
         .schema(customSchema)
         .csv(hdfs_path)
)

print("Schema lido:")
oscar_df.printSchema()


A ler ficheiro: hdfs://hdfs-nn:9000/datasets/bronze/full_data/full_data.csv
Schema lido:
root
 |-- Ceremony: integer (nullable = true)
 |-- Year: string (nullable = true)
 |-- Class: string (nullable = true)
 |-- CanonicalCategory: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- NomId: string (nullable = true)
 |-- Film: string (nullable = true)
 |-- FilmId: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Nominees: string (nullable = true)
 |-- NomineeIds: string (nullable = true)
 |-- Winner: boolean (nullable = true)
 |-- Detail: string (nullable = true)
 |-- Note: string (nullable = true)
 |-- Citation: string (nullable = true)
 |-- MultifilmNomination: boolean (nullable = true)



In [6]:
# === 5) Limpeza mínima / validações ===
# manter apenas linhas com Year AAAA
print("A filtrar linhas com 'Year' inválido...")
oscar_df = oscar_df.filter(col("Year").rlike(r"^[0-9]{4}$"))


A filtrar linhas com 'Year' inválido...


In [7]:
import unicodedata
#tirar acentos e colocar o nome das tabelas em minusculas
def normalize_col(colname):
    if colname is None:
        return None
    colname = colname.lower()
    colname = ''.join(
        c for c in unicodedata.normalize('NFKD', colname)
        if not unicodedata.combining(c)
    )
    return colname


In [8]:
#aplicar o padrao a todas as colunas
oscar_df = oscar_df.toDF(*[normalize_col(c) for c in oscar_df.columns])

print("Schema depois da normalização dos nomes:")
oscar_df.printSchema()



Schema depois da normalização dos nomes:
root
 |-- ceremony: integer (nullable = true)
 |-- year: string (nullable = true)
 |-- class: string (nullable = true)
 |-- canonicalcategory: string (nullable = true)
 |-- category: string (nullable = true)
 |-- nomid: string (nullable = true)
 |-- film: string (nullable = true)
 |-- filmid: string (nullable = true)
 |-- name: string (nullable = true)
 |-- nominees: string (nullable = true)
 |-- nomineeids: string (nullable = true)
 |-- winner: boolean (nullable = true)
 |-- detail: string (nullable = true)
 |-- note: string (nullable = true)
 |-- citation: string (nullable = true)
 |-- multifilmnomination: boolean (nullable = true)



In [9]:
#remover colunas que nao sao precisas
oscar_df = oscar_df.drop("note", "citation", "multifilmnomination", "filmid", "nomid", "nomineeids", "detail", )


In [10]:
from pyspark.sql import functions as F
import unicodedata

def normalize_colname(col):
    col = col.lower()
    col = ''.join(
        c for c in unicodedata.normalize('NFKD', col)
        if not unicodedata.combining(c)
    )
    return col

oscar_df = oscar_df.toDF(*[normalize_colname(c) for c in oscar_df.columns])

In [11]:
oscar_df = oscar_df.withColumnRenamed("canonicalcategory", "canonical_category")


In [12]:
ordered_columns = [
    "ceremony",
    "class",
    "canonical_category",
    "category",
    "film",
    "name",
    "nominees",
    "winner",
    "year"
]
oscar_df = oscar_df.select(ordered_columns)
oscar_df.show(10, truncate=False)



+--------+----------+-------------------------+-------------+------------------------------+--------------------------------+--------------------------------+------+----+
|ceremony|class     |canonical_category       |category     |film                          |name                            |nominees                        |winner|year|
+--------+----------+-------------------------+-------------+------------------------------+--------------------------------+--------------------------------+------+----+
|7       |Acting    |ACTOR IN A LEADING ROLE  |ACTOR        |It Happened One Night         |Clark Gable                     |Clark Gable                     |true  |1934|
|7       |Acting    |ACTOR IN A LEADING ROLE  |ACTOR        |The Affairs of Cellini        |Frank Morgan                    |Frank Morgan                    |null  |1934|
|7       |Acting    |ACTOR IN A LEADING ROLE  |ACTOR        |The Thin Man                  |William Powell                  |William Powell      

In [13]:
# === 7) Escrever como TABELA DELTA 
print("A escrever tabela 'silver.full_data' (Delta, partitionBy Year)...")
(oscar_df.select(ordered_columns)
        .write
        .format("delta")
        .mode("overwrite")
        .partitionBy("Year")
        .option("overwriteSchema", "true")
        .saveAsTable("silver.full_data"))

A escrever tabela 'silver.full_data' (Delta, partitionBy Year)...


In [14]:
# 8) Verificações rápidas
print("Tabelas em 'silver':")
spark.sql("SHOW TABLES IN silver").show(truncate=False)

print("Contagem de registos:")
spark.sql("SELECT COUNT(*) AS total FROM silver.full_data").show()

print("Amostra:")
spark.table("silver.full_data").show(10, truncate=False)

print("Histórico Delta:")
spark.sql("DESCRIBE HISTORY silver.full_data").show(truncate=False)

Tabelas em 'silver':
+---------+-----------------+-----------+
|namespace|tableName        |isTemporary|
+---------+-----------------+-----------+
|silver   |adaptations      |false      |
|silver   |adaptations_films|false      |
|silver   |books            |false      |
|silver   |full_data        |false      |
|silver   |the_oscar_award  |false      |
|silver   |tv_shows         |false      |
+---------+-----------------+-----------+

Contagem de registos:
+-----+
|total|
+-----+
|11719|
+-----+

Amostra:
+--------+------+--------------------------+--------------------------+------------------------+------------------+------------------+------+----+
|ceremony|class |canonical_category        |category                  |film                    |name              |nominees          |winner|year|
+--------+------+--------------------------+--------------------------+------------------------+------------------+------------------+------+----+
|14      |Acting|ACTOR IN A LEADING ROLE   |A

In [15]:
spark.stop()
print("Sessão Spark terminada.")

Sessão Spark terminada.
