In [1]:
!pip install delta-spark==2.4.0



In [2]:
from pyspark.sql import SparkSession
from delta import *
from pyspark.sql.types import (
    LongType, StringType, StructField, StructType, 
    BooleanType, ArrayType, IntegerType, DoubleType
)
from pyspark.sql.functions import col

In [3]:
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = (
    SparkSession.builder
      .appName("Projeto - Carga de Dados Oscar (Simples)")
      .config("spark.sql.warehouse.dir", warehouse_location)
      .config("hive.metastore.uris", "thrift://hive-metastore:9083")
      .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
      .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
      .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
      .enableHiveSupport()
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()
print("Sessão Spark iniciada.")

Sessão Spark iniciada.


In [4]:
# 2) Ler CSV do HDFS
hdfs_path = "hdfs://hdfs-nn:9000/datasets/bronze/the_oscar_award/the_oscar_award.csv"

customSchema = StructType([
    StructField("year_film", IntegerType(), True),
    StructField("year_ceremony", IntegerType(), True),
    StructField("ceremony", IntegerType(), True),
    StructField("category", StringType(), True),
    StructField("canon_category", StringType(), True),
    StructField("name", StringType(), True),
    StructField("film", StringType(), True),
    StructField("winner", BooleanType(), True)
])

print(f"A ler CSV de: {hdfs_path}")
oscar_df = (
    spark.read
         .option("delimiter", ",")
         .option("header", "true")
         .schema(customSchema)
         .csv(hdfs_path)
)

print("Schema e dados lidos:")
oscar_df.printSchema()
oscar_df.show(5, truncate=False)

A ler CSV de: hdfs://hdfs-nn:9000/datasets/bronze/the_oscar_award/the_oscar_award.csv
Schema e dados lidos:
root
 |-- year_film: integer (nullable = true)
 |-- year_ceremony: integer (nullable = true)
 |-- ceremony: integer (nullable = true)
 |-- category: string (nullable = true)
 |-- canon_category: string (nullable = true)
 |-- name: string (nullable = true)
 |-- film: string (nullable = true)
 |-- winner: boolean (nullable = true)

+---------+-------------+--------+--------+-------------------------+-------------------+----------------------+------+
|year_film|year_ceremony|ceremony|category|canon_category           |name               |film                  |winner|
+---------+-------------+--------+--------+-------------------------+-------------------+----------------------+------+
|1927     |1928         |1       |ACTOR   |ACTOR IN A LEADING ROLE  |Richard Barthelmess|The Noose             |false |
|1927     |1928         |1       |ACTOR   |ACTOR IN A LEADING ROLE  |Richard Bar

In [5]:
#remover colunas 
oscar_df = oscar_df.drop(
    "year_ceremony",
    "ceremony",
    "name"
)


In [6]:
# 3) Criar/garantir a BD 'silver' e gravar como TABELA Delta
print("A reordenar colunas e a preparar para a escrita...")

ordered_columns = [
    "canon_category",
    "film",
    "category",
    "winner",
    "year_film"
]

A reordenar colunas e a preparar para a escrita...


In [7]:
spark.sql("DROP TABLE IF EXISTS silver.the_oscar_award")


DataFrame[]

In [8]:
spark.sql("""
  CREATE DATABASE IF NOT EXISTS silver
  LOCATION 'hdfs://hdfs-nn:9000/warehouse/silver.db'
""")

# Escrever e REGISTAR como tabela Delta (permite SELECT silver.the_oscar_award)
(oscar_df
    .select(ordered_columns)
    .write
    .mode("overwrite")
    .format("delta")
    .partitionBy("year_film")
    .saveAsTable("silver.the_oscar_award")
)

print("Dados escritos com sucesso para a tabela silver.the_oscar_award")



Dados escritos com sucesso para a tabela silver.the_oscar_award


In [9]:
# 4) Verificações
print("SELECT * da tabela 'silver.the_oscar_award':")
spark.sql("SELECT * FROM silver.the_oscar_award LIMIT 10").show()

print("Contagem de nomeações por ano (GROUP BY):")
spark.sql("""
    SELECT year_film, COUNT(*) AS total_nominations
    FROM silver.the_oscar_award
    GROUP BY year_film
    ORDER BY year_film DESC
""").show()

SELECT * da tabela 'silver.the_oscar_award':
+--------------------+-----------------+--------------------+------+---------+
|      canon_category|             film|            category|winner|year_film|
+--------------------+-----------------+--------------------+------+---------+
|ACTOR IN A LEADIN...|             Reds|ACTOR IN A LEADIN...| false|     1981|
|ACTOR IN A LEADIN...|   On Golden Pond|ACTOR IN A LEADIN...|  true|     1981|
|ACTOR IN A LEADIN...|    Atlantic City|ACTOR IN A LEADIN...| false|     1981|
|ACTOR IN A LEADIN...|           Arthur|ACTOR IN A LEADIN...| false|     1981|
|ACTOR IN A LEADIN...|Absence of Malice|ACTOR IN A LEADIN...| false|     1981|
|ACTOR IN A SUPPOR...|Only When I Laugh|ACTOR IN A SUPPOR...| false|     1981|
|ACTOR IN A SUPPOR...|           Arthur|ACTOR IN A SUPPOR...|  true|     1981|
|ACTOR IN A SUPPOR...| Chariots of Fire|ACTOR IN A SUPPOR...| false|     1981|
|ACTOR IN A SUPPOR...|             Reds|ACTOR IN A SUPPOR...| false|     1981|
|ACTOR 

In [10]:
spark.stop()
print("Sessão Spark terminada.")

Sessão Spark terminada.
