In [1]:
!pip install delta-spark==2.4.0



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, lower, trim, udf, regexp_extract
from pyspark.sql.types import DoubleType, IntegerType, StringType
from delta import *
import unicodedata

# Caminho do warehouse Hive no HDFS
warehouse_location = "hdfs://hdfs-nn:9000/warehouse"

# Criação da sessão Spark com suporte a Hive + Delta Lake
spark = (
    SparkSession.builder
    .appName("Silver_actorfilms_Treatment")
    # ---- configurações Hive ----
    .config("spark.sql.warehouse.dir", warehouse_location)
    .config("hive.metastore.uris", "thrift://hive-metastore:9083")
    .config("spark.sql.catalogImplementation", "hive")
    .config("hive.metastore.warehouse.dir", warehouse_location)
    # ---- extensões Delta Lake ----
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    # ---- pacote Delta compatível com Spark 3.4.1 (Scala 2.12) ----
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
    .enableHiveSupport()
    .getOrCreate()
)

print("Spark iniciado com sucesso — versão:", spark.version)


Spark iniciado com sucesso — versão: 3.4.1


In [4]:
from pyspark.sql.types import StructType, StructField

bronze_path = "hdfs://hdfs-nn:9000/demo/bronze/actorfilms.csv"  

# Schema simples e explícito
schema = StructType([
    StructField("Actor", StringType(), True),
    StructField("ActorID", StringType(), True),
    StructField("Film", StringType(), True),
    StructField("Year", StringType(), True),
    StructField("Votes", StringType(), True),
    StructField("Rating", StringType(), True),
    StructField("FilmID", StringType(), True)
])

actorfilms_bronze = (
    spark.read
    .option("header", True)
    .schema(schema)
    .csv(bronze_path)
)

print("Bronze actorfilms carregado:")
actorfilms_bronze.show(10, truncate=False)


Bronze actorfilms carregado:
+------------+---------+---------------------------+----+-----+------+---------+
|Actor       |ActorID  |Film                       |Year|Votes|Rating|FilmID   |
+------------+---------+---------------------------+----+-----+------+---------+
|Fred Astaire|nm0000001|Ghost Story                |1981|7731 |6.3   |tt0082449|
|Fred Astaire|nm0000001|The Purple Taxi            |1977|533  |6.6   |tt0076851|
|Fred Astaire|nm0000001|The Amazing Dobermans      |1976|369  |5.3   |tt0074130|
|Fred Astaire|nm0000001|The Towering Inferno       |1974|39888|7     |tt0072308|
|Fred Astaire|nm0000001|Midas Run                  |1969|123  |4.8   |tt0064664|
|Fred Astaire|nm0000001|Finian's Rainbow           |1968|3377 |6.2   |tt0062974|
|Fred Astaire|nm0000001|The Notorious Landlady     |1962|1887 |6.8   |tt0056289|
|Fred Astaire|nm0000001|The Pleasure of His Company|1961|679  |6.9   |tt0055307|
|Fred Astaire|nm0000001|On the Beach               |1959|12066|7.2   |tt0053137|

In [5]:
spark.sql("""
    CREATE DATABASE IF NOT EXISTS silver
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/silver.db'
""")

print("Base de dados 'silver' pronta.")


Base de dados 'silver' pronta.


In [6]:
# Lógica do grupo para remover acentos
def remove_accents(text):
    if text is None:
        return None
    return ''.join(
        c for c in unicodedata.normalize('NFKD', text)
        if not unicodedata.combining(c)
    )

remove_accents_udf = udf(remove_accents, StringType())


In [7]:
actorfilms_silver = (
    actorfilms_bronze

    # 1️ - Remover registos com nulos críticos (Actor e Film são obrigatórios)
    .na.drop(subset=["Actor", "Film"])

    # 2️ - Normalizar ACTOR → lowercase + remover acentos + remover especiais
    .withColumn("actor_clean", lower(trim(col("Actor"))))
    .withColumn("actor_noacc", remove_accents_udf(col("actor_clean")))
    .withColumn("actor_norm", regexp_replace(col("actor_noacc"), r"[^a-z0-9 ]", ""))

    # 3️ - Normalizar FILM TITLE
    .withColumn("film_clean", lower(trim(col("Film"))))
    .withColumn("film_noacc", remove_accents_udf(col("film_clean")))
    .withColumn("film_norm", regexp_replace(col("film_noacc"), r"[^a-z0-9 ]", ""))

    # 4️ - Extrair ANO (Year → integer)
    .withColumn("year_str", regexp_extract(col("Year"), r"(\d{4})", 1))
    .withColumn("year", col("year_str").cast(IntegerType()))

    # 5️ - Converter Votes e Rating
    .withColumn("votes_clean", regexp_replace(col("Votes"), r"[^0-9]", "").cast(IntegerType()))
    .withColumn("rating_clean", col("Rating").cast(DoubleType()))

    # 6️ - Filtrar linhas com ano válido
    .filter(col("year").isNotNull())
    .filter(col("year").between(1900, 2035))

    # 7️ - Remover duplicados reais
    .dropDuplicates(["actor_norm", "film_norm", "year"])

    # 8️ - Selecionar colunas finais padronizadas
    .select(
        col("actor_norm").alias("actor"),
        col("film_norm").alias("title"),
        "year",
        col("votes_clean").alias("votes"),
        col("rating_clean").alias("rating"),
        col("FilmID").alias("film_id"),
        col("ActorID").alias("actor_id")
    )
)

print("Silver actorfilms transformado:")
actorfilms_silver.show(20, truncate=False)


Silver actorfilms transformado:
+-------------+--------------------------+----+------+------+----------+---------+
|actor        |title                     |year|votes |rating|film_id   |actor_id |
+-------------+--------------------------+----+------+------+----------+---------+
|50 cent      |escape plan               |2013|228881|6.7   |tt1211956 |nm1265067|
|50 cent      |escape plan 2 hades       |2018|28730 |3.9   |tt6513656 |nm1265067|
|50 cent      |escape plan the extractors|2019|11894 |4.4   |tt6772804 |nm1265067|
|50 cent      |home of the brave         |2006|10500 |5.6   |tt0763840 |nm1265067|
|a martinez   |the take                  |1974|223   |5.5   |tt0072249 |nm0553436|
|aaliyah      |queen of the damned       |2002|52650 |5.3   |tt0238546 |nm0004691|
|aamir khan   |aatank hi aatank          |1995|770   |5.2   |tt0172089 |nm0451148|
|aamir khan   |baazi                     |1995|2213  |6.5   |tt0121989 |nm0451148|
|aamir khan   |dangal                    |2016|159974|8

In [8]:
(
    actorfilms_silver.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", True)
    .option("path", "hdfs://hdfs-nn:9000/warehouse/silver.db/actorfilms")
    .saveAsTable("silver.actorfilms")
)

print("Tabela silver.actorfilms gravada com sucesso!")


Tabela silver.actorfilms gravada com sucesso!


In [12]:
from pyspark.sql.functions import col, count, countDistinct, min, max, avg, desc

df = spark.table("silver.actorfilms")

# --- Verificação colunas numéricas ---
numeric_cols = ["year", "votes", "rating"]
numeric_present = [c for c in numeric_cols if c in df.columns]

if numeric_present:
    numeric_summary = df.select(
        *[min(col(c)).alias(f"{c}_min") for c in numeric_present],
        *[max(col(c)).alias(f"{c}_max") for c in numeric_present],
        *[avg(col(c)).alias(f"{c}_avg") for c in numeric_present],
        *[count(col(c)).alias(f"{c}_count") for c in numeric_present]
    )
    print("Resumo colunas numéricas")
    numeric_summary.show()
else:
    print("Sem colunas numéricas presentes entre:", numeric_cols)

# --- Verificação colunas categóricas ---
# Inclui as antigas e as novas; só usa as que existem
categorical_cols = ["actor", "title", "actor_id", "film_id"]
categorical_present = [c for c in categorical_cols if c in df.columns]

if categorical_present:
    cat_summary = df.select(
        *[count(col(c)).alias(f"{c}_count") for c in categorical_present],
        *[countDistinct(col(c)).alias(f"{c}_distinct") for c in categorical_present]
    )
    print("Resumo colunas categóricas (colunas usadas:", categorical_present, ")")
    cat_summary.show(truncate=False)
else:
    print("Nenhuma das colunas categóricas existe na tabela. Colunas disponíveis:", df.columns)

# --- Top 5 valores mais frequentes por coluna categórica ---
print("Top 5 valores mais frequentes por coluna categórica")
for c in categorical_present:
    print(f"--- Coluna: {c} ---")
    df.groupBy(col(c)).count().orderBy(desc("count")).show(5, truncate=False)


Resumo colunas numéricas
+--------+---------+----------+--------+---------+----------+------------------+-----------------+-----------------+----------+-----------+------------+
|year_min|votes_min|rating_min|year_max|votes_max|rating_max|          year_avg|        votes_avg|       rating_avg|year_count|votes_count|rating_count|
+--------+---------+----------+--------+---------+----------+------------------+-----------------+-----------------+----------+-----------+------------+
|    1914|      100|       1.0|    2021|  2371548|       9.3|1997.9535675196748|50419.35416167197|5.975763798405216|    191870|     191870|      191870|
+--------+---------+----------+--------+---------+----------+------------------+-----------------+-----------------+----------+-----------+------------+

Resumo colunas categóricas (colunas usadas: ['actor', 'title', 'actor_id', 'film_id'] )
+-----------+-----------+--------------+-------------+--------------+--------------+-----------------+----------------+
|

In [13]:
spark.sql("""
    SElECT * from silver.actorfilms
          """).show()

+-------------+--------------------+----+------+------+---------+---------+
|        actor|               title|year| votes|rating|  film_id| actor_id|
+-------------+--------------------+----+------+------+---------+---------+
|      50 cent|caught in the cro...|2010|  1155|   4.4|tt1449379|nm1265067|
|      50 cent|          last vegas|2013|128857|   6.6|tt1204975|nm1265067|
|   a martinez|     change of habit|1969|  2127|   6.1|tt0065537|nm0553436|
|   a martinez|     ordinary sinner|2001|   196|   5.2|tt0259054|nm0553436|
|   a martinez|       whats cooking|2000|  2523|   6.9|tt0197096|nm0553436|
|   aamir khan|               earth|1998|  7458|   7.7|tt0150433|nm0451148|
|   aamir khan|                holi|1985|   554|   7.5|tt0087417|nm0451148|
|   aamir khan|                ishq|1997|  9958|   6.9|tt0133024|nm0451148|
|   aamir khan|     jawani zindabad|1990|   649|   5.7|tt0337650|nm0451148|
|   aamir khan|lagaan once upon ...|2001|105883|   8.1|tt0169102|nm0451148|
|   aamir kh

In [10]:
df = spark.table("silver.actorfilms")
print(df.columns)


['actor', 'title', 'year', 'votes', 'rating', 'film_id', 'actor_id']
