In [1]:
!pip install delta-spark==2.4.0



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, lower, trim, udf
from pyspark.sql.types import DoubleType, IntegerType, StringType, DateType
from delta import *
from pyspark.sql import functions as F
from functools import reduce

import unicodedata

# Caminho do warehouse Hive no HDFS
warehouse_location = "hdfs://hdfs-nn:9000/warehouse"

# Criação da sessão Spark com suporte a Hive + Delta Lake
spark = (
    SparkSession.builder
    .appName("Silver_Rotten_Tomatoes_Movies_Treatment")
    # ---- configurações Hive ----
    .config("spark.sql.warehouse.dir", warehouse_location)
    .config("hive.metastore.uris", "thrift://hive-metastore:9083")
    .config("spark.sql.catalogImplementation", "hive")
    .config("hive.metastore.warehouse.dir", warehouse_location)
    # ---- extensões Delta Lake ----
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    # ---- pacote Delta compatível com Spark 3.4.1 (Scala 2.12) ----
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
    .enableHiveSupport()
    .getOrCreate()
)
print("Spark iniciado com sucesso — versão:", spark.version)


Spark iniciado com sucesso — versão: 3.4.1


In [3]:
hdfs_path = "hdfs://hdfs-nn:9000/datasets/bronze/Rotten_Tomatoes_Movies.csv"

In [4]:
bronze_path = "hdfs://hdfs-nn:9000/datasets/bronze/Rotten_Tomatoes_Movies.csv"

# Schema simples e explícito
from pyspark.sql.types import StructType, StructField

customSchema = StructType([
    StructField("movie_title", StringType(), True),
    StructField("movie_info", StringType(), True),
    StructField("critics_consensus", StringType(), True),
    StructField("rating", StringType(), True),
    StructField("genre", StringType(), True),
    StructField("directors", StringType(), True),
    StructField("writers", StringType(), True),
    StructField("cast", StringType(), True),
    StructField("in_theaters_date", DateType(), True),
    StructField("on_streaming_date", DateType(), True),
    StructField("runtime_in_minutes", IntegerType(), True),
    StructField("studio_name", StringType(), True),
    StructField("tomatometer_status", StringType(), True),
    StructField("tomatometer_rating", IntegerType(), True),
    StructField("tomatometer_count", IntegerType(), True),
    StructField("audience_rating", IntegerType(), True),
    StructField("audience_count", IntegerType(), True),
])

movies_df = (
    spark.read
    .option("header", "true")
    .option("delimiter", ",")
    .schema(customSchema)
    .csv(hdfs_path)
)

print("Dados lidos da Bronze:")
movies_df.show(5, truncate=False)

Dados lidos da Bronze:
+--------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [5]:
spark.sql("""
    CREATE DATABASE IF NOT EXISTS silver
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/silver.db'
""")

print("Base de dados 'silver' pronta.")


Base de dados 'silver' pronta.


In [6]:
# Logica do grupo para remover acentos
def remove_accents(text):
    if text is None:
        return None
    return ''.join(
        c for c in unicodedata.normalize('NFKD', text)
        if not unicodedata.combining(c)
    )

remove_accents_udf = udf(remove_accents, StringType())


In [7]:
def reset_silver_table(table_name: str):
    spark.sql(f"DROP TABLE IF EXISTS silver.{table_name}")
    os.system(f"hdfs dfs -rm -r -f /warehouse/silver.db/{table_name}")
    print(f"Tabela silver.{table_name} limpa com sucesso!")

In [8]:
conditions = [ (F.col(c).isNull() | (F.trim(F.col(c)) == "")) for c in movies_df.columns ]
df_clean = movies_df.filter(~reduce(lambda a, b: a | b, conditions))
df_clean = df_clean.dropDuplicates()

print(f"Linhas após limpeza: {df_clean.count()}")

Linhas após limpeza: 5553


In [9]:
movies_silver_df = (
    df_clean
    .withColumn("movie_title", lower(col("movie_title")))
    .withColumn("writers", lower(col("writers")))
    .withColumn("tomatometer_status", lower(col("tomatometer_status")))
    .withColumn("critics_consensus", lower(col("critics_consensus")))
    .withColumn("cast", lower(col("cast"))) 
    .withColumn("title_norm", F.lower(F.trim(F.regexp_replace(F.col("movie_title"), r"[^\p{L}\p{N}\s]", ""))))
    .withColumn("release_year", F.year(F.col("in_theaters_date")))
    .withColumn("tomatometer_rating", F.when(F.col("tomatometer_rating") < 0, None).otherwise(F.col("tomatometer_rating")))
    .withColumn("audience_rating", F.when(F.col("audience_rating") < 0, None).otherwise(F.col("audience_rating")))
    .dropDuplicates(["title_norm"])
    .drop(
        "in_theaters_date",
        "on_streaming_date",
        "runtime_in_minutes",
        "studio_name",
        "genre",
        "directors",
        "movie_info",
    )

    .select(
        F.col("movie_title"),
        "writers",
        "release_year",
        "rating",
        "tomatometer_status",
        "tomatometer_rating",
        "tomatometer_count",
        "audience_rating",
        "audience_count",
        "critics_consensus",
        "cast",
        
    )
)




In [10]:
movies_silver_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .option("path", "hdfs://hdfs-nn:9000/warehouse/silver.db/rating_movies") \
    .saveAsTable("silver.rating_movies")

print("Tabela 'silver.rating_movies' gravada com sucesso!")


Tabela 'silver.rating_movies' gravada com sucesso!


In [11]:
spark.sql("""
SELECT * 
FROM silver.rating_movies
""").show()

+--------------------+--------------------+------------+------+------------------+------------------+-----------------+---------------+--------------+--------------------+--------------------+
|         movie_title|             writers|release_year|rating|tomatometer_status|tomatometer_rating|tomatometer_count|audience_rating|audience_count|   critics_consensus|                cast|
+--------------------+--------------------+------------+------+------------------+------------------+-----------------+---------------+--------------+--------------------+--------------------+
|            10 years|        jamie linden|        2012| PG-13|             fresh|                60|               57|             40|         11767|a sweet ensemble ...|channing tatum, j...|
|      102 dalmatians|kristen buckley, ...|        2000|     G|            rotten|                31|               90|             32|        406503|this sequel to th...|glenn close, géra...|
|             13 sins|daniel stamm,

In [12]:
spark.stop()