In [1]:
# Instalar delta-spark
!pip install delta-spark==2.4.0



In [2]:
#Início da sessão Spark (Hive + Delta)
from pyspark.sql import SparkSession

warehouse_location = "hdfs://hdfs-nn:9000/warehouse"

spark = (
    SparkSession.builder
        .appName("Silver_Audience_Reviews")
        .config("spark.sql.warehouse.dir", warehouse_location)
        .config("hive.metastore.uris", "thrift://hive-metastore:9083")
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
        .enableHiveSupport()
        .getOrCreate()
)

spark

In [3]:
#Schema e leitura do CSV bronze
from pyspark.sql.types import StructType, StructField, StringType, FloatType

bronze_path = "hdfs://hdfs-nn:9000/demo/bronze/audience_reviews.csv"

audience_schema = StructType([
    StructField("show",   StringType(), True),
    StructField("rating", FloatType(),  True),
    StructField("review", StringType(), True)
])

df_bronze = (
    spark.read
        .option("header", True)
        .schema(audience_schema)
        .csv(bronze_path)
)

print("----- Bronze audience_reviews (amostra) -----")
df_bronze.show(10, truncate=False)
df_bronze.printSchema()

----- Bronze audience_reviews (amostra) -----
+----+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|show|rating|review                                                                                                                                                                                                                                                                                                  |
+----+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Test|0.0   |Test rev

In [4]:
#Cria base de dados se não existir
spark.sql(f"""
    CREATE DATABASE IF NOT EXISTS silver
    LOCATION '{warehouse_location}/silver.db'
""")

spark.sql("SHOW DATABASES").show(truncate=False)

+---------+
|namespace|
+---------+
|default  |
|gold     |
|silver   |
+---------+



In [5]:
# Transformações para Silver
from pyspark.sql.functions import col, lower, when
from pyspark.sql.window import Window
from pyspark.sql import functions as F

# 1) Adicionar row_number para identificar a primeira linha
w_all = Window.orderBy("show", "rating", "review")
df_with_rn = df_bronze.withColumn("rn", F.row_number().over(w_all))

# 2) Remover a primeira linha (linha de teste)
df_no_test = df_with_rn.filter(col("rn") > 1).drop("rn")

# 3) Garantir que rating é double e classificá-lo como positive/negative
df_clean = df_no_test.withColumn("rating", col("rating").cast("double"))

df_clean = df_clean.withColumn(
    "rating",
    when(col("rating") >= 2.5, "positive").otherwise("negative")  # <-- minúsculas
)

# 4) Normalizar tudo para minúsculas em show e review
df_clean = (
    df_clean
        .withColumn("show", lower(col("show")))
        .withColumn("review", lower(col("review")))
)

# 5) Remover linhas duplicadas depois das transformações
df_clean = df_clean.dropDuplicates()

print("----- Silver audience_reviews (após transformações) -----")
df_clean.show(10, truncate=False)
df_clean.printSchema()

----- Silver audience_reviews (após transformações) -----
+--------+--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
#Carregar tabela Silver em Delta + Hive
(
    df_clean.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", True)
        .option("path", f"{warehouse_location}/silver.db/audience_reviews")
        .saveAsTable("silver.audience_reviews")
)

print("Tabela silver.audience_reviews carregada com sucesso.")

Tabela silver.audience_reviews carregada com sucesso.


In [7]:
#Ler Silver para fazer as verificações
df_silver = spark.table("silver.audience_reviews")

print("----- Silver audience_reviews (lida do Hive) -----")
df_silver.show(10, truncate=False)
df_silver.printSchema()
print("Total de linhas na Silver:", df_silver.count())

----- Silver audience_reviews (lida do Hive) -----
+--------+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [8]:
#Verificação de valores nulos
from pyspark.sql import functions as F

print("----- Nulos por coluna (silver.audience_reviews) -----")
df_nulls = df_silver.select([
    F.sum(F.col(c).isNull().cast("int")).alias(c)
    for c in df_silver.columns
])
df_nulls.show()

----- Nulos por coluna (silver.audience_reviews) -----
+----+------+------+
|show|rating|review|
+----+------+------+
|   0|     0|     0|
+----+------+------+



In [9]:
#Verificação de valores duplicados
print("----- Duplicados (linha inteira) -----")
total_linhas  = df_silver.count()
linhas_unicas = df_silver.dropDuplicates().count()
duplicados    = total_linhas - linhas_unicas

print("Total de linhas:        ", total_linhas)
print("Total de linhas únicas: ", linhas_unicas)
print("Duplicados:             ", duplicados)

----- Duplicados (linha inteira) -----
Total de linhas:         65421
Total de linhas únicas:  65421
Duplicados:              0


In [11]:
#Demonstração final
print("----- Demonstração final da Silver audience_reviews (20 linhas) -----")
df_silver.show(20, truncate=False)

print("Total de linhas finais na Silver audience_reviews:", df_silver.count())

----- Demonstração final da Silver audience_reviews (20 linhas) -----
+--------+--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------