In [1]:
# Instalar delta-spark

!pip install delta-spark==2.4.0



In [2]:
from pyspark.sql import SparkSession

warehouse_location = "hdfs://hdfs-nn:9000/warehouse"

spark = (
    SparkSession.builder
        .appName("Silver_Critic_Reviews")
        .config("spark.sql.warehouse.dir", warehouse_location)
        .config("hive.metastore.uris", "thrift://hive-metastore:9083")
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
        .enableHiveSupport()
        .getOrCreate()
)

spark

In [3]:
#Definição do schema e Leitura do CSV bronze
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

bronze_path = "hdfs://hdfs-nn:9000/demo/bronze/critic_reviews.csv"

critic_schema = StructType([
    StructField("show",    StringType(),  True),
    StructField("sentiment", IntegerType(), True),
    StructField("review",  StringType(),  True)
])

df_bronze = (
    spark.read
        .option("header", True)
        .schema(critic_schema)
        .csv(bronze_path)
)

print("----- Bronze critic_reviews (amostra) -----")
df_bronze.show(10, truncate=False)
df_bronze.printSchema()

----- Bronze critic_reviews (amostra) -----
+----------------+---------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|show            |sentiment|review                                                                                                                                                                                                                                                  |
+----------------+---------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Mare of Easttown|1        |I’m not sure I have engaged in appointment viewing in years! It was nice to be giddy on a Sund

In [4]:
#Criar base de dados se não existir
spark.sql("""
    CREATE DATABASE IF NOT EXISTS silver
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/silver.db'
""")

spark.sql("SHOW DATABASES").show()

+---------+
|namespace|
+---------+
|  default|
|     gold|
|   silver|
+---------+



In [5]:
# Transformações para Silver (critic_reviews)
from pyspark.sql.functions import col, lower
from pyspark.sql.window import Window
from pyspark.sql import functions as F

# 1) Adicionar row_number para identificar a primeira linha (linha de teste)
w_all = Window.orderBy("show", "sentiment", "review")
df_with_rn = df_bronze.withColumn("rn", F.row_number().over(w_all))

# 2) Remover a primeira linha (linha de teste)
df_no_test = df_with_rn.filter(col("rn") > 1).drop("rn")

# 3) Converter tudo para minúsculas nas colunas principais
df_clean = (
    df_no_test
        .withColumn("show", lower(col("show")))
        .withColumn("sentiment", lower(col("sentiment")))
        .withColumn("review", lower(col("review")))
)

# 4) Remover linhas duplicadas
df_clean = df_clean.dropDuplicates()

print("----- Silver critic_reviews (após transformações) -----")
df_clean.show(10, truncate=False)
df_clean.printSchema()

----- Silver critic_reviews (após transformações) -----
+--------+---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|show    |sentiment|review                                                                                                                                                                                                                              |
+--------+---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|11.22.63|0        |11.22.63 reaches some thoughtful moving conclusions but oh what coulda been with a more engaged star. if only there were a time machine to fix that mistake.                  

In [6]:
#Carregar a tabela Silver em Delta e Hive
(
    df_clean.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", True)
        .option("path", "hdfs://hdfs-nn:9000/warehouse/silver.db/critic_reviews")
        .saveAsTable("silver.critic_reviews")
)

print("Tabela silver.critic_reviews carregada com sucesso.")

Tabela silver.critic_reviews carregada com sucesso.


In [7]:
#Ler o carregamento para fazer as verificações
df_silver = spark.table("silver.critic_reviews")

print("----- Silver critic_reviews (lida do Hive) -----")
df_silver.show(10, truncate=False)
df_silver.printSchema()
print("Total de linhas na Silver:", df_silver.count())

----- Silver critic_reviews (lida do Hive) -----
+--------+---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|show    |sentiment|review                                                                                                                                                                                                                              |
+--------+---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|11.22.63|0        |11.22.63 reaches some thoughtful moving conclusions but oh what coulda been with a more engaged star. if only there were a time machine to fix that mistake.                         

In [8]:
#Verificação de valores nulos
from pyspark.sql import functions as F

print("----- Nulos por coluna (silver.critic_reviews) -----")
df_nulls = df_silver.select([
    F.sum(F.col(c).isNull().cast("int")).alias(c)
    for c in df_silver.columns
])
df_nulls.show()

----- Nulos por coluna (silver.critic_reviews) -----
+----+---------+------+
|show|sentiment|review|
+----+---------+------+
|   0|        0|     1|
+----+---------+------+



In [9]:
#Verificação de valores duplicados
print("----- Duplicados (linha inteira) -----")
total_linhas    = df_silver.count()
linhas_unicas   = df_silver.dropDuplicates().count()
duplicados      = total_linhas - linhas_unicas

print("Total de linhas:        ", total_linhas)
print("Total de linhas únicas: ", linhas_unicas)
print("Duplicados:             ", duplicados)

----- Duplicados (linha inteira) -----
Total de linhas:         14790
Total de linhas únicas:  14790
Duplicados:              0


In [10]:
#Demonstração final
print("----- Demonstração final da Silver critic_reviews (20 linhas) -----")
df_silver.show(20, truncate=False)
print("Total de linhas finais na Silver critic_reviews:", df_silver.count())

----- Demonstração final da Silver critic_reviews (20 linhas) -----
+--------+---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|show    |sentiment|review                                                                                                                                                                                                                              |
+--------+---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|11.22.63|0        |11.22.63 reaches some thoughtful moving conclusions but oh what coulda been with a more engaged star. if only there were a time machine to fix that mistake.      