In [1]:
!pip install delta-spark==2.4.0



In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from delta import *
from pyspark.sql.functions import col, when, sum, lit, trim
from pyspark.sql.functions import col, when, substring, trim, upper, lower

from pyspark.sql.types import IntegerType

In [3]:
#Configuração da Sessão Spark
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'
builder = SparkSession \
    .builder \
    .appName("Junção Silver") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()
print("Sessão Spark (Silver v3.1) iniciada.")

Sessão Spark (Silver v3.1) iniciada.


In [4]:
warehouse_location = "hdfs://hdfs-nn:9000/warehouse"

audience_path = f"{warehouse_location}/silver.db/audience_reviews"
critic_path   = f"{warehouse_location}/silver.db/critic_reviews"

print("A carregar a Silver audience_reviews a partir do caminho Delta...")
df_audience = (
    spark.read
        .format("delta")
        .load(audience_path)
)

print("A carregar a Silver critic_reviews a partir do caminho Delta...")
df_critic = (
    spark.read
        .format("delta")
        .load(critic_path)
)

print("Schemas carregados:")
df_audience.printSchema()
df_critic.printSchema()

A carregar a Silver audience_reviews a partir do caminho Delta...
A carregar a Silver critic_reviews a partir do caminho Delta...
Schemas carregados:
root
 |-- show: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- review: string (nullable = true)

root
 |-- show: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- review: string (nullable = true)



In [5]:
#Preparar audience_reviews para união final
from pyspark.sql import functions as F

df_audience_sel = (
    df_audience
        .select(
            F.col("show"),
            F.col("review"),
            F.col("rating").alias("sentiment")   # rating já Positive/Negative
        )
        .withColumn("review_type", F.lit("audience"))
)

print("Schema audience_reviews (selecionado):")
df_audience_sel.printSchema()
df_audience_sel.show(5, truncate=False)

Schema audience_reviews (selecionado):
root
 |-- show: string (nullable = true)
 |-- review: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- review_type: string (nullable = false)

+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
#Preparar critic_reviews para união final
df_critic_sel = (
    df_critic
        .select(
            F.col("show"),
            F.col("review"),
            F.col("sentiment").alias("sentiment")   # já Positive/Negative
        )
        .withColumn("review_type", F.lit("critic"))
)

print("Schema critic_reviews (selecionado):")
df_critic_sel.printSchema()
df_critic_sel.show(5, truncate=False)

Schema critic_reviews (selecionado):
root
 |-- show: string (nullable = true)
 |-- review: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- review_type: string (nullable = false)

+--------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+-----------+
|show                |review                                                                                                                                                                                                                                                    |sentiment|review_type|
+--------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [7]:
#União final das duas tabelas

df_rating_shows = df_audience_sel.unionByName(df_critic_sel)

print("Schema da tabela unificada:")
df_rating_shows.printSchema()

print("Amostra de linhas:")
df_rating_shows.show(20, truncate=False)

Schema da tabela unificada:
root
 |-- show: string (nullable = true)
 |-- review: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- review_type: string (nullable = false)

Amostra de linhas:
+--------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [9]:
#Verificações finais da junção

print("Total audience:", df_audience_sel.count())
print("Total critic:", df_critic_sel.count())
print("Total após união:", df_rating_shows.count())

print("Distribuição por tipo de review:")
df_rating_shows.groupBy("review_type").count().show()

print("Distribuição de sentimentos:")
df_rating_shows.groupBy("sentiment").count().show()

print("Nulos por coluna:")
df_rating_shows.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c) 
    for c in df_rating_shows.columns
]).show()

print("Amostra de 50 linhas:")
df_rating_shows.show(50, truncate=False)

Total audience: 65421
Total critic: 14790
Total após união: 80211
Distribuição por tipo de review:
+-----------+-----+
|review_type|count|
+-----------+-----+
|   audience|65421|
|     critic|14790|
+-----------+-----+

Distribuição de sentimentos:
+---------+-----+
|sentiment|count|
+---------+-----+
| Positive|59045|
| Negative|21166|
+---------+-----+

Nulos por coluna:
+----+------+---------+-----------+
|show|review|sentiment|review_type|
+----+------+---------+-----------+
|   0|     0|        0|          0|
+----+------+---------+-----------+

Amostra de 50 linhas:
+--------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [10]:
# Guardar tabela no silver.db

df_rating_shows.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("silver.rating_shows")

print("Tabela 'silver.rating_shows' gravada com sucesso!")

Tabela 'silver.rating_shows' gravada com sucesso!


In [11]:
spark.stop()