In [1]:
#Instalar delta-spark 
!pip install delta-spark==2.4.0



In [1]:
#Imports + sessão Spark configurada (warehouse + hive + delta)

from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
from pyspark.sql.functions import col, trim, lower

# Configuração da Sessão Spark
warehouse_location = "hdfs://hdfs-nn:9000/warehouse"

builder = (
    SparkSession.builder
    .appName("Fase Gold - series_avaliadas")
    .config("spark.sql.warehouse.dir", warehouse_location)
    .config("hive.metastore.uris", "thrift://hive-metastore:9083")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
    .enableHiveSupport()
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()
print("Sessão Spark (Gold) iniciada com sucesso.")

Sessão Spark (Gold) iniciada com sucesso.


In [3]:
#Confirmar se a BD gold existe; se não existir, criar

print("A criar/usar a base de dados 'gold'...")
spark.sql("CREATE DATABASE IF NOT EXISTS gold LOCATION 'hdfs://hdfs-nn:9000/warehouse/gold.db/'")
spark.sql("USE gold")

A criar/usar a base de dados 'gold'...


DataFrame[]

In [4]:
#ler as tabelas da Silver (tv_shows e rating_shows)

df_tv_shows = spark.table("silver.tv_shows")
df_rating_shows = spark.table("silver.rating_shows")

print("tv_shows colunas:", df_tv_shows.columns)
print("rating_shows colunas:", df_rating_shows.columns)

print("\nAmostra tv_shows:")
df_tv_shows.show(5, truncate=False)

print("\nAmostra rating_shows:")
df_rating_shows.show(5, truncate=False)

tv_shows colunas: ['id', 'series_name', 'first_air_date', 'popularity', 'vote_average', 'vote_count']
rating_shows colunas: ['show', 'review', 'sentiment', 'review_type']

Amostra tv_shows:
+------+--------------------+--------------+----------+------------+----------+
|id    |series_name         |first_air_date|popularity|vote_average|vote_count|
+------+--------------------+--------------+----------+------------+----------+
|96246 |#LikeMe             |2019-01-13    |10.066    |9.7         |3         |
|2211  |'Allo 'Allo!        |1984-09-07    |28.808    |7.8         |162       |
|1568  |'Til Death          |2006-09-07    |9.945     |6.6         |44        |
|203417|'Til Jail Do Us Part|2022-09-15    |116.136   |7.6         |5         |
|8864  |.hack               |2002-04-04    |16.699    |6.8         |21        |
+------+--------------------+--------------+----------+------------+----------+
only showing top 5 rows


Amostra rating_shows:
+--------+--------------------------------

In [7]:
from pyspark.sql.functions import col, trim, lower

# normalizar colunas
df_tv_shows_clean = df_tv_shows.withColumn("series_name", lower(trim(col("series_name"))))
df_rating_shows_clean = df_rating_shows.withColumn("show", lower(trim(col("show"))))

# usar alias + col() na condição do join (mais seguro)
tv = df_tv_shows_clean.alias("tv")
rt = df_rating_shows_clean.alias("rt")

df_series_avaliadas_tmp = tv.join(
    rt,
    col("tv.series_name") == col("rt.show"),
    how="inner"
)

print("Linhas após junção:", df_series_avaliadas_tmp.count())
df_series_avaliadas_tmp.show(5, truncate=False)

Linhas após junção: 76164
+-----+-----------+--------------+----------+------------+----------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [9]:
#Fundir as colunas show e series_name
from pyspark.sql.functions import coalesce

df_series_avaliadas = (
    df_series_avaliadas_tmp
    .withColumn("series_name", coalesce(col("tv.series_name"), col("rt.show")))
    .drop(col("rt.show"))
)

In [10]:
#Guardar na gold.db como tabela Delta (series_avaliadas)

df_series_avaliadas.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("gold.series_avaliadas")

print("Tabela 'gold.series_avaliadas' gravada com sucesso!")

Tabela 'gold.series_avaliadas' gravada com sucesso!


In [15]:
#Confirmar que ficou mesmo na gold + preview

spark.sql("SHOW TABLES IN gold").show(truncate=False)

df_check = spark.table("gold.series_avaliadas")
print("Contagem final na gold.series_avaliadas:", df_check.count())
df_check.show(20, truncate=False)

+---------+----------------+-----------+
|namespace|tableName       |isTemporary|
+---------+----------------+-----------+
|gold     |series_adaptadas|false      |
|gold     |series_avaliadas|false      |
+---------+----------------+-----------+

Contagem final na gold.series_avaliadas: 76164
+-----+-----------+--------------+----------+------------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [4]:
from pyspark.sql.functions import concat_ws, col

df = spark.table("gold.series_avaliadas")

df.coalesce(1) \
  .write \
  .mode("overwrite") \
  .option("header", "true") \
  .csv("/tmp/series_avaliadas_csv")

In [5]:
!ls /tmp/series_avaliadas_csv

part-00000-bbb9d184-5d29-4835-88ad-a6a53cd47102-c000.csv  _SUCCESS


In [6]:
!cp /tmp/series_avaliadas_csv/part-00000-bbb9d184-5d29-4835-88ad-a6a53cd47102-c000.csv /home/jovyan/series_avaliadas.csv
!ls -lah /home/jovyan/series_avaliadas.csv

-rw-r--r-- 1 jovyan users 26M Dec 28 11:15 /home/jovyan/series_avaliadas.csv
