In [13]:
# Cell 1: Install dependencies if needed
!pip install delta-spark==2.4.0



In [14]:
# Cell 2: Import necessary modules and start Spark Session
from pyspark.sql import SparkSession
from delta import *

# Caminho do warehouse Hive no HDFS
warehouse_location = "hdfs://hdfs-nn:9000/warehouse"

# Criação da sessão Spark com suporte a Hive + Delta Lake
spark = (
    SparkSession.builder
    .appName("Gold_Adaptacoes_Filmes")
    # ---- configurações Hive ----
    .config("spark.sql.warehouse.dir", warehouse_location)
    .config("hive.metastore.uris", "thrift://hive-metastore:9083")
    .config("spark.sql.catalogImplementation", "hive")
    .config("hive.metastore.warehouse.dir", warehouse_location)
    # ---- extensões Delta Lake ----
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    # ---- pacote Delta compatível com Spark 3.4.1 (Scala 2.12) ----
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
    .enableHiveSupport()
    .getOrCreate()
)

print("Spark iniciado com sucesso — versão:", spark.version)

Spark iniciado com sucesso — versão: 3.4.1


In [15]:
# Cell 3: Create gold database if not exists
spark.sql("""
    CREATE DATABASE IF NOT EXISTS gold
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/gold.db'
""")

print("Base de dados 'gold' pronta.")

Base de dados 'gold' pronta.


In [16]:
# Cell 4: Load silver tables
adaptations_df = spark.table("silver.adaptations")
books_df = spark.table("silver.books")
rating_movies_df = spark.table("silver.rating_movies")

print("Tabelas silver carregadas com sucesso.")

Tabelas silver carregadas com sucesso.


In [17]:
spark.table("silver.books").printSchema()
spark.table("silver.adaptations").printSchema()
spark.table("silver.rating_movies").printSchema()

root
 |-- book_id: integer (nullable = true)
 |-- book_title: string (nullable = true)
 |-- author: string (nullable = true)
 |-- book_rating: double (nullable = true)
 |-- ratings_count: integer (nullable = true)
 |-- text_reviews_count: integer (nullable = true)
 |-- publication_year: integer (nullable = true)
 |-- publisher: string (nullable = true)

root
 |-- author: string (nullable = true)
 |-- author_norm: string (nullable = true)
 |-- book_title: string (nullable = true)
 |-- book_title_norm: string (nullable = true)
 |-- movie_title: string (nullable = true)
 |-- movie_title_norm: string (nullable = true)
 |-- movie_year: integer (nullable = true)

root
 |-- movie_title: string (nullable = true)
 |-- writers: string (nullable = true)
 |-- release_year: integer (nullable = true)
 |-- rating: string (nullable = true)
 |-- tomatometer_status: string (nullable = true)
 |-- tomatometer_rating: integer (nullable = true)
 |-- tomatometer_count: integer (nullable = true)
 |-- audience

In [18]:
# Cell 5: Join adaptations with books
# Join on book_title_norm (adaptations) == book_title (books) and author_norm (adaptations) == author (books)
from pyspark.sql.functions import lower, trim, regexp_replace
from pyspark.sql.functions import col, lower, trim, regexp_replace
def norm(col):
    return regexp_replace(lower(trim(col)), r"[^a-z0-9 ]", "")


books_df = spark.table("silver.books")
adaptations_df = spark.table("silver.adaptations")

joined_books_adaptations = (
    spark.table("silver.adaptations").alias("a")
    .join(
        spark.table("silver.books").alias("b"),
        (norm(col("a.book_title")) == norm(col("b.book_title"))) &
        (norm(col("a.author")) == norm(col("b.author"))),
        "left"
    )
    .select(
        col("a.author"),
        col("a.book_title"),
        col("a.movie_title"),
        col("a.movie_year"),
        col("b.book_rating"),
        col("b.ratings_count"),
        col("b.text_reviews_count"),
        col("b.publication_year"),
        col("b.publisher")
    )
)

print("Join entre adaptations e books concluído.")


Join entre adaptations e books concluído.


In [19]:
from pyspark.sql.functions import col, lower, trim, regexp_replace

def norm(c):
    return regexp_replace(lower(trim(c)), r"[^a-z0-9 ]", "")

gold_adaptacoes_filmes_df = (
    joined_books_adaptations.alias("j")
    .join(
        spark.table("silver.rating_movies").alias("r"),
        (norm(col("j.movie_title")) == norm(col("r.movie_title"))) &
        (col("j.movie_year") == col("r.release_year")),
        "left"   
    )
    .select(
        col("j.author"),
        col("j.book_title"),
        col("j.movie_title"),
        col("j.movie_year"),
        col("j.book_rating"),
        col("j.ratings_count"),
        col("j.text_reviews_count"),
        col("j.publication_year"),
        col("j.publisher"),
        col("r.writers"),
        col("r.rating"),
        col("r.tomatometer_status"),
        col("r.tomatometer_rating"),
        col("r.tomatometer_count"),
        col("r.audience_rating"),
        col("r.audience_count"),
        col("r.critics_consensus"),
        col("r.cast")
    )
)

print("Join completo para gold_adaptacoes_filmes concluído.")


Join completo para gold_adaptacoes_filmes concluído.


In [20]:
# Cell 7: Write to gold.adaptacoes_filmes as Delta table
(
    gold_adaptacoes_filmes_df.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", True)
    .option("path", "hdfs://hdfs-nn:9000/warehouse/gold.db/adaptacoes_filmes")
    .saveAsTable("gold.adaptacoes_filmes")
)

print("Tabela gold.adaptacoes_filmes gravada com sucesso!")

Tabela gold.adaptacoes_filmes gravada com sucesso!


In [21]:
# Cell 8: Verify the gold table
spark.sql("""
    SELECT * FROM gold.adaptacoes_filmes LIMIT 5
""").show(truncate=False)

+-----------------+-------------------------+-----------------------------+----------+-----------+-------------+------------------+----------------+---------+----------+------+------------------+------------------+-----------------+---------------+--------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [22]:
gold_adaptacoes_filmes_df.printSchema()
gold_adaptacoes_filmes_df.show(5, truncate=False)


root
 |-- author: string (nullable = true)
 |-- book_title: string (nullable = true)
 |-- movie_title: string (nullable = true)
 |-- movie_year: integer (nullable = true)
 |-- book_rating: double (nullable = true)
 |-- ratings_count: integer (nullable = true)
 |-- text_reviews_count: integer (nullable = true)
 |-- publication_year: integer (nullable = true)
 |-- publisher: string (nullable = true)
 |-- writers: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- tomatometer_status: string (nullable = true)
 |-- tomatometer_rating: integer (nullable = true)
 |-- tomatometer_count: integer (nullable = true)
 |-- audience_rating: integer (nullable = true)
 |-- audience_count: integer (nullable = true)
 |-- critics_consensus: string (nullable = true)
 |-- cast: string (nullable = true)

+-----------------+-------------------------+-----------------------------+----------+-----------+-------------+------------------+----------------+---------+----------+------+--------------

In [32]:
# Q6 As obras literárias com melhores avaliações no Goodreads correspondem a filmes com classificações mais altas no Rotten Tomatoes?
spark.sql("""
SELECT
  CASE
    WHEN book_rating >= 4 THEN 'Livros muito bem avaliados (>=4)'
    WHEN book_rating >= 3 THEN 'Livros bem avaliados (3–3.9)'
    ELSE 'Livros menos avaliados (<3)'
  END AS faixa_goodreads,
  COUNT(DISTINCT movie_title) AS n_filmes,
  AVG(tomatometer_rating) AS avg_tomatometer_rating
FROM gold.adaptacoes_filmes
WHERE book_rating IS NOT NULL
  AND tomatometer_rating IS NOT NULL
GROUP BY
  CASE
    WHEN book_rating >= 4 THEN 'Livros muito bem avaliados (>=4)'
    WHEN book_rating >= 3 THEN 'Livros bem avaliados (3–3.9)'
    ELSE 'Livros menos avaliados (<3)'
  END
ORDER BY faixa_goodreads
""").show()


+--------------------+--------+----------------------+
|     faixa_goodreads|n_filmes|avg_tomatometer_rating|
+--------------------+--------+----------------------+
|Livros bem avalia...|       1|                  25.0|
|Livros muito bem ...|       3|                  90.5|
+--------------------+--------+----------------------+



In [33]:
spark.sql("SELECT COUNT(*) FROM gold.adaptacoes_filmes").show()


+--------+
|count(1)|
+--------+
|     166|
+--------+



In [12]:
# Cell 9: Stop Spark session
spark.stop()

In [23]:
from pyspark.sql.functions import concat_ws, col
from pyspark.sql.types import ArrayType

# Carregar a tabela Gold
df = spark.table("gold.adaptacoes_filmes")

# Identificar automaticamente colunas do tipo array
array_cols = [
    f.name for f in df.schema.fields
    if isinstance(f.dataType, ArrayType)
]

print("Colunas array encontradas:", array_cols)

# Converter todas as colunas array para string (CSV-safe)
df_export = df
for c in array_cols:
    df_export = df_export.withColumn(c, concat_ws(", ", col(c)))

# Exportar para CSV (1 ficheiro)
df_export.coalesce(1) \
    .write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("/tmp/adaptacoes_filmes_csv")


Colunas array encontradas: []


In [24]:
!ls /tmp/adaptacoes_filmes_csv

part-00000-0fa851e8-5c8f-4a54-b98c-4e96c472e06e-c000.csv  _SUCCESS


In [25]:
!cp /tmp/adaptacoes_filmes_csv/part-00000-0fa851e8-5c8f-4a54-b98c-4e96c472e06e-c000.csv /home/jovyan/adaptacoes_filmes.csv

In [26]:
!ls -lah /home/jovyan/adaptacoes_filmes.csv

-rw-r--r-- 1 jovyan users 49K Dec 29 18:47 /home/jovyan/adaptacoes_filmes.csv
