In [1]:
!pip install delta-spark==2.4.0



In [1]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

warehouse_location = "hdfs://hdfs-nn:9000/warehouse"

builder = (
    SparkSession.builder
    .appName("Fase Gold - desempenho_livros")
    .config("spark.sql.warehouse.dir", warehouse_location)
    .config("hive.metastore.uris", "thrift://hive-metastore:9083")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
    .enableHiveSupport()
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()
print("Sessão Spark (Gold) iniciada com sucesso.")


Sessão Spark (Gold) iniciada com sucesso.


In [2]:
print("A criar/usar a base de dados 'gold'...")
spark.sql("CREATE DATABASE IF NOT EXISTS gold LOCATION 'hdfs://hdfs-nn:9000/warehouse/gold.db/'")
spark.sql("USE gold")


A criar/usar a base de dados 'gold'...


DataFrame[]

In [4]:
df_books = spark.table("silver.books")
df_adap  = spark.table("silver.adaptations")

print("books colunas:", df_books.columns)
print("adaptations colunas:", df_adap.columns)

print("\nAmostra books:")
df_books.show(5, truncate=False)

print("\nAmostra adaptations:")
df_adap.show(5, truncate=False)


books colunas: ['book_id', 'book_title', 'author', 'book_rating', 'ratings_count', 'text_reviews_count', 'publication_year', 'publisher']
adaptations colunas: ['author', 'book_title', 'movie_title', 'movie_year']

Amostra books:
+-------+---------------------------------------------------------+--------------------------+-----------+-------------+------------------+----------------+---------------+
|book_id|book_title                                               |author                    |book_rating|ratings_count|text_reviews_count|publication_year|publisher      |
+-------+---------------------------------------------------------+--------------------------+-----------+-------------+------------------+----------------+---------------+
|1      |harry potter and the halfblood prince harry potter  6    |J.K. Rowling/Mary GrandPré|4.57       |2095690      |27591             |2006            |Scholastic Inc.|
|2      |harry potter and the order of the phoenix harry potter  5|J.K. Rowling

In [5]:
from pyspark.sql.functions import (
    col, trim, lower, countDistinct,
    min as Fmin, max as Fmax,
    collect_set, size, first
)


# 0) DataFrames de origem

df_books = spark.table("silver.books")
df_adap  = spark.table("silver.adaptations")

# 1) Colunas de join (existem mesmo)

bk_col = "book_title"
ad_col = "book_title"

print("Coluna usada em books:", bk_col)
print("Coluna usada em adaptations:", ad_col)

# 2) Normalizar chave + aliases

b = (
    df_books.alias("b")
    .withColumn("book_key", lower(trim(col(bk_col))))
    .withColumn("author_key", lower(trim(col("author"))))
)

a = (
    df_adap.alias("a")
    .withColumn("book_key", lower(trim(col(ad_col))))
    .withColumn("author_key", lower(trim(col("author"))))
)

# 3) Join (por título + autor)

df_join = (
    b.join(
        a,
        on=["book_key", "author_key"],
        how="left"
    )
)

print("Linhas após junção:", df_join.count())
df_join.select(
    "b.book_id", "b.book_title", "b.author",
    "a.movie_title", "a.movie_year"
).show(10, truncate=False)

# 4) Agregações de adaptações

aggs = [
    countDistinct(col("a.movie_title")).alias("num_adaptacoes"),
    size(collect_set(col("a.movie_title"))).alias("num_filmes_distintos"),
    collect_set(col("a.movie_title")).alias("filmes"),
    Fmin(col("a.movie_year")).alias("primeiro_ano_filme"),
    Fmax(col("a.movie_year")).alias("ultimo_ano_filme"),
]

# 5) GOLD: desempenho_livros

df_desempenho_livros = (
    df_join
    .groupBy(
        "book_key",
        col("b.book_id").alias("book_id")
    )
    .agg(
        first(col("b.book_title"), ignorenulls=True).alias("book_title"),
        first(col("b.author"), ignorenulls=True).alias("author"),
        first(col("b.publisher"), ignorenulls=True).alias("publisher"),
        first(col("b.publication_year"), ignorenulls=True).alias("publication_year"),
        first(col("b.book_rating"), ignorenulls=True).alias("book_rating"),
        first(col("b.ratings_count"), ignorenulls=True).alias("ratings_count"),
        first(col("b.text_reviews_count"), ignorenulls=True).alias("text_reviews_count"),
        *aggs
    )
)

print("Preview df_desempenho_livros:")
df_desempenho_livros.show(20, truncate=False)


Coluna usada em books: book_title
Coluna usada em adaptations: book_title
Linhas após junção: 11124
+-------+-----------------------------------------------------------------------------------------------------+--------------------------+-----------+----------+
|book_id|book_title                                                                                           |author                    |movie_title|movie_year|
+-------+-----------------------------------------------------------------------------------------------------+--------------------------+-----------+----------+
|1      |harry potter and the halfblood prince harry potter  6                                                |J.K. Rowling/Mary GrandPré|null       |null      |
|2      |harry potter and the order of the phoenix harry potter  5                                            |J.K. Rowling/Mary GrandPré|null       |null      |
|4      |harry potter and the chamber of secrets harry potter  2                          

In [6]:
df_desempenho_livros.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("gold.desempenho_livros")

print("Tabela 'gold.desempenho_livros' gravada com sucesso!")


Tabela 'gold.desempenho_livros' gravada com sucesso!


In [8]:
from pyspark.sql.functions import concat_ws, col

df = spark.table("gold.desempenho_livros")

df_export = df.withColumn("filmes", concat_ws(" | ", col("filmes")))  

df_export.coalesce(1) \
    .write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("/tmp/desempenho_livros_csv")


In [11]:
spark.table("gold.desempenho_livros").printSchema()


root
 |-- book_key: string (nullable = true)
 |-- book_id: integer (nullable = true)
 |-- book_title: string (nullable = true)
 |-- author: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- publication_year: integer (nullable = true)
 |-- book_rating: double (nullable = true)
 |-- ratings_count: integer (nullable = true)
 |-- text_reviews_count: integer (nullable = true)
 |-- num_adaptacoes: long (nullable = true)
 |-- num_filmes_distintos: integer (nullable = true)
 |-- filmes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- primeiro_ano_filme: integer (nullable = true)
 |-- ultimo_ano_filme: integer (nullable = true)



In [9]:
!ls /tmp/desempenho_livros_csv


part-00000-0f733d77-2c0d-42d8-ba57-838ae6c39217-c000.csv  _SUCCESS


In [10]:
!cp /tmp/desempenho_livros_csv/part-00000-0f733d77-2c0d-42d8-ba57-838ae6c39217-c000.csv /home/jovyan/desempenho_livros.csv
!ls -lah /home/jovyan/desempenho_livros.csv


-rw-r--r-- 1 jovyan users 1.6M Dec 28 12:53 /home/jovyan/desempenho_livros.csv
