In [1]:
!pip install delta-spark==2.4.0



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType, StructField,
    StringType, IntegerType, DoubleType
)
from pyspark.sql.functions import (
    col, regexp_extract, lower, trim, regexp_replace,
    udf, min, max, avg, count, countDistinct, desc
)
import unicodedata
from pyspark.sql.functions import when

In [3]:
# Spark Session
warehouse_location = "hdfs://hdfs-nn:9000/warehouse"

spark = (
    SparkSession.builder
    .appName("Silver_goodreads_books")
    .config("spark.sql.warehouse.dir", warehouse_location)
    .config("hive.metastore.uris", "thrift://hive-metastore:9083")
    .config("spark.sql.catalogImplementation", "hive")
    .config("hive.metastore.warehouse.dir", warehouse_location)
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
    .enableHiveSupport()
    .getOrCreate()
)

In [4]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

schema = StructType([
    StructField("bookID", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("authors", StringType(), True),
    StructField("average_rating", DoubleType(), True),
    StructField("isbn", StringType(), True),
    StructField("isbn13", StringType(), True),
    StructField("language_code", StringType(), True),
    StructField("num_pages", IntegerType(), True),
    StructField("ratings_count", IntegerType(), True),
    StructField("text_reviews_count", IntegerType(), True),
    StructField("publication_date", StringType(), True),
    StructField("publisher", StringType(), True)
])

bronze_path = "hdfs://hdfs-nn:9000/datasets/bronze/books.csv"

books_bronze = (
    spark.read
    .option("header", True)
    .schema(schema)
    .csv(bronze_path)
)

print("Bronze Goodreads-books corrigido:")
books_bronze.show(5, truncate=False)

Bronze Goodreads-books corrigido:
+------+------------------------------------------------------------+--------------------------+--------------+----------+-------------+-------------+---------+-------------+------------------+----------------+---------------+
|bookID|title                                                       |authors                   |average_rating|isbn      |isbn13       |language_code|num_pages|ratings_count|text_reviews_count|publication_date|publisher      |
+------+------------------------------------------------------------+--------------------------+--------------+----------+-------------+-------------+---------+-------------+------------------+----------------+---------------+
|1     |Harry Potter and the Half-Blood Prince (Harry Potter  #6)   |J.K. Rowling/Mary GrandPré|4.57          |0439785960|9780439785969|eng          |652      |2095690      |27591             |9/16/2006       |Scholastic Inc.|
|2     |Harry Potter and the Order of the Phoenix (Harry P

In [5]:
def remove_accents(text):
    if text is None:
        return None
    return ''.join(c for c in unicodedata.normalize('NFKD', text) if not unicodedata.combining(c))

remove_accents_udf = udf(remove_accents, StringType())

In [6]:
from pyspark.sql.functions import (
    col, regexp_extract, lower, trim, regexp_replace,
    when
)
from pyspark.sql.types import IntegerType, DoubleType

# 1) Começa a pipeline
books_silver = (
    books_bronze

    # Remover nulos essenciais
    .na.drop(subset=["title", "authors", "average_rating"])

    # Remover duplicados por bookID
    .dropDuplicates(["bookID"])

    # Converter tipos numéricos (se ainda vierem como string)
    .withColumn("book_rating",        col("average_rating").cast(DoubleType()))
    .withColumn("ratings_count",      col("ratings_count").cast(IntegerType()))
    .withColumn("text_reviews_count", col("text_reviews_count").cast(IntegerType()))
)

# 2) Extrair ano de forma segura
books_silver = books_silver.withColumn(
    "publication_year",
    when(
        regexp_extract(col("publication_date"), r"(\d{4})", 1).cast("int").between(1000, 2025),
        regexp_extract(col("publication_date"), r"(\d{4})", 1).cast("int")
    ).otherwise(None)
)

# 3) Continuar pipeline normalmente
books_silver = (
    books_silver

    # Normalizar título
    .withColumn("book_title_clean",  lower(trim(col("title"))))
    .withColumn("book_title_noacc",  remove_accents_udf(col("book_title_clean")))
    .withColumn("book_title_norm",   regexp_replace(col("book_title_noacc"), r"[^a-z0-9 ]", ""))

    # Selecionar colunas finais (repara: já não pomos isbn aqui)
    .select(
        col("bookID").cast(IntegerType()).alias("book_id"),
        col("book_title_norm").alias("book_title"),
        col("authors").alias("author"),
        "book_rating",
        "ratings_count",
        "text_reviews_count",
        "publication_year",
        "publisher",
    )
)

print("Pré-visualização tabela Silver (books):")
books_silver.show(10, truncate=False)

Pré-visualização tabela Silver (books):
+-------+-----------------------------------------------------------------------------------------------------+--------------------------+-----------+-------------+------------------+----------------+---------------+
|book_id|book_title                                                                                           |author                    |book_rating|ratings_count|text_reviews_count|publication_year|publisher      |
+-------+-----------------------------------------------------------------------------------------------------+--------------------------+-----------+-------------+------------------+----------------+---------------+
|1      |harry potter and the halfblood prince harry potter  6                                                |J.K. Rowling/Mary GrandPré|4.57       |2095690      |27591             |2006            |Scholastic Inc.|
|2      |harry potter and the order of the phoenix harry potter  5                          

In [7]:
spark.sql("""
CREATE DATABASE IF NOT EXISTS silver
LOCATION 'hdfs://hdfs-nn:9000/warehouse/silver.db'
""")

#Gravar tabela Silver em Delta

(
    books_silver.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", True)
    .option("path", "hdfs://hdfs-nn:9000/warehouse/silver.db/books")
    .saveAsTable("silver.books")
)

print("Tabela silver.books gravada com sucesso!")

Tabela silver.books gravada com sucesso!


In [8]:
from pyspark.sql.functions import col, count, countDistinct, min, max, avg

df = spark.table("silver.books")

# --- Colunas numéricas (de acordo com o teu select final) ---
numeric_cols = ["book_rating", "ratings_count", "text_reviews_count", "publication_year"]
numeric_present = [c for c in numeric_cols if c in df.columns]

if numeric_present:
    numeric_summary = df.select(
        *[min(col(c)).alias(f"{c}_min") for c in numeric_present],
        *[max(col(c)).alias(f"{c}_max") for c in numeric_present],
        *[avg(col(c)).alias(f"{c}_avg") for c in numeric_present],
        *[count(col(c)).alias(f"{c}_count") for c in numeric_present]
    )
    print("Resumo das colunas numéricas (usadas):", numeric_present)
    numeric_summary.show(truncate=False)
else:
    print("Nenhuma das colunas numéricas está presente:", numeric_cols)
    print("Colunas disponíveis no df:", df.columns)

# --- Colunas categóricas (ajustadas ao teu schema final) ---
categorical_cols = ["book_title", "author", "book_id", "publisher"]
categorical_present = [c for c in categorical_cols if c in df.columns]

if categorical_present:
    cat_summary = df.select(
        *[count(col(c)).alias(f"{c}_count") for c in categorical_present],
        *[countDistinct(col(c)).alias(f"{c}_distinct") for c in categorical_present]
    )
    print("Resumo das colunas categóricas (usadas):", categorical_present)
    cat_summary.show(truncate=False)
else:
    print("Nenhuma das colunas categóricas está presente:", categorical_cols)
    print("Colunas disponíveis no df:", df.columns)


Resumo das colunas numéricas (usadas): ['book_rating', 'ratings_count', 'text_reviews_count', 'publication_year']
+---------------+-----------------+----------------------+--------------------+---------------+-----------------+----------------------+--------------------+-----------------+------------------+----------------------+--------------------+-----------------+-------------------+------------------------+----------------------+
|book_rating_min|ratings_count_min|text_reviews_count_min|publication_year_min|book_rating_max|ratings_count_max|text_reviews_count_max|publication_year_max|book_rating_avg  |ratings_count_avg |text_reviews_count_avg|publication_year_avg|book_rating_count|ratings_count_count|text_reviews_count_count|publication_year_count|
+---------------+-----------------+----------------------+--------------------+---------------+-----------------+----------------------+--------------------+-----------------+------------------+----------------------+-------------------

In [9]:
# --- Top 5 valores mais frequentes por coluna categórica ---
print("===== Top 5 valores mais frequentes por coluna categórica =====")
for c in categorical_cols:
    print(f"--- Coluna: {c} ---")
    df.groupBy(c).count().orderBy(desc("count")).show(5, truncate=False)

===== Top 5 valores mais frequentes por coluna categórica =====
--- Coluna: book_title ---
+----------------------+-----+
|book_title            |count|
+----------------------+-----+
|the brothers karamazov|9    |
|the iliad             |9    |
|anna karenina         |8    |
|the odyssey           |8    |
|gullivers travels     |8    |
+----------------------+-----+
only showing top 5 rows

--- Coluna: author ---
+----------------+-----+
|author          |count|
+----------------+-----+
|Stephen King    |40   |
|P.G. Wodehouse  |40   |
|Rumiko Takahashi|39   |
|Orson Scott Card|35   |
|Agatha Christie |33   |
+----------------+-----+
only showing top 5 rows

--- Coluna: book_id ---
+-------+-----+
|book_id|count|
+-------+-----+
|463    |1    |
|1342   |1    |
|1591   |1    |
|1645   |1    |
|1959   |1    |
+-------+-----+
only showing top 5 rows

--- Coluna: publisher ---
+----------------+-----+
|publisher       |count|
+----------------+-----+
|Vintage         |318  |
|Penguin Book

In [10]:
spark.sql("""
    SElECT * from silver.books
          """).show()

+-------+--------------------+--------------------+-----------+-------------+------------------+----------------+--------------------+
|book_id|          book_title|              author|book_rating|ratings_count|text_reviews_count|publication_year|           publisher|
+-------+--------------------+--------------------+-----------+-------------+------------------+----------------+--------------------+
|      1|harry potter and ...|J.K. Rowling/Mary...|       4.57|      2095690|             27591|            2006|     Scholastic Inc.|
|      2|harry potter and ...|J.K. Rowling/Mary...|       4.49|      2153167|             29221|            2004|     Scholastic Inc.|
|      4|harry potter and ...|        J.K. Rowling|       4.42|         6333|               244|            2003|          Scholastic|
|      5|harry potter and ...|J.K. Rowling/Mary...|       4.56|      2339585|             36325|            2004|     Scholastic Inc.|
|      8|harry potter boxe...|J.K. Rowling/Mary...|    

In [11]:
df = spark.table("silver.books")
print(df.columns)

['book_id', 'book_title', 'author', 'book_rating', 'ratings_count', 'text_reviews_count', 'publication_year', 'publisher']


In [12]:
spark.stop()