In [1]:

import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, DoubleType, StringType, ArrayType
from pyspark.sql.window import Window

In [2]:
from delta import *
from pyspark.sql import SparkSession

warehouse_location = "hdfs://hdfs-nn:9000/warehouse"

builder = (
    SparkSession.builder
    .appName("Projeto - Silver TV Shows")
    .config("spark.sql.warehouse.dir", warehouse_location)
    .config("hive.metastore.uris", "thrift://hive-metastore:9083")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
    .enableHiveSupport()
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

print(" Sessão Spark iniciada.")


 Sessão Spark iniciada.


In [3]:
bronze_tv_path = "hdfs://hdfs-nn:9000/datasets/bronze/Popular_Tv_shows.csv"  

from pyspark.sql.types import (
    LongType, StringType, StructField, StructType, 
    BooleanType, ArrayType, IntegerType, DoubleType
)
tvSchema = StructType([
    StructField("_c0", IntegerType(), True),
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("first_air_date", StringType(), True),
    StructField("origin_country", StringType(), True),   
    StructField("original_language", StringType(), True),
    StructField("overview", StringType(), True),
    StructField("popularity", StringType(), True),
    StructField("vote_average", StringType(), True),
    StructField("vote_count", StringType(), True),
])
df = (
    spark.read
        .option("header", True)
        .option("multiLine", True)
        .option("quote", "\"")
        .option("escape", "\"")
        .csv("hdfs://hdfs-nn:9000/datasets/bronze/Popular_Tv_shows.csv")
)


df.printSchema()
df.show(5, truncate=False)
df.toPandas()

root
 |-- _c0: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- first_air_date: string (nullable = true)
 |-- origin_country: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- vote_count: string (nullable = true)

+---+------+--------------+--------------+--------------+-----------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+------------+----------+
|_c0|id    |name          |first_air_date|origin_country|original_language|overview       

Unnamed: 0,_c0,id,name,first_air_date,origin_country,original_language,overview,popularity,vote_average,vote_count
0,0,119051,Wednesday,2022-11-23,['US'],en,"Wednesday Addams is sent to Nevermore Academy,...",3430.916,8.7,4629
1,1,36361,Ulice,2005-09-05,['CZ'],cs,Ulice is a Czech soap opera produced and broad...,2763.759,2.1,8
2,2,115646,Lisa,2021-01-25,['BE'],nl,,1872.799,6.5,36
3,3,204095,Mar do Sertão,2022-08-22,['BR'],pt,,1524.384,4.6,19
4,4,197189,Cara e Coragem,2022-05-30,['BR'],pt,,1499.511,5.8,42
...,...,...,...,...,...,...,...,...,...,...
9975,9975,122530,Panic,2021-05-27,['US'],en,"In the forgotten town of Carp, Texas, Panic is...",7.433,7.3,201
9976,9976,121860,Kakegurui Twin,2021-03-26,['JP'],ja,It's a year before Yumeko transferred to Hyakk...,6.641,8.3,7
9977,9977,3278,Numberjacks,2006-10-16,['GB'],en,Numberjacks is a children's British television...,7.775,6.0,4
9978,9978,41336,Schnell ermittelt,2009-04-21,['AT'],de,,10.748,7.0,2


In [5]:
from pyspark.sql.functions import col

# 1) Remover colunas que não interessam às questões analíticas
df_tv = df.drop(
    "_c0",
    "overview",
    "original_language",
    "origin_country"
    
)

In [6]:
# 2) Converter tipos numéricos
df_tv = (
    df_tv
        .withColumn("popularity", col("popularity").cast("double"))
        .withColumn("vote_average", col("vote_average").cast("double"))
        .withColumn("vote_count", col("vote_count").cast("int"))
)


In [7]:
# 3)  renomear name → series_name
df_tv = df_tv.withColumnRenamed("name", "series_name")


In [8]:
# 4) Remover duplicados por nome de série
df_tv = df_tv.dropDuplicates(["series_name"])

In [9]:

df_tv.printSchema()
df_tv.show(10, truncate=False)

root
 |-- id: string (nullable = true)
 |-- series_name: string (nullable = true)
 |-- first_air_date: string (nullable = true)
 |-- popularity: double (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- vote_count: integer (nullable = true)

+------+--------------------+--------------+----------+------------+----------+
|id    |series_name         |first_air_date|popularity|vote_average|vote_count|
+------+--------------------+--------------+----------+------------+----------+
|96246 |#LikeMe             |2019-01-13    |10.066    |9.7         |3         |
|2211  |'Allo 'Allo!        |1984-09-07    |28.808    |7.8         |162       |
|1568  |'Til Death          |2006-09-07    |9.945     |6.6         |44        |
|203417|'Til Jail Do Us Part|2022-09-15    |116.136   |7.6         |5         |
|8864  |.hack               |2002-04-04    |16.699    |6.8         |21        |
|21855 |07-Ghost            |2009-04-07    |6.863     |6.7         |15        |
|98736 |1 For All     

In [10]:
spark.sql("""
CREATE DATABASE IF NOT EXISTS silver
LOCATION 'hdfs://hdfs-nn:9000/warehouse/silver.db/'
""")

spark.sql("DROP TABLE IF EXISTS silver.tv_shows")

(df_tv.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("silver.tv_shows")
)

print("Tabela silver.tv_shows gravada.")


Tabela silver.tv_shows gravada.


In [11]:
spark.sql("SHOW TABLES IN silver").show(truncate=False)

spark.sql("""
  SELECT series_name, COUNT(*) AS n
  FROM silver.tv_shows
  GROUP BY series_name
  HAVING n > 1
  ORDER BY n DESC
""").show(20, truncate=False)

spark.sql("""
  SELECT
    COUNT(*) AS total_series,
    MIN(popularity) AS min_popularity,
    MAX(popularity) AS max_popularity,
    AVG(popularity) AS avg_popularity,
    MIN(vote_average) AS min_vote_average,
    MAX(vote_average) AS max_vote_average,
    AVG(vote_average) AS avg_vote_average,
    MIN(vote_count) AS min_vote_count,
    MAX(vote_count) AS max_vote_count,
    AVG(vote_count) AS avg_vote_count
  FROM silver.tv_shows
""").show(truncate=False)


+---------+-----------------+-----------+
|namespace|tableName        |isTemporary|
+---------+-----------------+-----------+
|silver   |adaptations      |false      |
|silver   |adaptations_films|false      |
|silver   |books            |false      |
|silver   |films_awards     |false      |
|silver   |full_data        |false      |
|silver   |the_oscar_award  |false      |
|silver   |tv_shows         |false      |
+---------+-----------------+-----------+

+-----------+---+
|series_name|n  |
+-----------+---+
+-----------+---+

+------------+--------------+--------------+------------------+----------------+----------------+-----------------+--------------+--------------+------------------+
|total_series|min_popularity|max_popularity|avg_popularity    |min_vote_average|max_vote_average|avg_vote_average |min_vote_count|max_vote_count|avg_vote_count    |
+------------+--------------+--------------+------------------+----------------+----------------+-----------------+--------------+----

In [12]:
spark.table("silver.tv_shows").printSchema()


root
 |-- id: string (nullable = true)
 |-- series_name: string (nullable = true)
 |-- first_air_date: string (nullable = true)
 |-- popularity: double (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- vote_count: integer (nullable = true)



In [13]:
spark.sql("""
SELECT MIN(popularity), MAX(popularity),
       MIN(vote_average), MAX(vote_average),
       MIN(vote_count), MAX(vote_count)
FROM silver.tv_shows
""").show()

+---------------+---------------+-----------------+-----------------+---------------+---------------+
|min(popularity)|max(popularity)|min(vote_average)|max(vote_average)|min(vote_count)|max(vote_count)|
+---------------+---------------+-----------------+-----------------+---------------+---------------+
|          4.613|       3430.916|              0.0|             10.0|              0|          20156|
+---------------+---------------+-----------------+-----------------+---------------+---------------+



In [14]:
spark.stop()