## EDA - Exploratory Data Analysis

In [2]:
# Check directories and files
import os

print("Verificando diretórios:")
print("/app existe:", os.path.exists("/app"))
print("/app/datalake existe:", os.path.exists("/app/datalake"))
print("/app/datalake/silver existe:", os.path.exists("/app/datalake/silver"))
print("/app/datalake/silver/treino existe:", os.path.exists("/app/datalake/silver/treino"))
print("/app/datalake/silver/itens existe:", os.path.exists("/app/datalake/silver/itens"))

if os.path.exists("/app/datalake/silver"):
    print("\nConteúdo de /app/datalake/silver:")
    print(os.listdir("/app/datalake/silver"))

Verificando diretórios:
/app existe: True
/app/datalake existe: True
/app/datalake/silver existe: True
/app/datalake/silver/treino existe: True
/app/datalake/silver/itens existe: True

Conteúdo de /app/datalake/silver:
['itens', 'treino']


In [2]:
# Start Spark session
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder \
    .appName("Silver Data Exploration") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.default.parallelism", "12") \
    .config("spark.sql.shuffle.partitions", "12") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.3") \
    .config("spark.sql.files.maxPartitionBytes", "128m") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.adaptive.skewJoin.enabled", "true") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.shuffle.service.enabled", "true") \
    .config("spark.locality.wait", "0s") \
    .getOrCreate()



In [3]:
# Read data (note the correct path for mapped volume)
treino = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/treino") \
    .coalesce(2)

In [4]:
itens = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/itens") \
    .coalesce(2)

In [5]:
# Create temporary tables for SQL
treino.createOrReplaceTempView("tab_treino")
itens.createOrReplaceTempView("tab_itens")


In [6]:

# Show record counts
print("Número de registros em treino:", treino.count())
print("Número de registros em itens:", itens.count())

# Show data schema
print("\nSchema dos dados de treino:")
treino.printSchema()

print("\nSchema dos dados de itens:")
itens.printSchema()

Número de registros em treino: 8123951
Número de registros em itens: 255603

Schema dos dados de treino:
root
 |-- userId: string (nullable = true)
 |-- userType: string (nullable = true)
 |-- history: string (nullable = true)
 |-- timestampHistory: string (nullable = true)
 |-- numberOfClicksHistory: integer (nullable = true)
 |-- timeOnPageHistory: integer (nullable = true)
 |-- scrollPercentageHistory: float (nullable = true)
 |-- pageVisitsCountHistory: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)


Schema dos dados de itens:
root
 |-- page: string (nullable = true)
 |-- issued: timestamp (nullable = true)
 |-- modified: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- body: string (nullable = true)
 |-- caption: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)



In [7]:
# SQL query
results = spark.sql("""
    SELECT min(timestampHistory) as min_timestamp, 
           max(timestampHistory) as max_timestamp
    FROM tab_treino
""")

results.show(truncate=False)

+-------------------+-------------------+
|min_timestamp      |max_timestamp      |
+-------------------+-------------------+
|2022-07-01 03:00:00|2022-08-15 02:59:47|
+-------------------+-------------------+



In [8]:
# Basic analysis
print("\nDistribuição de userType:")
treino.groupBy("userType").count().show()

print("\nEstatísticas de timeOnPageHistory:")
treino.select("timeOnPageHistory").describe().show()


Distribuição de userType:
+----------+-------+
|  userType|  count|
+----------+-------+
|    Logged|3676649|
|Non-Logged|4447302|
+----------+-------+


Estatísticas de timeOnPageHistory:
+-------+-----------------+
|summary|timeOnPageHistory|
+-------+-----------------+
|  count|          8123951|
|   mean|88768.68892008334|
| stddev|113932.1803580999|
|    min|             5000|
|    max|         46033049|
+-------+-----------------+

