In [1]:
import pandas as pd
import numpy as np
from pyspark.sql import functions as F


In [2]:
# Start Spark session
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder \
    .appName("Silver Data Exploration") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.default.parallelism", "12") \
    .config("spark.sql.shuffle.partitions", "12") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.3") \
    .config("spark.sql.files.maxPartitionBytes", "128m") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.adaptive.skewJoin.enabled", "true") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.shuffle.service.enabled", "true") \
    .config("spark.locality.wait", "0s") \
    .getOrCreate()



In [3]:
treino = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/treino/") \
    .coalesce(2)

treino.show(10,truncate=False)

+------------------------------------+----------------------------------------------------------------+----------+----------------+---------------------+-----------------+-----------------------+----------------------+----+---------+---------------------------+-----------------+----------------------------+------------------+---------------------+-----------------+---------------------+--------------------+----+-----+---+
|history                             |userId                                                          |userType  |timestampHistory|numberOfClicksHistory|timeOnPageHistory|scrollPercentageHistory|pageVisitsCountHistory|hour|dayofweek|time_since_last_interaction|first_interaction|time_since_first_interaction|interaction_score |recency_weight       |avg_time_on_page |time_weight          |adjusted_score      |year|month|day|
+------------------------------------+----------------------------------------------------------------+----------+----------------+-----------------

In [None]:
treino.describe().show()


In [4]:
treino_normalized = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/treino_normalized/") \
    .coalesce(2)

treino_normalized.show(10,truncate=False)

+------------------------------------+----------------------------------------------------------------+----------------+---------------------+------------------+-----------------------+----------------------+-------------------+------------------+---------------------------+---------------------+----------------------------+------------------+---------------------+------------------+--------------------+--------------------+--------+----+-----+---+
|history                             |userId                                                          |timestampHistory|numberOfClicksHistory|timeOnPageHistory |scrollPercentageHistory|pageVisitsCountHistory|hour               |dayofweek         |time_since_last_interaction|first_interaction    |time_since_first_interaction|interaction_score |recency_weight       |avg_time_on_page  |time_weight         |adjusted_score      |userType|year|month|day|
+------------------------------------+--------------------------------------------------------

In [5]:
itens = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/itens/") \
    .coalesce(2)

itens.show(10,truncate=False)

+------------------------------------+-------------------+-------------------+------------------------------------------------------------------------------------+--------------------+-------------------+----+-----+---+
|page                                |issued             |modified           |title                                                                               |days_since_published|days_since_modified|year|month|day|
+------------------------------------+-------------------+-------------------+------------------------------------------------------------------------------------+--------------------+-------------------+----+-----+---+
|78698773-9812-446d-be1f-2e70be61e1be|2021-01-01 03:23:31|2021-01-01 03:23:44|Prefeito e vereadores de Tucumã tomam posse; veja lista de eleitos                  |1511                |1511               |2021|1    |1  |
|017964ac-b943-4513-8a8b-6741a38eee00|2021-01-01 04:50:51|2021-01-01 04:50:54|Prefeito e vereadores de São Caetano tomam

In [None]:
from pyspark.sql.functions import min, max

# Supondo que o nome da sua feature seja "sua_feature"
itens.agg(min("days_since_published").alias("min"), max("days_since_published").alias("max")).show()


In [None]:
itens.describe().show()


In [6]:
itens_normalized = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/itens_normalized/year=2022/month=7/day=1") \
    .coalesce(2)

itens_normalized.show(10,truncate=False)

+------------------------------------+-------------------+-------------------+---------------------------------------------------------------------------------------------------------+--------------------+-------------------+
|page                                |issued             |modified           |title                                                                                                    |days_since_published|days_since_modified|
+------------------------------------+-------------------+-------------------+---------------------------------------------------------------------------------------------------------+--------------------+-------------------+
|6b8534ed-d39e-46ed-af1c-d69b414099db|2022-07-01 20:15:35|2022-07-01 22:49:54|É #FAKE que testes de Covid já vêm com resultado positivo e negativo                                     |0.03258522301718055 |0.2547178186567539 |
|71eb8655-9ccc-4b08-911e-fa244834fd54|2022-07-01 21:51:46|2022-07-01 21:51:47|Poupatempo faz mul

In [8]:
# Read data (note the correct path for mapped volume)
itens_embeddings = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/itens_embeddings/year=2022/month=7/day=1") \
    .coalesce(2)

itens_embeddings.show(10,truncate=False)

+------------------------------------+-------------------+-------------------+---------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|page                                |issued             |modified           |title                                                                                                    |words                                                                       

In [9]:
matrices_interactions = spark.read \
    .option("mergeSchema", "false") \
    .parquet("datalake/gold/matrices/interactions/year=2022/month=7/day=1") \
    .coalesce(2)

matrices_interactions.show(10,truncate=False)

+--------+------------------------------------+----+---------+-----+-----------------+--------+
|user_idx|page                                |hour|dayofweek|month|interaction_score|item_idx|
+--------+------------------------------------+----+---------+-----+-----------------+--------+
|194362.0|c0a176b2-622f-42a4-8e5d-2a08032f8e08|12  |6        |7    |0.0              |191811.0|
|194362.0|97ac6c4c-5b3d-4e36-9816-0791226573e5|12  |6        |7    |0.0              |150583.0|
|194362.0|ece7f38e-1744-4885-83d9-2ffb89e40b11|14  |6        |7    |0.0              |235993.0|
|318111.0|5fc43290-06bd-49fe-8d23-b8cb0f892548|11  |6        |7    |0.0              |95132.0 |
|6513.0  |cfd8b7ef-29ec-446d-84b6-dfecf017e12a|11  |6        |7    |0.0              |207056.0|
|6513.0  |9442c690-21b3-42bc-9e5a-f880de0fa0a8|12  |6        |7    |0.0              |147186.0|
|6513.0  |a22fa1cd-f11e-426e-bd09-57781b82f6e7|13  |6        |7    |0.0              |161124.0|
|6513.0  |1f2b9c2f-a2d2-4192-b009-09065d

In [10]:
matrices_item_features = spark.read \
    .option("mergeSchema", "false") \
    .parquet("datalake/gold/matrices/item_features") \
    .coalesce(2)

matrices_item_features.show(10,truncate=False)

+--------+-----------+---------------+
|item_idx|feature_idx|embedding_value|
+--------+-----------+---------------+
|7.0     |0          |2.60304        |
|7.0     |1          |0.0            |
|7.0     |2          |0.0            |
|7.0     |3          |0.0            |
|7.0     |4          |0.0            |
|7.0     |5          |0.0            |
|7.0     |6          |0.0            |
|7.0     |7          |0.0            |
|7.0     |8          |0.0            |
|7.0     |9          |0.0            |
+--------+-----------+---------------+
only showing top 10 rows



In [11]:
from pyspark.sql.window import Window
from pyspark.sql.functions import explode, row_number

# Ler e explodir os arrays aninhados
page_indexer = spark.read \
    .parquet("datalake/gold/matrices/page_indexer/data") \
    .select(explode("labelsArray").alias("labels")) \
    .select(explode("labels").alias("page")) \
    .withColumn("item_idx", row_number().over(Window.orderBy("page")) - 1)

# Mostrar o mapeamento
page_indexer.show(10, truncate=False)

+------------------------------------+--------+
|page                                |item_idx|
+------------------------------------+--------+
|000031b7-5c82-4e5c-bc37-6acba5e5027d|0       |
|0000352b-5a88-4a69-8f30-120da7169573|1       |
|00004767-ce1e-436a-85c0-68ee9f456a61|2       |
|00004fb2-18c3-4b13-94f5-d334c3d3259a|3       |
|0000d552-fb56-4fc1-9f7d-17fac18e6937|4       |
|0001eba9-55f4-477f-9de3-dc93b8b95574|5       |
|00021df2-031a-4bd6-9568-1b203530d8fa|6       |
|00023dfe-6554-4de8-8dfd-f6ede169808e|7       |
|000278b4-cc13-404e-9a1d-8bc1f7bf456e|8       |
|00030c8c-7fe8-4c44-aa6d-2c6a899c3ce5|9       |
+------------------------------------+--------+
only showing top 10 rows



In [12]:
from pyspark.sql.window import Window
from pyspark.sql.functions import explode, row_number

# Ler e explodir os arrays aninhados
user_indexer = spark.read \
    .parquet("datalake/gold/matrices/user_indexer/data") \
    .select(explode("labelsArray").alias("labels")) \
    .select(explode("labels").alias("page")) \
    .withColumn("item_idx", row_number().over(Window.orderBy("page")) - 1)

# Mostrar o mapeamento
user_indexer.show(10, truncate=False)

+----------------------------------------------------------------+--------+
|page                                                            |item_idx|
+----------------------------------------------------------------+--------+
|000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951668074de9c6a0faf15|0       |
|00004868f064a8147619ca4d75eac9ccabfbe1169840e67008a7c1350206807b|1       |
|00007a4e5949a3dba7c977503c53e0873643fe17d0802aca678ad62d0b4d94f8|2       |
|000087b05ccb95dec5d55e968764285c5403747fc35da2127f13bfd3879cd9e9|3       |
|00011b1ced626112372206634e0e9b5ccb432da916e83f2bad98dd2f3daafcd9|4       |
|00012cd490cea9fe043146807def18c830fa0d0b791772ed6a70ee03618df548|5       |
|00014a8ad79d212f0410c3dd3260e37a093054b9d9a6a80b04843fd3ec9ee39b|6       |
|00019abf778947398b46310c3947cc0260f30e79683ae9e244e3af0a5a29aa0f|7       |
|0001b40676c18a37bf25f0b1921ad12513c434cb57db18847fed8c8c743df905|8       |
|0001b9972aec5ac47e28fb0a0747d598f4c4058c2dbd3f8fbc6e23bced2fcc19|9       |
+-----------