In [5]:
import pandas as pd
import numpy as np
from pyspark.sql import functions as F


In [6]:
# Start Spark session
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder \
    .appName("Silver Data Exploration") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.default.parallelism", "12") \
    .config("spark.sql.shuffle.partitions", "12") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.3") \
    .config("spark.sql.files.maxPartitionBytes", "128m") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.adaptive.skewJoin.enabled", "true") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.shuffle.service.enabled", "true") \
    .config("spark.locality.wait", "0s") \
    .getOrCreate()



In [3]:
treino = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/treino/") \
    .coalesce(2)

treino.show(10,truncate=False)

+------------------------------------+----------------------------------------------------------------+----------+----------------+---------------------+-----------------+-----------------------+----------------------+----+---------+---------------------------+-----------------+----------------------------+------------------+---------------------+-----------------+---------------------+--------------------+----+-----+---+
|history                             |userId                                                          |userType  |timestampHistory|numberOfClicksHistory|timeOnPageHistory|scrollPercentageHistory|pageVisitsCountHistory|hour|dayofweek|time_since_last_interaction|first_interaction|time_since_first_interaction|interaction_score |recency_weight       |avg_time_on_page |time_weight          |adjusted_score      |year|month|day|
+------------------------------------+----------------------------------------------------------------+----------+----------------+-----------------

In [4]:
treino_normalized = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/treino_normalized/") \
    .coalesce(2)

treino_normalized.show(10,truncate=False)

+------------------------------------+----------------------------------------------------------------+----------------+---------------------+------------------+-----------------------+----------------------+-------------------+------------------+---------------------------+---------------------+----------------------------+------------------+---------------------+------------------+--------------------+--------------------+--------+----+-----+---+
|history                             |userId                                                          |timestampHistory|numberOfClicksHistory|timeOnPageHistory |scrollPercentageHistory|pageVisitsCountHistory|hour               |dayofweek         |time_since_last_interaction|first_interaction    |time_since_first_interaction|interaction_score |recency_weight       |avg_time_on_page  |time_weight         |adjusted_score      |userType|year|month|day|
+------------------------------------+--------------------------------------------------------

In [5]:
itens = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/itens/") \
    .coalesce(2)

itens.show(10,truncate=False)

+------------------------------------+-------------------+-------------------+------------------------------------------------------------------------------------+--------------------+-------------------+----+-----+---+
|page                                |issued             |modified           |title                                                                               |days_since_published|days_since_modified|year|month|day|
+------------------------------------+-------------------+-------------------+------------------------------------------------------------------------------------+--------------------+-------------------+----+-----+---+
|78698773-9812-446d-be1f-2e70be61e1be|2021-01-01 03:23:31|2021-01-01 03:23:44|Prefeito e vereadores de Tucumã tomam posse; veja lista de eleitos                  |1512                |1512               |2021|1    |1  |
|017964ac-b943-4513-8a8b-6741a38eee00|2021-01-01 04:50:51|2021-01-01 04:50:54|Prefeito e vereadores de São Caetano tomam

In [6]:
itens_normalized = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/itens_normalized/year=2022/month=7/day=1") \
    .coalesce(2)

itens_normalized.show(10,truncate=False)

+------------------------------------+-------------------+-------------------+---------------------------------------------------------------------------------------------------------+--------------------+-------------------+
|page                                |issued             |modified           |title                                                                                                    |days_since_published|days_since_modified|
+------------------------------------+-------------------+-------------------+---------------------------------------------------------------------------------------------------------+--------------------+-------------------+
|6b8534ed-d39e-46ed-af1c-d69b414099db|2022-07-01 20:15:35|2022-07-01 22:49:54|É #FAKE que testes de Covid já vêm com resultado positivo e negativo                                     |0.03256947532797093 |0.25457282950750026|
|71eb8655-9ccc-4b08-911e-fa244834fd54|2022-07-01 21:51:46|2022-07-01 21:51:47|Poupatempo faz mul

In [8]:
itens_embeddings = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/itens_embeddings/year=2022/month=7/day=1") \
    .coalesce(2)

itens_embeddings.show(10,truncate=False)

+------------------------------------+-------------------+-------------------+---------------------------------------------------------------------------------------------------------+--------------------+-------------------+---------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|page                                |issued            

In [None]:
import numpy as np

# Carregar o arquivo .npz
data = np.load("/gold/lightfm_interactions/interaction_matrix.npz")

# Listar as chaves armazenadas no arquivo
print(data.files)


In [13]:
from scipy.sparse import load_npz

# Carregar a matriz esparsa
interaction_matrix = load_npz("/app/datalake/gold/lightfm_interactions/interaction_matrix.npz")

# Mostrar dimensões da matriz
print("Dimensões da Matriz:", interaction_matrix.shape)

# Mostrar os primeiros elementos (convertendo para denso apenas para inspeção)
print(interaction_matrix.todense()[:10, :10])


Dimensões da Matriz: (577942, 255603)
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [9]:
df = spark.read.parquet("/app/datalake/gold/lightfm_item_features/item_features.parquet")
df.show(truncate=False)


+------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|page                                |features                                                                                                                                                                                                                                                                                                                           |
+------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [10]:
df = spark.read.parquet("/app/datalake/gold/lightfm_user_features/user_features.parquet")
df.show(truncate=False)


+----------------------------------------------------------------+--------+------------------+---------------------+------------------+------------------+---------------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------+
|userId                                                          |userType|avg_time_on_page  |numberOfClicksHistory|timeOnPageHistory |interaction_score |time_since_last_interaction|recency_weight       |features_vec                                                                                                                |user_features                                                                                                               |
+----------------------------------------------------------------+--------+---------------