In [3]:
import pandas as pd
import numpy as np
from pyspark.sql import functions as F


In [4]:
# Start Spark session
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder \
    .appName("Silver Data Exploration") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.default.parallelism", "12") \
    .config("spark.sql.shuffle.partitions", "12") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.3") \
    .config("spark.sql.files.maxPartitionBytes", "128m") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.adaptive.skewJoin.enabled", "true") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.shuffle.service.enabled", "true") \
    .config("spark.locality.wait", "0s") \
    .getOrCreate()



In [5]:
treino = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/treino/") \
    .coalesce(2)

treino.show(10,truncate=False)

+------------------------------------+----------------------------------------------------------------+----------+----------------+---------------------+-----------------+-----------------------+----------------------+----+---------+---------------------------+-----------------+----------------------------+------------------+---------------------+-----------------+---------------------+--------------------+----+-----+---+
|history                             |userId                                                          |userType  |timestampHistory|numberOfClicksHistory|timeOnPageHistory|scrollPercentageHistory|pageVisitsCountHistory|hour|dayofweek|time_since_last_interaction|first_interaction|time_since_first_interaction|interaction_score |recency_weight       |avg_time_on_page |time_weight          |adjusted_score      |year|month|day|
+------------------------------------+----------------------------------------------------------------+----------+----------------+-----------------

In [6]:
treino_normalized = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/treino_normalized/") \
    .coalesce(2)

treino_normalized.show(10,truncate=False)

+------------------------------------+----------------------------------------------------------------+----------------+---------------------+------------------+-----------------------+----------------------+------------------+------------------+---------------------------+--------------------+----------------------------+------------------+---------------------+------------------+---------------------+---------------------+--------+----+-----+---+
|history                             |userId                                                          |timestampHistory|numberOfClicksHistory|timeOnPageHistory |scrollPercentageHistory|pageVisitsCountHistory|hour              |dayofweek         |time_since_last_interaction|first_interaction   |time_since_first_interaction|interaction_score |recency_weight       |avg_time_on_page  |time_weight          |adjusted_score       |userType|year|month|day|
+------------------------------------+--------------------------------------------------------

In [7]:
itens = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/itens/") \
    .coalesce(2)

itens.show(10,truncate=False)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [8]:
itens_normalized = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/itens_normalized/year=2022/month=7/day=1") \
    .coalesce(2)

itens_normalized.show(10,truncate=False)

+------------------------------------+-------------------+-------------------+---------------------------------------------------------------------------------------------------------+--------------------+-------------------+
|page                                |issued             |modified           |title                                                                                                    |days_since_published|days_since_modified|
+------------------------------------+-------------------+-------------------+---------------------------------------------------------------------------------------------------------+--------------------+-------------------+
|6b8534ed-d39e-46ed-af1c-d69b414099db|2022-07-01 20:15:35|2022-07-01 22:49:54|É #FAKE que testes de Covid já vêm com resultado positivo e negativo                                     |0.03258522301718055 |0.2547178186567539 |
|71eb8655-9ccc-4b08-911e-fa244834fd54|2022-07-01 21:51:46|2022-07-01 21:51:47|Poupatempo faz mul

In [9]:
tfidf_features = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/gold/tfidf_features/year=2022/month=7/day=1") \
    .coalesce(2)

tfidf_features.show(10,truncate=False)

+------------------------------------+-------------------+-------------------+---------------------------------------------------------------------------------------------------------+--------------------+-------------------+------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [11]:
itens_embeddings = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/itens_embeddings/year=2022/month=7/day=1") \
    .coalesce(2)

itens_embeddings.show(10,truncate=False)

+------------------------------------+-------------------+-------------------+---------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|page                                |issued             |modified           |title                                                                                                    |words                                                                       