In [2]:
import pandas as pd
import numpy as np
from pyspark.sql import functions as F


In [3]:
# Start Spark session
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder \
    .appName("Silver Data Exploration") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.default.parallelism", "12") \
    .config("spark.sql.shuffle.partitions", "12") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.3") \
    .config("spark.sql.files.maxPartitionBytes", "128m") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.adaptive.skewJoin.enabled", "true") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.shuffle.service.enabled", "true") \
    .config("spark.locality.wait", "0s") \
    .getOrCreate()



Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/19 19:02:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/19 19:02:44 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [6]:
treino = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/treino/year=2022/month=7/day=1") \
    .coalesce(2)

In [9]:
treino.show(10,truncate=False)

+----------------------------------------------------------------+----------+------------------------------------+----------------+---------------------+-----------------+-----------------------+----------------------+
|userId                                                          |userType  |history                             |timestampHistory|numberOfClicksHistory|timeOnPageHistory|scrollPercentageHistory|pageVisitsCountHistory|
+----------------------------------------------------------------+----------+------------------------------------+----------------+---------------------+-----------------+-----------------------+----------------------+
|c58c8345159e60366e064b469dcdadb00a220f6ab06d8cce78bfb78698ccce72|Non-Logged|0077f0d7-1d66-4e4c-b6e4-cc56f67306b6|1.656692666909E9|19                   |38435            |41.16                  |1                     |
|c58c8345159e60366e064b469dcdadb00a220f6ab06d8cce78bfb78698ccce72|Non-Logged|efef5cd6-fbf5-469c-bf43-8e0d30f64134|1.65671753

In [10]:
treino_normalized = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/treino_normalized/year=2022/month=7/day=1") \
    .coalesce(2)

treino_normalized.show(10,truncate=False)

+----------------------------------------------------------------+----------+------------------------------------+----------------+---------------------+------------------+-----------------------+----------------------+
|userId                                                          |userType  |history                             |timestampHistory|numberOfClicksHistory|timeOnPageHistory |scrollPercentageHistory|pageVisitsCountHistory|
+----------------------------------------------------------------+----------+------------------------------------+----------------+---------------------+------------------+-----------------------+----------------------+
|c58c8345159e60366e064b469dcdadb00a220f6ab06d8cce78bfb78698ccce72|Non-Logged|0077f0d7-1d66-4e4c-b6e4-cc56f67306b6|1.656692666909E9|0.0013047658288696609|10.556749799397158|7.817197746341996E-5   |0.0                   |
|c58c8345159e60366e064b469dcdadb00a220f6ab06d8cce78bfb78698ccce72|Non-Logged|efef5cd6-fbf5-469c-bf43-8e0d30f64134|1.6567

In [11]:
itens = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/itens/year=2022/month=7/day=1") \
    .coalesce(2)

itens.show(10,truncate=False)

+------------------------------------+-------------------+-------------------+---------------------------------------------------------------------------------------------------------+
|page                                |issued             |modified           |title                                                                                                    |
+------------------------------------+-------------------+-------------------+---------------------------------------------------------------------------------------------------------+
|6b8534ed-d39e-46ed-af1c-d69b414099db|2022-07-01 20:15:35|2022-07-01 22:49:54|É #FAKE que testes de Covid já vêm com resultado positivo e negativo                                     |
|71eb8655-9ccc-4b08-911e-fa244834fd54|2022-07-01 21:51:46|2022-07-01 21:51:47|Poupatempo faz multirão para regularizar CNH de motoristas que exercem atividade remunerada              |
|ceff793d-719b-4a22-8607-71544206ba19|2022-07-01 10:40:11|2022-07-01 10:40:

In [13]:
itens_normalized = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/itens_normalized/year=2022/month=7/day=1") \
    .coalesce(2)

itens_normalized.show(10,truncate=False)

+------------------------------------+-------------------+-------------------+---------------------------------------------------------------------------------------------------------+
|page                                |issued             |modified           |title                                                                                                    |
+------------------------------------+-------------------+-------------------+---------------------------------------------------------------------------------------------------------+
|6b8534ed-d39e-46ed-af1c-d69b414099db|2022-07-01 20:15:35|2022-07-01 22:49:54|É #FAKE que testes de Covid já vêm com resultado positivo e negativo                                     |
|71eb8655-9ccc-4b08-911e-fa244834fd54|2022-07-01 21:51:46|2022-07-01 21:51:47|Poupatempo faz multirão para regularizar CNH de motoristas que exercem atividade remunerada              |
|ceff793d-719b-4a22-8607-71544206ba19|2022-07-01 10:40:11|2022-07-01 10:40:

In [14]:
# Read data (note the correct path for mapped volume)
itens_embeddings = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/datalake/silver/itens_embeddings/year=2022/month=7/day=1") \
    .coalesce(2)

itens_embeddings.show(10,truncate=False)

+------------------------------------+-------------------+-------------------+---------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|page                                |issued             |modified           |title                                                                                                    |words                                                                       

In [5]:
matrices_interactions = spark.read \
    .option("mergeSchema", "false") \
    .parquet("datalake/gold/matrices/interactions/year=2022/month=7/day=1") \
    .coalesce(2)

matrices_interactions.show(10,truncate=False)

25/02/19 19:03:08 WARN DAGScheduler: Broadcasting large task binary with size 46.5 MiB
                                                                                

+--------+------------------------------------+----+-----+---+-----------------+--------+
|user_idx|page                                |year|month|day|interaction_score|item_idx|
+--------+------------------------------------+----+-----+---+-----------------+--------+
|14098.0 |0077f0d7-1d66-4e4c-b6e4-cc56f67306b6|2022|7    |1  |4.223114801100763|448.0   |
|14098.0 |efef5cd6-fbf5-469c-bf43-8e0d30f64134|2022|7    |1  |4.447017848065036|239021.0|
|36389.0 |82e7b237-e7ea-4cf5-b44c-55385e2cdbb0|2022|7    |1  |4.516456318872515|129939.0|
|36389.0 |109adb08-8ff3-408b-b828-e16b51788202|2022|7    |1  |3.684207681792833|16412.0 |
|112387.0|426e9777-7ad8-4b0f-98f4-48e8ee96ace5|2022|7    |1  |4.252949808003409|65809.0 |
|63369.0 |352e341c-bbd1-4bd9-9732-fb917cb37803|2022|7    |1  |3.780033387296842|52780.0 |
|319570.0|8c67956e-c8a7-4ee1-8f09-aa9ee4d27279|2022|7    |1  |3.68418067293674 |139412.0|
|72371.0 |5add4d01-a7d7-4819-a657-e677b24ece4d|2022|7    |1  |4.76333562766699 |90206.0 |
|72371.0 |

In [7]:
matrices_item_features = spark.read \
    .option("mergeSchema", "false") \
    .parquet("datalake/gold/matrices/item_features") \
    .coalesce(2)

matrices_item_features.show(10,truncate=False)

25/02/19 19:06:44 WARN DAGScheduler: Broadcasting large task binary with size 9.6 MiB


+--------+-----------+---------------+
|item_idx|feature_idx|embedding_value|
+--------+-----------+---------------+
|7.0     |0          |2.60304        |
|7.0     |1          |0.0            |
|7.0     |2          |0.0            |
|7.0     |3          |0.0            |
|7.0     |4          |0.0            |
|7.0     |5          |0.0            |
|7.0     |6          |0.0            |
|7.0     |7          |0.0            |
|7.0     |8          |0.0            |
|7.0     |9          |0.0            |
+--------+-----------+---------------+
only showing top 10 rows



In [18]:
from pyspark.sql.window import Window
from pyspark.sql.functions import explode, row_number

# Ler e explodir os arrays aninhados
page_indexer = spark.read \
    .parquet("datalake/gold/matrices/page_indexer/data") \
    .select(explode("labelsArray").alias("labels")) \
    .select(explode("labels").alias("page")) \
    .withColumn("item_idx", row_number().over(Window.orderBy("page")) - 1)

# Mostrar o mapeamento
page_indexer.show(10, truncate=False)

25/02/19 19:18:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/02/19 19:18:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/02/19 19:18:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.

+------------------------------------+--------+
|page                                |item_idx|
+------------------------------------+--------+
|000031b7-5c82-4e5c-bc37-6acba5e5027d|0       |
|0000352b-5a88-4a69-8f30-120da7169573|1       |
|00004767-ce1e-436a-85c0-68ee9f456a61|2       |
|00004fb2-18c3-4b13-94f5-d334c3d3259a|3       |
|0000d552-fb56-4fc1-9f7d-17fac18e6937|4       |
|0001eba9-55f4-477f-9de3-dc93b8b95574|5       |
|00021df2-031a-4bd6-9568-1b203530d8fa|6       |
|00023dfe-6554-4de8-8dfd-f6ede169808e|7       |
|000278b4-cc13-404e-9a1d-8bc1f7bf456e|8       |
|00030c8c-7fe8-4c44-aa6d-2c6a899c3ce5|9       |
+------------------------------------+--------+
only showing top 10 rows



                                                                                

In [19]:
from pyspark.sql.window import Window
from pyspark.sql.functions import explode, row_number

# Ler e explodir os arrays aninhados
user_indexer = spark.read \
    .parquet("datalake/gold/matrices/user_indexer/data") \
    .select(explode("labelsArray").alias("labels")) \
    .select(explode("labels").alias("page")) \
    .withColumn("item_idx", row_number().over(Window.orderBy("page")) - 1)

# Mostrar o mapeamento
user_indexer.show(10, truncate=False)

25/02/19 19:19:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/02/19 19:19:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/02/19 19:19:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.

+----------------------------------------------------------------+--------+
|page                                                            |item_idx|
+----------------------------------------------------------------+--------+
|000044b36375e7f1a66a9476affc2ddc83c2ec6dd18951668074de9c6a0faf15|0       |
|00004868f064a8147619ca4d75eac9ccabfbe1169840e67008a7c1350206807b|1       |
|00007a4e5949a3dba7c977503c53e0873643fe17d0802aca678ad62d0b4d94f8|2       |
|000087b05ccb95dec5d55e968764285c5403747fc35da2127f13bfd3879cd9e9|3       |
|00011b1ced626112372206634e0e9b5ccb432da916e83f2bad98dd2f3daafcd9|4       |
|00012cd490cea9fe043146807def18c830fa0d0b791772ed6a70ee03618df548|5       |
|00014a8ad79d212f0410c3dd3260e37a093054b9d9a6a80b04843fd3ec9ee39b|6       |
|00019abf778947398b46310c3947cc0260f30e79683ae9e244e3af0a5a29aa0f|7       |
|0001b40676c18a37bf25f0b1921ad12513c434cb57db18847fed8c8c743df905|8       |
|0001b9972aec5ac47e28fb0a0747d598f4c4058c2dbd3f8fbc6e23bced2fcc19|9       |
+-----------

                                                                                