## ETL_source_to_bronze

### Download file

In [4]:
import gdown
import os

# Caminho da pasta a ser criada
path = "/datalake/bronze"

# Criar a pasta
os.makedirs(path, exist_ok=True)

# Verificar se a pasta foi criada com sucesso
os.path.exists(path)

# ID do arquivo do Google Drive (extraída do link)
file_id = "13rvnyK5PJADJQgYe-VbdXb7PpLPj7lPr"
# Construindo a URL de download
download_url = f"https://drive.google.com/uc?id={file_id}"

# Nome do arquivo local para salvar
output_file = "/datalake/bronze/challenge-webmedia-e-globo-2023.zip"

# Fazendo o download
gdown.download(download_url, output_file, quiet=False)


Downloading...
From (original): https://drive.google.com/uc?id=13rvnyK5PJADJQgYe-VbdXb7PpLPj7lPr
From (redirected): https://drive.google.com/uc?id=13rvnyK5PJADJQgYe-VbdXb7PpLPj7lPr&confirm=t&uuid=89e7610f-0206-4eb1-a033-39a99731d05f
To: /datalake/bronze/challenge-webmedia-e-globo-2023.zip
100%|██████████| 672M/672M [00:55<00:00, 12.2MB/s] 


'/datalake/bronze/challenge-webmedia-e-globo-2023.zip'

### Unzip

In [1]:
import zipfile
import os

def unzip_file(file_path, extract_to):
    # Descompactar o arquivo
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
        print(f"Arquivo descompactado em: {extract_to}")
    
    # Deletar o arquivo .zip após a descompactação
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Arquivo .zip deletado: {file_path}")
    else:
        print("O arquivo .zip não foi encontrado para exclusão.")

destination = "/datalake/bronze/challenge-webmedia-e-globo-2023.zip"
extract_to = "/datalake/bronze"

unzip_file(destination, extract_to)


Arquivo descompactado em: /datalake/bronze
Arquivo .zip deletado: /datalake/bronze/challenge-webmedia-e-globo-2023.zip


## START SPARK SESSION

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split, arrays_zip, from_unixtime, year, month, dayofmonth, to_timestamp, regexp_replace

spark = SparkSession.builder \
    .appName("Spark Init") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/06 13:32:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## ETL_bronze_to_silver

### Treino

In [2]:
file_path = "/datalake/bronze/files/treino/"

df = spark.read.option("header", "true").csv(file_path)

cols_to_split = [
    "history", "timestampHistory", "numberOfClicksHistory", 
    "timeOnPageHistory", "scrollPercentageHistory", 
    "pageVisitsCountHistory"
]

for col_name in cols_to_split:
    df = df.withColumn(col_name, split(col(col_name), ",\\s*"))

df = df.withColumn("zipped", arrays_zip(*[col(c) for c in cols_to_split]))

df_exploded = df.withColumn("exploded", explode(col("zipped")))

df_normalized = df_exploded.select(
    col("userId"),
    col("userType"),
    col("exploded.history").alias("history"),
    from_unixtime(col("exploded.timestampHistory").cast("long") / 1000, "yyyy-MM-dd HH:mm:ss").alias("timestampHistory"),
    col("exploded.numberOfClicksHistory").cast("int").alias("numberOfClicksHistory"),
    col("exploded.timeOnPageHistory").cast("int").alias("timeOnPageHistory"),
    col("exploded.scrollPercentageHistory").cast("float").alias("scrollPercentageHistory"),
    col("exploded.pageVisitsCountHistory").alias("pageVisitsCountHistory")
)

df_partitioned = df_normalized \
    .withColumn("year", year(col("timestampHistory"))) \
    .withColumn("month", month(col("timestampHistory"))) \
    .withColumn("day", dayofmonth(col("timestampHistory")))

output_path = "/datalake/silver/treino/"
df_partitioned.write \
    .mode("overwrite") \
    .option("compression", "snappy") \
    .partitionBy("year", "month", "day") \
    .parquet(output_path)

print("Arquivos parquet salvos com sucesso!")


25/02/06 13:32:30 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/02/06 13:32:30 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
25/02/06 13:32:30 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
25/02/06 13:32:30 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
25/02/06 13:32:30 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
25/02/06 13:32:30 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
25/02/06 13:32:30 WARN MemoryManager: Total allocation exceeds 95.

Arquivos parquet salvos com sucesso!


### Itens

In [3]:
file_path = "/datalake/bronze/itens/itens/"

df = spark.read \
    .option("header", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .option("multiLine", "true") \
    .option("inferSchema", "true") \
    .csv(file_path)

df = df.withColumn("issued", regexp_replace(col("issued"), r"\+00:00", "")) \
       .withColumn("modified", regexp_replace(col("modified"), r"\+00:00", ""))

df = df.withColumn("issued", to_timestamp(col("issued"), "yyyy-MM-dd HH:mm:ss")) \
       .withColumn("modified", to_timestamp(col("modified"), "yyyy-MM-dd HH:mm:ss"))

df = df.drop("url")

df = df.withColumn("year", year(col("issued"))) \
       .withColumn("month", month(col("issued"))) \
       .withColumn("day", dayofmonth(col("issued")))

output_path = "datalake/silver/itens/"

df.write \
    .mode("overwrite") \
    .option("compression", "snappy") \
    .partitionBy("year", "month", "day") \
    .parquet(output_path)

print("Arquivos parquet salvos com sucesso!")


                                                                                

Arquivos parquet salvos com sucesso!
