In [1]:
import pandas as pd
import numpy as np
from pyspark.sql import functions as F


In [2]:
# Start Spark session
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder \
    .appName("Silver Data Exploration") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.default.parallelism", "12") \
    .config("spark.sql.shuffle.partitions", "12") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.3") \
    .config("spark.sql.files.maxPartitionBytes", "128m") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.adaptive.skewJoin.enabled", "true") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.shuffle.service.enabled", "true") \
    .config("spark.locality.wait", "0s") \
    .getOrCreate()



In [3]:
btc = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/data/bronze/bitcoin_data/") \
    .coalesce(2)

btc.show(20,truncate=False)

+-------------+-----------------+---------------------+---------------------+----+-----+---+
|timestamp    |price            |market_cap           |total_volume         |year|month|day|
+-------------+-----------------+---------------------+---------------------+----+-----+---+
|1740357412101|96251.1123703611 |1.9095710802652363E12|1.6453226993530008E10|2025|2    |24 |
|1740357760631|96245.09823181506|1.9089532008439167E12|1.3525717406974714E10|2025|2    |24 |
|1740357938192|96282.1989697277 |1.9071389106648289E12|1.803493721779075E10 |2025|2    |24 |
|1740358275443|96424.77057523708|1.9111210329085317E12|1.6097959691337078E10|2025|2    |24 |
|1740358604231|96343.42122341768|1.9132558807630022E12|1.825768305993376E10 |2025|2    |24 |
|1740358855298|96235.4969392334 |1.9090429675544155E12|1.8418267627855816E10|2025|2    |24 |
|1740359171707|96190.70236140117|1.9086584323616287E12|1.813230325904156E10 |2025|2    |24 |
|1740359492323|96222.3149837647 |1.9107777567746768E12|1.8355858179359

In [4]:
btc_silver = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/data/silver/bitcoin_data/") \
    .coalesce(2)

btc_silver.show(20,truncate=False)

+-----------------------+-----------------+---------------------+---------------------+----+-----+---+
|timestamp              |price            |market_cap           |total_volume         |year|month|day|
+-----------------------+-----------------+---------------------+---------------------+----+-----+---+
|2025-02-24 12:06:09.83 |95663.52496489698|1.900201828466259E12 |2.5287731482991985E10|2025|2    |24 |
|2025-02-24 12:10:40.724|95781.87096232019|1.900201828466259E12 |2.5486610468911724E10|2025|2    |24 |
|2025-02-24 12:16:30.714|95788.51792803427|1.8994686458468071E12|2.5275817937201412E10|2025|2    |24 |
|2025-02-24 12:20:22.331|95798.7215720659 |1.8994686458468071E12|2.286497219334802E10 |2025|2    |24 |
|2025-02-24 12:26:19.032|95871.91372658548|1.9001365614921519E12|2.5501428657382412E10|2025|2    |24 |
|2025-02-24 12:31:49.921|95878.80290919964|1.9005927553409973E12|2.521415910298648E10 |2025|2    |24 |
|2025-02-24 12:35:53.753|95866.85890549126|1.9005927553409973E12|2.258923