In [1]:
import pandas as pd
import numpy as np
from pyspark.sql import functions as F


In [2]:
# Start Spark session
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder \
    .appName("Silver Data Exploration") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .config("spark.default.parallelism", "12") \
    .config("spark.sql.shuffle.partitions", "12") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.3") \
    .config("spark.sql.files.maxPartitionBytes", "128m") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.adaptive.skewJoin.enabled", "true") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.shuffle.service.enabled", "true") \
    .config("spark.locality.wait", "0s") \
    .getOrCreate()



In [3]:
btc = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/data/bronze/bitcoin_data/") \
    .coalesce(2)

btc.show(20,truncate=False)

+-------------+-----------------+---------------------+---------------------+----+-----+---+
|timestamp    |price            |market_cap           |total_volume         |year|month|day|
+-------------+-----------------+---------------------+---------------------+----+-----+---+
|1740445222876|91837.13011638554|1.8242628913402627E12|5.121322734400521E10 |2025|2    |25 |
|1740445569956|91735.29798186751|1.820919862713355E12 |4.7559140045575554E10|2025|2    |25 |
+-------------+-----------------+---------------------+---------------------+----+-----+---+



In [4]:
btc.count()

2

In [5]:
btc_silver = spark.read \
    .option("mergeSchema", "false") \
    .parquet("/app/data/silver/bitcoin_data/") \
    .coalesce(2)

btc_silver.show(20,truncate=False)

+-------------+--------+-------------+------------+--------+----+-----+---+
|timestamp    |price   |market_cap   |total_volume|hour    |year|month|day|
+-------------+--------+-------------+------------+--------+----+-----+---+
|1740445222876|91837.13|1.82426285E12|5.1213226E10|01:00:22|2025|2    |25 |
|1740445569956|91735.3 |1.82091986E12|4.7559139E10|01:06:09|2025|2    |25 |
+-------------+--------+-------------+------------+--------+----+-----+---+



In [7]:
btc_silver.printSchema()

root
 |-- timestamp: long (nullable = true)
 |-- price: float (nullable = true)
 |-- market_cap: float (nullable = true)
 |-- total_volume: float (nullable = true)
 |-- hour: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)

