In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [2]:
import numpy as np
np.bool = np.bool_

In [3]:
#current notebook name
notebook_name = __session__.replace('.ipynb','')[__session__.rfind('/')+1:] 

In [4]:
# HDFS base paths
hdfs_lakehouse_base_path = 'hdfs://localhost:9000/lakehouse/'
hdfs_warehouse_base_path = 'hdfs://localhost:9000/warehouse'

In [5]:
import os
dependencies = ["org.apache.spark:spark-avro_2.12:3.5.0",
                "io.delta:delta-iceberg_2.12:3.0.0"]
os.environ['PYSPARK_SUBMIT_ARGS']= f"--packages {','.join(dependencies)} pyspark-shell"
os.environ['PYARROW_IGNORE_TIMEZONE'] = 'true'

In [6]:
from pyspark.sql.session import SparkSession

spark = (SparkSession.builder
    .appName(notebook_name)
    .config("spark.log.level","ERROR")
    .config("spark.sql.warehouse.dir",hdfs_warehouse_base_path)
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .enableHiveSupport()
    .getOrCreate()
)

25/03/18 00:33:34 WARN Utils: Your hostname, osbdet resolves to a loopback address: 127.0.0.1; using 10.0.2.15 instead (on interface enp0s1)
25/03/18 00:33:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/osbdet/.ivy2/cache
The jars for the packages stored in: /home/osbdet/.ivy2/jars
org.apache.spark#spark-avro_2.12 added as a dependency
io.delta#delta-iceberg_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-01509170-bfee-46a7-b47f-5ed63154b85f;1.0
	confs: [default]


:: loading settings :: url = jar:file:/home/osbdet/.jupyter_venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.spark#spark-avro_2.12;3.5.0 in central
	found org.tukaani#xz;1.9 in central
	found io.delta#delta-iceberg_2.12;3.0.0 in central
	found io.delta#delta-spark_2.12;3.0.0 in central
	found io.delta#delta-storage;3.0.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.scala-lang.modules#scala-collection-compat_2.12;2.1.1 in central
	found com.github.ben-manes.caffeine#caffeine;2.9.3 in central
	found org.checkerframework#checker-qual;3.19.0 in central
	found com.google.errorprone#error_prone_annotations;2.10.0 in central
:: resolution report :: resolve 191ms :: artifacts dl 6ms
	:: modules in use:
	com.github.ben-manes.caffeine#caffeine;2.9.3 from central in [default]
	com.google.errorprone#error_prone_annotations;2.10.0 from central in [default]
	io.delta#delta-iceberg_2.12;3.0.0 from central in [default]
	io.delta#delta-spark_2.12;3.0.0 from central in [default]
	io.delta#delta-storage;3.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 f

In [None]:
base_path = f"{hdfs_lakehouse_base_path}/bronze/trump_btc"

# List of folder names (from the structure provided)
folder_names = ["BTC", "Trump", "fear_greed_index"]

# Dictionary to store DataFrames
dataframes = {}

# Loop through each folder and read the CSV file inside it
for folder_name in folder_names:
    # Construct the file path
    file_path = os.path.join(base_path, folder_name)
    
    # Read the CSV file into a DataFrame
    dataframes[folder_name] = (spark.read
                                        .option("header", "true")
                                        .option("inferSchema", "true")
                                        .csv(file_path))
    print(f"Loaded DataFrame for {folder_name}_raw")

# Example: Accessing one of the DataFrames
btc_df = dataframes["BTC"]
trump_df=dataframes["Trump"]
FG_df=dataframes["fear_greed_index"]

In [None]:
btc_df.show()

In [None]:
trump_df.toPandas().head()

In [None]:
FG_df.show()

In [None]:
FG_df = FG_df.drop("BTC_Closing")

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col, sum

FG_df=FG_df.filter(F.col("Value_Classification").isNotNull())
FG_df.show()

In [None]:
FG_df.filter(F.col("Value_Classification").isNull()).show()

In [None]:
from pyspark.sql.functions import col, from_unixtime, date_format

# If the timestamps are in milliseconds, divide by 1000
btc_df_std = btc_df.withColumnRenamed("Timestamp", "date") \
                   .withColumn("date", date_format(from_unixtime(col("date")), "yyyy-MM-dd HH:mm:ss")) \
                   .dropna()  # Remove missing values



In [None]:
btc_df_std.show(5)

In [None]:
btc_df_std.tail(5)

In [None]:
spark.sql("DROP SCHEMA trump_btc CASCADE")

In [None]:
spark.sql("CREATE SCHEMA IF NOT EXISTS trump_btc")

In [None]:
#saving btc data it in silver
(btc_df_std.write
            .format("delta")
            .mode("overwrite")
            .option("path",f"{hdfs_lakehouse_base_path}/silver/trump_btc/BTC/")
            .saveAsTable("trump_btc.BTC")
)


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_unixtime, date_format,regexp_replace
from pyspark.sql.types import StringType, BooleanType, DoubleType, LongType, TimestampType
import re

trump_df = trump_df.withColumn("date", col("date").cast("bigInt"))
trump_df = trump_df.withColumn("date", date_format(from_unixtime(col("date") / 1000), "yyyy-MM-dd HH:mm:ss"))
trump_df = trump_df.withColumn("text", regexp_replace(col("text"), "<[^>]+>", ""))


In [None]:
trump_df_std=trump_df
trump_df_std.show()

In [None]:
#saving btc data it in silver
(trump_df_std.write
            .format("delta")
            .mode("overwrite")
            .option("path",f"{hdfs_lakehouse_base_path}/silver/trump_btc/trump/")
            .saveAsTable("trump_btc.trump")
)


In [None]:
#saving btc data it in silver
(FG_df.write
            .format("delta")
            .mode("overwrite")
            .option("path",f"{hdfs_lakehouse_base_path}/silver/trump_btc/fear_greed_index/")
            .saveAsTable("trump_btc.FG_df")
)

In [None]:
%%sparksql
SELECT *
FROM trump_btc.trump
LIMIT 10