In [87]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [88]:
import numpy as np
np.bool = np.bool_

In [89]:
#current notebook name
notebook_name = __session__.replace('.ipynb','')[__session__.rfind('/')+1:] 

In [90]:
# HDFS base paths
hdfs_lakehouse_base_path = 'hdfs://localhost:9000/lakehouse/'
hdfs_warehouse_base_path = 'hdfs://localhost:9000/warehouse'

In [91]:
import os
dependencies = ["org.apache.spark:spark-avro_2.12:3.5.0",
                "io.delta:delta-iceberg_2.12:3.0.0"]
os.environ['PYSPARK_SUBMIT_ARGS']= f"--packages {','.join(dependencies)} pyspark-shell"
os.environ['PYARROW_IGNORE_TIMEZONE'] = 'true'

In [92]:
from pyspark.sql.session import SparkSession

spark = (SparkSession.builder
    .appName(notebook_name)
    .config("spark.log.level","ERROR")
    .config("spark.sql.warehouse.dir",hdfs_warehouse_base_path)
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .enableHiveSupport()
    .getOrCreate()
)

In [93]:
base_path = f"{hdfs_lakehouse_base_path}/bronze/trump_btc"

# List of folder names (from the structure provided)
folder_names = ["BTC", "Trump", "fear_greed_index"]

# Dictionary to store DataFrames
dataframes = {}

# Loop through each folder and read the CSV file inside it
for folder_name in folder_names:
    # Construct the file path
    file_path = os.path.join(base_path, folder_name)
    
    # Read the CSV file into a DataFrame
    dataframes[folder_name] = (spark.read
                                        .option("header", "true")
                                        .option("inferSchema", "true")
                                        .csv(file_path))
    print(f"Loaded DataFrame for {folder_name}_raw")

# Example: Accessing one of the DataFrames
btc_df = dataframes["BTC"]
trump_df=dataframes["Trump"]
FG_df=dataframes["fear_greed_index"]



Loaded DataFrame for BTC_raw
Loaded DataFrame for Trump_raw
Loaded DataFrame for fear_greed_index_raw


                                                                                

In [94]:
btc_df.show()

+------------+----+----+----+-----+------+
|   Timestamp|Open|High| Low|Close|Volume|
+------------+----+----+----+-----+------+
|1.32541206E9|4.58|4.58|4.58| 4.58|   0.0|
|1.32541212E9|4.58|4.58|4.58| 4.58|   0.0|
|1.32541218E9|4.58|4.58|4.58| 4.58|   0.0|
|1.32541224E9|4.58|4.58|4.58| 4.58|   0.0|
| 1.3254123E9|4.58|4.58|4.58| 4.58|   0.0|
|1.32541236E9|4.58|4.58|4.58| 4.58|   0.0|
|1.32541242E9|4.58|4.58|4.58| 4.58|   0.0|
|1.32541248E9|4.58|4.58|4.58| 4.58|   0.0|
|1.32541254E9|4.58|4.58|4.58| 4.58|   0.0|
| 1.3254126E9|4.58|4.58|4.58| 4.58|   0.0|
|1.32541266E9|4.58|4.58|4.58| 4.58|   0.0|
|1.32541272E9|4.58|4.58|4.58| 4.58|   0.0|
|1.32541278E9|4.58|4.58|4.58| 4.58|   0.0|
|1.32541284E9|4.58|4.58|4.58| 4.58|   0.0|
| 1.3254129E9|4.58|4.58|4.58| 4.58|   0.0|
|1.32541296E9|4.58|4.58|4.58| 4.58|   0.0|
|1.32541302E9|4.58|4.58|4.58| 4.58|   0.0|
|1.32541308E9|4.58|4.58|4.58| 4.58|   0.0|
|1.32541314E9|4.58|4.58|4.58| 4.58|   0.0|
| 1.3254132E9|4.58|4.58|4.58| 4.58|   0.0|
+----------

In [95]:
trump_df.toPandas().head()

Unnamed: 0,date,favorites,id,isRetweet,retweets,text
0,1730689722303,0,113422481640840196,True,0,"""<p>RT <span class=""""h-card""""><a href=""""https://truthsocial.com/@TeamTrump"""" class=""""u-url mention"""">@<span>TeamTrump</span></a></span>PRESIDENT TRUMP: I’d like to begin by asking a question. Are you better off now than you were 4 years ago?</p><p>With your vote on Tuesday"
1,1730676754118,12350,113421631757789522,False,3124,"""<p>KAMALA BROKE IT, BUT I WILL FIX IT! With your VOTE, inflation will END. The border will be SAFE &amp; SECURE. We will have PEACE across the globe. WITH YOUR VOTE, AMERICA WILL ENTER A NEW GOLDEN AGE! Vote TRUMP to MAKE AMERICA GREAT AGAIN! <a href=""""https://links.truthsocial.com/link/112559845980056031"""" rel=""""nofollow noopener noreferrer"""" target=""""_blank""""><span class=""""invisible"""">https://</span><span class="""""""">swampthevoteusa.com/</span><span class=""""invisible""""></span></a></p>"""
2,1730657728247,0,113420384878459920,True,0,"""<p>RT <span class=""""h-card""""><a href=""""https://truthsocial.com/@TeamTrump"""" class=""""u-url mention"""">@<span>TeamTrump</span></a></span>PRESIDENT TRUMP: IN CONCLUSION"
3,1730611542332,17960,113417358038328332,False,5386,"<p>If Kamala wins, you are 3 days away from the start of a 1929-style economic depression. If I win, you are 3 days away from the best jobs, the biggest paychecks, and the brightest economic future the world has ever seen.</p><p>Kamala’s inflation nightmare has cost the typical American family $30,000 dollars in higher prices—and now, she wants to impose the largest tax hike in American history, and raise your taxes by $3,000 dollars a year. </p><p>I will massively cut taxes for workers and small businesses—and we will have NO TAX ON TIPS, NO TAX ON OVERTIME, and NO TAX ON SOCIAL SECURITY benefits!</p>"
4,1730509334162,8165,113410659723667193,False,1919,"<p>Patrick Morrisey, of West Virginia, has been a wonderful Attorney General, and will now be an incredible Governor! He will fight tirelessly to Grow the Economy, Stop Inflation, Secure our Border, Restore American Energy DOMINANCE, Strengthen our Military/Vets, and Protect our always under siege Second Amendment. Patrick will work closely with me to enact our America First Agenda, and is a FANTASTIC person to replace my friend and future U.S. Senator, Jim Justice. Patrick Morrisey is my friend, and has my Complete and Total Endorsement - HE WILL NOT LET YOU DOWN!</p>"


In [96]:
FG_df.show()

+----------+-----+--------------------+----------------+-----------+
|      Date|Value|Value_Classification|     BTC_Closing| BTC_Volume|
+----------+-----+--------------------+----------------+-----------+
|2018-02-01| 30.0|                Fear| 9170.5400390625| 9959400448|
|2018-02-02| 15.0|        Extreme Fear|         8830.75|12726899712|
|2018-02-03| 40.0|                Fear|   9174.91015625| 7263790080|
|2018-02-04| 24.0|        Extreme Fear|  8277.009765625| 7073549824|
|2018-02-05| 11.0|        Extreme Fear|6955.27001953125| 9285289984|
|2018-02-06|  8.0|        Extreme Fear|          7754.0|13999800320|
|2018-02-07| 36.0|                Fear| 7621.2998046875| 9169280000|
|2018-02-08| 30.0|                Fear|   8265.58984375| 9346750464|
|2018-02-09| 44.0|                Fear|   8736.98046875| 6784820224|
|2018-02-10| 54.0|             Neutral|  8621.900390625| 7780960256|
|2018-02-11| 31.0|                Fear|8129.97021484375| 6122189824|
|2018-02-12| 42.0|                

In [97]:
FG_df = FG_df.drop("BTC_Closing")

In [98]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col, sum

FG_df=FG_df.filter(F.col("Value_Classification").isNotNull())
FG_df.show()

+----------+-----+--------------------+-----------+
|      Date|Value|Value_Classification| BTC_Volume|
+----------+-----+--------------------+-----------+
|2018-02-01| 30.0|                Fear| 9959400448|
|2018-02-02| 15.0|        Extreme Fear|12726899712|
|2018-02-03| 40.0|                Fear| 7263790080|
|2018-02-04| 24.0|        Extreme Fear| 7073549824|
|2018-02-05| 11.0|        Extreme Fear| 9285289984|
|2018-02-06|  8.0|        Extreme Fear|13999800320|
|2018-02-07| 36.0|                Fear| 9169280000|
|2018-02-08| 30.0|                Fear| 9346750464|
|2018-02-09| 44.0|                Fear| 6784820224|
|2018-02-10| 54.0|             Neutral| 7780960256|
|2018-02-11| 31.0|                Fear| 6122189824|
|2018-02-12| 42.0|                Fear| 6256439808|
|2018-02-13| 35.0|                Fear| 5696719872|
|2018-02-14| 55.0|               Greed| 7909819904|
|2018-02-15| 71.0|               Greed| 9062540288|
|2018-02-16| 67.0|               Greed| 7296159744|
|2018-02-17|

In [101]:
FG_df.filter(F.col("Value_Classification").isNull()).show()

+----+-----+--------------------+----------+
|Date|Value|Value_Classification|BTC_Volume|
+----+-----+--------------------+----------+
+----+-----+--------------------+----------+



In [86]:
from pyspark.sql.functions import col, from_unixtime, date_format

# If the timestamps are in milliseconds, divide by 1000
btc_df_std = btc_df.withColumnRenamed("Timestamp", "date") \
                   .withColumn("date", date_format(from_unixtime(col("date")), "yyyy-MM-dd HH:mm:ss")) \
                   .dropna()  # Remove missing values



In [75]:
btc_df_std.show(5)

+-------------------+----+----+----+-----+------+
|               date|Open|High| Low|Close|Volume|
+-------------------+----+----+----+-----+------+
|2012-01-01 11:01:00|4.58|4.58|4.58| 4.58|   0.0|
|2012-01-01 11:02:00|4.58|4.58|4.58| 4.58|   0.0|
|2012-01-01 11:03:00|4.58|4.58|4.58| 4.58|   0.0|
|2012-01-01 11:04:00|4.58|4.58|4.58| 4.58|   0.0|
|2012-01-01 11:05:00|4.58|4.58|4.58| 4.58|   0.0|
+-------------------+----+----+----+-----+------+
only showing top 5 rows



In [76]:
btc_df_std.tail(5)

                                                                                

[Row(date='2025-03-01 00:56:00', Open=84386.0, High=84386.0, Low=84373.0, Close=84386.0, Volume=0.12721259),
 Row(date='2025-03-01 00:57:00', Open=84387.0, High=84387.0, Low=84348.0, Close=84348.0, Volume=0.03039574),
 Row(date='2025-03-01 00:58:00', Open=84347.0, High=84372.0, Low=84347.0, Close=84372.0, Volume=0.17704815),
 Row(date='2025-03-01 00:59:00', Open=84369.0, High=84369.0, Low=84316.0, Close=84321.0, Volume=0.07598197),
 Row(date='2025-03-01 01:00:00', Open=84316.0, High=84325.0, Low=84288.0, Close=84288.0, Volume=0.07470067)]

In [77]:
#spark.sql("DROP SCHEMA trump_btc CASCADE")

In [78]:
spark.sql("CREATE SCHEMA IF NOT EXISTS trump_btc")

DataFrame[]

In [79]:
#saving btc data it in silver
(btc_df_std.write
            .format("delta")
            .mode("overwrite")
            .option("path",f"{hdfs_lakehouse_base_path}/silver/trump_btc/BTC/")
            .saveAsTable("trump_btc.BTC")
)


                                                                                

In [80]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_unixtime, date_format,regexp_replace
from pyspark.sql.types import StringType, BooleanType, DoubleType, LongType, TimestampType
import re

trump_df = trump_df.withColumn("date", col("date").cast("bigInt"))
trump_df = trump_df.withColumn("date", date_format(from_unixtime(col("date") / 1000), "yyyy-MM-dd HH:mm:ss"))
trump_df = trump_df.withColumn("text", regexp_replace(col("text"), "<[^>]+>", ""))


In [81]:
trump_df_std=trump_df
trump_df_std.show()

+-------------------+---------+------------------+---------+--------+--------------------+
|               date|favorites|                id|isRetweet|retweets|                text|
+-------------------+---------+------------------+---------+--------+--------------------+
|2024-11-04 04:08:42|        0|113422481640840196|     true|       0|"RT @TeamTrumpPRE...|
|2024-11-04 00:32:34|    12350|113421631757789522|    false|    3124|"KAMALA BROKE IT,...|
|2024-11-03 19:15:28|        0|113420384878459920|     true|       0|"RT @TeamTrumpPRE...|
|2024-11-03 06:25:42|    17960|113417358038328332|    false|    5386|If Kamala wins, y...|
|2024-11-02 02:02:14|     8165|113410659723667193|    false|    1919|Patrick Morrisey,...|
|2024-10-31 18:28:48|    16204|113403214486581264|    false|    4930|Bill Clinton just...|
|2024-10-31 17:26:14|     7439|113402968449781376|    false|    1988|Congressman Marc ...|
|2024-10-31 17:16:04|     8601|113402928440845044|    false|    2282|"I would like to ...|

In [82]:
#saving btc data it in silver
(trump_df_std.write
            .format("delta")
            .mode("overwrite")
            .option("path",f"{hdfs_lakehouse_base_path}/silver/trump_btc/trump/")
            .saveAsTable("trump_btc.trump")
)


In [104]:
#saving btc data it in silver
(FG_df.write
            .format("delta")
            .mode("overwrite")
            .option("path",f"{hdfs_lakehouse_base_path}/silver/trump_btc/fear_greed_index/")
            .saveAsTable("trump_btc.FG_df")
)

In [None]:
%%sparksql
SELECT *
FROM trump_btc.trump
LIMIT 10