In [1]:
# Init spark session to read data from parquet files
from pyspark.sql import SparkSession
from pyspark.sql.types import DateType, IntegerType
from pyspark.sql import functions as F
from stocksx.configs.spark_config import SparkConfig
from stocksx.data_pipeline.sub_modules.spark_manager import SparkManager
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F

spark_config = SparkConfig(iceberg_enabled=True, iceberg_namespace = "raw_data", 
                           iceberg_warehouse="data/warehouse/iceberg")
spark_manager = SparkManager(spark_config)
spark_manager.verify_configuration()
spark = spark_manager.session


=== Verifying Configuration ===
Testing Spark basic functionality...
Using package root: e:\projects\stocksx_price_and_news_influences
Iceberg warehouse: file:///e:/projects/stocksx_price_and_news_influences/data/warehouse/iceberg
Hive warehouse: file:///e:/projects/stocksx_price_and_news_influences/data/warehouse/hive
Found PostgreSQL JDBC driver at: e:\projects\stocksx_price_and_news_influences\stocksx\libs\postgresql-42.7.5.jar
Created Spark session with:
- Iceberg enabled: True
- Hive metastore: True
- Warehouse dir: file:///e:/projects/stocksx_price_and_news_influences/data/warehouse/iceberg
- Hive config dir: e:\projects\stocksx_price_and_news_influences\stocksx\configs
Basic Spark functionality: OK (1+1=2)

Initializing Hive metastore schema...
Initializing Hive metastore schema...
Hive metastore schema already exists.

Testing Iceberg functionality...
Iceberg functionality: OK

Testing Hive metastore...
Database result schema:
root
 |-- namespace: string (nullable = false)

Fo

In [3]:
# List all available catalogs
spark.sql("SHOW CATALOGS").show()

# List all namespaces in the local catalog
spark.sql("SHOW NAMESPACES IN spark_catalog").show()

# List all tables in the specified namespace
spark.sql("SHOW TABLES IN raw_data").show()

+-------------+
|      catalog|
+-------------+
|spark_catalog|
+-------------+

+---------+
|namespace|
+---------+
|  default|
| raw_data|
+---------+

+---------+------------+-----------+
|namespace|   tableName|isTemporary|
+---------+------------+-----------+
| raw_data|stock_prices|      false|
+---------+------------+-----------+



In [4]:
current_directory = os.getcwd()

home_directory = os.path.dirname(os.path.dirname(current_directory))

metadata_path = os.path.join(home_directory, "data", "metadata", "stock_updates_metadata", "metadata.csv")

In [5]:
metadata_df = pd.read_csv(metadata_path)

In [6]:
technology_symbols = metadata_df[metadata_df['sector'] == 'Technology']['symbol'].tolist()
len(technology_symbols)

758

In [7]:
stocks_df = spark.read.table("raw_data.stock_prices")

In [8]:
from pyspark.sql.functions import broadcast

# Create empty Spark DataFrame to hold the results
technology_stocks_df = spark.createDataFrame([], stocks_df.schema)

for i in range(0, len(technology_symbols), 100):
    symbols_batch = technology_symbols[i:i + 100]
    print(f"Processing batch: {symbols_batch}")
    
    # Create a DataFrame from the list of symbols
    symbols_batch_df = spark.createDataFrame(symbols_batch, "string").toDF("symbol")
    
    technology_stocks_df = stocks_df.join(broadcast(symbols_batch_df), "symbol")

technology_stocks_df.cache()
    

Processing batch: ['AAOI', 'AAPL', 'ACIW', 'ACLS', 'ACMR', 'ACN', 'ADBE', 'ADEA', 'ADI', 'ADP', 'ADSK', 'ADTN', 'AEHR', 'AEVA', 'AEYE', 'AFRM', 'AGMH', 'AGYS', 'AI', 'AIFF', 'AIOT', 'AIP', 'AIRG', 'AISP', 'AIXI', 'AKAM', 'ALAB', 'ALAR', 'ALGM', 'ALIT', 'ALKT', 'ALLT', 'ALNT', 'ALOT', 'ALRM', 'ALTS', 'AMAT', 'AMBA', 'AMD', 'AMKR', 'AMOD', 'AMPG', 'AMPGW', 'AMPL', 'AMST', 'ANET', 'ANSS', 'AOSL', 'APCX', 'APH', 'API', 'APLD', 'APP', 'APPF', 'APPN', 'APPS', 'ARBB', 'ARBE', 'ARM', 'ARQQ', 'ARQQW', 'ARRY', 'ARW', 'ASAN', 'ASGN', 'ASML', 'ASNS', 'ASTC', 'ASTI', 'ASTS', 'ASUR', 'ASX', 'ASYS', 'ATCH', 'ATEN', 'ATGL', 'ATOM', 'AUDC', 'AUID', 'AUR', 'AUUD', 'AVDX', 'AVGO', 'AVNW', 'AVPT', 'AVT', 'AWRE', 'AXIL', 'AXTI', 'AZ', 'BAND', 'BASE', 'BB', 'BBAI', 'BDC', 'BEEM', 'BELFA', 'BELFB', 'BHE', 'BIGC']
Processing batch: ['BILL', 'BKKT', 'BKTI', 'BL', 'BLBX', 'BLIN', 'BLKB', 'BLND', 'BLZE', 'BMI', 'BMR', 'BNAI', 'BNZI', 'BOSC', 'BOX', 'BOXL', 'BR', 'BRZE', 'BSY', 'BTCM', 'BTCT', 'BTDR', 'BZAI', 'CA

DataFrame[symbol: string, trade_date: date, open: double, high: double, low: double, close: double, volume: bigint]