In [None]:
# Init spark session to read data from parquet files
from pyspark.sql import SparkSession
from pyspark.sql.types import DateType, IntegerType
from pyspark.sql import functions as F
from stocksx.configs.spark_config import SparkConfig
from stocksx.data_pipeline.sub_modules.spark_manager import SparkManager
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F

spark_config = SparkConfig(iceberg_enabled=True, iceberg_namespace = "raw_data", 
                           iceberg_warehouse="data/warehouse/iceberg")
spark_manager = SparkManager(spark_config)
spark_manager.verify_configuration()
spark = spark_manager.session


=== Verifying Configuration ===
Testing Spark basic functionality...
Using package root: e:\projects\stocksx_price_and_news_influences
Iceberg warehouse: file:///e:/projects/stocksx_price_and_news_influences/data/warehouse/iceberg
Hive warehouse: file:///e:/projects/stocksx_price_and_news_influences/data/warehouse/hive
Found PostgreSQL JDBC driver at: e:\projects\stocksx_price_and_news_influences\stocksx\libs\postgresql-42.7.5.jar
Created Spark session with:
- Iceberg enabled: True
- Hive metastore: True
- Warehouse dir: file:///e:/projects/stocksx_price_and_news_influences/data/warehouse/iceberg
- Hive config dir: e:\projects\stocksx_price_and_news_influences\stocksx\configs
Basic Spark functionality: OK (1+1=2)

Initializing Hive metastore schema...
Initializing Hive metastore schema...
Hive metastore schema already exists.

Testing Iceberg functionality...
Iceberg functionality: OK

Testing Hive metastore...
Database result schema:
root
 |-- namespace: string (nullable = false)

Fo

In [None]:
# List all available catalogs
spark.sql("SHOW CATALOGS").show()

# List all namespaces in the local catalog
spark.sql("SHOW NAMESPACES IN spark_catalog").show()

+-------------+
|      catalog|
+-------------+
|spark_catalog|
+-------------+

+---------+
|namespace|
+---------+
|  default|
| raw_data|
+---------+



In [None]:
# Load data from the local Iceberg warehouse
# Using the catalog.database.table format
spark_df = spark.read.format("iceberg").table("spark_catalog.raw_data.stock_prices")
metadata = pd.read_csv("../data/metadata/stock_updates_metadata/metadata.csv")
missing_sector = metadata[metadata["sector"].isnull() | metadata["sector"].eq("")]["symbol"].tolist()

In [None]:
from stocksx.utils.sector_fetcher import fetch_sectors_spark

# Use your Spark session
sector_pd = fetch_sectors_spark(spark_manager.session, missing_sector)

In [None]:
sector_pd.shape

(5961, 2)

In [None]:
# replace the missing sectors in the metadata DataFrame with the fetched sectors
metadata.loc[metadata["symbol"].isin(sector_pd["symbol"]), "sector"] = sector_pd["sector"].values

In [None]:
# save metadata with sectors
metadata.to_csv("../data/metadata/stock_updates_metadata/metadata.csv", index=False)