In [1]:
# Init spark session to read data from parquet files
from pyspark.sql import SparkSession
from pyspark.sql.types import DateType, IntegerType
from pyspark.sql import functions as F
from stocksx.configs.spark_config import SparkConfig
from stocksx.data_pipeline.sub_modules.spark_manager import SparkManager


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F

spark_config = SparkConfig(iceberg_enabled=True, iceberg_namespace = "raw_data", 
                           iceberg_warehouse="stocksx/data/iceberg_warehouse")
spark_manager = SparkManager(spark_config)
spark = spark_manager.session

In [4]:
# List all available catalogs
spark.sql("SHOW CATALOGS").show()

# List all namespaces in the local catalog
spark.sql("SHOW NAMESPACES IN spark_catalog").show()

+-------------+
|      catalog|
+-------------+
|spark_catalog|
+-------------+

+---------+
|namespace|
+---------+
|    local|
| raw_data|
+---------+



In [8]:
try:
    spark.sql("CREATE NAMESPACE IF NOT EXISTS spark_catalog.raw_data")  
    print("Successfully created spark_catalog.raw_data namespace")
except Exception as e:
    print(f"Error with spark_catalog: {e}")

Successfully created spark_catalog.raw_data namespace


In [None]:
# Sanity check for a few symbols in the iceberg table

from pyspark.sql.functions import col

symbols_to_check = ["AAPL", "MSFT", "GOOGL"]

# Load data from the local Iceberg warehouse
# Using the catalog.database.table format
sample_df = spark.read.format("iceberg").table("spark_catalog.raw_data.stock_prices")

# Alternatively, load directly from the file path
# sample_df = spark.read.format("iceberg").load("file:///e:/Projects/StocksX_Price_and_News_Influences/stocksx/data/iceberg_warehouse/default/table1")

# Filter for specific symbols
sample_df = sample_df.filter(col("symbol").isin(symbols_to_check))

# Display basic info about the data
print(f"Total records for symbols {symbols_to_check}: {sample_df.count()}")

# Check for missing values in important columns
print("\nMissing values in key columns:")
for column in sample_df.columns:
    null_count = sample_df.filter(col(column).isNull()).count()
    if null_count > 0:
        print(f"- {column}: {null_count} nulls")

# Show sample data for each symbol
print("\nSample data for each symbol:")
for symbol in symbols_to_check:
    print(f"\n--- {symbol} ---")
    sample_df.filter(col("symbol") == symbol).show(5, truncate=False)

# Basic statistics for numeric columns (assuming 'price' column exists)
if "price" in sample_df.columns:
    print("\nPrice statistics by symbol:")
    sample_df.groupBy("symbol").agg(
        min("price").alias("min_price"),
        max("price").alias("max_price"),
        avg("price").alias("avg_price"),
        count("price").alias("price_count")
    ).show()

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `spark_catalog`.`local`.`raw_data`.`stock_prices` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.;
'UnresolvedRelation [spark_catalog, local, raw_data, stock_prices], [], false
