# Market Data Snowpark ML Workflow

### Pre Run Activity
- Make sure to tap the 3 dot icon on top left and enable the necessary 'External Access' integrations for the notebook.
- Update the 'market_config.json' with the requirement before starting the run.

## Runtime Config Set-Up

In [None]:
GLOBAL_CONFIG_PATH = "market_config.json"

### Importing Libraries

In [None]:
from snowflake.snowpark.functions import col
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.modeling.metrics import mean_absolute_error, mean_squared_error
import pandas as pd
from snowflake.snowpark import DataFrame as SnowparkDF

In [None]:
import json
def get_config(CONFIG_PATH):
    with open(CONFIG_PATH) as f:
        config = json.load(f)
    return config

def get_config_snowflake():
    config = get_config(GLOBAL_CONFIG_PATH)
    config_snowflake = config["snowflake"]
    return config_snowflake
    
config = get_config(GLOBAL_CONFIG_PATH) #DEBUG
config_snowflake = get_config_snowflake()
config_snowflake

In [None]:
# from snowflake.snowpark.session import Session
from snowflake.snowpark import Session
from snowflake.snowpark.functions import col

# Hard Coded for now
user_with_cloud = config_snowflake["user"] + '.us-east-1.aws'
t1 = "CYWCXZB-EPC46775.snowflakecomputing.com"
t2 = "CYWCXZB-EPC46775"
connection_params = {
    "user": config_snowflake["user"],
    "password" : config_snowflake["password"],
    "account": config_snowflake["account"],
    # "authenticator": "externalbrowser",
    # "role": "ACCOUNTADMIN",
    "warehouse": config_snowflake["warehouse"],
    "database": config_snowflake["database"],
    "schema": "ANALYTICS"
}

session = Session.builder.configs(connection_params).create()


## Main Code

In [None]:
# features_df = session.table("STAGING.VW_MODEL_READY_FEATURES") \
#     .filter(col("next_day_close").is_not_null()) \
#     .to_pandas()

# features_df.head(5)

In [None]:
# # Select features and target
# X = features_df.drop(columns=["NEXT_DAY_CLOSE", "NEXT_DAY_RETURN", "PRICE_DIRECTION", "FEATURE_ID", "DATE", "SYMBOL", "ENTITY_NAME"])
# y = features_df["NEXT_DAY_CLOSE"]

# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


In [None]:
# train_df = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
# train_snowpark = session.create_dataframe(train_df)

# feature_cols = X_train.columns.tolist()
# target_col = "NEXT_DAY_CLOSE"

In [None]:
GLOBAL_VERSION_NAME = "v11"

In [None]:
from snowflake.ml.modeling.xgboost import XGBRegressor
from snowflake.ml.model import task, type_hints
from snowflake.ml.registry import Registry
from snowflake.snowpark.functions import col
import pandas as pd

# 1️⃣ Load Data
df = (
    session.table("STAGING.VW_MODEL_READY_FEATURES")
    # .filter(col("SYMBOL") == "ORCL")
    .filter(col("NEXT_DAY_CLOSE").is_not_null())
    .to_pandas()
)

# 2️⃣ Define columns
# input_cols = [
#     "OPEN", "HIGH", "LOW", "CLOSE", "VOLUME",
#     "MA_5", "MA_10", "MA_20", "EMA_12", "EMA_26"
# ]
# input_cols = [
#     # Core OHLCV
#     "OPEN", "HIGH", "LOW", "CLOSE", "VOLUME",
#     # Returns & price movement
#     "DAILY_RETURN", "LOG_RETURN", "PRICE_RANGE_PCT",
#     # Moving averages (trend indicators)
#     "MA_5", "MA_10", "MA_20", "MA_50",
#     "EMA_12", "EMA_26",
#     # Momentum indicators
#     "RSI_14", "MACD", "MACD_SIGNAL", "MACD_HISTOGRAM",
#     # Volatility indicators
#     "VOLATILITY_5D", "VOLATILITY_10D", "VOLATILITY_20D", "ATR_14",
#     # Volume indicators
#     "VOLUME_MA_5", "VOLUME_MA_20", "VOLUME_RATIO",
#     # Lag and rolling features
#     "CLOSE_LAG_1", "CLOSE_LAG_2", "CLOSE_LAG_3", "CLOSE_LAG_5",
#     "ROLLING_MAX_20", "ROLLING_MIN_20",
#     "DISTANCE_FROM_HIGH_20", "DISTANCE_FROM_LOW_20"
# ]
input_cols = [
    "OPEN", "HIGH", "LOW", "CLOSE", "VOLUME"
    ,"CLOSE_LAG_1", "CLOSE_LAG_2"
    , "VOLATILITY_5D"
    
    # ,"MA_5", "MA_10", "MA_20"
]
label_cols = "NEXT_DAY_CLOSE"
output_cols = "PREDICTED_CLOSE"

# 3️⃣ Train model
model = XGBRegressor(
    input_cols=input_cols,
    label_cols=label_cols,
    output_cols=output_cols,
    drop_input_cols=True,
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    random_state=42,
)
model.fit(df)

# 4️⃣ Register with explainability
reg = Registry(session)

mv = reg.log_model(
    model,
    model_name="XGBRegressor_PricePredictor",
    version_name=GLOBAL_VERSION_NAME,
    conda_dependencies=["snowflake-ml-python", "xgboost"],
    comment="Daily close price predictor with explainability",
    # metrics={"RMSE": model.evaluate(df)["rmse"] if "rmse" in model.evaluate(df) else None},
    sample_input_data=df[input_cols],
    options={"relax_version": False},
    task=task.Task.TABULAR_REGRESSION
)


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Split your data into train/test if not already
from sklearn.model_selection import train_test_split

X = df[input_cols]
y = df[label_cols]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# preds = model.predict(X_test)
# rmse = np.sqrt(mean_squared_error(y_test, preds))
# mae = mean_absolute_error(y_test, preds)
# r2 = r2_score(y_test, preds)
# print(f"RMSE: {rmse:.3f}, MAE: {mae:.3f}, R2: {r2:.3f}")

# 6️⃣ Evaluate on Test Set for Overfitting Check
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
mae = mean_absolute_error(y_test, preds)
r2 = r2_score(y_test, preds)

print(f"Test Set Performance:")
print(f"RMSE: {rmse:.3f}, MAE: {mae:.3f}, R2: {r2:.3f}")

# ✅ Optional: compare train vs test metrics for overfitting diagnosis
train_preds = model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
train_r2 = r2_score(y_train, train_preds)

print(f"Train Set Performance:")
print(f"RMSE: {train_rmse:.3f}, R2: {train_r2:.3f}")


In [None]:
from snowflake.ml.registry import Registry

# Initialize registry
reg = Registry(session)

# Get model by name
model_ref = reg.get_model("XGBRegressor_PricePredictor")

# Load specific version (v3)
model_version = model_ref.version(GLOBAL_VERSION_NAME)

# Load the actual model object
model = model_version.load()


In [None]:
df = session.table("STAGING.VW_MODEL_READY_FEATURES").filter(col("NEXT_DAY_CLOSE").is_not_null())
    # .filter(col("SYMBOL") == "ORCL")

# Generate predictions
pred_df = model.predict(df)

# Save predictions into your analytics schema
pred_df.write.save_as_table(
    "ANALYTICS.PREDICTED_PRICES",
    mode="overwrite"
)
