In [None]:
# 1) Wipe out all Python variables
%reset -f
# 2) Force Python’s garbage collector to run
import gc
gc.collect()

import importlib
from libs import trades, plots, params, models
importlib.reload(trades)
importlib.reload(plots)
importlib.reload(params)
importlib.reload(models)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso

import shap
import time
from tqdm.auto import tqdm
from IPython.display import clear_output, display

In [None]:
features_cols = params.features_cols
label_col = params.label_col

In [None]:
path   = params.save_path / f"{params.ticker}_ready.csv"
pd.set_option('display.max_columns', None)

df_raw = pd.read_csv(path, index_col=0, parse_dates=True)
df = models.feature_engineering(df_raw, features_cols, label_col)
df.drop(['bid','ask'],axis=1,inplace=True)
df

In [None]:
# 0) Prepare the master dict of feature importances
features_importances = {
    feat: {"corr": None, "mi": None, "perm": None, "shap": None, "lasso": None}
    for feat in features_cols
}

X = df[features_cols]
y = df[label_col]

In [None]:
# 1) Utility: update the dict
def update_feature_importances(fi_dict, importance_type, values: pd.Series):
    """
    fi_dict: master dict
    importance_type: one of "corr","mi","perm","shap","lasso"
    values: pd.Series indexed by feature name
    """
    for feat, val in values.items():
        if feat in fi_dict:
            fi_dict[feat][importance_type] = val


In [None]:
def live_feature_importance(
    df: pd.DataFrame,
    features: list[str],
    label: str,
    method: str,               # "corr", "mi", "perm", "shap", "lasso"
    compute_fn,                # function(feature_name) → importance_value
    threshold: float = None
):
    """
    Generic live plotter for a sequence of feature importances.
    
    - df        : full dataframe
    - features  : list of column names to score
    - label     : target column name (for context in title)
    - method    : text legend ("Corr", "Mutual Info", etc.)
    - compute_fn: a callable f → score[f]
    - threshold : optional vertical line in the bar chart
    """
    # accumulator
    scores = {}

    # loop with progress bar
    for f in tqdm(features, desc=f"{method}"):
        # compute
        val = compute_fn(f)
        scores[f] = val

        # update on‐screen plot
        clear_output(wait=True)
        series = pd.Series(scores).sort_values()
        plt.figure(figsize=(6, max(3, len(series)*0.25)))
        sns.barplot(x=series.values, y=series.index, palette="vlag")
        if threshold is not None:
            plt.axvline(threshold, color="gray", linestyle="--")
            if method=="Corr":
                plt.axvline(-threshold, color="gray", linestyle="--")
        plt.title(f"{method} Importance (partial) for {label}")
        plt.xlabel("Importance")
        plt.tight_layout()
        display(plt.gcf())
        plt.close()

        # tiny pause so the UI can breathe (optional)
        time.sleep(0.02)

    # final return
    return pd.Series(scores).sort_values()

In [None]:
# ────────────────────────────────────────────────────────────────────
# 1) Correlation live
# ────────────────────────────────────────────────────────────────────
# Features with near-zero correlation aren’t necessarily useless (nonlinear effects!), but anything in |ρ|<0.05 is a low-hanging candidate for removal.abs

def corr_fn(feat):
    return df[feat].corr(df[label_col])

corr_scores = live_feature_importance(
    df,
    features_cols,
    label_col,
    method="Corr",
    compute_fn=corr_fn,
    threshold=0.05
)
# now store into your master dict:
update_feature_importances(features_importances, "corr", corr_scores)

In [None]:
# ────────────────────────────────────────────────────────────────────
# 2) Lasso live (coordinate descent)
# ────────────────────────────────────────────────────────────────────

# Standardize X,y first
Xz = (X - X.mean())/X.std()
yz = y - y.mean()

# we’ll fit p separate single‐feature Lasso for speed, as a proxy
def lasso_fn(feat):
    l = Lasso(alpha=0.01, max_iter=10000)
    l.fit(Xz[[feat]], yz)
    return abs(l.coef_[0])

lasso_scores = live_feature_importance(
    df,
    features_cols,
    label_col,
    method="Lasso",
    compute_fn=lasso_fn,
    threshold=0.0
)
update_feature_importances(features_importances, "lasso", lasso_scores)


In [None]:
# ────────────────────────────────────────────────────────────────────
# 3) Mutual Information live
# ────────────────────────────────────────────────────────────────────
def mi_fn(feat):
    # mutual_info_regression wants 2D X
    return mutual_info_regression(
        df[[feat]].fillna(0),
        df[label_col],
        discrete_features=False,
        random_state=0
    )[0]

mi_scores = live_feature_importance(
    df,
    features_cols,
    label_col,
    method="Mutual Info",
    compute_fn=mi_fn,
    threshold=0.0
)
update_feature_importances(features_importances, "mi", mi_scores)

In [None]:
# ────────────────────────────────────────────────────────────────────
# Incremental RF fit with warm_start + tqdm (used for the following Permutation Importance and Shap)
# ────────────────────────────────────────────────────────────────────
# from tqdm.auto import tqdm
# from sklearn.ensemble import RandomForestRegressor

# initialize an “empty” forest
rf = RandomForestRegressor(
    warm_start=True,      # allow adding trees
    n_estimators=0,       # start with zero
    max_depth=4,
    n_jobs=-1,
    random_state=0
)

TARGET_TREES = 25
for i in tqdm(range(TARGET_TREES), desc="Training RF trees"):
    # increase the forest by 1 tree each iteration
    rf.set_params(n_estimators=i + 1)
    rf.fit(X, y)



In [None]:
# ────────────────────────────────────────────────────────────────────
# 4) Permutation Importance live (RF)
# ────────────────────────────────────────────────────────────────────

# 1) compute importances with multiple repeats
perm_imp = permutation_importance(
    rf,
    X, y,
    n_repeats=15,
    scoring="neg_root_mean_squared_error",
    random_state=0
    # n_jobs=-1
)

# 2) turn into a Series for fast lookup
perm_scores_mean = pd.Series(perm_imp.importances_mean, index=features_cols)

# 3) define a lookup function for live plotting
def perm_fn(feat):
    return perm_scores_mean[feat]

# 4) stream‐plot using our helper
perm_scores_live = live_feature_importance(
    df,
    features_cols,
    label_col,
    method="Permute RF",
    compute_fn=perm_fn,
    threshold=0.0
)

# 5) store final results
update_feature_importances(features_importances, "perm", perm_scores_mean)


In [None]:
# ────────────────────────────────────────────────────────────────────
# 5) SHAP live (sample of rows)
# ────────────────────────────────────────────────────────────────────

# 1) sample rows for speed
X_sample = shap.sample(X, 500, random_state=0)

# 2) compute all SHAP values at once
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_sample)

# 3) mean absolute SHAP per feature
shap_means = np.abs(shap_values).mean(axis=0)
shap_scores = pd.Series(shap_means, index=features_cols)

# 4) lookup fn for live plotting
def shap_fn(feat):
    return shap_scores[feat]

# 5) stream‐plot
shap_scores_live = live_feature_importance(
    df,
    features_cols,
    label_col,
    method="SHAP",
    compute_fn=shap_fn,
    threshold=None
)

# 6) store final results
update_feature_importances(features_importances, "shap", shap_scores)


# SHAP gives you a global ranking plus a beeswarm view of how each feature pushes predictions up or down

In [None]:
# Finally inspect the combined table
fi_df = pd.DataFrame.from_dict(features_importances, orient="index")
fi_df