# Gradient Boosting Trees

## Notebook's Environment

In [None]:
INSTALL_DEPS = False
if INSTALL_DEPS:
  %pip install matplotlib==3.8.3
  %pip installnumpy==1.26.4
  %pip installpandas==2.2.1
  %pip installpandas_market_calendars==4.4.0
  %pip installpytz==2024.1
  %pip installscipy==1.12.0
  %pip installta==0.11.0
  %pip installyfinance==0.2.37

!python --version

## Cloud Environment Setup

In [None]:
import os
import sys
import warnings

warnings.filterwarnings("ignore")

IN_KAGGLE = IN_COLAB = False
try:
    # https://www.tensorflow.org/install/pip#windows-wsl2
    import google.colab
    from google.colab import drive

    drive.mount("/content/drive")
    DATA_PATH = "/content/drive/MyDrive/EDT dataset"
    MODEL_PATH = "/content/drive/MyDrive/models"
    IN_COLAB = True
    print("Colab!")
except:
    IN_COLAB = False
if "KAGGLE_KERNEL_RUN_TYPE" in os.environ and not IN_COLAB:
    print("Running in Kaggle...")
    for dirname, _, filenames in os.walk("/kaggle/input"):
        for filename in filenames:
            print(os.path.join(dirname, filename))
    MODEL_PATH = "./models"
    DATA_PATH = "/kaggle/input/"
    IN_KAGGLE = True
    print("Kaggle!")
elif not IN_COLAB:
    IN_KAGGLE = False
    MODEL_PATH = "./models"
    DATA_PATH = "./data/"
    print("running localhost!")

# Instruments

In [None]:
from constants import *

TARGET_FUT, INTERVAL

## Data Load

In [None]:
import pandas as pd
import numpy as np

filename = f"{DATA_PATH}{os.sep}futures_{INTERVAL}.csv"
print(filename)
futs_df = pd.read_csv(filename, index_col="Date", parse_dates=True)

print(futs_df.shape)
futs_df.head(2)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))

plt.plot(futs_df[f'{TARGET_FUT}_Close'], label=f'{TARGET_FUT} Close', alpha=0.7)
plt.title(f'{TARGET_FUT} Price')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# Prepare the Data

In [None]:
from signals import dynamic_support_resistance, kalman_backtest, kf_bollinger_band_backtest, tsmom_backtest
from quant_equations import get_ou, modulate_std
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler

KF_COLS = ['SD','Z1', 'Z2', 'Filtered_X', 'KG_X', 'KG_Z1', 'KG_Z2'] # ['Z1', 'Z2', 'Filtered_X', 'Uncertainty', 'Residuals', 'KG_X', 'KG_Z1', 'KG_Z2']
BB_COLS = ['MA', 'U','L'] # ['SB','SS','SBS','SSB', 'Unreal_Ret', 'MA','SD', 'U','L', '%B', 'X']
SR_COLS = ["Support", "Resistance"] # ["PP", "S1", "R1", "S2", "R2", "Support", "Resistance"]
MOM_COLS = ["TSMOM", "CONTRA"]
MARKET_COLS = [f"{fut}_{col}" for col in StockFeat.list for fut in MARKET_FUTS]
# We scale RAW column, the rest are percentages or log values.
COLS_TO_SCALE = StockFeat.list + BB_COLS + SR_COLS

def augment_ts(df, target_close, target_high, target_low, target_volume, interval, cols_to_scale=COLS_TO_SCALE, scaler=None):
    hl, h = get_ou(df, target_close)
    window = abs(hl)
    mod_std = modulate_std(h)

    mom_df, _ = tsmom_backtest(df, target_close, interval, int(window*2), contra_lookback=window//2, std_threshold=mod_std)
    bb_df, _ = kf_bollinger_band_backtest(df[target_close], df[target_volume], interval, std_factor=mod_std)
    sr_df, _, _ = dynamic_support_resistance(df, target_close, target_high, target_low, initial_window_size=window)
    kf_df, _ = kalman_backtest(bb_df["%B"].bfill().ffill(), df[target_volume], df[target_close], period=interval)

    aug_ts_df = pd.concat([df[StockFeat.list], sr_df, kf_df, bb_df, mom_df], axis=1).bfill().ffill()
    aug_ts_df = aug_ts_df.loc[:, ~aug_ts_df.columns.duplicated(keep="first")]
    if cols_to_scale is not None:
        # Scale the raw values, and concat with the signals.
        aug_df_scaled = None
        if scaler is None:
            scaler = StandardScaler()
            aug_df_scaled = scaler.fit_transform(aug_ts_df[cols_to_scale])
        else:
            aug_df_scaled = scaler.transform(aug_ts_df[cols_to_scale])

        aug_df_scaled = pd.DataFrame(aug_df_scaled, columns=cols_to_scale)
        aug_ts_df = pd.concat([aug_df_scaled, aug_ts_df.drop(columns=cols_to_scale).reset_index(drop=True)], axis=1)
        aug_ts_df = aug_ts_df.loc[:, ~aug_ts_df.columns.duplicated(keep="first")]

    return aug_ts_df, scaler

def process_exog(futures, futs_df, universe_cols=UNIVERSE_COLS):
    futs_exog_ts = []
    for f in tqdm(futures, desc="process_exog"):
        fut_df = futs_df.filter(regex=f"{f}_.*")

        universe_cols.update(fut_df.columns.tolist())

        train_df = fut_df
        futs_exog_ts.append(train_df)

    futs_exog_df = pd.concat(futs_exog_ts, axis=1)

    return futs_exog_df

def process_futures(futures, futs_df, futs_exog_df, train_size, interval, cols_to_scale=COLS_TO_SCALE):
    training_ts = []
    val_ts = []
    scalers = []
    for f in tqdm(futures, desc="process_futures"):
        fut_df = futs_df.filter(regex=f"{f}_.*")
        fut_df.columns = fut_df.columns.str.replace(f"{f}_", "", regex=False)
        fut_df = pd.concat([fut_df, futs_exog_df], axis=1)

        train_df, scaler = augment_ts(fut_df.iloc[:train_size], StockFeat.CLOSE, StockFeat.HIGH, StockFeat.LOW, StockFeat.VOLUME, interval, cols_to_scale=cols_to_scale)
        test_df, _ = augment_ts(fut_df.iloc[train_size:], StockFeat.CLOSE, StockFeat.HIGH, StockFeat.LOW, StockFeat.VOLUME, interval, cols_to_scale=cols_to_scale, scaler=scaler)
        training_ts.append(train_df.reset_index(drop=True))
        val_ts.append(test_df.reset_index(drop=True))
        scalers.append(scaler) # we use these later in the validation.

    return training_ts, val_ts, scalers

In [None]:
TEST_SPLIT = 0.7
TRAIN_SIZE = int(len(futs_df) * TEST_SPLIT)

futs_exog_df = process_exog(MARKET_FUTS, futs_df)
train_agri_ts, val_agri_ts, scalers = process_futures(AGRI_FUTS, futs_df, futs_exog_df, TRAIN_SIZE, INTERVAL)
# Stacking the lists of dataframes into single dataframes
train_ts_df = pd.concat([df.reset_index(drop=True) for df in train_agri_ts], axis=0, ignore_index=True).dropna()
test_ts_df = pd.concat([df.reset_index(drop=True) for df in val_agri_ts], axis=0, ignore_index=True).dropna()

train_ts_df.tail(5)

## Feature Engineering

In [None]:
META_LABEL = "mr_label"

def aug_metalabel_mr(df, metalabel = META_LABEL):
    df = df.copy()
    df[metalabel] = 0
    position = 0
    start_index = None
    df[META_LABEL] = 0
    for i, row in tqdm(df.iterrows(), desc="Posthoc Metalabeling"):
        if row['Closed'] != 0:
            # Position closed, work backwards
            metalabel = (row['Ret'] > 0.).astype(int)
            if start_index is not None and metalabel:
                df.loc[start_index:row.name, META_LABEL] = metalabel
            position = 0
            start_index = None
        if row['Position'] != 0 and position == 0:
            # New position opened
            position = row['Position']
            start_index = row.name

    return df

train_ts_df = aug_metalabel_mr(train_ts_df)
test_ts_df = aug_metalabel_mr(test_ts_df)

train_ts_df[train_ts_df[META_LABEL] > 0][["Ret", META_LABEL]]

# Data Clean

In [None]:
CORR_THRESHOLD = 0.90
DROP_COL =False

ALL_COLS = np.concatenate([StockFeat.list, np.array(MARKET_COLS), np.array(KF_COLS), np.array(BB_COLS), np.array(SR_COLS), np.array(MOM_COLS)])

y_train = train_ts_df[META_LABEL]
X_train = train_ts_df[ALL_COLS]

y_test = test_ts_df[META_LABEL]
X_test = test_ts_df[ALL_COLS]

X_train_preclean = X_train.copy()
X_test_preclean = X_test.copy()

corr_matrix = X_train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > CORR_THRESHOLD)]
to_drop = sorted(to_drop, key=lambda x: (upper.columns.get_loc(x), -upper[x].max()))
print(f"These are highly corr: {to_drop}")

if to_drop is not None and not DROP_COL:
    X_train = X_train.drop(columns=to_drop)
    X_test = X_test.drop(columns=to_drop)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

VIF_THRESHOLD = 5

def calculate_vif(X):
    x_const = add_constant(X)
    vif_data = pd.DataFrame()
    vif_data["feature"] = x_const.columns
    vif_data["VIF"] = [
        variance_inflation_factor(x_const.values, i)
        for i in range(x_const.shape[1])
    ]
    return vif_data

vif_data = calculate_vif(X_train)
vif_data = vif_data.sort_values(by="VIF", ascending=False)

vif_data = vif_data.replace([np.inf, -np.inf], np.nan).dropna()
acceptable_vif = vif_data[vif_data["VIF"] < VIF_THRESHOLD].sort_values(by="feature")
selected_features = acceptable_vif["feature"].tolist()
if 'const' in selected_features:
    selected_features.remove('const')

if not DROP_COL:
    X_train = X_train[selected_features]
    X_test = X_test[selected_features]

print(f"Multi-Colinear: {vif_data[vif_data['VIF'] >= VIF_THRESHOLD]['feature'].values}")

CLEAN_FEATURES = X_train.columns

# GBC

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

def param_search():
    # Best parameters found: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
    model = GradientBoostingClassifier(random_state=42)
    param_grid = {
        'n_estimators': [50, 150, 300],
        'learning_rate': [0.001, 0.01, 0.1],
        'max_depth': [3, 7, 9],
    }

    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='precision', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(f"Best parameters found: {grid_search.best_params_}")
    print(f"Best precision score: {grid_search.best_score_}")

    best_model = grid_search.best_estimator_
    return best_model

PARAM_SEARCH = True
if PARAM_SEARCH:
    gbc = param_search()
else:
    gbc = GradientBoostingClassifier(random_state=42, learning_rate=0.01, max_depth=3, n_estimators=300)
    gbc.fit(X_train, y_train)
gbc

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, roc_curve, auc, precision_recall_curve, fbeta_score, precision_score, recall_score, RocCurveDisplay, PrecisionRecallDisplay

def print_classification_metrics(X_test, y_test, best_model):
    # Predictions and probabilities
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]

    print(classification_report(y_test, y_pred))

    _, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 7))

    # ROC Curves
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name="GBC").plot(ax=ax1)
    ax1.set_title('Receiver Operating Characteristic (ROC)')
    PrecisionRecallDisplay(precision=precision, recall=recall).plot(ax=ax2)
    ax2.set_title('Precision-Recall Curve')

    plt.show()

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = fbeta_score(y_test, y_pred, average='weighted', beta=0.)
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Beta Score: {f1:.4f}')

print_classification_metrics(X_test, y_test, gbc)

# Feature Selection

In [None]:
feature_importance = gbc.feature_importances_
features = X_train.columns if isinstance(X_train, pd.DataFrame) else [f'Feature {i}' for i in range(X_train.shape[1])]

# Create a DataFrame for feature importance
gbc_feat_df = pd.DataFrame({'Feature': features, 'Importance': feature_importance})
gbc_feat_df = gbc_feat_df.sort_values(by="Importance", ascending=False)
gbc_feat_df = gbc_feat_df[gbc_feat_df["Importance"] > 0.1]
plt.figure(figsize=(10, 8))
plt.barh(gbc_feat_df["Feature"], gbc_feat_df["Importance"], align='center')
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importance for Gradient Boosting Classifier')
plt.show()

## SHAP

The summary plot provides a global view of feature importance across all predictions. The SHAP value represents the impact of each feature on the model's output. 

Positive SHAP values push the prediction towards one class (mean reversion in your case), while negative SHAP values push it towards the other class (random walk).

The magnitude of the SHAP value indicates the strength of the impact. Larger absolute values mean a stronger impact on the prediction.

In [None]:
import shap

MAX_FEATURES_COUNT = 16

explainer = shap.Explainer(gbc, X_train)
shap_values = explainer(X_train)

shap.summary_plot(shap_values, X_train)
shap.decision_plot(explainer.expected_value, shap_values.values[0], X_test.iloc[0])

shap_importance = np.abs(shap_values.values).mean(axis=0)
shap_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': shap_importance
})

shap_importance_df = shap_importance_df.sort_values(by='Importance', ascending=False)
shap_importance_df = shap_importance_df[shap_importance_df['Importance'] > 0.001]
top_shap_features = shap_importance_df.head(MAX_FEATURES_COUNT)
print(top_shap_features)

fig, ax = plt.subplots(figsize=(10, 6))
top_shap_features.plot(kind='barh', x='Feature', y='Importance', legend=False, ax=ax)
ax.set_title(f"SHAP Values")
ax.set_xlabel("Mean Absolute SHAP Value")
ax.set_ylabel("Feature")
plt.tight_layout()
plt.show()

## PCA

In [None]:
from sklearn.decomposition import PCA

MAX_VARIANCE = 0.99

pca = PCA()
xdata = pca.fit_transform(X_train)

cum_var_exp = np.cumsum(pca.explained_variance_ratio_)
num_components = np.argmax(cum_var_exp >= MAX_VARIANCE) + 1
print(f"Max components for {MAX_VARIANCE*100}% variance: {num_components} out of {X_train.shape[1]}")

pca = PCA(num_components)
xdata = pca.fit_transform(X_train)
eigenvectors = pca.components_

top_features = np.abs(eigenvectors).sum(axis=0).argsort()[::-1]
selected_features = X_train.columns[top_features[:num_components]]
loadings_df = pd.DataFrame(eigenvectors[:, top_features[:num_components]], columns=selected_features).T

summed_loadings = np.sum(np.abs(eigenvectors[:, top_features[:num_components]]), axis=0)
summed_loadings_df = pd.DataFrame(summed_loadings, index=selected_features, columns=["Sum"]).sort_values(by="Sum", ascending=False)

fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(18, 10), gridspec_kw={'height_ratios': [1, 2]})

summed_loadings_df.plot(kind="bar", legend=False, ax=axes[0])
axes[0].set_title("Summed Loadings Across Top Principal Components")
axes[0].set_ylabel("Summed Loadings")
axes[0].set_xlabel("Features")
axes[0].tick_params(axis='x', labelrotation=45)

loadings_df.plot(kind="bar", legend=False, ax=axes[1])
axes[1].set_title(f"Loadings for Top Principal Components")
axes[1].set_ylabel("Loadings")
axes[1].set_xlabel("Features")
axes[1].tick_params(axis='x', labelrotation=45)

loadings_df

In [None]:
import seaborn as sns
from scipy.stats import spearmanr
from sklearn.feature_selection import mutual_info_regression

ic = {}
for column in X_train.columns:
    corr, p_val = spearmanr(y_train, X_train[column])
    ic[column] = [corr, p_val]

ic_df = pd.DataFrame(ic, index=["IC", "p-value"]).T

mi = mutual_info_regression(X=X_train, y=y_train)
mi_series = pd.Series(mi, index=X_train.columns)
metrics = pd.concat(
    [
        mi_series.to_frame("Mutual Information"),
        ic_df["IC"].to_frame("Information Coefficient"),
    ],
    axis=1,
)


top_mi_features = metrics.sort_values(by="Mutual Information", ascending=False).head(num_components)
top_ic_features = metrics.sort_values(by="Information Coefficient", ascending=False).head(num_components)

fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(18, 8))

top_mi_features.plot.bar(ax=axes[0], rot=45)
axes[0].set_xlabel("Features")
axes[0].set_ylabel("Scores")
axes[0].set_title("Top Features by Mutual Information")
axes[0].tick_params(axis='x', labelrotation=45)
sns.despine(ax=axes[0])

top_ic_features.plot.bar(ax=axes[1], rot=45)
axes[1].set_xlabel("Features")
axes[1].set_ylabel("Scores")
axes[1].set_title("Top Features by Information Coefficient")
axes[1].tick_params(axis='x', labelrotation=45)
sns.despine(ax=axes[1])

plt.tight_layout()
plt.show()

In [None]:
features_pca = summed_loadings_df.head(MAX_FEATURES_COUNT).index.tolist()
features_miic = (metrics.head(MAX_FEATURES_COUNT).index.tolist())
features_shap  = (shap_importance_df.head(MAX_FEATURES_COUNT)['Feature'].tolist())
features_gbc = (gbc_feat_df.head(MAX_FEATURES_COUNT)['Feature'].tolist())

print(F"Top {MAX_FEATURES_COUNT} PCA Loadings: {features_pca}")
print(F"Top {MAX_FEATURES_COUNT} MI/IC: {features_miic}")
print(F"Top {MAX_FEATURES_COUNT} SHAP: {features_shap}")
print(F"Top {MAX_FEATURES_COUNT} GBC: {features_gbc}")

DIMREDUC_FEATURES = list(set(features_pca) | set(features_miic) | set(features_shap) | set(features_gbc))

print(f"Selected {len(DIMREDUC_FEATURES)} features: {DIMREDUC_FEATURES}")

## Dimension Reduction

In [None]:
X_train_dimreduc = X_train[DIMREDUC_FEATURES]
X_test_dimreduc = X_test[DIMREDUC_FEATURES]
if PARAM_SEARCH:
    gbc_dimreduc = param_search()
else:
    gbc_dimreduc = GradientBoostingClassifier(random_state=42, learning_rate=0.01, max_depth=3, n_estimators=300)
    gbc_dimreduc.fit(X_train_dimreduc, y_train)
print_classification_metrics(X_test_dimreduc, y_test, gbc_dimreduc)

# Rebalancing Dataset

## SMOTE

In [None]:
from imblearn.over_sampling import SMOTEN

X_train_resampled, y_train_resampled = SMOTEN().fit_resample(X_train, y_train)
if PARAM_SEARCH:
    gbc_smote = param_search()
else:
    gbc_smote = GradientBoostingClassifier(random_state=42, learning_rate=0.01, max_depth=3, n_estimators=300)
    gbc_smote.fit(X_train_resampled, y_train_resampled)
print_classification_metrics(X_test, y_test, gbc_smote)

## Balanced Bagging

In [None]:
from imblearn.ensemble import BalancedBaggingClassifier

def bagging_param_search(best_model):
    model = BalancedBaggingClassifier(estimator=best_model,
                                        sampling_strategy='auto',
                                        replacement=False,
                                        random_state=42,
                                        n_estimators=50,
                                        n_jobs=-1)
    param_grid = {
        'n_estimators': [50, 150, 300],
    }

    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='precision', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(f"Best parameters found: {grid_search.best_params_}")
    print(f"Best precision score: {grid_search.best_score_}")

    best_model = grid_search.best_estimator_
    return best_model

if PARAM_SEARCH:
    bbc = bagging_param_search(gbc)
else:
    bbc = BalancedBaggingClassifier(estimator=gbc,
                                    sampling_strategy='auto',
                                    replacement=False,
                                    random_state=42,
                                    n_estimators=150,
                                    n_jobs=-1)
    bbc.fit(X_train, y_train)
print_classification_metrics(X_test, y_test, bbc)

# Validation

In [None]:
SELECTED_MODEL = gbc

In [None]:
def backtest_gbc(df, gbc_model, model_features=DIMREDUC_FEATURES):
    position = 0

    df[META_LABEL] = gbc_model.predict(df[model_features])
    df['Ret'] = 0
    df['Closed'] = 0
    df['Position'] = 0
    df['Unreal_Ret'] = 0
    df['cRets'] = 0
    df['SCancelled'] = 0

    for i, row in tqdm(df.iterrows(), desc="gbc_backtest"):
        if ((row['SBS'] == -1 and position == 1) or \
            (row['SSB'] == 1 and position == -1) or \
            (position == 1 and row['Close'] <= entry * (1 - 0.1)) or \
            (position == -1 and row['Close'] >= entry * (1 + 0.1))):

            if position == 1:
                df.loc[i, 'Ret'] = (row['Close'] - entry) / entry
                df.loc[i, 'Closed'] = 1
            else:
                df.loc[i, 'Ret'] = (entry - row['Close']) / entry
                df.loc[i, 'Closed'] = -1
            position = 0
        elif ((row['SBS'] == 1 and position == 0) or (row['SSB'] == -1 and position == 0)):
            # it was cancelled.
            df.loc[i, 'SBS'] = df.loc[i, 'SSB'] = 0

        if ((row['SB'] == 1 and position == 0) or (row['SS'] == -1 and position == 0)):
            if row[META_LABEL] :
                entry = row['Close']
                position = 1 if row['SB'] == 1 else -1
            else:
                df.loc[i, 'SB'] = df.loc[i, 'SS'] = 0
                df.loc[i, 'SCancelled'] = 1 # use LIME to understand

        df.loc[i, 'Position'] = position
        if position != 0:
            # Unrealized for continuous returns tracking.
            df.loc[i, 'Unreal_Ret'] = (entry - row['Close']) / entry

            if not row[META_LABEL] :
                if position == 1:
                    df.loc[i, 'Ret'] = (row['Close'] - entry) / entry
                    df.loc[i, 'Closed'] = 1
                    df.loc[i, 'SBS'] = 1
                else:
                    df.loc[i, 'Ret'] = (entry - row['Close']) / entry
                    df.loc[i, 'Closed'] = -1
                    df.loc[i, 'SSB'] = 1
                position = 0

    df['cRets'] = (1 + df['Ret']).cumprod() - 1
    return df

# Since we stacked the same timeseries, we can unstack to get the first actual future TS.
# Also descale it to compare, and rescale it for classification.
fut_df = futs_df.filter(regex=f"{LEANHOG_FUT}_.*")
fut_df.columns = fut_df.columns.str.replace(f"{LEANHOG_FUT}_", "", regex=False)
fut_df = pd.concat([fut_df, futs_exog_df], axis=1)
fut_df = fut_df.loc[:, ~fut_df.columns.duplicated(keep="first")]

single_test_df, scaler = augment_ts(fut_df.iloc[TRAIN_SIZE:], "Close", "High", "Low", "Volume", INTERVAL)
single_test_df = single_test_df.loc[:, ~single_test_df.columns.duplicated(keep="first")]
print(single_test_df.shape)

bt_df = backtest_gbc(single_test_df, SELECTED_MODEL, model_features=CLEAN_FEATURES)

In [None]:
def plot(bt_df):
    fig, axs = plt.subplots(4, gridspec_kw={'height_ratios': [4, 4, 1, 1]}, figsize=(18, 8))

    buy_signals = bt_df[bt_df['SB'] > 0]
    sell_signals = bt_df[bt_df['SS'] < 0]
    long_closed = bt_df[bt_df['Closed'] > 0]
    short_closed = bt_df[bt_df['Closed'] < 0]

    axs[0].plot(bt_df['Close'].index, bt_df['Close'], label=f'{AGRI_FUTS[0]} Price', color='blue')
    axs[0].scatter(buy_signals.index, buy_signals[f'Close'], color='green', marker='^', label='Buy', alpha =0.7)
    axs[0].scatter(sell_signals.index, sell_signals[f'Close'], color='red', marker='v', label='Sell', alpha =0.7)
    axs[0].scatter(long_closed.index, long_closed[f'Close'], color='green', marker='x', label='Buy Close', alpha =0.5)
    axs[0].scatter(short_closed.index, short_closed[f'Close'], color='red', marker='o', label='Sell Close', alpha =0.5)
    axs[0].set_title(f'{AGRI_FUTS[0]} Close Prices')
    axs[0].grid(True)
    axs[0].legend()

    axs[1].plot(bt_df.index, bt_df['Filtered_X'], label='Spread', alpha=0.7, linestyle='--')
    axs[1].scatter(buy_signals.index, buy_signals[f'Filtered_X'], color='green', marker='^', label='Buy', alpha =0.7)
    axs[1].scatter(sell_signals.index, sell_signals[f'Filtered_X'], color='red', marker='v', label='Sell', alpha =0.7)
    axs[1].scatter(long_closed.index, long_closed[f'Filtered_X'], color='green', marker='x', label='Buy Close', alpha =0.5)
    axs[1].scatter(short_closed.index, short_closed[f'Filtered_X'], color='red', marker='o', label='Sell Close', alpha =0.5)
    axs[1].fill_between(bt_df.index,
                        bt_df['Filtered_X'] - bt_df['Uncertainty'],
                        bt_df['Filtered_X'] + bt_df['Uncertainty'],
                        label='Uncertainty', color="gray", alpha=0.5)
    axs[1].axhline(y=1., color='black', alpha=0.7)
    axs[1].axhline(y=0.5, color='black', alpha=0.7)
    axs[1].axhline(y=0., color='black', alpha=0.7)
    axs[1].set_title(f'{TARGET_FUT} Actual vs Kalman Filtered Spread')
    axs[1].legend()
    axs[1].grid(True)

    axs[2].plot(bt_df.index, bt_df['cRets'], label='Returns')
    axs[2].set_title(f'Cummulative Returns')
    axs[2].grid(True)
    axs[2].legend()

    axs[3].scatter(bt_df.index, bt_df[META_LABEL], label='Signal')
    axs[3].set_title(f'Model Signal')
    axs[3].grid(True)
    axs[3].legend()

    plt.tight_layout()
    plt.show()

plot(bt_df)

## Why Decisions?

In [None]:
samples = single_test_df.iloc[460:480]
sample_cancelled = samples[samples["SCancelled"] == 1][CLEAN_FEATURES]
samples = samples[CLEAN_FEATURES]

print(sample_cancelled.shape)
sample_cancelled

# LIME Explanations

In [None]:
from lime.lime_tabular import LimeTabularExplainer

explainer = LimeTabularExplainer(samples.values, feature_names=samples.columns, class_names=['RW', 'MR'], discretize_continuous=True)
exp = explainer.explain_instance(sample_cancelled.values[0], SELECTED_MODEL.predict_proba, num_features=len(single_test_df.columns))
exp.show_in_notebook(show_table=True)


### Other Future