### Installing required libraries

In [1]:
#pip install hmmlearn

In [2]:
#pip install statsmodels

In [3]:
#pip install xgboost

In [4]:
#pip install lightgbm

In [5]:
#pip install seaborn

In [6]:
#pip install gdown

In [51]:
"""
Importing all the needed libraries for the project:
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from hmmlearn import hmm
from hmmlearn.hmm import GaussianHMM
import statsmodels.api as sm1
from scipy.stats import linregress
import matplotlib.dates as mdates
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model   import LinearRegression, Lasso, Ridge
from sklearn.model_selection import TimeSeriesSplit,ParameterGrid
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm import tqdm
from sklearn.model_selection  import PredefinedSplit, GridSearchCV
from sklearn.base import clone
import warnings
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import gdown
warnings.filterwarnings('ignore')





In [8]:
# Define Google Drive file IDs
prices_id        = "1drMM8_c-43cOmGjirp4-CRz1RBwgFK_V"
prices_train_id  = "1RmrpUfiqfqOp5Jb5attauKcltLEx1zKh"
prices_test_id   = "1ONursxl7CKdAlf0t_Slmy2_w_Bzj2QbB"

# Download and load prices
print("Loading prices...")
gdown.download(f"https://drive.google.com/uc?id={prices_id}", "prices.csv", quiet=False)
prices = pd.read_csv("prices.csv", parse_dates=True, index_col=0)
print("✓ Loaded prices")

# Download and load training set
print("Loading prices_train...")
gdown.download(f"https://drive.google.com/uc?id={prices_train_id}", "prices_train.csv", quiet=False)
prices_train = pd.read_csv("prices_train.csv", parse_dates=True, index_col=0)
print("✓ Loaded prices_train")

# Download and load test set
print("Loading prices_test...")
gdown.download(f"https://drive.google.com/uc?id={prices_test_id}", "prices_test.csv", quiet=False)
prices_test = pd.read_csv("prices_test.csv", parse_dates=True, index_col=0)
print("✓ Loaded prices_test")


Loading prices...


Downloading...
From: https://drive.google.com/uc?id=1drMM8_c-43cOmGjirp4-CRz1RBwgFK_V
To: /Users/goutham/Desktop/systematic_trading/prices.csv
100%|██████████████████████████████████████| 34.8M/34.8M [00:01<00:00, 18.3MB/s]


✓ Loaded prices
Loading prices_train...


Downloading...
From: https://drive.google.com/uc?id=1RmrpUfiqfqOp5Jb5attauKcltLEx1zKh
To: /Users/goutham/Desktop/systematic_trading/prices_train.csv
100%|██████████████████████████████████████| 26.1M/26.1M [00:01<00:00, 22.3MB/s]


✓ Loaded prices_train
Loading prices_test...


Downloading...
From: https://drive.google.com/uc?id=1ONursxl7CKdAlf0t_Slmy2_w_Bzj2QbB
To: /Users/goutham/Desktop/systematic_trading/prices_test.csv
100%|██████████████████████████████████████| 8.66M/8.66M [00:00<00:00, 23.3MB/s]


✓ Loaded prices_test


In [9]:
prices_train_meta = prices_train.copy()# for meta labelling later on

In [10]:
prices_train.drop(['coin','regime','ret_regime_1','cumret_regime_1','ret_regime_2','cumret_regime_0', 'ret_regime_1', 'cumret_regime_1', 'ret_regime_2',
       'cumret_regime_2','t1','tVal','windowSize','day'],axis=1,inplace=True)
prices_test.drop(['coin','regime','ret_regime_1','cumret_regime_1','ret_regime_2','cumret_regime_0', 'ret_regime_1', 'cumret_regime_1', 'ret_regime_2',
       'cumret_regime_2','t1','tVal','windowSize','day'],axis=1,inplace=True)

### Normalising data

In [11]:


def preprocess_data(df):
    """
    Returns a ColumnTransformer that:
      - Log-transforms and then z-scores MVRV, NVT
      - Takes the log-difference of n_unique_addresses, then z-scores
      - Log1p‐transforms exchange_volume, then z-scores
      - Scales (z-scores) all the remaining continuous features
      - Leaves binary features untouched (“passthrough”)

    Usage:
      preprocessor = preprocess_data(df)
      X_transformed = preprocessor.fit_transform(df)
    """

    # --- 1) Identify all columns of interest ---
    # (a) special columns that need custom transforms
    special_log_cols     = ['MVRV', 'nvt']
    special_logdiff_cols = ['n_unique_addresses']
    special_log1p_cols   = ['exchange_volume']

    # (b) columns that were previously in “continuous_cols”
    returns_vol_cols = [
        c for c in df.columns
        if c.startswith('return') or c.startswith('log_return') or c.startswith('volatility')
    ]
    ema_cols = [c for c in df.columns if c in ['ema21', 'ema35', 'ema80', 'ema250']]
    sma_cols = [c for c in df.columns if c in ['sma50', 'sma200', 'sma20']]
    other_cont = [
        'rsi14','bb_percent_b','bb_bandwidth','macd_line','macd_hist',
        'adx14','obv','bb_mid','bb_upper','bb_lower','bb_percent_b','bb_bandwidth',
        'plus_di14','minus_di14','%K','%D','obv_sma20'
    ]

    # (c) combine & remove the “special” ones
    all_continuous = sma_cols + returns_vol_cols + ema_cols + other_cont + ['volume', 'vwap']
    # Now remove special-handled columns (they’ll be processed in their own pipelines)
    rest_continuous = [
        c for c in all_continuous
        if c not in special_log_cols + special_logdiff_cols + special_log1p_cols
    ]

    # (d) binary columns get “passthrough” (unscaled)
    binary_cols = [
        c for c in df.columns
        if c.endswith('cross')
        or c.endswith('above')
        or c.endswith('below')
        or c.startswith('is')
    ]

    # --- 2) Build small helper functions for the custom transforms ---

    # 2a) Log‐transform (with a tiny offset if you worry about zeros):
    log_transform = FunctionTransformer(np.log1p, validate=True)
    # You could also use np.log if you know there are no zeros or negatives:
    # log_transform = FunctionTransformer(lambda x: np.log(x), validate=True)

    # 2b) Log‐difference (i.e. first‐difference of the log)
    def logdiff_column(X):
        """
        X will be a 2D array of shape (n_samples, 1).
        We take np.log on that column, then do a forward difference,
        and fill the first row's diff with 0. Finally reshape back to (n_samples, 1).
        """
        col = X.astype(float).ravel()           # shape (n_samples,)
        logged = np.log1p(col)                  # or np.log(col) if no zeros
        diffed = np.diff(logged, prepend=logged[0])  # shape (n_samples,)
        return diffed.reshape(-1, 1)             # back to 2D

    logdiff_transform = FunctionTransformer(logdiff_column, validate=True)

    # 2c) Log1p‐transform (log(1+x)) for exchange_volume
    log1p_transform = FunctionTransformer(np.log1p, validate=True)

    # --- 3) Build pipelines for each group ---

    # 3a) MVRV & NVT:  log → z-score
    mvrv_pipeline = Pipeline([
        ('log',    log_transform),        # apply np.log1p
        ('scale',  StandardScaler()),     # then z-score
    ])

    nvt_pipeline = Pipeline([
        ('log',    log_transform),
        ('scale',  StandardScaler()),
    ])

    # 3b) n_unique_addresses: log‐difference → z-score
    nua_pipeline = Pipeline([
        ('logdiff', logdiff_transform),
        ('scale',   StandardScaler()),
    ])

    # 3c) exchange_volume: log1p → z-score
    exch_pipeline = Pipeline([
        ('log1p',  log1p_transform),
        ('scale',  StandardScaler()),
    ])

    # 3d) “Everything else” continuous: just z-score
    rest_pipeline = Pipeline([
        ('scale', StandardScaler())
    ])

    # --- 4) Assemble ColumnTransformer ---

    preprocessor = ColumnTransformer([
        # log→z for MVRV
        ('mvrv_log',     mvrv_pipeline,     ['MVRV']),
        # log→z for NVT
        ('nvt_log',      nvt_pipeline,      ['nvt']),
        # logdiff→z for n_unique_addresses
        ('nua_logdiff',  nua_pipeline,      ['n_unique_addresses']),
        # log1p→z for exchange_volume
        ('exch_log1p',   exch_pipeline,     ['exchange_volume']),
        # all the other continuous features get plain z-score
        ('rest_cont_z',  rest_pipeline,     rest_continuous),
        # binary features pass through unchanged
        ('passthrough_bin', 'passthrough',  binary_cols),
    ], remainder='drop')  # Drop any columns not explicitly listed above.

    return preprocessor


In [12]:
preprocessor = preprocess_data(prices_train)

### Building a baseline primary model (Randomforest, no hyperparams tuned)

In [13]:
pipe = Pipeline([
    ('prep', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=200,
                                   min_samples_leaf=5,
                                   random_state=42,
                                   n_jobs=-1))
])

In [14]:
X_prices_train = prices_train.drop(['bin'],axis=1)
y_prices_train = prices_train['bin']
X_prices_test = prices_test.drop(['bin'],axis=1)
y_prices_test = prices_test['bin']

In [15]:
pipe.fit(X_prices_train,y_prices_train)

In [52]:
rf_base_pred = pipe.predict(X_prices_test)
print('Accuracy Score for Base RF Model:')
print(accuracy_score(rf_base_pred,y_prices_test))
print("Classification matrix:")
print(classification_report(rf_base_pred,y_prices_test, digits=4))
print("Confusion Matrix")
print(confusion_matrix(rf_base_pred,y_prices_test))

Accuracy Score for Base RF Model:
0.5362954414310445
Classification matrix:
              precision    recall  f1-score   support

         0.0     0.0000    0.0000    0.0000         0
         1.0     1.0000    0.5363    0.6982      8665

    accuracy                         0.5363      8665
   macro avg     0.5000    0.2681    0.3491      8665
weighted avg     1.0000    0.5363    0.6982      8665

Confusion Matrix
[[   0    0]
 [4018 4647]]


## We run modified CV on RF, XGB and LGBM

In [17]:
years = X_prices_train.index.year
validation_years = [2018, 2019, 2020]

#### RF CV parameters and function for modified CV

In [18]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth':    [None, 5, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'max_features':     ['sqrt', 0.3, 0.5]
}
grid = list(ParameterGrid(param_grid))

In [19]:
def score_hyperparams_rf(params, X, y, years, validation_years):
    
    # We'll store each year’s OOF predictions here, keyed by year
    oof_preds_by_year = {}
    
    for val_year in validation_years:
        # Define train index: all rows whose `year != val_year`
        train_mask = (years != val_year)
        val_mask   = (years == val_year)
        
        X_tr = X.loc[train_mask]
        y_tr = y.loc[train_mask]
        X_val = X.loc[val_mask]
        # Note: y_val is not used for training, only for scoring later
        
        # 1) Fit RF on the two “other” years
        rf = RandomForestClassifier(
            n_estimators       = params['n_estimators'],
            max_depth          = params['max_depth'],
            min_samples_leaf   = params['min_samples_leaf'],
            max_features       = params['max_features'],
            random_state       = 42,
            n_jobs             = -1
        )
        rf.fit(X_tr, y_tr)
        
        # 2) Predict on val_year
        preds_val = rf.predict(X_val)  # shape = (#rows in that year,)
        
        # 3) Store them
        oof_preds_by_year[val_year] = pd.Series(
            preds_val,
            index = X_val.index
        )
    
    # 4) Concatenate the three year‐by‐year predictions in chronological order
    #    (so the final “full_oof_pred” has the same index & ordering as X)
    pred_list = []
    for y_year in validation_years:
        pred_list.append(oof_preds_by_year[y_year])
    full_oof_pred = pd.concat(pred_list).sort_index()
    
    # 5) Compute accuracy on those concatenated predictions
    true_labels = y.loc[full_oof_pred.index]  # same index
    score = accuracy_score(true_labels, full_oof_pred)
    return score, full_oof_pred.values


#### Running CV on RF

In [20]:
best_score = -np.inf
best_params = None
best_oof   = None

for params in grid:
    score, oof_preds = score_hyperparams_rf(
        params,
        X_prices_train,
        y_prices_train,
        years,
        validation_years
    )
    print(f"Params {params} → combined‐2018‐20 accuracy = {score:.4f}")
    
    if score > best_score:
        best_score  = score
        best_params = params
        best_oof    = oof_preds.copy()

print("\nBest hyperparameters:", best_params)
print("Best 2018–2020 OOF accuracy:", best_score)

Params {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 100} → combined‐2018‐20 accuracy = 0.4726
Params {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 200} → combined‐2018‐20 accuracy = 0.4703
Params {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 300} → combined‐2018‐20 accuracy = 0.4772
Params {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 100} → combined‐2018‐20 accuracy = 0.4857
Params {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 200} → combined‐2018‐20 accuracy = 0.4730
Params {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 300} → combined‐2018‐20 accuracy = 0.4699
Params {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'n_estimators': 100} → combined‐2018‐20 accuracy = 0.4726
Params {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 10, 

In [21]:
final_rf = RandomForestClassifier(
    n_estimators     = best_params['n_estimators'],
    max_depth        = best_params['max_depth'],
    min_samples_leaf = best_params['min_samples_leaf'],
    max_features     = best_params['max_features'],
    random_state     = 42,
    n_jobs           = -1
)
final_rf.fit(X_prices_train,y_prices_train)

In [22]:
rf_final_pred = final_rf.predict(X_prices_test)
acc = accuracy_score(rf_final_pred,y_prices_test)
print('Accuracy Score for Hyperparam optimised RF Model:')
print(acc)
print("Classification matrix:")
print(classification_report(rf_final_pred,y_prices_test, digits=4))
print("Confusion Matrix")
print(confusion_matrix(rf_final_pred,y_prices_test))

Accuracy Score for Hyperparam optimised RF Model:
0.5362954414310445
Classification matrix:
              precision    recall  f1-score   support

        -1.0     0.0000    0.0000    0.0000         0
         1.0     1.0000    0.5363    0.6982      8665

    accuracy                         0.5363      8665
   macro avg     0.5000    0.2681    0.3491      8665
weighted avg     1.0000    0.5363    0.6982      8665



#### Code for XGB and LGBM CV

In [23]:
# XGB and LGBM cannot handle -1 value, so we replace -1 with 0
mapping = {-1:0,1:1}
prices_train['bin'] = prices_train['bin'].map(mapping)
prices_test['bin'] = prices_test['bin'].map(mapping)
X_prices_train = prices_train.drop(['bin'],axis=1)
y_prices_train = prices_train['bin']
X_prices_test = prices_test.drop(['bin'],axis=1)
y_prices_test = prices_test['bin']

In [24]:
xgb_param_grid = {
    'n_estimators':      [50, 100],
    'max_depth':         [3, 6],
    'learning_rate':     [0.05, 0.1],
    'subsample':         [0.7, 1.0],
    'colsample_bytree':  [0.7, 1.0]
}

lgbm_param_grid = {
    'n_estimators':    [50, 100],
    'max_depth':       [-1, 5, 10],       # -1 means “no max depth” in LightGBM
    'learning_rate':   [0.05, 0.1],
    'num_leaves':      [31, 63],
    'subsample':       [0.7, 1.0]
}

# Convert to list of dicts:
xgb_grid  = list(ParameterGrid(xgb_param_grid))
lgbm_grid = list(ParameterGrid(lgbm_param_grid))

In [25]:
def score_xgb(params, X, y, years, val_years):
    
    oof_by_year = {}
    
    # Pre‐filter numeric columns in case there’s a stray date or string column
    numeric_cols = X.select_dtypes(include='number').columns
    
    for year in val_years:
        train_mask = (years != year)
        val_mask   = (years == year)
        
        X_tr  = X.loc[train_mask, numeric_cols]
        y_tr  = y.loc[train_mask]
        X_val = X.loc[val_mask,   numeric_cols]
        
        # Instantiate XGB with these params + mandatory multiclass/binary settings
        xgb = XGBClassifier(
            objective       = 'binary:logistic' if len(np.unique(y)) == 2 else 'multi:softprob',
            num_class       = len(np.unique(y)) if len(np.unique(y)) > 2 else None,
            use_label_encoder=False,
            eval_metric     = 'logloss',
            random_state    = 42,
            verbosity       = 0,
            
            # now unpack the grid search params:
            n_estimators    = params['n_estimators'],
            max_depth       = params['max_depth'],
            learning_rate   = params['learning_rate'],
            subsample       = params['subsample'],
            colsample_bytree= params['colsample_bytree']
        )
        
        xgb.fit(X_tr, y_tr)
        preds_val = xgb.predict(X_val)
        oof_by_year[year] = pd.Series(preds_val, index=X_val.index)
    
    # Concatenate in chronological order
    all_preds = pd.concat([oof_by_year[y] for y in val_years]).sort_index()
    true_lbl  = y.loc[all_preds.index]
    score     = accuracy_score(true_lbl, all_preds)
    return score, all_preds.values


In [26]:
def score_xgb(params, X, y, years, val_years):
    
    oof_by_year = {}
    
    # Pre‐filter numeric columns in case there’s a stray date or string column
    numeric_cols = X.select_dtypes(include='number').columns
    
    for year in val_years:
        train_mask = (years != year)
        val_mask   = (years == year)
        
        X_tr  = X.loc[train_mask, numeric_cols]
        y_tr  = y.loc[train_mask]
        X_val = X.loc[val_mask,   numeric_cols]
        
        # Instantiate XGB with these params + mandatory multiclass/binary settings
        xgb = XGBClassifier(
            objective       = 'binary:logistic' if len(np.unique(y)) == 2 else 'multi:softprob',
            num_class       = len(np.unique(y)) if len(np.unique(y)) > 2 else None,
            use_label_encoder=False,
            eval_metric     = 'logloss',
            random_state    = 42,
            verbosity       = 0,
            
            # now unpack the grid search params:
            n_estimators    = params['n_estimators'],
            max_depth       = params['max_depth'],
            learning_rate   = params['learning_rate'],
            subsample       = params['subsample'],
            colsample_bytree= params['colsample_bytree']
        )
        
        xgb.fit(X_tr, y_tr)
        preds_val = xgb.predict(X_val)
        oof_by_year[year] = pd.Series(preds_val, index=X_val.index)
    
    # Concatenate in chronological order
    all_preds = pd.concat([oof_by_year[y] for y in val_years]).sort_index()
    true_lbl  = y.loc[all_preds.index]
    score     = accuracy_score(true_lbl, all_preds)
    return score, all_preds.values


#### Running code for XGB

In [27]:
best_xgb_score = -np.inf
best_xgb_params = None
best_xgb_oof    = None

print("=== Tuning XGBoost ===")
for params in xgb_grid:
    sc, oof = score_xgb(params,
                         X_prices_train,
                         y_prices_train,
                         years,
                         validation_years)
    print(f"XGB params {params} → OOF accuracy = {sc:.4f}")
    if sc > best_xgb_score:
        best_xgb_score  = sc
        best_xgb_params = params.copy()
        best_xgb_oof    = oof.copy()

print("\nBest XGB params:", best_xgb_params)
print("Best XGB 2018–20 accuracy:", best_xgb_score)

=== Tuning XGBoost ===
XGB params {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.7} → OOF accuracy = 0.4484
XGB params {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0} → OOF accuracy = 0.4453
XGB params {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7} → OOF accuracy = 0.4573
XGB params {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0} → OOF accuracy = 0.4562
XGB params {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 50, 'subsample': 0.7} → OOF accuracy = 0.4516
XGB params {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 50, 'subsample': 1.0} → OOF accuracy = 0.4538
XGB params {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.7} → OOF accuracy = 0.4556
XGB para

In [28]:
def score_lgbm(params, X, y, years, val_years):
    
    oof_by_year = {}
    numeric_cols = X.select_dtypes(include='number').columns
    
    for year in val_years:
        train_mask = (years != year)
        val_mask   = (years == year)
        
        X_tr  = X.loc[train_mask, numeric_cols]
        y_tr  = y.loc[train_mask]
        X_val = X.loc[val_mask,   numeric_cols]
        
        lgbm = LGBMClassifier(
            objective     = 'binary' if len(np.unique(y)) == 2 else 'multiclass',
            num_class     = len(np.unique(y)) if len(np.unique(y)) > 2 else None,
            random_state  = 42,
            verbosity     = -1,
            
            # unpack the hyperparams
            n_estimators  = params['n_estimators'],
            max_depth     = params['max_depth'],
            learning_rate = params['learning_rate'],
            num_leaves    = params['num_leaves'],
            subsample     = params['subsample']
        )
        
        lgbm.fit(X_tr, y_tr)
        preds_val = lgbm.predict(X_val)
        oof_by_year[year] = pd.Series(preds_val, index=X_val.index)
    
    all_preds = pd.concat([oof_by_year[y] for y in val_years]).sort_index()
    true_lbl  = y.loc[all_preds.index]
    score     = accuracy_score(true_lbl, all_preds)
    return score, all_preds.values

#### Running code for LGBM

In [29]:
best_lgbm_score = -np.inf
best_lgbm_params = None
best_lgbm_oof    = None

print("\n=== Tuning LightGBM ===")
for params in lgbm_grid:
    sc, oof = score_lgbm(params,
                          X_prices_train,
                          y_prices_train,
                          years,
                          validation_years)
    print(f"LGBM params {params} → OOF accuracy = {sc:.4f}")
    if sc > best_lgbm_score:
        best_lgbm_score  = sc
        best_lgbm_params = params.copy()
        best_lgbm_oof    = oof.copy()

print("\nBest LGBM params:", best_lgbm_params)
print("Best LGBM 2018–20 accuracy:", best_lgbm_score)


=== Tuning LightGBM ===
LGBM params {'learning_rate': 0.05, 'max_depth': -1, 'n_estimators': 50, 'num_leaves': 31, 'subsample': 0.7} → OOF accuracy = 0.4599
LGBM params {'learning_rate': 0.05, 'max_depth': -1, 'n_estimators': 50, 'num_leaves': 31, 'subsample': 1.0} → OOF accuracy = 0.4599
LGBM params {'learning_rate': 0.05, 'max_depth': -1, 'n_estimators': 50, 'num_leaves': 63, 'subsample': 0.7} → OOF accuracy = 0.4417
LGBM params {'learning_rate': 0.05, 'max_depth': -1, 'n_estimators': 50, 'num_leaves': 63, 'subsample': 1.0} → OOF accuracy = 0.4417
LGBM params {'learning_rate': 0.05, 'max_depth': -1, 'n_estimators': 100, 'num_leaves': 31, 'subsample': 0.7} → OOF accuracy = 0.4456
LGBM params {'learning_rate': 0.05, 'max_depth': -1, 'n_estimators': 100, 'num_leaves': 31, 'subsample': 1.0} → OOF accuracy = 0.4456
LGBM params {'learning_rate': 0.05, 'max_depth': -1, 'n_estimators': 100, 'num_leaves': 63, 'subsample': 0.7} → OOF accuracy = 0.4449
LGBM params {'learning_rate': 0.05, 'max_

In [30]:
final_xgb = XGBClassifier(
    objective       = 'binary:logistic' if len(np.unique(y_prices_train)) == 2 else 'multi:softprob',
    num_class       = len(np.unique(y_prices_train)) if len(np.unique(y_prices_train)) > 2 else None,
    use_label_encoder=False,
    eval_metric     = 'logloss',
    random_state    = 42,
    verbosity       = 0,
    
    **best_xgb_params
)
final_xgb.fit(X_prices_train.select_dtypes(include='number'), y_prices_train)

# Final LightGBM
final_lgbm = LGBMClassifier(
    objective     = 'binary' if len(np.unique(y_prices_train)) == 2 else 'multiclass',
    num_class     = len(np.unique(y_prices_train)) if len(np.unique(y_prices_train)) > 2 else None,
    random_state  = 42,
    verbosity     = -1,
    
    **best_lgbm_params
)
final_lgbm.fit(X_prices_train.select_dtypes(include='number'), y_prices_train)


In [31]:
final_xgb.fit(X_prices_train,y_prices_train)
final_lgbm.fit(X_prices_train,y_prices_train)

In [53]:
xgb_final_predict = final_xgb.predict(X_prices_test)
lgbm_final_predict = final_lgbm.predict(X_prices_test)
acc_xgb = accuracy_score(xgb_final_predict,y_prices_test)
acc_lgbm = accuracy_score(lgbm_final_predict,y_prices_test)
print("XGB Final accuracy score:")
print(acc_xgb)
print("Classification matrix for XGB:")
print(classification_report(xgb_final_predict,y_prices_test, digits=4))
print("Confusion Matrix")
print(confusion_matrix(xgb_final_predict,y_prices_test))
print("LGBM Final Accuracy score:")
print(acc_lgbm)
print("Classification matrix for LGBM:")
print(classification_report(lgbm_final_predict,y_prices_test, digits=4))
print("Confusion Matrix")
print(confusion_matrix(lgbm_final_predict,y_prices_test))

XGB Final accuracy score:
0.5362954414310445
Classification matrix for XGB:
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         0
           1     1.0000    0.5363    0.6982      8665

    accuracy                         0.5363      8665
   macro avg     0.5000    0.2681    0.3491      8665
weighted avg     1.0000    0.5363    0.6982      8665

Confusion Matrix
[[   0    0]
 [4018 4647]]
LGBM Final Accuracy score:
0.5361800346220427
Classification matrix for LGBM:
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         1
           1     0.9998    0.5362    0.6981      8664

    accuracy                         0.5362      8665
   macro avg     0.4999    0.2681    0.3490      8665
weighted avg     0.9997    0.5362    0.6980      8665

Confusion Matrix
[[   0    1]
 [4018 4646]]


In [33]:
best_pred_prob_xgb = final_xgb.predict_proba(X_prices_train)
#best_pred_prob_lgbm = final_lgbm.predict_proba(X_prices_train) I do not think we include probabilities predicted by primary models we do not use
#best_pred_prob_rf = final_rf.predict_proba(X_prices_train)
best_predictions = final_xgb.predict(X_prices_train)

### Tripple Barrier method (Meta labelling)

### We re-download the data for adding meta labels since we dropped certain features before

In [34]:
prices_train_meta.drop(['coin','regime','ret_regime_1','cumret_regime_1','ret_regime_2','cumret_regime_0', 'ret_regime_1', 'cumret_regime_1', 'ret_regime_2',
       'cumret_regime_2','tVal','day'],axis=1,inplace=True)
#prices_test.drop(['coin','regime','ret_regime_1','cumret_regime_1','ret_regime_2','cumret_regime_0', 'ret_regime_1', 'cumret_regime_1', 'ret_regime_2',
 #      'cumret_regime_2','tVal','day'],axis=1,inplace=True)

In [35]:
prices_train_meta['best_pred_prob_xgb_0'] = best_pred_prob_xgb[:,0]
prices_train_meta['best_pred_prob_xgb_1'] = best_pred_prob_xgb[:,1]
prices_train_meta['best_predictions'] = best_predictions

### Function to add meta-label

In [36]:
def triple_barrier_labeling(df,
                            price_col: str = 'close',
                            dir_col: str   = 'best_predictions',
                            horizon_col: str = 'windowSize',
                            pt: float = 0.02,
                            sl: float = 0.02):
    
    out = pd.DataFrame(index=df.index, columns=['t_out','barrier','y','m'])
    prices = df[price_col].values.flatten()
    directions = df[dir_col].fillna(0).values.flatten()
    horizons  = df[horizon_col].fillna(0).astype(int).values.flatten()
    N = len(df)

    for idx, (p0, d, h) in enumerate(zip(prices, directions, horizons)):
        if d == 0:
            out.iloc[idx] = [pd.NaT, None, 0, 0]
            continue

        # compute barrier levels
        profit_bar = p0 * (1 + d * pt)
        stop_bar   = p0 * (1 - d * sl)

        # endpoint index
        end_idx = min(idx + h, N - 1)

        y = 0
        t_hit = df.index[end_idx]
        barrier_hit = 'time'

        # scan forward
        for j in range(idx, end_idx + 1):
            pj = prices[j]
            if (d == 1 and pj >= profit_bar) or (d == -1 and pj <= profit_bar):
                y = 1
                t_hit = df.index[j]
                barrier_hit = 'profit'
                break
            if (d == 1 and pj <= stop_bar) or (d == -1 and pj >= stop_bar):
                y = -1
                t_hit = df.index[j]
                barrier_hit = 'stop'
                break

        m = int(y == d)
        out.iloc[idx] = [t_hit, barrier_hit, y, m]

    return out

# Integrate into pipeline
def add_meta_labels(df,
                    price_col='close',
                    dir_col='best_predictions',
                    horizon_col='windowSize',
                    pt=0.02, sl=0.02):
    """
    Joins triple-barrier labels and meta-labels (m) onto df.
    """
    labels = triple_barrier_labeling(df, price_col, dir_col, horizon_col, pt, sl)
    return df.join(labels)

In [37]:
prices_train_meta = add_meta_labels(prices_train_meta,price_col='close',
                    dir_col='best_predictions',
                    horizon_col='windowSize',
                    pt=0.02, sl=0.02)

#### Remove everyhting to do with trend scanning

In [41]:
prices_train_meta.drop(['t_out','barrier','y','windowSize','bin','t1'],axis=1,inplace=True)
#prices_test.drop(['coin','regime','ret_regime_1','cumret_regime_1','ret_regime_2','cumret_regime_0', 'ret_regime_1', 'cumret_regime_1', 'ret_regime_2',
 #      'cumret_regime_2','tVal','day'],axis=1,inplace=True)

In [42]:
prices_train_meta.columns

Index(['open', 'high', 'low', 'close', 'volume', 'MVRV', 'n_unique_addresses',
       'exchange_volume', 'nvt', 'log_close', 'log_vol', 'log_return',
       'return24', 'return30', 'return120', 'log_return30', 'log_return120',
       'volatility15', 'volatility200', 'rv_24h', 'ema21', 'ema35', 'ema80',
       'ema250', 'vwap', 'vwap_ratio', 'rsi14', 'macd_line', 'macd_signal',
       'macd_hist', 'bb_mid', 'bb_upper', 'bb_lower', 'bb_percent_b',
       'bb_bandwidth', 'adx14', 'plus_di14', 'minus_di14', 'obv', 'obv_ratio',
       'sma50', 'sma200', 'sma20', '%K', '%D', 'ema21_ema80_golden_cross',
       'ema21_ema80_death_cross', 'sma50_sma200_golden_cross',
       'sma50_sma200_death_cross', 'macd_golden_cross', 'macd_death_cross',
       'di14_golden_cross', 'di14_death_cross', 'price_sma20_golden_cross',
       'price_sma20_death_cross', 'sto_golden_cross', 'sto_death_cross',
       'price_vwap_golden_cross', 'price_vwap_death_cross', 'bb_cross_above',
       'bb_cross_below', 'rsi7

In [43]:
X_prices_train_meta = prices_train_meta.drop(['m'],axis=1)
y_prices_train_meta = prices_train_meta['m'].astype(int)

### Meta Model

In [44]:
tscv = TimeSeriesSplit(n_splits=3)

#### RF Meta model with hyperparam ptimisation

In [45]:
rf_param_grid = {
    'n_estimators':    [50, 100, 200],
    'max_depth':       [None, 5, 10],
    'min_samples_leaf': [1, 5]
}

rf_base = RandomForestClassifier(random_state=42, n_jobs=-1)

rf_search = GridSearchCV(
    estimator=rf_base,
    param_grid=rf_param_grid,
    cv=tscv,
    scoring='accuracy',   # or 'f1' if your classes are imbalanced
    n_jobs=-1,
    verbose=1
)

# Fit on the full meta‐training set (2018–2020), but internally CV will respect time order:
rf_search.fit(X_prices_train_meta, y_prices_train_meta)

print("→ RF best params:", rf_search.best_params_)
print("→ RF best CV accuracy:", rf_search.best_score_)
best_rf_meta = rf_search.best_estimator_

Fitting 3 folds for each of 18 candidates, totalling 54 fits
→ RF best params: {'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 100}
→ RF best CV accuracy: 0.7718631178707224


#### XGB Meta model

In [46]:
xgb_param_grid = {
    'n_estimators':   [50, 100, 200],
    'max_depth':      [3, 6, 9],
    'learning_rate':  [0.01, 0.1],
    'subsample':      [0.7, 1.0],
    'colsample_bytree': [0.7, 1.0]
}

xgb_base = XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    verbosity=0
)

xgb_search = GridSearchCV(
    estimator=xgb_base,
    param_grid=xgb_param_grid,
    cv=tscv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

xgb_search.fit(X_prices_train_meta, y_prices_train_meta)

print("→ XGB best params:", xgb_search.best_params_)
print("→ XGB best CV accuracy:", xgb_search.best_score_)
best_xgb_meta = xgb_search.best_estimator_

Fitting 3 folds for each of 72 candidates, totalling 216 fits
→ XGB best params: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}
→ XGB best CV accuracy: 0.7641064638783271


#### LGBM Meta Model

In [47]:
lgbm_param_grid = {
    'n_estimators':  [50, 100, 200],
    'max_depth':     [-1, 5, 10],
    'learning_rate': [0.01, 0.1],
    'num_leaves':    [31, 63],
    'subsample':     [0.7, 1.0]
}

lgbm_base = LGBMClassifier(
    objective='binary',
    random_state=42,
    n_jobs=-1,
    verbosity=-1
)

lgbm_search = GridSearchCV(
    estimator=lgbm_base,
    param_grid=lgbm_param_grid,
    cv=tscv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

lgbm_search.fit(X_prices_train_meta, y_prices_train_meta)

print("→ LGBM best params:", lgbm_search.best_params_)
print("→ LGBM best CV accuracy:", lgbm_search.best_score_)
best_lgbm_meta = lgbm_search.best_estimator_

Fitting 3 folds for each of 72 candidates, totalling 216 fits
→ LGBM best params: {'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 100, 'num_leaves': 31, 'subsample': 0.7}
→ LGBM best CV accuracy: 0.7605069708491762


#### Validate on these 3 models to find best accuracy on holdout set

In [54]:
# 3.1 Create a chronological train/val split (e.g. first 80% train, last 20% test)
n_total = len(X_prices_train_meta)
split_index = int(n_total * 0.8)

X_m_train = X_prices_train_meta.iloc[:split_index]
y_m_train = y_prices_train_meta.iloc[:split_index]
X_m_test  = X_prices_train_meta.iloc[split_index:]
y_m_test  = y_prices_train_meta.iloc[split_index:]

# 3.2 Fit each best‐estimator on the first 80% (if not already refit):
best_rf_meta.fit(X_m_train, y_m_train)
best_xgb_meta.fit(X_m_train, y_m_train)
best_lgbm_meta.fit(X_m_train, y_m_train)

# 3.3 Predict on the final 20% and report
for name, model in [
    ("RF-meta", best_rf_meta),
    ("XGB-meta", best_xgb_meta),
    ("LGBM-meta", best_lgbm_meta)
]:
    yhat = model.predict(X_m_test)
    print(f"\n=== {name} on final 20% hold‐out ===")
    print("Accuracy:", accuracy_score(y_m_test, yhat))
    print('Classification Matrix:')
    print(classification_report(y_m_test, yhat, digits=4))
    print("Confusion Matrix:")
    print(confusion_matrix(y_m_test, yhat))



=== RF-meta on final 20% hold‐out ===
Accuracy: 0.7036685040866756
Classification Matrix:
              precision    recall  f1-score   support

           0     0.9979    0.3739    0.5440      2487
           1     0.6403    0.9993    0.7805      2774

    accuracy                         0.7037      5261
   macro avg     0.8191    0.6866    0.6623      5261
weighted avg     0.8093    0.7037    0.6687      5261

Confusion Matrix:
[[ 930 1557]
 [   2 2772]]

=== XGB-meta on final 20% hold‐out ===
Accuracy: 0.7032883482227713
Classification Matrix:
              precision    recall  f1-score   support

           0     1.0000    0.3723    0.5426      2487
           1     0.6399    1.0000    0.7804      2774

    accuracy                         0.7033      5261
   macro avg     0.8200    0.6862    0.6615      5261
weighted avg     0.8101    0.7033    0.6680      5261

Confusion Matrix:
[[ 926 1561]
 [   0 2774]]

=== LGBM-meta on final 20% hold‐out ===
Accuracy: 0.6892225812583159
Cla

In [55]:
n = len(X_prices_train_meta)
split_at = int(n * 0.8)

X_train = X_prices_train_meta.iloc[:split_at]
X_val   = X_prices_train_meta.iloc[split_at:]
y_train = y_prices_train_meta.iloc[:split_at]
y_val   = y_prices_train_meta.iloc[split_at:]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)

# 4) Fit LinearRegression, Lasso, and Ridge (all treat y_train as continuous {0,1})
#    (a) Ordinary Least Squares
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

#    (b) Lasso (L1‐penalty); choose alpha via a small grid or stick to a default (e.g. 0.01)
lasso = Lasso(alpha=0.01, random_state=42, max_iter=5000)
lasso.fit(X_train_scaled, y_train)

#    (c) Ridge (L2‐penalty); choose alpha similarly (e.g. 1.0)
ridge = Ridge(alpha=1.0, random_state=42, max_iter=5000)
ridge.fit(X_train_scaled, y_train)

# 5) Predict continuous “scores” on the validation set
lr_scores    = lr.predict(X_val_scaled)
lasso_scores = lasso.predict(X_val_scaled)
ridge_scores = ridge.predict(X_val_scaled)

# 6) Threshold at 0.5 to get discrete 0/1 labels
lr_pred    = (lr_scores    >= 0.5).astype(int)
lasso_pred = (lasso_scores >= 0.5).astype(int)
ridge_pred = (ridge_scores >= 0.5).astype(int)

# 7) Evaluate each as a classifier
print("\n=== Linear Regression Meta‐Model ===")
print("Accuracy:", accuracy_score(y_val, lr_pred))
print('Classification Matrix:')
print(classification_report(y_val, lr_pred, digits=4))
print("Confusion Matrix:")
print(confusion_matrix(y_val, lr_pred))

print("\n=== Lasso Meta‐Model (alpha=0.01) ===")
print("Accuracy:", accuracy_score(y_val, lasso_pred))
print('Classification Matrix:')
print(classification_report(y_val, lasso_pred, digits=4))
print("Confusion Matrix:")
print(confusion_matrix(y_val, lasso_pred))

print("\n=== Ridge Meta‐Model (alpha=1.0) ===")
print("Accuracy:", accuracy_score(y_val, ridge_pred))
print('Classification Matrix:')
print(classification_report(y_val, ridge_pred, digits=4))
print("Confusion Matrix:")
print(confusion_matrix(y_val, ridge_pred))



=== Linear Regression Meta‐Model ===
Accuracy: 0.6550085535069379
Classification Matrix:
              precision    recall  f1-score   support

           0     0.6673    0.5388    0.5962      2487
           1     0.6474    0.7592    0.6989      2774

    accuracy                         0.6550      5261
   macro avg     0.6574    0.6490    0.6475      5261
weighted avg     0.6568    0.6550    0.6503      5261

Confusion Matrix:
[[1340 1147]
 [ 668 2106]]

=== Lasso Meta‐Model (alpha=0.01) ===
Accuracy: 0.7021478806310587
Classification Matrix:
              precision    recall  f1-score   support

           0     1.0000    0.3699    0.5401      2487
           1     0.6390    1.0000    0.7798      2774

    accuracy                         0.7021      5261
   macro avg     0.8195    0.6850    0.6599      5261
weighted avg     0.8097    0.7021    0.6665      5261

Confusion Matrix:
[[ 920 1567]
 [   0 2774]]

=== Ridge Meta‐Model (alpha=1.0) ===
Accuracy: 0.6426534879300513
Classifi

### Backtesting on 2021