In [1]:
#OPTIONAL: uncomment to install packages if necessary

# %pip install pandas
# %pip install scikit-learn
# %pip install ta
# %pip install xgboost
# %pip install catboost

# Preprocessing

In [2]:
import pandas as pd

df_raw = pd.read_csv("crypto.csv")

df_raw.head()

Unnamed: 0,OpenDt,CRVUSDT-open,CRVUSDT-high,CRVUSDT-low,CRVUSDT-close,CRVUSDT-volume,ZILUSDT-open,ZILUSDT-high,ZILUSDT-low,ZILUSDT-close,...,EGLDUSDT-open,EGLDUSDT-high,EGLDUSDT-low,EGLDUSDT-close,EGLDUSDT-volume,LTCUSDT-open,LTCUSDT-high,LTCUSDT-low,LTCUSDT-close,LTCUSDT-volume
0,2020-09-04 06:00:00,2.962,2.975,2.8,2.814,233206.032,0.01617,0.0163,0.01588,0.01592,...,17.524,17.563,17.0,17.027,13078.587,48.77,48.77,48.2,48.21,23686.98714
1,2020-09-04 07:00:00,2.812,2.933,2.797,2.919,422883.332,0.01594,0.01599,0.01501,0.01591,...,17.029,17.9,16.71,17.129,24945.703,48.2,48.93,47.37,48.83,46079.69323
2,2020-09-04 08:00:00,2.919,3.237,2.915,3.171,599623.833,0.01592,0.01654,0.01588,0.01635,...,17.133,17.344,17.102,17.277,7945.829,48.78,49.92,48.75,49.67,73098.31532
3,2020-09-04 09:00:00,3.171,3.294,3.1,3.282,392916.735,0.01638,0.01689,0.01621,0.01684,...,17.277,17.277,16.918,17.121,15481.387,49.68,51.54,49.34,51.45,73369.6731
4,2020-09-04 10:00:00,3.272,3.278,3.15,3.157,374373.881,0.01683,0.01717,0.01672,0.01703,...,17.133,17.75,17.1,17.342,19422.335,51.42,52.38,51.0,51.42,81565.19476


Initially the data is in wide format, with each instance sharing a single timestamp and coins having 5 columns dedicated for OHLCV. Additionally, columns are named in the format of {COIN_NAME}USDT-{METRIC}. USDT stands for the USD Tether coin, which was used as a baseline for this data collection. By Tether's definition, it should always equal 1 USD, so for readability we will be removing that from the naming scheme.

In [3]:
# do not run this more than once without resetting kernel
def rename_cols(name):
    divider = name.index('-')
    return name[divider+1:] + "-" + name[:divider-4]

rename_map = {col: rename_cols(col) for col in df_raw.columns[1:]}

rename_map["OpenDt"] = "datetime"

df_raw.rename(columns=rename_map, inplace=True)

df_raw["datetime"] = pd.to_datetime(df_raw["datetime"])

df_raw.head()

Unnamed: 0,datetime,open-CRV,high-CRV,low-CRV,close-CRV,volume-CRV,open-ZIL,high-ZIL,low-ZIL,close-ZIL,...,open-EGLD,high-EGLD,low-EGLD,close-EGLD,volume-EGLD,open-LTC,high-LTC,low-LTC,close-LTC,volume-LTC
0,2020-09-04 06:00:00,2.962,2.975,2.8,2.814,233206.032,0.01617,0.0163,0.01588,0.01592,...,17.524,17.563,17.0,17.027,13078.587,48.77,48.77,48.2,48.21,23686.98714
1,2020-09-04 07:00:00,2.812,2.933,2.797,2.919,422883.332,0.01594,0.01599,0.01501,0.01591,...,17.029,17.9,16.71,17.129,24945.703,48.2,48.93,47.37,48.83,46079.69323
2,2020-09-04 08:00:00,2.919,3.237,2.915,3.171,599623.833,0.01592,0.01654,0.01588,0.01635,...,17.133,17.344,17.102,17.277,7945.829,48.78,49.92,48.75,49.67,73098.31532
3,2020-09-04 09:00:00,3.171,3.294,3.1,3.282,392916.735,0.01638,0.01689,0.01621,0.01684,...,17.277,17.277,16.918,17.121,15481.387,49.68,51.54,49.34,51.45,73369.6731
4,2020-09-04 10:00:00,3.272,3.278,3.15,3.157,374373.881,0.01683,0.01717,0.01672,0.01703,...,17.133,17.75,17.1,17.342,19422.335,51.42,52.38,51.0,51.42,81565.19476


In [4]:
df_raw.count()

df_raw.isnull().values.any()

np.False_

Now that the timestamps have been confirmed to be in the proper format and that there are no missing values, the data will now be converted into the "Long" format

In [5]:
df = (
    pd.wide_to_long(
        df_raw,
        stubnames=["open", "high", "low", "close", "volume"],
        i="datetime",
        j="symbol",
        sep="-",
        suffix=".+",
    )
    .reset_index()
)

df = df.sort_values(['symbol','datetime']).set_index(['datetime'])

df

Unnamed: 0_level_0,symbol,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-09-04 06:00:00,ADA,0.09716,0.09766,0.09502,0.09502,22918282.0
2020-09-04 07:00:00,ADA,0.09506,0.09712,0.09215,0.09685,66824964.9
2020-09-04 08:00:00,ADA,0.09682,0.10012,0.09680,0.09944,44941018.0
2020-09-04 09:00:00,ADA,0.09945,0.10237,0.09886,0.10219,30595680.1
2020-09-04 10:00:00,ADA,0.10218,0.10392,0.10150,0.10227,44413004.8
...,...,...,...,...,...,...
2024-02-19 22:00:00,ZRX,0.36550,0.36740,0.36480,0.36640,316212.0
2024-02-19 23:00:00,ZRX,0.36680,0.36720,0.36330,0.36540,206017.0
2024-02-20 00:00:00,ZRX,0.36540,0.36780,0.36090,0.36240,543901.0
2024-02-20 01:00:00,ZRX,0.36210,0.36570,0.35730,0.36250,457941.0


In our initial testing, we found the 1 hour windows to be too sensitive to noise in pricing and volume. As such, we converted our dataset to use the same 4 hour windows from the original paper. Additionally, we can now index our dataset by time and symbol.

In [6]:
def ohlcv_4h(g):
    o = g['open'].resample('4H').first()
    h = g['high'].resample('4H').max()
    l = g['low' ].resample('4H').min()
    c = g['close'].resample('4H').last()
    v = g['volume'].resample('4H').sum()
    out = pd.DataFrame({'open':o,'high':h,'low':l,'close':c,'volume':v})
    return out.dropna()  # drop partial bars at edges if any


df = (
    df.groupby('symbol')
      .resample('4H')
      .agg(open=('open','first'),
           high=('high','max'),
           low =('low' ,'min'),
           close=('close','last'),
           volume=('volume','sum'))
      .dropna(subset=['open','high','low','close'])
      .reset_index()
      .set_index(['symbol','datetime'])
      .sort_index()
)

  .resample('4H')
  .agg(open=('open','first'),


## Feature Engineering
Next, we reconstruct the same financial metrics used in the paper.

In [7]:
# Feature Engineering
from ta.momentum import RSIIndicator, StochasticOscillator, ROCIndicator
from ta.trend import EMAIndicator, MACD, CCIIndicator
from ta.volatility import BollingerBands
import numpy as np

c, h, l, v = df['close'], df['high'], df['low'], df['volume']

# ---- Params for hourly (4h params × 4)
# ROC_WINS   = [4, 12, 168]
# EMA_PAIRS  = [(336, 672)]
# RSI_WINS   = [24, 56, 104]
# CCI_WINS   = [40, 80]
# BB_WINS    = [40, 80]
# STO_KS     = [24, 56]
# VOL_WIN    = 80
# VOL_MA_VOL = 168
# VOL_ROC_W  = [168, 336]

# 4h params
ROC_WINS   = [1, 3, 42]
EMA_PAIRS  = [(84, 168)]
RSI_WINS   = [8, 14, 26]
CCI_WINS   = [10, 20]
BB_WINS    = [10, 20]
STO_KS     = [8, 14]
VOL_WIN    = 20
VOL_MA_VOL = 42
VOL_ROC_W  = [42, 84]

fe = pd.DataFrame(index=df.index)

# ROC
for n in ROC_WINS:
    fe[f'roc_{n}'] = ROCIndicator(close=c, window=n).roc()

# EMA cross (diff & ratio)
for s, L in EMA_PAIRS:
    ema_s = EMAIndicator(close=c, window=s).ema_indicator()
    ema_L = EMAIndicator(close=c, window=L).ema_indicator()
    fe[f'ema_diff_{s}_{L}']  = ema_s - ema_L
    fe[f'ema_ratio_{s}_{L}'] = ema_s / ema_L - 1.0

# RSI + lag2
for n in RSI_WINS:
    r = RSIIndicator(close=c, window=n).rsi()
    fe[f'rsi_{n}'] = r
    fe[f'rsi_{n}_lag2'] = r.shift(2)

# MACD histogram + lag2 (12,26,9)
macd_hist = MACD(close=c, window_fast=12, window_slow=26, window_sign=9).macd_diff()
fe['macd_hist'] = macd_hist
fe['macd_hist_lag2'] = macd_hist.shift(2)

# CCI
for n in CCI_WINS:
    fe[f'cci_{n}'] = CCIIndicator(high=h, low=l, close=c, window=n).cci()

# Bollinger %b and bandwidth
for n in BB_WINS:
    bb = BollingerBands(close=c, window=n, window_dev=2)
    fe[f'bb_pctb_{n}'] = bb.bollinger_pband()
    fe[f'bb_bw_{n}']   = bb.bollinger_wband()

# Stochastic (%K, %D, slowD) + histogram
for K in STO_KS:
    stoch = StochasticOscillator(high=h, low=l, close=c, window=K, smooth_window=3)
    fast_k = stoch.stoch()
    fast_d = stoch.stoch_signal()
    slow_d = fast_d.rolling(3, min_periods=3).mean()
    fe[f'stoch_fastk_{K}'] = fast_k
    fe[f'stoch_fastd_{K}'] = fast_d
    fe[f'stoch_slowd_{K}'] = slow_d
    fe[f'stoch_hist_{K}']  = fast_k - slow_d

# Price volatility: rolling std of log returns
logret = np.log(c / c.shift(1))
fe[f'volatility_{VOL_WIN}'] = logret.rolling(VOL_WIN, min_periods=VOL_WIN).std()

# Volume features: net volume mean & volume change
    
fe[f'net_volume_{VOL_MA_VOL}'] = v.rolling(VOL_MA_VOL, min_periods=VOL_MA_VOL).mean()

def volume_log_change(v: pd.Series, n: int, eps: float = 1e-9) -> pd.Series:
    return np.log1p(v + eps) - np.log1p(v.shift(n) + eps)
    
for n in VOL_ROC_W:
    fe[f'vol_change_{n}'] = volume_log_change(v, n)

In [8]:
fe

Unnamed: 0_level_0,Unnamed: 1_level_0,roc_1,roc_3,roc_42,ema_diff_84_168,ema_ratio_84_168,rsi_8,rsi_8_lag2,rsi_14,rsi_14_lag2,rsi_26,...,stoch_slowd_8,stoch_hist_8,stoch_fastk_14,stoch_fastd_14,stoch_slowd_14,stoch_hist_14,volatility_20,net_volume_42,vol_change_42,vol_change_84
symbol,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ADA,2020-09-04 04:00:00,,,,,,,,,,,...,,,,,,,,,,
ADA,2020-09-04 08:00:00,4.615385,,,,,,,,,,...,,,,,,,,,,
ADA,2020-09-04 12:00:00,-2.951046,,,,,,,,,,...,,,,,,,,,,
ADA,2020-09-04 16:00:00,2.491610,4.057821,,,,,,,,,...,,,,,,,,,,
ADA,2020-09-04 20:00:00,0.773963,0.236873,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZRX,2024-02-19 08:00:00,-0.444198,1.156559,13.986014,0.002362,0.007121,68.734054,73.275552,65.428551,67.597706,62.579404,...,91.360602,-9.445709,90.000000,94.086466,93.383357,-3.383357,0.015094,1.424501e+06,0.935967,1.189673
ZRX,2024-02-19 12:00:00,0.920245,0.555710,12.566096,0.002659,0.008008,72.939046,73.584988,67.923018,67.802387,64.082794,...,88.882896,-11.168610,89.682540,91.462807,93.159361,-3.476821,0.015170,1.457138e+06,0.928950,1.679996
ZRX,2024-02-19 16:00:00,0.165792,0.638534,11.813695,0.002952,0.008881,73.674737,68.734054,68.369890,65.428551,64.353594,...,84.833113,-3.690256,91.269841,90.317460,91.955578,-0.685737,0.014650,1.451243e+06,-0.156499,0.943886
ZRX,2024-02-19 20:00:00,0.800000,1.896263,11.538462,0.003268,0.009819,77.111905,72.939046,70.508436,67.923018,65.655219,...,82.144012,5.586049,94.505495,91.819292,91.199853,3.305641,0.012986,1.444928e+06,-0.268800,1.058533


In [9]:
# Up/Down/Neutral Labels

# hourly threshold (paper used +/-1% for 4h, +/-1.5% daily). using something in range 0.3%-0.5% since we have hourly data
# LABEL_THRESHOLD_1H = 0.004

LABEL_THRESHOLD = 0.01

rp = np.log(df['close'].shift(-1) / df['close'])
y = pd.Series(0, index=df.index)         # neutral
y[rp >  LABEL_THRESHOLD] =  1            # up
y[rp < -LABEL_THRESHOLD] = -1            # down

# Keep only up/down
mask = y != 0
y = y[mask]

# make 0-1 scale instead of -1/1 (needed for roc_auc unless we use multiclass)
y = (y > 0).astype(int)

# Align & drop NaNs (due to warmup windows); last row has NaN label from shift(-1)
Xy = fe.join(y.rename('label')).dropna()
X = Xy.drop(columns='label')
y = Xy['label'].astype(int)

In [10]:
Xy

Unnamed: 0_level_0,Unnamed: 1_level_0,roc_1,roc_3,roc_42,ema_diff_84_168,ema_ratio_84_168,rsi_8,rsi_8_lag2,rsi_14,rsi_14_lag2,rsi_26,...,stoch_hist_8,stoch_fastk_14,stoch_fastd_14,stoch_slowd_14,stoch_hist_14,volatility_20,net_volume_42,vol_change_42,vol_change_84,label
symbol,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ADA,2020-10-02 00:00:00,2.152183,1.018761,16.711339,0.000941,0.009978,50.235670,40.838184,52.049412,47.360220,54.162627,...,14.454404,55.312810,41.873552,41.109329,14.203481,0.020287,6.842137e+07,-0.816817,0.304010,0.0
ADA,2020-10-02 04:00:00,-5.351972,-3.472505,7.764893,0.000925,0.009802,33.867797,40.297030,41.303154,47.009730,47.441350,...,-30.479500,5.192308,31.621534,35.797269,-30.604961,0.022555,6.897688e+07,0.280668,1.231947,0.0
ADA,2020-10-02 08:00:00,-4.546893,-7.711138,-3.384944,0.000858,0.009100,26.063729,50.235670,35.038659,52.049412,42.980949,...,-26.366129,5.407210,21.970776,31.821954,-26.414744,0.024406,6.711281e+07,-0.608640,1.143144,1.0
ADA,2020-10-02 20:00:00,0.097014,2.630416,-4.198906,0.000745,0.007911,35.701838,34.007220,40.493815,39.551616,45.925395,...,13.280489,21.295060,20.182466,14.929393,6.365667,0.024384,6.111935e+07,-0.419296,0.224387,1.0
ADA,2020-10-03 00:00:00,1.949171,2.401298,-0.671493,0.000733,0.007780,43.773862,35.297694,44.823943,40.277410,48.194731,...,22.381521,33.377837,25.122385,20.063789,13.314048,0.024823,5.880832e+07,-1.561550,-0.225693,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZRX,2024-02-17 04:00:00,2.204586,2.961208,8.082064,-0.000380,-0.001156,63.367862,58.480175,62.980162,60.280149,61.066020,...,32.559292,80.733945,63.347648,55.913484,24.820461,0.011717,1.268665e+06,0.803390,0.744264,0.0
ZRX,2024-02-17 08:00:00,-1.955709,-0.670163,7.167557,-0.000229,-0.000696,51.079337,52.291051,54.664398,56.148065,55.992328,...,-7.906912,49.541284,58.868502,57.607508,-8.066223,0.012806,1.272674e+06,0.134544,1.077079,0.0
ZRX,2024-02-17 12:00:00,-2.640070,-2.439741,4.011282,-0.000189,-0.000573,39.494440,63.367862,46.006107,62.980162,50.245971,...,-34.594613,16.317992,48.864407,57.026852,-40.708860,0.014146,1.299579e+06,0.802451,1.182672,1.0
ZRX,2024-02-17 16:00:00,1.687255,-2.933563,5.666875,-0.000084,-0.000255,47.897592,51.079337,51.186878,54.664398,53.344435,...,-1.857481,39.748954,35.202743,47.645217,-7.896263,0.014374,1.317763e+06,0.925085,1.400766,1.0


In [11]:
fe.count()

roc_1               371615
roc_3               371613
roc_42              371574
ema_diff_84_168     371449
ema_ratio_84_168    371449
rsi_8               371609
rsi_8_lag2          371607
rsi_14              371603
rsi_14_lag2         371601
rsi_26              371591
rsi_26_lag2         371589
macd_hist           371583
macd_hist_lag2      371581
cci_10              371607
cci_20              371597
bb_pctb_10          371607
bb_bw_10            371607
bb_pctb_20          322428
bb_bw_20            371597
stoch_fastk_8       371609
stoch_fastd_8       371607
stoch_slowd_8       371605
stoch_hist_8        371605
stoch_fastk_14      371603
stoch_fastd_14      371601
stoch_slowd_14      371599
stoch_hist_14       371599
volatility_20       371596
net_volume_42       371575
vol_change_42       371574
vol_change_84       371532
dtype: int64

Most columns have the same value, though there are slight discrepancies due to some of the metrics used having a lookback period. Next, the data is normalized. Some of the metrics already exist in a normalized scale, and as such they will be skipped. We used logarithms before scaling some metrics to avoid issues when dividing by small numbers (ex: change volume), which produced near-infinite or infinite values.

In [12]:
# Normalize data if not already normalized

from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit

num_cols = X.select_dtypes(np.number).columns.tolist()

# already bound 0-100 or 0-1
bounded_cols   = [c for c in num_cols if c.startswith(('rsi_', 'stoch_', 'bb_pctb_'))]
# log1p then get z-score
loggy_cols     = [c for c in num_cols if c.startswith(('net_volume_', 'vol_change_', 'bb_bw_', 'volatility_', 'ema_ratio_'))]
# z-score
the_rest       = sorted(list(set(num_cols) - set(bounded_cols) - set(loggy_cols)))

def signed_log1p(df):
    Z = df.copy()
    for c in Z.columns:
        x = Z[c].values
        Z[c] = np.sign(x) * np.log1p(np.abs(x))
    return Z

pre = ColumnTransformer(
    transformers=[
        ("bounded_passthrough", "passthrough", bounded_cols),
        ("log_then_standardize", Pipeline([
            ("log",       FunctionTransformer(signed_log1p, validate=False)),
            ("standard",  StandardScaler())
        ]), loggy_cols),
        ("standardize_rest", StandardScaler(), the_rest)
    ],
    remainder="drop"
)


Due to dropping the 'neutral' observations in the price change, some coins did not meet our observation minimum requirements. We set a minimum number of splits to target, ensuring that the coins we are training the model had an adequate amount of examples to analyze.

In [13]:
# Train-Test Split and Model Training

used_coins = []
skipped_coins = []

DESIRED_SPLITS = 5

for sym, g in X.groupby('symbol', sort=False):

    n = len(g)
    max_splits = (n - 200) // 540   # ensure ~200 training rows for lookback window
    actual_splits = min(8, max_splits)

    # use coins that have a reasonable amount of data
    if actual_splits < DESIRED_SPLITS:
        skipped_coins.append(sym)
        continue

    used_coins.append(sym)

In [14]:
print("Used:", used_coins)
print("\n")
print("Skipped:", skipped_coins)

Used: ['ADA', 'ALGO', 'ATOM', 'BAL', 'BAND', 'BAT', 'BCH', 'COMP', 'CRV', 'DASH', 'DOT', 'EGLD', 'EOS', 'ETC', 'ICX', 'IOTA', 'KAVA', 'KNC', 'LINK', 'LTC', 'MKR', 'NEO', 'OMG', 'ONT', 'QTUM', 'RLC', 'RUNE', 'SNX', 'SOL', 'SUSHI', 'SXP', 'THETA', 'TRB', 'WAVES', 'XMR', 'XTZ', 'YFI', 'ZEC', 'ZRX']


Skipped: ['BNB', 'BTC', 'DOGE', 'ETH', 'IOST', 'TRX', 'VET', 'XLM', 'XRP', 'ZIL']


## Instantiating the models

We used the models and hyperparameters given by the paper as a baseline for comparison against our ensemble model

In [15]:
from itertools import product
from sklearn.model_selection import ParameterGrid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

def sigma_to_gamma(s):
    return 1.0 / (2.0 * s * s)

def make_models():
    param_grid = {
    "KNN": {"n_neighbors": [4, 6, 8, 12, 16]},
    "SVM": {
        "C": [2, 8, 16, 32, 64], 
        "probability": [True], 
        "gamma":[sigma_to_gamma(0.03)],
        "kernel": ["rbf"]
    },
    "RandomForest": {
        "n_estimators": [500, 1500], 
        "max_features": [2, 3, 4]
    },
    "XGBoost": {
        "n_estimators": [500, 1500, 2500],
        "learning_rate": [0.01, 0.05],
        # Fixed parameters:
        "max_depth": [3],
        "colsample_bytree": [0.75],
        "subsample": [0.75],
    },
    "CatBoost": {
        "iterations": [500, 1500, 2500],
        "learning_rate": [0.01, 0.05],
        "l2_leaf_reg": [0.01, 0.1],
        # Fixed parameters:
        "depth": [3],
        "border_count": [128],
        "rsm": [0.75],
        "verbose": [0]
    }
}

    models = {}

    base_models = {
        "KNN": KNeighborsClassifier,
        "SVM": SVC,
        "RandomForest": RandomForestClassifier,
        "XGBoost": XGBClassifier,
        "CatBoost": CatBoostClassifier,
    }
    
    for name, grid in param_grid.items():
        # Separate fixed from varying parameters
        varying_params = {k: v for k, v in grid.items() if len(v) > 1}
        fixed_params = {k: v[0] for k, v in grid.items() if len(v) == 1}
    
        for combo in product(*grid.values()):
            params = dict(zip(grid.keys(), combo))
            model_params = {**params}  # use all params when initializing
    
            # Create readable name: only include varying parameters
            name_parts = [f"{k}={v}" for k, v in params.items() if k in varying_params]
            model_id = f"{name}_" + "_".join(name_parts) if name_parts else name
    
            # Instantiate model
            models[model_id] = base_models[name](**model_params)

    # Check example output
    for key in list(models.keys())[:8]:
        print(key)
    
    return models

models = make_models()
print(f"Total models: {len(models)}")


KNN_n_neighbors=4
KNN_n_neighbors=6
KNN_n_neighbors=8
KNN_n_neighbors=12
KNN_n_neighbors=16
SVM_C=2
SVM_C=8
SVM_C=16
Total models: 34


After running each model, we use the `roi_sim` function to simulate the return on investment if we trade according to each model's prediciton. We can use these results to compare against the paper's baseline financial models of **Buy and Hold** and **Simple Trading Strategy**. These results are displayed alongside the typical `AUC` and `ACC` scores for models.

In [16]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from ta.momentum import RSIIndicator
import numpy as np

results = []

results_roi = []

def to_signal(p_up, buy_thr=0.50, sell_thr=0.45):
    signal = np.zeros_like(p_up, dtype=int)
    signal[p_up >= buy_thr] = 1
    signal[p_up <  sell_thr] = -1
    return signal

def roi_sim(test_df):
    commission = 0.001  # 0.1% per side

    equity = 1000.0
    position = 0        # +1 if long, 0 if flat
    roi_series = []
    
    for sig, r in zip(test_df['signal'], test_df['ret']):
        # update position
        if sig == 1 and position == 0:   # buy
            equity *= (1 - commission)
            position = 1
        elif sig == -1 and position == 1: # sell
            equity *= (1 - commission)
            position = 0
    
        # apply return if holding
        if position == 1:
            equity *= np.exp(r)
    
        roi_series.append(equity)
    
    test_df['equity'] = roi_series
    ROI_model = (equity - 1000) / 1000
    # print(f"ROI (model) = {ROI_model*100:.1f}%")

    return ROI_model


for sym, g in Xy.groupby('symbol', sort=False):

    closes_g = df.loc[g.index, 'close']                 
    closes_g = closes_g.sort_index(level='datetime')
    ret_next_full = np.log(closes_g).diff().shift(-1)

    if sym in skipped_coins:
        continue
    
    print(g.columns)
    g = g.sort_index(level='datetime')
    X_sym = g.drop(columns=['label'])
    y_sym = g['label']

    tscv = TimeSeriesSplit(n_splits=DESIRED_SPLITS, test_size=540)
    print(f"\n=== {sym} ===")

    for fold, (train_idx, test_idx) in enumerate(tscv.split(X_sym)):
        X_train, X_test = X_sym.iloc[train_idx], X_sym.iloc[test_idx]
        y_train, y_test = y_sym.iloc[train_idx], y_sym.iloc[test_idx]

        test_index = g.iloc[test_idx].index                  # MultiIndex (symbol, datetime)

        ret_next = ret_next_full.loc[test_index]

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        for name, model in models.items():
            model.fit(X_train, y_train)

            # Predict probabilities (for buy/hold/sell logic)
            proba = model.predict_proba(X_test)[:, 1]
            preds = (proba >= 0.5).astype(int)

            acc = accuracy_score(y_test, preds)
            auc = roc_auc_score(y_test, proba)

            results.append({
                "symbol": sym,
                "fold": fold + 1,
                "model": name,
                "accuracy": acc,
                "auc": auc
            })

            test_df = pd.DataFrame(
                {'proba': proba, 'pred': preds},
                index=test_index
            )
            test_df['ret'] = ret_next.values 

            test_df['signal'] = to_signal(test_df['proba'])

            roi = roi_sim(test_df)

            results_roi.append({
                "symbol": sym,
                "fold": fold + 1,
                "model": name,
                "roi": roi
            })

            print(f"{name:40s} Fold {fold+1}: acc={acc:.3f}, auc={auc:.3f}")


        # baseline B&H

        closes = df.loc[test_index, 'close']

        p0 = closes.iloc[0]
        p1 = closes.iloc[-1]
        ROI_BH = (p1 - p0) / p0

        results_roi.append({
                "symbol": sym,
                "fold": fold + 1,
                "model": "B&H",
                "roi": ROI_BH
            })
        
        # print(f"ROI (Buy & Hold) = {ROI_BH*100:.1f}%")

        # baseline STS
        
        rsi = RSIIndicator(closes, window=14).rsi()
        
        def rsi_sts(rsi, overbought=70, oversold=30):
            signal = np.zeros_like(rsi, dtype=int)
            signal[rsi < oversold]  = 1   # buy
            signal[rsi > overbought] = -1 # sell
            return signal

        sts_df = pd.DataFrame({
            'ret'  : ret_next.values
        })
        
        sts_df['signal'] = rsi_sts(rsi)

        ROI_STS = roi_sim(sts_df)

        results_roi.append({
            "symbol": sym,
            "fold": fold + 1,
            "model": "STS",
            "roi": ROI_STS
        })

        # print(f"ROI (STS) = {ROI_STS*100:.1f}%")
        

Index(['roc_1', 'roc_3', 'roc_42', 'ema_diff_84_168', 'ema_ratio_84_168',
       'rsi_8', 'rsi_8_lag2', 'rsi_14', 'rsi_14_lag2', 'rsi_26', 'rsi_26_lag2',
       'macd_hist', 'macd_hist_lag2', 'cci_10', 'cci_20', 'bb_pctb_10',
       'bb_bw_10', 'bb_pctb_20', 'bb_bw_20', 'stoch_fastk_8', 'stoch_fastd_8',
       'stoch_slowd_8', 'stoch_hist_8', 'stoch_fastk_14', 'stoch_fastd_14',
       'stoch_slowd_14', 'stoch_hist_14', 'volatility_20', 'net_volume_42',
       'vol_change_42', 'vol_change_84', 'label'],
      dtype='object')

=== ADA ===
KNN_n_neighbors=4    Fold 1: acc=0.487, auc=0.502
KNN_n_neighbors=6    Fold 1: acc=0.476, auc=0.486
KNN_n_neighbors=8    Fold 1: acc=0.487, auc=0.484
KNN_n_neighbors=12   Fold 1: acc=0.478, auc=0.497
KNN_n_neighbors=16   Fold 1: acc=0.481, auc=0.493
SVM_C=2              Fold 1: acc=0.493, auc=0.500
SVM_C=8              Fold 1: acc=0.493, auc=0.500
SVM_C=16             Fold 1: acc=0.493, auc=0.500
SVM_C=32             Fold 1: acc=0.493, auc=0.500
SVM_C=6

In [17]:
# Average by model
df_results = pd.DataFrame(results)

model_avgs = (
    df_results.groupby('model')[['accuracy', 'auc']]
    .mean()
    .sort_values('auc', ascending=False)
)
print(model_avgs.round(3))

                                                    accuracy    auc
model                                                              
CatBoost_iterations=500_learning_rate=0.01_l2_l...     0.521  0.535
CatBoost_iterations=500_learning_rate=0.01_l2_l...     0.522  0.534
CatBoost_iterations=1500_learning_rate=0.01_l2_...     0.519  0.530
CatBoost_iterations=1500_learning_rate=0.01_l2_...     0.519  0.529
XGBoost_n_estimators=500_learning_rate=0.01            0.518  0.529
CatBoost_iterations=2500_learning_rate=0.01_l2_...     0.517  0.527
CatBoost_iterations=2500_learning_rate=0.01_l2_...     0.518  0.527
CatBoost_iterations=500_learning_rate=0.05_l2_l...     0.517  0.526
RandomForest_n_estimators=500_max_features=3           0.517  0.525
XGBoost_n_estimators=1500_learning_rate=0.01           0.515  0.525
RandomForest_n_estimators=1500_max_features=3          0.517  0.525
CatBoost_iterations=500_learning_rate=0.05_l2_l...     0.516  0.525
RandomForest_n_estimators=1500_max_features=4   

In [30]:
df_results["model_base"] = df_results["model"].str.split("_", n=1).str[0]

model_coin_avgs = (
    df_results
    .groupby(["model_base", "symbol"], as_index=False)[["accuracy", "auc"]]
    .mean()
)

pivot_acc = (
    model_coin_avgs
    .pivot(index="model_base", columns="symbol", values="accuracy")
    .round(3)
)

pivot_acc

# print(model_avgs.round(3))

symbol,ADA,ALGO,ATOM,BAL,BAND,BAT,BCH,COMP,CRV,DASH,...,SUSHI,SXP,THETA,TRB,WAVES,XMR,XTZ,YFI,ZEC,ZRX
model_base,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CatBoost,0.517,0.53,0.524,0.517,0.534,0.504,0.511,0.518,0.521,0.501,...,0.524,0.522,0.511,0.497,0.521,0.521,0.514,0.522,0.512,0.514
KNN,0.502,0.512,0.509,0.51,0.503,0.505,0.504,0.504,0.5,0.505,...,0.506,0.504,0.507,0.496,0.505,0.534,0.506,0.502,0.505,0.507
RandomForest,0.514,0.523,0.523,0.514,0.524,0.51,0.509,0.513,0.515,0.508,...,0.52,0.525,0.509,0.505,0.537,0.525,0.514,0.521,0.503,0.513
SVM,0.49,0.509,0.5,0.497,0.496,0.507,0.499,0.499,0.493,0.504,...,0.508,0.496,0.505,0.506,0.483,0.518,0.506,0.492,0.508,0.502
XGBoost,0.517,0.526,0.519,0.522,0.525,0.507,0.512,0.513,0.523,0.508,...,0.52,0.521,0.515,0.494,0.516,0.515,0.516,0.527,0.499,0.515


In [32]:
df_results["model_base"] = df_results["model"].str.split("_", n=1).str[0]

model_avgs = (
    df_results
    .groupby("model_base")[["accuracy", "auc"]]
    .mean()
)

print(model_avgs.round(3))

              accuracy    auc
model_base                   
CatBoost         0.517  0.526
KNN              0.506  0.509
RandomForest     0.517  0.525
SVM              0.501  0.500
XGBoost          0.515  0.523


In [20]:
# Look at ROI results by model by coin
df_results_roi = pd.DataFrame(results_roi)

model_roi_avgs = (
    df_results_roi.groupby(['model'])['roi']
    .mean()
    .sort_values(ascending=False)
)
print(model_roi_avgs.round(3))

model
STS                                                             0.003
RandomForest_n_estimators=500_max_features=2                   -0.030
B&H                                                            -0.034
XGBoost_n_estimators=1500_learning_rate=0.01                   -0.036
CatBoost_iterations=500_learning_rate=0.05_l2_leaf_reg=0.1     -0.046
CatBoost_iterations=1500_learning_rate=0.01_l2_leaf_reg=0.01   -0.057
CatBoost_iterations=500_learning_rate=0.05_l2_leaf_reg=0.01    -0.058
XGBoost_n_estimators=500_learning_rate=0.01                    -0.065
CatBoost_iterations=500_learning_rate=0.01_l2_leaf_reg=0.01    -0.066
RandomForest_n_estimators=500_max_features=3                   -0.068
CatBoost_iterations=1500_learning_rate=0.05_l2_leaf_reg=0.1    -0.069
CatBoost_iterations=500_learning_rate=0.01_l2_leaf_reg=0.1     -0.071
RandomForest_n_estimators=1500_max_features=3                  -0.072
RandomForest_n_estimators=1500_max_features=2                  -0.073
RandomForest_n

In [33]:
roi_table = (
    df_results_roi
    .pivot_table(index='model_base', columns='symbol', values='roi', aggfunc='mean')
    .round(3)           # round to 3 decimals
    .sort_index(axis=0) # sort models alphabetically (optional)
)
roi_table["avg_roi"] = roi_table.mean(axis=1)

from IPython.display import display
display(roi_table.style
    .background_gradient(
        cmap='RdYlGn', 
        axis=None, 
        vmin=-roi_table.abs().max().max(),
        vmax= roi_table.abs().max().max()))

KeyError: 'model_base'