In [1]:
import os
import optuna
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import time
import datetime
import pickle
import ta 
import gc
from tqdm import tqdm

TRAIN_CSV = r'C:\Users\e0817820\Desktop\tokka\data\raw\train.csv'
TRAIN_COMBINE_CSV = r'C:\Users\e0817820\Desktop\tokka\data\raw\train_combine.csv'
TEST_CSV = r'C:\Users\e0817820\Desktop\tokka\data\raw\test.csv'
ADDITIONAL_TRAIN_CSV = r'C:\Users\e0817820\Desktop\tokka\data\raw\add_train.csv'
MODELS_DIR = r'C:\Users\e0817820\Desktop\tokka\models'

  from .autonotebook import tqdm as notebook_tqdm
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def get_time_series_cross_val_splits(df, cv = 6, embargo = 3750):
    all_train_timestamps = df['timestamp'].unique()
    len_split = len(all_train_timestamps) // cv
    test_splits = [all_train_timestamps[i * len_split : (i + 1) * len_split] for i in range(cv)]
    # fix the last test split to have all the last timestamps, in case the number of timestamps wasn't divisible by cv
    rem = len(all_train_timestamps) - len_split*cv
    if rem>0:
        test_splits[-1] = np.append(test_splits[-1], all_train_timestamps[-rem:])

    train_splits = []
    for test_split in test_splits:
        test_split_max = int(np.max(test_split))
        test_split_min = int(np.min(test_split))
        # get all of the timestamps that aren't in the test split
        train_split_not_embargoed = [e for e in all_train_timestamps if not (test_split_min <= int(e) <= test_split_max)]
        embargo_sec = 60*1000*embargo #in milliseconds
        train_split = [e for e in train_split_not_embargoed if
                       abs(int(e) - test_split_max) > embargo_sec and abs(int(e) - test_split_min) > embargo_sec]
        train_splits.append(train_split)
        print(f"Train split: {datetime.datetime.utcfromtimestamp(int(train_split[0]/1000)).strftime('%Y-%m-%d %H:%M:%S')} - {datetime.datetime.utcfromtimestamp(int(train_split[-1]/1000)).strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"Test split: {datetime.datetime.utcfromtimestamp(int(test_split[0]/1000)).strftime('%Y-%m-%d %H:%M:%S')} - {datetime.datetime.utcfromtimestamp(int(test_split[-1]/1000)).strftime('%Y-%m-%d %H:%M:%S')}")
        print(len(train_split), len(test_split))
    train_test_zip = zip(train_splits, test_splits)
    return train_test_zip

In [3]:
def correlation(a, train_data):
    b = train_data.get_label()
    
    a = np.ravel(a)
    b = np.ravel(b)

    len_data = len(a)
    mean_a = np.mean(a)
    mean_b = np.mean(b)
    var_a = np.var(a, ddof=0)  
    var_b = np.var(b, ddof=0)  
    
    cov = np.sum((a - mean_a) * (b - mean_b)) / len_data
    corr = cov / np.sqrt(var_a * var_b)

    return 'corr', corr, True

# For CV score calculation
def corr_score(preds, valid_data):
    valid = valid_data.get_label()
    return 'corr', correlation(preds, valid_data)[1], True  # True for maximization
  
def hyperparams_tuning(df, cv_splits, target, n_trials=100):
    """
    Perform hyperparameter tuning for LightGBM using Optuna.
    """

    def objective(trial):
        param = {
            'early_stopping_rounds': 50,
            'objective': 'regression',
            'metric': 'rmse',
            'verbosity': -1,
            'boosting_type': 'gbdt', 
            'max_depth': trial.suggest_int('max_depth', 5, 6),
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 150, 10),
            'num_leaves': trial.suggest_int('num_leaves', 100, 256),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
            'max_bin': trial.suggest_int('max_bin', 250, 480),
            'bagging_freq': 1,
            'seed':20,
            'feature_fraction_seed': 20,
            'bagging_fraction_seed': 20,
            'data_random_seed': 20,
            'extra_trees': True,
            'extra_seed': 20,
            'zero_as_missing': True,
            "first_metric_only": True
        }
        corr_list = []
        df_target = df[target]
        df_feature = df.drop(columns=[target, 'timestamp'])
        for train_split, test_split in cv_splits:
            gc.collect()
            train_split_index = df['timestamp'].isin(train_split)
            test_split_index = df['timestamp'].isin(test_split)
            
            X_train, y_train = df_feature.loc[train_split_index], df_target.loc[train_split_index].values
            X_test, y_test = df_feature.loc[test_split_index], df_target.loc[test_split_index].values
            
            lgb_train = lgb.Dataset(X_train, y_train)
            lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
            
            gbm = lgb.train(param, lgb_train, valid_sets=[lgb_train, lgb_eval], feval=correlation, num_boost_round=5000, valid_names=['train', 'valid'])
            
            preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

            corr = corr_score(preds, lgb_eval)
            corr_list.append(corr)

        corrs = [x[1] for x in corr_list]
        avg_corr = np.mean(corrs)
        return avg_corr

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)

    return study.best_params

In [4]:
list_cryptocurrencies = ['DOGE', 'LINK', 'XRP', 'BTC', 'ETH']
best_params_dict = {}

In [5]:
for symbol in list_cryptocurrencies:
    print(symbol)
    target = f'log_return_{symbol}'
    df = pd.read_csv(rf"C:\Users\e0817820\Desktop\tokka\data\processed\train_{symbol}.csv")
    cv_splits = list(get_time_series_cross_val_splits(df, cv = 6, embargo = 3750))
    best_params = hyperparams_tuning(df, cv_splits, target, n_trials=100)
    best_params_dict[symbol] = best_params
    print(f"Best params of {symbol}:")
    print(best_params)
    gc.collect()
print(best_params_dict)

DOGE
Train split: 2021-11-08 16:28:00 - 2023-08-07 04:40:00
Test split: 2021-07-01 00:00:00 - 2021-11-06 01:57:00
916493 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2021-11-06 01:58:00 - 2022-03-13 21:25:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-03-13 21:26:00 - 2022-07-19 16:53:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-07-19 16:54:00 - 2022-11-24 12:21:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-11-24 12:22:00 - 2023-04-01 09:09:00
912743 184048


[I 2024-02-16 20:54:10,778] A new study created in memory with name: no-name-a9555c1d-088e-4596-9204-e849d3371c00


Train split: 2021-07-01 00:00:00 - 2023-03-29 18:39:00
Test split: 2023-04-01 09:10:00 - 2023-08-07 04:40:00
916490 184051


  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 150, 10),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
[I 2024-02-16 20:55:47,180] Trial 0 finished with value: 0.04395713907087311 and parameters: {'max_depth': 5, 'lambda_l1': 0.034993430207757206, 'lambda_l2': 0.7472388391444793, 'learning_rate': 0.06230783463376143, 'min_data_in_leaf': 140, 'num_leaves': 122, 'feature_fraction': 0.4488323435294409, 'bagging_fraction': 0.4740103446587288, 'max_bin': 301}. Best is trial 0 with value: 0.04395713907087311.
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_logu

Best params of DOGE:
{'max_depth': 5, 'lambda_l1': 0.07812018043517426, 'lambda_l2': 0.030273115819440092, 'learning_rate': 0.08306620265032358, 'min_data_in_leaf': 90, 'num_leaves': 128, 'feature_fraction': 0.643750094314933, 'bagging_fraction': 0.7630037180728901, 'max_bin': 434}
LINK
Train split: 2021-11-08 16:28:00 - 2023-08-07 04:40:00
Test split: 2021-07-01 00:00:00 - 2021-11-06 01:57:00
916493 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2021-11-06 01:58:00 - 2022-03-13 21:25:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-03-13 21:26:00 - 2022-07-19 16:53:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-07-19 16:54:00 - 2022-11-24 12:21:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-11-24 12:22:00 - 2023-04-01 09:09:00
912743 184048


[I 2024-02-16 23:51:06,938] A new study created in memory with name: no-name-254e841a-869e-40a7-854e-df79e13435f8


Train split: 2021-07-01 00:00:00 - 2023-03-29 18:39:00
Test split: 2023-04-01 09:10:00 - 2023-08-07 04:40:00
916490 184051


  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 150, 10),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
[I 2024-02-16 23:52:10,192] Trial 0 finished with value: 0.01976787888999154 and parameters: {'max_depth': 6, 'lambda_l1': 0.12837375105101972, 'lambda_l2': 0.8728927222479248, 'learning_rate': 0.038753872122220416, 'min_data_in_leaf': 120, 'num_leaves': 183, 'feature_fraction': 0.6826714579651523, 'bagging_fraction': 0.9974365112811806, 'max_bin': 332}. Best is trial 0 with value: 0.01976787888999154.
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_logu

Best params of LINK:
{'max_depth': 5, 'lambda_l1': 0.1150885317549707, 'lambda_l2': 0.08146400204826076, 'learning_rate': 0.03791649407078434, 'min_data_in_leaf': 80, 'num_leaves': 208, 'feature_fraction': 0.750233760839524, 'bagging_fraction': 0.45303376818950125, 'max_bin': 399}
XRP
Train split: 2021-11-08 16:28:00 - 2023-08-07 04:40:00
Test split: 2021-07-01 00:00:00 - 2021-11-06 01:57:00
916493 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2021-11-06 01:58:00 - 2022-03-13 21:25:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-03-13 21:26:00 - 2022-07-19 16:53:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-07-19 16:54:00 - 2022-11-24 12:21:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-11-24 12:22:00 - 2023-04-01 09:09:00
912743 184048


[I 2024-02-17 01:43:05,576] A new study created in memory with name: no-name-37697a29-e302-47af-9c7b-f7d8f2e973eb


Train split: 2021-07-01 00:00:00 - 2023-03-29 18:39:00
Test split: 2023-04-01 09:10:00 - 2023-08-07 04:40:00
916490 184051


  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 150, 10),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
[I 2024-02-17 01:46:04,458] Trial 0 finished with value: 0.03547688248351036 and parameters: {'max_depth': 5, 'lambda_l1': 0.030201657855787392, 'lambda_l2': 0.05100666201689193, 'learning_rate': 0.012082467523127471, 'min_data_in_leaf': 90, 'num_leaves': 226, 'feature_fraction': 0.6800678558445384, 'bagging_fraction': 0.6747622514279188, 'max_bin': 266}. Best is trial 0 with value: 0.03547688248351036.
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_log

Best params of XRP:
{'max_depth': 6, 'lambda_l1': 0.10571060935748575, 'lambda_l2': 0.8057717531168188, 'learning_rate': 0.06482308832837806, 'min_data_in_leaf': 60, 'num_leaves': 255, 'feature_fraction': 0.8806959152115921, 'bagging_fraction': 0.6109425733605472, 'max_bin': 398}
BTC
Train split: 2021-11-08 16:28:00 - 2023-08-07 04:40:00
Test split: 2021-07-01 00:00:00 - 2021-11-06 01:57:00
916493 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2021-11-06 01:58:00 - 2022-03-13 21:25:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-03-13 21:26:00 - 2022-07-19 16:53:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-07-19 16:54:00 - 2022-11-24 12:21:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-11-24 12:22:00 - 2023-04-01 09:09:00
912743 184048


[I 2024-02-17 05:07:44,395] A new study created in memory with name: no-name-49029331-402e-498a-8185-03f62f5c0cf1


Train split: 2021-07-01 00:00:00 - 2023-03-29 18:39:00
Test split: 2023-04-01 09:10:00 - 2023-08-07 04:40:00
916490 184051


  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 150, 10),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
[I 2024-02-17 05:08:40,565] Trial 0 finished with value: 0.022297574624455325 and parameters: {'max_depth': 6, 'lambda_l1': 0.5054320953742202, 'lambda_l2': 0.09557747499084557, 'learning_rate': 0.08060543307712899, 'min_data_in_leaf': 120, 'num_leaves': 230, 'feature_fraction': 0.5481551040780215, 'bagging_fraction': 0.5659172297057853, 'max_bin': 251}. Best is trial 0 with value: 0.022297574624455325.
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_log

Best params of BTC:
{'max_depth': 6, 'lambda_l1': 0.04555386041688721, 'lambda_l2': 0.01649304174882811, 'learning_rate': 0.06038777256487618, 'min_data_in_leaf': 50, 'num_leaves': 225, 'feature_fraction': 0.8942882838732303, 'bagging_fraction': 0.9660591641398384, 'max_bin': 289}
ETH
Train split: 2021-11-08 16:28:00 - 2023-08-07 04:40:00
Test split: 2021-07-01 00:00:00 - 2021-11-06 01:57:00
916493 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2021-11-06 01:58:00 - 2022-03-13 21:25:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-03-13 21:26:00 - 2022-07-19 16:53:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-07-19 16:54:00 - 2022-11-24 12:21:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-11-24 12:22:00 - 2023-04-01 09:09:00
912743 184048


[I 2024-02-17 07:39:11,126] A new study created in memory with name: no-name-c30f0779-153e-4137-a82d-ed866ac3ed2f


Train split: 2021-07-01 00:00:00 - 2023-03-29 18:39:00
Test split: 2023-04-01 09:10:00 - 2023-08-07 04:40:00
916490 184051


  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 150, 10),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
[I 2024-02-17 07:41:03,776] Trial 0 finished with value: 0.01813733601448893 and parameters: {'max_depth': 5, 'lambda_l1': 0.15124221346512906, 'lambda_l2': 0.0867191198531979, 'learning_rate': 0.012709528241347169, 'min_data_in_leaf': 70, 'num_leaves': 166, 'feature_fraction': 0.6551424279952127, 'bagging_fraction': 0.6492498512329897, 'max_bin': 433}. Best is trial 0 with value: 0.01813733601448893.
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_logun

Best params of ETH:
{'max_depth': 6, 'lambda_l1': 0.02472249902241368, 'lambda_l2': 0.028143205949098634, 'learning_rate': 0.03390620284735132, 'min_data_in_leaf': 60, 'num_leaves': 121, 'feature_fraction': 0.9453740331948475, 'bagging_fraction': 0.8478246243307246, 'max_bin': 414}
{'DOGE': {'max_depth': 5, 'lambda_l1': 0.07812018043517426, 'lambda_l2': 0.030273115819440092, 'learning_rate': 0.08306620265032358, 'min_data_in_leaf': 90, 'num_leaves': 128, 'feature_fraction': 0.643750094314933, 'bagging_fraction': 0.7630037180728901, 'max_bin': 434}, 'LINK': {'max_depth': 5, 'lambda_l1': 0.1150885317549707, 'lambda_l2': 0.08146400204826076, 'learning_rate': 0.03791649407078434, 'min_data_in_leaf': 80, 'num_leaves': 208, 'feature_fraction': 0.750233760839524, 'bagging_fraction': 0.45303376818950125, 'max_bin': 399}, 'XRP': {'max_depth': 6, 'lambda_l1': 0.10571060935748575, 'lambda_l2': 0.8057717531168188, 'learning_rate': 0.06482308832837806, 'min_data_in_leaf': 60, 'num_leaves': 255, 'fe