In [1]:
import os
import optuna
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import time
import datetime
import pickle
import ta 
import gc
from tqdm import tqdm

TRAIN_CSV = r'C:\Users\e0817820\Desktop\tokka\data\raw\train.csv'
TRAIN_COMBINE_CSV = r'C:\Users\e0817820\Desktop\tokka\data\raw\train_combine.csv'
TEST_CSV = r'C:\Users\e0817820\Desktop\tokka\data\raw\test.csv'
ADDITIONAL_TRAIN_CSV = r'C:\Users\e0817820\Desktop\tokka\data\raw\add_train.csv'
MODELS_DIR = r'C:\Users\e0817820\Desktop\tokka\models'

  from .autonotebook import tqdm as notebook_tqdm
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def get_time_series_cross_val_splits(df, cv = 6, embargo = 3750):
    all_train_timestamps = df['timestamp'].unique()
    len_split = len(all_train_timestamps) // cv
    test_splits = [all_train_timestamps[i * len_split : (i + 1) * len_split] for i in range(cv)]
    # fix the last test split to have all the last timestamps, in case the number of timestamps wasn't divisible by cv
    rem = len(all_train_timestamps) - len_split*cv
    if rem>0:
        test_splits[-1] = np.append(test_splits[-1], all_train_timestamps[-rem:])

    train_splits = []
    for test_split in test_splits:
        test_split_max = int(np.max(test_split))
        test_split_min = int(np.min(test_split))
        # get all of the timestamps that aren't in the test split
        train_split_not_embargoed = [e for e in all_train_timestamps if not (test_split_min <= int(e) <= test_split_max)]
        embargo_sec = 60*1000*embargo #in milliseconds
        train_split = [e for e in train_split_not_embargoed if
                       abs(int(e) - test_split_max) > embargo_sec and abs(int(e) - test_split_min) > embargo_sec]
        train_splits.append(train_split)
        print(f"Train split: {datetime.datetime.utcfromtimestamp(int(train_split[0]/1000)).strftime('%Y-%m-%d %H:%M:%S')} - {datetime.datetime.utcfromtimestamp(int(train_split[-1]/1000)).strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"Test split: {datetime.datetime.utcfromtimestamp(int(test_split[0]/1000)).strftime('%Y-%m-%d %H:%M:%S')} - {datetime.datetime.utcfromtimestamp(int(test_split[-1]/1000)).strftime('%Y-%m-%d %H:%M:%S')}")
        print(len(train_split), len(test_split))
    train_test_zip = zip(train_splits, test_splits)
    return train_test_zip

In [3]:
def correlation(a, train_data):
    b = train_data.get_label()
    
    a = np.ravel(a)
    b = np.ravel(b)

    len_data = len(a)
    mean_a = np.mean(a)
    mean_b = np.mean(b)
    var_a = np.var(a, ddof=0)  
    var_b = np.var(b, ddof=0)  
    
    cov = np.sum((a - mean_a) * (b - mean_b)) / len_data
    corr = cov / np.sqrt(var_a * var_b)

    return 'corr', corr, True

# For CV score calculation
def corr_score(preds, valid_data):
    valid = valid_data.get_label()
    return 'corr', correlation(preds, valid_data)[1], True  # True for maximization
  
def hyperparams_tuning(df, cv_splits, target, n_trials=100):
    """
    Perform hyperparameter tuning for LightGBM using Optuna.
    """

    def objective(trial):
        param = {
            'early_stopping_rounds': 50,
            'objective': 'regression',
            'metric': 'rmse',
            'verbosity': -1,
            'boosting_type': 'gbdt', 
            'max_depth': trial.suggest_int('max_depth', 5, 6),
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 150, 10),
            'num_leaves': trial.suggest_int('num_leaves', 100, 256),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
            'max_bin': trial.suggest_int('max_bin', 250, 480),
            'bagging_freq': 1,
            'seed':20,
            'feature_fraction_seed': 20,
            'bagging_fraction_seed': 20,
            'data_random_seed': 20,
            'extra_trees': True,
            'extra_seed': 20,
            'zero_as_missing': True,
            "first_metric_only": True
        }
        corr_list = []
        df_target = df[target]
        df_feature = df.drop(columns=[target, 'timestamp'])
        for train_split, test_split in cv_splits:
            gc.collect()
            train_split_index = df['timestamp'].isin(train_split)
            test_split_index = df['timestamp'].isin(test_split)
            
            X_train, y_train = df_feature.loc[train_split_index], df_target.loc[train_split_index].values
            X_test, y_test = df_feature.loc[test_split_index], df_target.loc[test_split_index].values
            
            lgb_train = lgb.Dataset(X_train, y_train)
            lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
            
            gbm = lgb.train(param, lgb_train, valid_sets=[lgb_train, lgb_eval], feval=correlation, num_boost_round=5000, valid_names=['train', 'valid'])
            
            preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)

            corr = corr_score(preds, lgb_eval)
            corr_list.append(corr)

        corrs = [x[1] for x in corr_list]
        avg_corr = np.mean(corrs)
        return avg_corr

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)

    return study.best_params

In [4]:
list_cryptocurrencies = ['DOGE', 'LINK', 'XRP', 'BTC', 'ETH']
best_params_dict = {}

In [5]:
for symbol in list_cryptocurrencies:
    print(symbol)
    target = f'log_return_{symbol}'
    df = pd.read_csv(rf"C:\Users\e0817820\Desktop\tokka\data\processed\train_{symbol}2.csv")
    cv_splits = list(get_time_series_cross_val_splits(df, cv = 6, embargo = 3750))
    best_params = hyperparams_tuning(df, cv_splits, target, n_trials=100)
    best_params_dict[symbol] = best_params
    print(f"Best params of {symbol}:")
    print(best_params)
    gc.collect()
print(best_params_dict)

DOGE
Train split: 2021-11-08 16:28:00 - 2023-08-07 04:40:00
Test split: 2021-07-01 00:00:00 - 2021-11-06 01:57:00
916493 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2021-11-06 01:58:00 - 2022-03-13 21:25:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-03-13 21:26:00 - 2022-07-19 16:53:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-07-19 16:54:00 - 2022-11-24 12:21:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-11-24 12:22:00 - 2023-04-01 09:09:00
912743 184048


[I 2024-02-18 01:33:07,917] A new study created in memory with name: no-name-1b5b391d-e843-4a0a-afe3-d3422e9e0ef1


Train split: 2021-07-01 00:00:00 - 2023-03-29 18:39:00
Test split: 2023-04-01 09:10:00 - 2023-08-07 04:40:00
916490 184051


  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 150, 10),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
[I 2024-02-18 01:35:05,230] Trial 0 finished with value: 0.03676749035457103 and parameters: {'max_depth': 6, 'lambda_l1': 0.02596920037815595, 'lambda_l2': 0.08292873355283197, 'learning_rate': 0.022053270394328976, 'min_data_in_leaf': 70, 'num_leaves': 253, 'feature_fraction': 0.71010436850572, 'bagging_fraction': 0.46138260416481924, 'max_bin': 467}. Best is trial 0 with value: 0.03676749035457103.
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_logun

Best params of DOGE:
{'max_depth': 6, 'lambda_l1': 0.012613191472571958, 'lambda_l2': 0.5199641675050437, 'learning_rate': 0.030385845577366895, 'min_data_in_leaf': 100, 'num_leaves': 105, 'feature_fraction': 0.9749668485159031, 'bagging_fraction': 0.8210404751797926, 'max_bin': 350}
LINK
Train split: 2021-11-08 16:28:00 - 2023-08-07 04:40:00
Test split: 2021-07-01 00:00:00 - 2021-11-06 01:57:00
916493 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2021-11-06 01:58:00 - 2022-03-13 21:25:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-03-13 21:26:00 - 2022-07-19 16:53:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-07-19 16:54:00 - 2022-11-24 12:21:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-11-24 12:22:00 - 2023-04-01 09:09:00
912743 184048


[I 2024-02-18 04:52:28,659] A new study created in memory with name: no-name-3775624b-e2e0-4294-8e4d-12bd1c43ed08


Train split: 2021-07-01 00:00:00 - 2023-03-29 18:39:00
Test split: 2023-04-01 09:10:00 - 2023-08-07 04:40:00
916490 184051


  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 150, 10),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
[I 2024-02-18 04:53:32,608] Trial 0 finished with value: 0.021696246726877016 and parameters: {'max_depth': 6, 'lambda_l1': 0.5502551356008288, 'lambda_l2': 0.026107521186424267, 'learning_rate': 0.048085969187922774, 'min_data_in_leaf': 90, 'num_leaves': 173, 'feature_fraction': 0.5666327219905238, 'bagging_fraction': 0.5118288141422571, 'max_bin': 268}. Best is trial 0 with value: 0.021696246726877016.
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_lo

Best params of LINK:
{'max_depth': 6, 'lambda_l1': 0.5173987630980457, 'lambda_l2': 0.027381207445936542, 'learning_rate': 0.09084149023580701, 'min_data_in_leaf': 60, 'num_leaves': 128, 'feature_fraction': 0.4463586286294867, 'bagging_fraction': 0.46593708758235797, 'max_bin': 275}
XRP
Train split: 2021-11-08 16:28:00 - 2023-08-07 04:40:00
Test split: 2021-07-01 00:00:00 - 2021-11-06 01:57:00
916493 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2021-11-06 01:58:00 - 2022-03-13 21:25:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-03-13 21:26:00 - 2022-07-19 16:53:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-07-19 16:54:00 - 2022-11-24 12:21:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-11-24 12:22:00 - 2023-04-01 09:09:00
912743 184048


[I 2024-02-18 06:33:54,069] A new study created in memory with name: no-name-a43ec763-672e-41fa-9ac2-e008c8f0d0ab


Train split: 2021-07-01 00:00:00 - 2023-03-29 18:39:00
Test split: 2023-04-01 09:10:00 - 2023-08-07 04:40:00
916490 184051


  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 150, 10),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
[I 2024-02-18 06:39:10,285] Trial 0 finished with value: 0.04521549693194604 and parameters: {'max_depth': 5, 'lambda_l1': 0.028455883268395897, 'lambda_l2': 0.6340798637358669, 'learning_rate': 0.01308744284218797, 'min_data_in_leaf': 120, 'num_leaves': 253, 'feature_fraction': 0.4045087291313826, 'bagging_fraction': 0.4578720618347876, 'max_bin': 385}. Best is trial 0 with value: 0.04521549693194604.
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_logu

Best params of XRP:
{'max_depth': 5, 'lambda_l1': 0.011928788862769902, 'lambda_l2': 0.025413437854423886, 'learning_rate': 0.03268535974483644, 'min_data_in_leaf': 140, 'num_leaves': 190, 'feature_fraction': 0.9996309429080054, 'bagging_fraction': 0.8410302292534415, 'max_bin': 434}
BTC
Train split: 2021-11-08 16:28:00 - 2023-08-07 04:40:00
Test split: 2021-07-01 00:00:00 - 2021-11-06 01:57:00
916493 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2021-11-06 01:58:00 - 2022-03-13 21:25:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-03-13 21:26:00 - 2022-07-19 16:53:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-07-19 16:54:00 - 2022-11-24 12:21:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-11-24 12:22:00 - 2023-04-01 09:09:00
912743 184048


[I 2024-02-18 10:58:49,512] A new study created in memory with name: no-name-2fe07540-f381-491e-96fb-9ea1179387a2
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 150, 10),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),


Train split: 2021-07-01 00:00:00 - 2023-03-29 18:39:00
Test split: 2023-04-01 09:10:00 - 2023-08-07 04:40:00
916490 184051


[I 2024-02-18 10:59:46,819] Trial 0 finished with value: 0.02448660768041101 and parameters: {'max_depth': 6, 'lambda_l1': 0.01627223227412983, 'lambda_l2': 0.4170466240548365, 'learning_rate': 0.054314209217090306, 'min_data_in_leaf': 70, 'num_leaves': 236, 'feature_fraction': 0.8315205720680633, 'bagging_fraction': 0.6161964233410313, 'max_bin': 457}. Best is trial 0 with value: 0.02448660768041101.
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 150, 10),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
[I 2024-02-18 11:01:54,958] Trial 1 finished with value: 0.023619774449177334 and parameters: {'max_depth': 5, 'lambda_l1': 0.6071876003395216, 'lambda_l2': 0.135506769

Best params of BTC:
{'max_depth': 5, 'lambda_l1': 0.012460404348229388, 'lambda_l2': 0.1345586212298735, 'learning_rate': 0.0866960465496276, 'min_data_in_leaf': 90, 'num_leaves': 112, 'feature_fraction': 0.743858615404928, 'bagging_fraction': 0.9994332560062438, 'max_bin': 302}
ETH
Train split: 2021-11-08 16:28:00 - 2023-08-07 04:40:00
Test split: 2021-07-01 00:00:00 - 2021-11-06 01:57:00
916493 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2021-11-06 01:58:00 - 2022-03-13 21:25:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-03-13 21:26:00 - 2022-07-19 16:53:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-07-19 16:54:00 - 2022-11-24 12:21:00
912743 184048
Train split: 2021-07-01 00:00:00 - 2023-08-07 04:40:00
Test split: 2022-11-24 12:22:00 - 2023-04-01 09:09:00
912743 184048


[I 2024-02-18 13:32:44,246] A new study created in memory with name: no-name-d1aa1911-13ec-4245-a5d1-d373beafdc29


Train split: 2021-07-01 00:00:00 - 2023-03-29 18:39:00
Test split: 2023-04-01 09:10:00 - 2023-08-07 04:40:00
916490 184051


  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 150, 10),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
[I 2024-02-18 13:33:32,259] Trial 0 finished with value: 0.02115300940492258 and parameters: {'max_depth': 6, 'lambda_l1': 0.015050029476676567, 'lambda_l2': 0.6268260747048223, 'learning_rate': 0.09454691389703833, 'min_data_in_leaf': 140, 'num_leaves': 198, 'feature_fraction': 0.919345700054067, 'bagging_fraction': 0.7000026797733043, 'max_bin': 313}. Best is trial 0 with value: 0.02115300940492258.
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.01, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.01, 1.0),
  'learning_rate': trial.suggest_logun

Best params of ETH:
{'max_depth': 5, 'lambda_l1': 0.036297743869784425, 'lambda_l2': 0.15349395522004292, 'learning_rate': 0.06504524963427039, 'min_data_in_leaf': 90, 'num_leaves': 195, 'feature_fraction': 0.6936797014619965, 'bagging_fraction': 0.6288167346652295, 'max_bin': 354}
{'DOGE': {'max_depth': 6, 'lambda_l1': 0.012613191472571958, 'lambda_l2': 0.5199641675050437, 'learning_rate': 0.030385845577366895, 'min_data_in_leaf': 100, 'num_leaves': 105, 'feature_fraction': 0.9749668485159031, 'bagging_fraction': 0.8210404751797926, 'max_bin': 350}, 'LINK': {'max_depth': 6, 'lambda_l1': 0.5173987630980457, 'lambda_l2': 0.027381207445936542, 'learning_rate': 0.09084149023580701, 'min_data_in_leaf': 60, 'num_leaves': 128, 'feature_fraction': 0.4463586286294867, 'bagging_fraction': 0.46593708758235797, 'max_bin': 275}, 'XRP': {'max_depth': 5, 'lambda_l1': 0.011928788862769902, 'lambda_l2': 0.025413437854423886, 'learning_rate': 0.03268535974483644, 'min_data_in_leaf': 140, 'num_leaves': 