In [1]:
import lightgbm as lgbm
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, RandomizedSearchCV
import pandas as pd
import import_ipynb
from config import *
from sklearn.metrics import accuracy_score
import optuna
from sklearn.preprocessing import LabelEncoder
from scipy.stats import uniform, randint

importing Jupyter notebook from config.ipynb


In [2]:
tss = TimeSeriesSplit(n_splits = 2)

In [3]:
def merge_currencies(dictionary):
    
    # Function to merge all currencies into one dataframe and prepare for model training
    
    df = pd.DataFrame()
    
    for currency in dictionary:
        dictionary[currency]["currency"] = currency
        df = pd.concat([df, dictionary[currency]])
            
    df = df.sort_index()
    
    return df

In [4]:
def train_model(df, label_name):
    
    model_dict = {}
    
    # define features and target variable
    y = df[label_name]
    y = y.astype('int')
    X = df[train_feature_list]
    
    # tokenize currencies before training
    X = X.replace({"currency": valid_currencies})
    
    for train_index, test_index in tss.split(X):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
    clf = lgbm.LGBMClassifier()
    clf.fit(X_train, y_train)
    y_pred_test = clf.predict(X_test)
    y_pred_train = clf.predict(X_train)
    
    # Train dataset features
    model_dict["X_train"] = X_train
    # Test dataset features
    model_dict["X_test"] = X_test
    # Train dataset true labels
    model_dict["y_train"] = y_train
    # Test dataset true labels
    model_dict["y_test"] = y_test
    # Model itself
    model_dict["model"] = clf
    # Test dataset predicted labels
    model_dict["y_pred_test"] = pd.DataFrame(y_pred_test)
    # Train dataset predicted labels (to check for overfitting)
    model_dict["y_pred_train"] = pd.DataFrame(y_pred_train)
    
    return model_dict

In [5]:
def apply_training(df):
    labeling_methods = {}
    labeling_methods["excess_over_mean"] = train_model(df, "excess_over_mean")
    labeling_methods["excess_over_median"] = train_model(df, "excess_over_median")
    labeling_methods["fixed_time_horizon"] = train_model(df, "fth_label")
    labeling_methods["triple_barrier"] = train_model(df, "tbm_label")
    labeling_methods["trend_scanning"] = train_model(df, "trend")
    labeling_methods["tail_sets"] = train_model(df, "tail_sets")
    labeling_methods["matrix_flag"] = train_model(df, "matrix_flag")
    return labeling_methods

In [6]:
def train_model_with_parameter_tuning(df, label_name):
    model_dict = {}
    
    # Define features and target variable
    y = df[label_name].astype('int')
    X = df[train_feature_list]
    
    # Tokenize currencies before training
    X = X.replace({"currency": valid_currencies})
    
    # Ensure each label is present in the dataset
    unique_labels = y.unique()
    
    # TimeSeriesSplit for cross-validation
    tss = TimeSeriesSplit(n_splits=5)
    
    # Define hyperparameter search space
    param_dist = {
        'boosting_type': ['gbdt', 'dart'],
        'num_leaves': randint(31, 51),
        'learning_rate': uniform(0.01, 0.09),
        'n_estimators': randint(100, 300),
        'max_depth': randint(-1, 10)
    }
    
    best_score = 0
    best_clf = None
    
    # Iterate through TimeSeriesSplit
    for train_index, test_index in tss.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Ensure all labels in test set are present in training set
        common_labels = y_train.unique()
        X_train_filtered = X_train[y_train.isin(common_labels)]
        y_train_filtered = y_train[y_train.isin(common_labels)]
        
        # Ensure all labels in training set are present in test set
        X_test_filtered = X_test[y_test.isin(common_labels)]
        y_test_filtered = y_test[y_test.isin(common_labels)]
        
        # Ensure the filtered datasets are not empty and have the same number of samples
        if not X_train_filtered.empty and not X_test_filtered.empty and len(X_train_filtered) == len(y_train_filtered) and len(X_test_filtered) == len(y_test_filtered):
            # Initialize the classifier
            clf = LGBMClassifier(objective='multiclass', num_class=len(unique_labels))
            
            # Perform hyperparameter tuning limited to 10 runs
            random_search = RandomizedSearchCV(estimator=clf, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2, random_state=42)
            random_search.fit(X_train_filtered, y_train_filtered)
            
            # Check if the best model is found
            if random_search.best_score_ > best_score:
                best_score = random_search.best_score_
                best_clf = random_search.best_estimator_
                best_params = random_search.best_params_
    
    if best_clf is not None:
        # Train the best model on the full training data
        best_clf.fit(X_train_filtered, y_train_filtered)
        
        # Make predictions on the test set
        y_pred_test = best_clf.predict(X_test_filtered)
        
        # Train dataset features
        model_dict["X_train"] = X_train_filtered
        # Test dataset features
        model_dict["X_test"] = X_test_filtered
        # Train dataset true labels
        model_dict["y_train"] = y_train_filtered
        # Test dataset true labels
        model_dict["y_test"] = y_test_filtered
        # Best model after hyperparameter tuning
        model_dict["model"] = best_clf
        # Test dataset predicted labels
        model_dict["y_pred_test"] = pd.DataFrame(y_pred_test)
        # Best parameters
        model_dict["best_params"] = best_params
        
    return model_dict

In [7]:
def train_model_wrong(df, label_name):
    model_dict = {}
    
    # define features and target variable
    y = df[label_name]
    y = y.astype('int')
    X = df[train_feature_list]
    
    # tokenize currencies before training
    X = X.replace({"currency": valid_currencies})
    
    # Initialize lists to accumulate training and testing data
    X_train_list = []
    X_test_list = []
    y_train_list = []
    y_test_list = []
    
    for train_index, test_index in tss.split(X):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        X_train_list.append(X_train)
        X_test_list.append(X_test)
        y_train_list.append(y_train)
        y_test_list.append(y_test)
    
    # Concatenate accumulated data to create final training and testing sets
    X_train_final = pd.concat(X_train_list)
    X_test_final = pd.concat(X_test_list)
    y_train_final = pd.concat(y_train_list)
    y_test_final = pd.concat(y_test_list)
        
    clf = lgbm.LGBMClassifier()
    clf.fit(X_train_final, y_train_final)
    y_pred_test = clf.predict(X_test_final)
    y_pred_train = clf.predict(X_train_final)
    
    # Train dataset features
    model_dict["X_train"] = X_train_final
    # Test dataset features
    model_dict["X_test"] = X_test_final
    # Train dataset true labels
    model_dict["y_train"] = y_train_final
    # Test dataset true labels
    model_dict["y_test"] = y_test_final
    # Model itself
    model_dict["model"] = clf
    # Test dataset predicted labels
    model_dict["y_pred_test"] = pd.DataFrame(y_pred_test)
    # Train dataset predicted labels (to check for overfitting)
    model_dict["y_pred_train"] = pd.DataFrame(y_pred_train)
    
    return model_dict