In [1]:
import lightgbm as lgbm
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, RandomizedSearchCV, ParameterSampler
import pandas as pd
import import_ipynb
from config import *
from sklearn.metrics import accuracy_score
import optuna
from sklearn.preprocessing import LabelEncoder
from scipy.stats import uniform, randint
import warnings
import numpy as np

importing Jupyter notebook from config.ipynb


In [2]:
# Time Series Split in order to remain all the rows in their chronological order
tss = TimeSeriesSplit(n_splits = 2)

In [3]:
def merge_currencies(dictionary):
    """
    This function merges multiple DataFrames, each representing a different currency, into a single DataFrame.
    It also adds a 'currency' column to indicate the source of each row and sorts the merged DataFrame by index.

    Parameters:
    dictionary (dict): A dictionary where keys are currency names and values are DataFrames containing data for each currency.

    Returns:
    DataFrame: A single merged DataFrame containing data from all currencies with an additional 'currency' column.
    """
    
    # Initialize an empty DataFrame to store the merged data
    df = pd.DataFrame()
    
    # Iterate over each currency and its corresponding DataFrame in the dictionary
    for currency in dictionary:
        # Add a 'currency' column to the DataFrame to indicate the source currency
        dictionary[currency]["currency"] = currency
        
        # Concatenate the current DataFrame with the merged DataFrame
        df = pd.concat([df, dictionary[currency]])
            
    # Sort the merged DataFrame by index
    df = df.sort_index()
    
    return df

In [4]:
def train_model(df, label_name, train_feature_list, valid_currencies):
    """
    This function trains a LightGBM classifier on the provided DataFrame using time series cross-validation.
    
    Parameters:
    df (DataFrame): The DataFrame containing the features and the target variable.
    label_name (str): The name of the column in the DataFrame to be used as the target variable.
    train_feature_list (list): A list of feature column names to be used for training.
    valid_currencies (dict): A dictionary mapping currency names to integer tokens for encoding the 'currency' feature.
    
    Returns:
    dict: A dictionary containing the trained model, training and test datasets, true labels, and predicted labels.
    """
    
    model_dict = {}
    
    # Define features and target variable
    y = df[label_name].astype('int')
    X = df[train_feature_list]
    
    # Tokenize currencies before training
    X = X.replace({"currency": valid_currencies})
    
    # Initialize TimeSeriesSplit for time series cross-validation
    tss = TimeSeriesSplit(n_splits=2)
    
    for train_index, test_index in tss.split(X):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
    # Initialize and train the LightGBM classifier
    clf = lgbm.LGBMClassifier()
    clf.fit(X_train, y_train)
    
    # Predict the labels for the test and train datasets
    y_pred_test = clf.predict(X_test)
    y_pred_train = clf.predict(X_train)
    
    # Store the results in the model dictionary
    model_dict["X_train"] = X_train  # Train dataset features
    model_dict["X_test"] = X_test  # Test dataset features
    model_dict["y_train"] = y_train  # Train dataset true labels
    model_dict["y_test"] = y_test  # Test dataset true labels
    model_dict["model"] = clf  # Trained model
    model_dict["y_pred_test"] = pd.DataFrame(y_pred_test)  # Test dataset predicted labels
    model_dict["y_pred_train"] = pd.DataFrame(y_pred_train)  # Train dataset predicted labels (to check for overfitting)
    
    return model_dict

In [5]:
def apply_training(df):
    """
    This function applies training to multiple labeling methods using the `train_model` function.
    
    Parameters:
    df (DataFrame): The DataFrame containing the features and target variables for training.
    
    Returns:
    dict: A dictionary where keys are labeling method names and values are dictionaries containing 
          the trained model, training and test datasets, true labels, and predicted labels for each method.
    """
    
    labeling_methods = {}
    
    # Train models for each labeling method
    labeling_methods["excess_over_mean"] = train_model(df, "excess_over_mean", train_feature_list, valid_currencies)
    labeling_methods["excess_over_median"] = train_model(df, "excess_over_median", train_feature_list, valid_currencies)
    labeling_methods["fixed_time_horizon"] = train_model(df, "fth_label", train_feature_list, valid_currencies)
    labeling_methods["triple_barrier"] = train_model(df, "tbm_label", train_feature_list, valid_currencies)
    labeling_methods["trend_scanning"] = train_model(df, "trend", train_feature_list, valid_currencies)
    labeling_methods["tail_sets"] = train_model(df, "tail_sets", train_feature_list, valid_currencies)
    labeling_methods["matrix_flag"] = train_model(df, "matrix_flag", train_feature_list, valid_currencies)
    labeling_methods["next_period"] = train_model(df, "next_period", train_feature_list, valid_currencies)
    
    return labeling_methods

In [6]:
def train_model_with_parameter_tuning(df, label_name, train_feature_list, valid_currencies):
    """
    This function trains a LightGBM classifier with hyperparameter tuning using time series cross-validation.
    
    Parameters:
    df (DataFrame): The DataFrame containing the features and target variable.
    label_name (str): The name of the column in the DataFrame to be used as the target variable.
    train_feature_list (list): A list of feature column names to be used for training.
    valid_currencies (dict): A dictionary mapping currency names to integer tokens for encoding the 'currency' feature.
    
    Returns:
    dict: A dictionary containing the trained model, training and test datasets, true labels, predicted labels, and best hyperparameters.
    """
    
    model_dict = {}
    
    # Define features and target variable
    y = df[label_name].astype('int')
    X = df[train_feature_list]
    
    # Tokenize currencies before training
    X = X.replace({"currency": valid_currencies})
    
    # Ensure each label is present in the dataset
    unique_labels = y.unique()
    
    # TimeSeriesSplit for cross-validation
    tss = TimeSeriesSplit(n_splits=5)
    
    # Define hyperparameter search space
    param_dist = {
        'boosting_type': ['gbdt', 'dart'],
        'num_leaves': randint(31, 51),
        'learning_rate': uniform(0.01, 0.09),
        'n_estimators': randint(100, 300),
        'max_depth': randint(-1, 10)
    }
    
    best_score = 0
    best_clf = None
    best_params = None
    
    # Iterate through TimeSeriesSplit
    for train_index, test_index in tss.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Ensure all labels in test set are present in training set
        common_labels = y_train.unique()
        X_train_filtered = X_train[y_train.isin(common_labels)]
        y_train_filtered = y_train[y_train.isin(common_labels)]
        
        # Ensure all labels in training set are present in test set
        X_test_filtered = X_test[y_test.isin(common_labels)]
        y_test_filtered = y_test[y_test.isin(common_labels)]
        
        # Ensure the filtered datasets are not empty and have the same number of samples
        if not X_train_filtered.empty and not X_test_filtered.empty and len(X_train_filtered) == len(y_train_filtered) and len(X_test_filtered) == len(y_test_filtered):
            # Initialize the classifier
            clf = LGBMClassifier(objective='multiclass', num_class=len(unique_labels))
            
            # Perform hyperparameter tuning limited to 10 runs
            random_search = RandomizedSearchCV(estimator=clf, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2, random_state=42)
            random_search.fit(X_train_filtered, y_train_filtered)
            
            # Check if the best model is found
            if random_search.best_score_ > best_score:
                best_score = random_search.best_score_
                best_clf = random_search.best_estimator_
                best_params = random_search.best_params_
    
    if best_clf is not None:
        # Train the best model on the full training data
        best_clf.fit(X_train_filtered, y_train_filtered)
        
        # Make predictions on the test set
        y_pred_test = best_clf.predict(X_test_filtered)
        
        # Store the results in the model dictionary
        model_dict["X_train"] = X_train_filtered  # Train dataset features
        model_dict["X_test"] = X_test_filtered  # Test dataset features
        model_dict["y_train"] = y_train_filtered  # Train dataset true labels
        model_dict["y_test"] = y_test_filtered  # Test dataset true labels
        model_dict["model"] = best_clf  # Best model after hyperparameter tuning
        model_dict["y_pred_test"] = pd.DataFrame(y_pred_test)  # Test dataset predicted labels
        model_dict["best_params"] = best_params  # Best parameters
    
    return model_dict

In [7]:
def train_model(df, label_name, train_feature_list, valid_currencies):
    """
    This function trains a LightGBM classifier on the provided DataFrame using time series cross-validation.
    
    Parameters:
    df (DataFrame): The DataFrame containing the features and the target variable.
    label_name (str): The name of the column in the DataFrame to be used as the target variable.
    train_feature_list (list): A list of feature column names to be used for training.
    valid_currencies (dict): A dictionary mapping currency names to integer tokens for encoding the 'currency' feature.
    
    Returns:
    dict: A dictionary containing the trained model, training and test datasets, true labels, and predicted labels.
    """
    
    warnings.filterwarnings("ignore")
    
    model_dict = {}
    
    # Define features and target variable
    y = df[label_name].astype('int')
    X = df[train_feature_list]
    
    # Tokenize currencies before training
    X = X.replace({"currency": valid_currencies})
    
    # Initialize TimeSeriesSplit for time series cross-validation
    tss = TimeSeriesSplit(n_splits=2)
    
    # Define the parameter grid for random search
    param_grid = {
        'learning_rate': [0.01, 0.05, 0.1],
        'num_leaves': [31, 50, 100],
        'n_estimators': [100, 200, 500]
    }
    
    # Generate parameter combinations
    param_list = list(ParameterSampler(param_grid, n_iter=10, random_state=42))
    
    best_score = -np.inf
    best_params = None
    best_model = None
    
    for params in param_list:
        fold_scores = []
        
        for train_index, test_index in tss.split(X):
            X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            
            clf = lgbm.LGBMClassifier(**params)
            clf.fit(X_train, y_train)
            
            y_pred_test = clf.predict(X_test)
            score = accuracy_score(y_test, y_pred_test)
            fold_scores.append(score)
        
        mean_score = np.mean(fold_scores)
        
        if mean_score > best_score:
            best_score = mean_score
            best_params = params
            best_model = clf
    
    # Use the best parameters to train the final model
    for train_index, test_index in tss.split(X):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
    best_model.fit(X_train, y_train)
    
    # Predict the labels for the test and train datasets
    y_pred_test = best_model.predict(X_test)
    y_pred_train = best_model.predict(X_train)
    
    # Store the results in the model dictionary
    model_dict["X_train"] = X_train  # Train dataset features
    model_dict["X_test"] = X_test  # Test dataset features
    model_dict["y_train"] = y_train  # Train dataset true labels
    model_dict["y_test"] = y_test  # Test dataset true labels
    model_dict["model"] = best_model  # Trained model
    model_dict["y_pred_test"] = pd.DataFrame(y_pred_test)  # Test dataset predicted labels
    model_dict["y_pred_train"] = pd.DataFrame(y_pred_train)  # Train dataset predicted labels (to check for overfitting)
    
    return model_dict