In [1]:
from sklearn.metrics import accuracy_score
import import_ipynb
from config import *
import pandas as pd
import empyrical
import numpy as np
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import os
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score

importing Jupyter notebook from config.ipynb


In [2]:
def print_overall_accuracy(dictionary):
    """
    This function prints the overall accuracy of the trained model on both the training and test sets.

    Parameters:
    dictionary (dict): A dictionary containing the trained model, training and test datasets, and true labels.

    Returns:
    None: This function prints the accuracy scores directly.
    """
    
    # Calculate and print the accuracy score for the training set
    train_accuracy = dictionary["model"].score(dictionary["X_train"], dictionary["y_train"])
    print('Training set score: {:.4f}'.format(train_accuracy))
    
    # Calculate and print the accuracy score for the test set
    test_accuracy = dictionary["model"].score(dictionary["X_test"], dictionary["y_test"])
    print('Test set score: {:.4f}'.format(test_accuracy))

In [3]:
def evaluate_overall_accuracy(dictionary):
    """
    This function evaluates the overall accuracy of trained models for multiple labeling methods
    and stores the results in a DataFrame.

    Parameters:
    dictionary (dict): A dictionary where keys are labeling method names and values are dictionaries 
                       containing the trained model, training and test datasets, and true labels.

    Returns:
    pd.DataFrame: A DataFrame containing the accuracy scores for each labeling method.
    """
    results = []  # Initialize an empty list to store results
    
    # Iterate over each labeling method in the dictionary
    for labeling_method in dictionary:
        # Evaluate the accuracy for the current method
        train_accuracy, test_accuracy = get_overall_accuracy(dictionary[labeling_method])
        
        # Append the results to the list
        results.append({
            'Labeling Method': labeling_method,
            'Training Set Score': train_accuracy,
            'Test Set Score': test_accuracy
        })
    
    # Convert the results list to a DataFrame
    results_df = pd.DataFrame(results)
    return results_df

In [4]:
def get_overall_accuracy(dictionary):
    """
    This function calculates the overall accuracy of the trained model on both the training and test sets.

    Parameters:
    dictionary (dict): A dictionary containing the trained model, training and test datasets, and true labels.

    Returns:
    tuple: A tuple containing the training set accuracy and test set accuracy.
    """
    # Calculate the accuracy score for the training set
    train_accuracy = dictionary["model"].score(dictionary["X_train"], dictionary["y_train"])
    
    # Calculate the accuracy score for the test set
    test_accuracy = dictionary["model"].score(dictionary["X_test"], dictionary["y_test"])
    
    return train_accuracy, test_accuracy

In [5]:
def evaluate_currency_accuracy(dictionary):
    """
    This function evaluates various ML metrics of trained models for each currency and labeling method,
    and stores the results in a DataFrame.

    Parameters:
    dictionary (dict): A dictionary where keys are labeling method names and values are dictionaries containing
                       the trained model information for each currency. Each currency dictionary should contain
                       'train' and 'test' keys with true and predicted labels.

    Returns:
    pd.DataFrame: A DataFrame containing the evaluation metrics for each currency and labeling method.
    """
    
    # Mapping of internal names to real names
    labeling_method_names = {
        'excess_over_mean': 'Excess over Mean',
        'excess_over_median': 'Excess over Median',
        'fixed_time_horizon': 'Fixed Time Horizon',
        'triple_barrier': 'Triple Barrier',
        'tail_sets': 'Tail Sets',
        'matrix_flag': 'Matrix Flag',
        'trend_scanning': 'Trend Scanning',
        'buy_and_hold': 'Buy and Hold',
        "next_period": 'Next Period Labeling'
    }

    # Initialize an empty list to store results
    results = []

    # Iterate over each labeling method in the dictionary
    for labeling_method in dictionary:
        real_labeling_method = labeling_method_names.get(labeling_method, labeling_method)
        # Iterate over each currency in the current labeling method
        for currency in dictionary[labeling_method]:
            y_train_true = dictionary[labeling_method][currency]["train"]["true label"]
            y_train_pred = dictionary[labeling_method][currency]["train"]["predicted label"]
            y_test_true = dictionary[labeling_method][currency]["test"]["true label"]
            y_test_pred = dictionary[labeling_method][currency]["test"]["predicted label"]
            
            # Calculate evaluation metrics for training set
            train_accuracy = accuracy_score(y_train_true, y_train_pred)
            train_precision = precision_score(y_train_true, y_train_pred, average='weighted', zero_division=0)
            train_recall = recall_score(y_train_true, y_train_pred, average='weighted', zero_division=0)
            train_f1 = f1_score(y_train_true, y_train_pred, average='weighted', zero_division=0)
            train_balanced_acc = balanced_accuracy_score(y_train_true, y_train_pred)
            
            # Calculate evaluation metrics for test set
            test_accuracy = accuracy_score(y_test_true, y_test_pred)
            test_precision = precision_score(y_test_true, y_test_pred, average='weighted', zero_division=0)
            test_recall = recall_score(y_test_true, y_test_pred, average='weighted', zero_division=0)
            test_f1 = f1_score(y_test_true, y_test_pred, average='weighted', zero_division=0)
            test_balanced_acc = balanced_accuracy_score(y_test_true, y_test_pred)
            
            # Append the results to the list for train data
            results.append({
                'Labeling Method': real_labeling_method,
                'Currency': currency,
                'Data Type': 'Train',
                'Accuracy': train_accuracy,
                'Precision': train_precision,
                'Recall': train_recall,
                'F1 Score': train_f1,
                'Balanced Accuracy': train_balanced_acc
            })
            
            # Append the results to the list for test data
            results.append({
                'Labeling Method': real_labeling_method,
                'Currency': currency,
                'Data Type': 'Test',
                'Accuracy': test_accuracy,
                'Precision': test_precision,
                'Recall': test_recall,
                'F1 Score': test_f1,
                'Balanced Accuracy': test_balanced_acc
            })

    # Convert the results list to a DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

In [6]:
def split_training_results_in_currencies(dictionary):
    """
    This function splits the training and test results by currencies for each labeling method.
    
    Parameters:
    dictionary (dict): A dictionary where keys are labeling method names and values are dictionaries containing
                       the trained model, training and test datasets, and true and predicted labels.
    
    Returns:
    dict: A dictionary where keys are labeling method names and values are dictionaries containing training 
          and test results split by currencies.
    """
    
    currencies = {}
    
    # Reverse the valid_currencies dictionary for detokenizing
    valid_currencies2 = {y: x for x, y in valid_currencies.items()}
    
    # Iterate over each labeling method in the dictionary
    for labeling_method in dictionary:
        
        # Initialize a nested dictionary for each labeling method
        currencies[labeling_method] = {}
        
        # Copy training and test DataFrames
        train_df = dictionary[labeling_method]["X_train"].copy()
        test_df = dictionary[labeling_method]["X_test"].copy()
        
        # Add true and predicted labels to the training DataFrame
        train_df["true label"] = dictionary[labeling_method]["y_train"].values
        train_df["predicted label"] = dictionary[labeling_method]["y_pred_train"].values
        
        # Add true and predicted labels to the test DataFrame
        test_df["true label"] = dictionary[labeling_method]["y_test"].values
        test_df["predicted label"] = dictionary[labeling_method]["y_pred_test"].values
        
        # Detokenize the currencies to make them readable
        train_df = train_df.replace({"currency": valid_currencies2})
        test_df = test_df.replace({"currency": valid_currencies2})
        
        # Split the results by currencies
        for currency in valid_currencies.keys():
            
            # Initialize a nested dictionary for each currency
            currencies[labeling_method][currency] = {}
            
            # Filter the training and test DataFrames for the current currency
            train_currency_df = train_df.loc[train_df['currency'] == currency]
            test_currency_df = test_df.loc[test_df['currency'] == currency]
            
            # Store the filtered DataFrames in the dictionary
            currencies[labeling_method][currency]["train"] = train_currency_df
            currencies[labeling_method][currency]["test"] = test_currency_df
            
    return currencies

In [7]:
def plot_confusion_matrix(y_true, y_pred, classes, labeling_method, save_path="visualizations/ml_metrics"):
    """
    This function plots and saves the confusion matrix for the given true and predicted labels.

    Parameters:
    y_true (array-like): Array of true labels.
    y_pred (array-like): Array of predicted labels.
    classes (list): List of class labels to display on the axes of the confusion matrix.
    labeling_method (str): Name of the labeling method to include in the saved plot's filename.
    save_path (str, optional): Directory where the plot will be saved. Default is "visualizations/ml_metrics".

    Returns:
    None: This function saves the confusion matrix plot to the specified directory.
    """
    
    # Create directory if it doesn't exist
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # Calculate the confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=classes)

    # Set up the plot
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1.2)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=classes, yticklabels=classes)

    # Add titles and labels
    plt.title('Confusion Matrix - Actuals and Predicted')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.xticks(rotation=45)

    # Save the plot
    save_name = os.path.join(save_path, f"confusion_matrix_{labeling_method}.png")
    plt.savefig(save_name)
    plt.close()

In [8]:
def evaluate_and_visualize_model(model_dict, labeling_method, save_path="visualizations/ml_metrics"):
    """
    This function evaluates the performance of the model by generating and saving a confusion matrix,
    and printing a classification report for the test dataset.

    Parameters:
    model_dict (dict): A dictionary containing the trained model, test dataset, and predicted labels.
    labeling_method (str): The name of the labeling method to include in the saved plot's filename.
    save_path (str, optional): Directory where the confusion matrix plot will be saved. Default is "visualizations/ml_metrics".

    Returns:
    None: This function saves the confusion matrix plot and prints the classification report.
    """
    
    # Extract true and predicted labels for the test dataset
    y_test = model_dict["y_test"]
    y_pred_test = model_dict["y_pred_test"]
    
    # Convert to numpy arrays
    y_test = y_test.to_numpy()
    y_pred_test = y_pred_test.to_numpy()
    
    # Ensure y_test and y_pred_test have the same dimensions
    if y_test.ndim > 1:
        y_test = y_test.ravel()
    if y_pred_test.ndim > 1:
        y_pred_test = y_pred_test.ravel()
    
    # Get unique classes from true and predicted labels
    classes = np.unique(np.concatenate((y_test, y_pred_test), axis=0))

    # Plot and save the confusion matrix
    plot_confusion_matrix(y_test, y_pred_test, classes, labeling_method, save_path)

    # Print classification report for the test dataset
    print("Classification Report - Test Data:")
    print(classification_report(y_test, y_pred_test))

In [9]:
def evaluate_and_visualize_all_approaches(dictionary):
    """
    Evaluate and visualize the performance of models for all labeling methods.
    
    Parameters:
    dictionary (dict): A dictionary where keys are labeling method names and values are dictionaries containing
                       the test dataset and predicted labels.

    Returns:
    pd.DataFrame: DataFrame containing the classification report metrics for all labeling methods.
    """

    # Mapping of internal names to real names
    labeling_method_names = {
        'excess_over_mean': 'Excess over Mean',
        'excess_over_median': 'Excess over Median',
        'fixed_time_horizon': 'Fixed Time Horizon',
        'triple_barrier': 'Triple Barrier',
        'tail_sets': 'Tail Sets',
        'matrix_flag': 'Matrix Flag',
        'trend_scanning': 'Trend Scanning',
        'buy_and_hold': 'Buy and Hold',
        "next_period": 'Next Period Labeling'
    }

    all_reports_df = pd.DataFrame()
    
    # Iterate over each labeling method in the dictionary
    for labeling_method in dictionary:
        # Evaluate model performance for the current labeling method
        report_df = evaluate_model_performance(dictionary[labeling_method], labeling_method_names.get(labeling_method, labeling_method))
        all_reports_df = pd.concat([all_reports_df, report_df], ignore_index=True)
    
    return all_reports_df

In [10]:
def evaluate_labeling_techniques(dictionary):
    """
    This function evaluates various ML metrics of trained models for multiple labeling methods and stores the results in a DataFrame.

    Parameters:
    dictionary (dict): A dictionary where keys are labeling method names and values are dictionaries 
                       containing the trained model, training and test datasets, and true labels.

    Returns:
    pd.DataFrame: A DataFrame containing the evaluation metrics for each labeling method.
    """

    # Mapping of internal names to real names
    labeling_method_names = {
        'excess_over_mean': 'Excess over Mean',
        'excess_over_median': 'Excess over Median',
        'fixed_time_horizon': 'Fixed Time Horizon',
        'triple_barrier': 'Triple Barrier',
        'tail_sets': 'Tail Sets',
        'matrix_flag': 'Matrix Flag',
        'trend_scanning': 'Trend Scanning',
        'buy_and_hold': 'Buy and Hold',
        "next_period": 'Next Period Labeling'
    }
    
    # Initialize an empty list to store results
    results = []

    # Iterate over each labeling method in the dictionary
    for labeling_method, data in dictionary.items():
        model = data['model']
        X_train = data['X_train']
        y_train = data['y_train']
        X_test = data['X_test']
        y_test = data['y_test']
        
        # Predict the labels for the test set
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        # Calculate evaluation metrics
        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        balanced_acc = balanced_accuracy_score(y_test, y_test_pred)
        
        # Append the results to the list
        results.append({
            'Labeling Method': labeling_method_names.get(labeling_method, labeling_method),
            'Train Accuracy': train_accuracy,
            'Test Accuracy': test_accuracy,
            'Balanced Accuracy': balanced_acc
        })
    
    # Convert the results list to a DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

In [11]:
def evaluate_model_performance(model_dict, labeling_method):
    """
    Evaluate the performance of the model by generating a classification report.
    
    Parameters:
    model_dict (dict): A dictionary containing the test dataset and predicted labels.
    labeling_method (str): The name of the labeling method.
    
    Returns:
    pd.DataFrame: DataFrame containing the classification report metrics.
    """
    # Extract true and predicted labels for the test dataset
    y_test = model_dict["y_test"]
    y_pred_test = model_dict["y_pred_test"]
    
    # Ensure y_test and y_pred_test have the same dimensions
    y_test = y_test.to_numpy().ravel()
    y_pred_test = y_pred_test.to_numpy().ravel()
    
    # Generate classification report
    report = classification_report(y_test, y_pred_test)
    
    # Convert classification report to DataFrame
    report_df = classification_report_to_df(report, labeling_method)
    
    return report_df

In [12]:
def classification_report_to_df(report, method):
    """
    Convert classification report to a DataFrame.
    
    Parameters:
    report (str): The classification report as a string.
    method (str): The labeling method name.
    
    Returns:
    pd.DataFrame: DataFrame containing the classification report metrics.
    """
    report_data = []
    lines = report.split('\n')
    
    for line in lines:
        if line.strip() == "":
            continue
        row_data = line.split()
        
        if len(row_data) < 2:  # Skip empty or malformed lines
            continue
        
        if row_data[0] in ['accuracy', 'macro', 'weighted']:
            continue

        if len(row_data) == 5:
            row = {
                'Method': method,
                'Class': row_data[0],
                'Precision': float(row_data[1]),
                'Recall': float(row_data[2]),
                'F1-Score': float(row_data[3]),
                'Support': int(row_data[4])
            }
            report_data.append(row)
    
    df = pd.DataFrame.from_dict(report_data)
    return df