In [1]:
import pandas as pd
import statistics
import import_ipynb
from config import *
from data_preparation import *
from data_exploration import *
from labeling import *
from model_training import *
from label_exploration import *
from evaluate_ml_metrics import *
from evaluate_financial_metrics import *
from make_visualizations import *
import pickle
import numpy as np
from feature_engineering import *
from utilities import *

importing Jupyter notebook from config.ipynb
importing Jupyter notebook from data_preparation.ipynb
importing Jupyter notebook from data_exploration.ipynb
importing Jupyter notebook from labeling.ipynb
importing Jupyter notebook from model_training.ipynb
importing Jupyter notebook from label_exploration.ipynb
importing Jupyter notebook from evaluate_ml_metrics.ipynb
importing Jupyter notebook from evaluate_financial_metrics.ipynb
importing Jupyter notebook from make_visualizations.ipynb
importing Jupyter notebook from feature_engineering.ipynb
importing Jupyter notebook from utilities.ipynb


In [2]:
def main():
    """
    Main function to execute the script. It controls the workflow of data preparation, labeling, visualization,
    training, and evaluation based on the configuration flags.
    """
    
    # Run time configuration
    prepare_data = False
    visualize_price_development = False
    label_data = False
    visualize_labels = False
    feature_engineering = False
    train_model = False
    explore_label_distribution = False
    evaluate_ml_metrics = False
    evaluate_financial_metrics = False
    print_latex = False
    
    ### Data Preparation ###
    if prepare_data:
        # Dictionary as data structure with coin name as key which stores the according dataframe 
        daily_crypto_dict = {}
        weekly_crypto_dict = {}
    
        # Reading the CSV files of crypto currencies for daily and weekly directories
        daily_dict = read_dir_of_csv(daily_crypto_dict, directory_daily)
        weekly_dict = read_dir_of_csv(weekly_crypto_dict, directory_weekly)
        
        # Drop rows where no data for prices available (crypto currency is too young)
        daily_dict = clean_dfs_prices(daily_dict)
        weekly_dict = clean_dfs_prices(weekly_dict)
        
        #describe_dfs(daily_dict)
        #print("-" * 50)
        #describe_dfs(weekly_dict)
        
        # Convert timestamps to datetime
        daily_dict = convert_timestamps(daily_dict)
        weekly_dict = convert_timestamps(weekly_dict)
    
        # Assign time horizon for each data point according to the window that was defined in the config
        daily_dict = assign_horizons(daily_dict, delta)
        weekly_dict = assign_horizons(weekly_dict, delta)
        
        # Align data tails according to the configured time window
        daily_dict = allign_all_tails(daily_dict, window_in_days)
        weekly_dict = allign_all_tails(weekly_dict, window_in_days)

        print(calculate_outlier_profits(daily_dict, weekly_dict))
        
        if visualize_price_development:
            visualize_price_development_for_timeframe(daily_dict["Ethereum"], 30, "visualizations/price_development_ethereum.png")
        
        save_dictionary("dump_dictionaries/daily_dict.pkl", daily_dict)
        save_dictionary("dump_dictionaries/weekly_dict.pkl", weekly_dict)
    
    if label_data:
        if not prepare_data:
            daily_dict = load_dictionary("dump_dictionaries/daily_dict.pkl")
            weekly_dict = load_dictionary("dump_dictionaries/weekly_dict.pkl")
        
        # Label the currencies according to the triple barrier method
        daily_dict = apply_tbm(daily_dict, volatility_delta)
        weekly_dict = apply_tbm(weekly_dict, volatility_delta)
        
        # Label the currencies according to the fixed horizon method
        daily_dict = apply_fixed_time_horizon(daily_dict, threshold)
        weekly_dict = apply_fixed_time_horizon(weekly_dict, threshold)
        
        # Assign relative returns within the given horizon
        daily_dict = assign_relative_returns(daily_dict)
        weekly_dict = assign_relative_returns(weekly_dict)

        # Calculate and store the market means and median on a daily and weekly basis
        daily_mean_returns, daily_median_returns = calculate_mean_median_market_return(daily_dict)
        weekly_mean_returns, weekly_median_returns = calculate_mean_median_market_return(weekly_dict)
        
        # Apply excess over mean and median labeling
        daily_dict = assign_excess_over_mean_median_label(daily_dict, daily_mean_returns, daily_median_returns)
        weekly_dict = assign_excess_over_mean_median_label(weekly_dict, weekly_mean_returns, weekly_median_returns)
        
        # Assign tail sets
        daily_dict = assign_tail_sets(daily_dict)
        weekly_dict = assign_tail_sets(weekly_dict)
        
        # Assign trend scanning
        daily_dict = assign_trend_scanning(daily_dict)
        weekly_dict = assign_trend_scanning(weekly_dict)
        
        # Assign matrix flag labeling
        daily_dict = assign_matrix_flags(daily_dict)
        weekly_dict = assign_matrix_flags(weekly_dict)
        
        # Assign next day labeling
        daily_dict = apply_next_period_return_labeling(daily_dict)
        weekly_dict = apply_next_period_return_labeling(weekly_dict)
        
        save_dictionary("dump_dictionaries/daily_dict.pkl", daily_dict)
        save_dictionary("dump_dictionaries/weekly_dict.pkl", weekly_dict)
    
    if explore_label_distribution:
        if not label_data:
            daily_dict = load_dictionary("dump_dictionaries/daily_dict.pkl")
            weekly_dict = load_dictionary("dump_dictionaries/weekly_dict.pkl")
        
        # Explore how labels are distributed
        label_distribution_daily = check_label_distribution(daily_dict)
        label_distribution_weekly = check_label_distribution(weekly_dict)
        label_distribution_daily.to_excel("raw_results/label_distribution_daily.xlsx")
        label_distribution_weekly.to_excel("raw_results/label_distribution_weekly.xlsx")
        plot_label_distribution(label_distribution_daily, "visualizations/label_distribution_daily")
        plot_label_distribution(label_distribution_weekly, "visualizations/label_distribution_weekly")
        
        explore_and_visualize_overlap(daily_dict, label_columns, "visualizations/label_overlap", "daily")
        explore_and_visualize_overlap(weekly_dict, label_columns, "visualizations/label_overlap", "weekly")
        
        # Merge all currencies into one dataframe so that the model can be trained on the crypto market instead of particular currencies
        explore_crypto_market_daily = merge_currencies(daily_dict)
        explore_crypto_market_weekly = merge_currencies(weekly_dict)
        
        daily_overlaps_df = explore_label_overlap(explore_crypto_market_daily, label_columns)
        weekly_overlaps_df = explore_label_overlap(explore_crypto_market_weekly, label_columns)
        
        visualize_label_overlap(daily_overlaps_df, "market", "visualizations/label_overlap", "daily")
        visualize_label_overlap(weekly_overlaps_df, "market", "visualizations/label_overlap", "weekly")
    
    if visualize_labels:
        if not label_data:
            daily_dict = load_dictionary("dump_dictionaries/daily_dict.pkl")
            weekly_dict = load_dictionary("dump_dictionaries/weekly_dict.pkl")
        
        plot_trend_labels(daily_dict["Ethereum"], label_columns)
    
    if train_model:
        if not label_data:
            daily_dict = load_dictionary("dump_dictionaries/daily_dict.pkl")
            weekly_dict = load_dictionary("dump_dictionaries/weekly_dict.pkl")
            
        # Feature engineering
        if feature_engineering:
            from config import train_feature_list
            daily_dict = add_features(daily_dict)
            weekly_dict = add_features(weekly_dict)
            train_feature_list += engineering_features
        
        # Merge all currencies into one dataframe so that the model can be trained on the crypto market instead of particular currencies
        crypto_market_daily = merge_currencies(daily_dict)
        crypto_market_weekly = merge_currencies(weekly_dict)
        
        # Train lgbm classifier based on the previously assigned labels
        trained_currencies_daily = apply_training(crypto_market_daily)
        trained_currencies_weekly = apply_training(crypto_market_weekly)
        
        # Separate the training results based on currencies
        currencies_results_daily = split_training_results_in_currencies(trained_currencies_daily)
        currencies_results_weekly = split_training_results_in_currencies(trained_currencies_weekly)
        
        save_dictionary("dump_dictionaries/trained_currencies_daily.pkl", trained_currencies_daily)
        save_dictionary("dump_dictionaries/trained_currencies_weekly.pkl", trained_currencies_weekly)
        save_dictionary("dump_dictionaries/currencies_results_daily.pkl", currencies_results_daily)
        save_dictionary("dump_dictionaries/currencies_results_weekly.pkl", currencies_results_weekly)
    
    if evaluate_ml_metrics:
        if not train_model:
            trained_currencies_daily = load_dictionary("dump_dictionaries/trained_currencies_daily.pkl")
            trained_currencies_weekly = load_dictionary("dump_dictionaries/trained_currencies_weekly.pkl")
            currencies_results_daily = load_dictionary("dump_dictionaries/currencies_results_daily.pkl")
            currencies_results_weekly = load_dictionary("dump_dictionaries/currencies_results_weekly.pkl")
        
        
        # Evaluate overall model accuracies (across all currencies)
        train_test_accuracy_daily = evaluate_overall_accuracy(trained_currencies_daily)
        train_test_accuracy_weekly = evaluate_overall_accuracy(trained_currencies_weekly)
        train_test_accuracy_daily.to_excel("raw_results/train_test_accuracy_daily.xlsx", index=False)
        train_test_accuracy_weekly.to_excel("raw_results/train_test_accuracy_weekly.xlsx", index=False)
        
        # Evaluate accuracy for each currency separately
        #print_latex_table(evaluate_currency_accuracy(currencies_results_daily))
        #print_latex_table(evaluate_currency_accuracy(currencies_results_weekly))
        
        key_ml_metrics_daily = evaluate_labeling_techniques(trained_currencies_daily)
        key_ml_metrics_weekly = evaluate_labeling_techniques(trained_currencies_weekly)
        key_ml_metrics_daily.to_excel("raw_results/key_ml_metrics_daily.xlsx", index=False)
        key_ml_metrics_weekly.to_excel("raw_results/key_ml_metrics_weekly.xlsx", index=False)
        #print_latex_table_small(key_ml_metrics_daily)
        
        metrics_per_class_daily = evaluate_and_visualize_all_approaches(trained_currencies_daily)
        metrics_per_class_weekly = evaluate_and_visualize_all_approaches(trained_currencies_weekly)
        metrics_per_class_daily.to_excel("raw_results/ metrics_per_class_daily.xlsx", index=False)
        metrics_per_class_weekly.to_excel("raw_results/ metrics_per_class_weekly.xlsx", index=False)
        #print_latex_table_small(metrics_per_class_daily)
    
    if evaluate_financial_metrics:
        if not train_model:
            trained_currencies_daily = load_dictionary("dump_dictionaries/trained_currencies_daily.pkl")
            trained_currencies_weekly = load_dictionary("dump_dictionaries/trained_currencies_weekly.pkl")
            currencies_results_daily = load_dictionary("dump_dictionaries/currencies_results_daily.pkl")
            currencies_results_weekly = load_dictionary("dump_dictionaries/currencies_results_weekly.pkl")
            
        #market_returns_daily = calculate_market_returns(trained_currencies_daily)
        #market_returns_weekly = calculate_market_returns(trained_currencies_weekly)
        
        #visualize_overall_returns(market_returns_daily, 'visualizations/market_returns_daily.png')
        #visualize_overall_returns(market_returns_weekly, 'visualizations/market_returns_weekly.png')
        
        periods = evaluate_different_holding_periods(currencies_results_daily)
        periods.to_excel("raw_results/periods.xlsx", index=False)
        
        currencies_results_daily, evaluation_df_daily, market_results_daily = evaluate_strategy_financially(currencies_results_daily, periods="daily")
        currencies_results_weekly, evaluation_df_weekly,market_results_weekly = evaluate_strategy_financially(currencies_results_weekly, periods="weekly")
        
        evaluation_df_daily.to_excel("raw_results/currencies_daily.xlsx", index=False)
        evaluation_df_weekly.to_excel("raw_results/currencies_weekly.xlsx", index=False)
        market_results_daily.to_excel("raw_results/market_daily.xlsx", index=False)
        market_results_weekly.to_excel("raw_results/market_weekly.xlsx", index=False)
        
        
        visualize_overall_returns(market_results_daily, 'visualizations/market_returns_daily.png')
        visualize_overall_returns(market_results_weekly, 'visualizations/market_returns_weekly.png')
        visualize_overall_returns_vertical(market_results_daily, market_results_weekly, 'visualizations/market_returns_comparison.png')
        
        visualize_risk_metrics_radar_plot(market_results_daily, 'visualizations/risk_radar_daily.png')
        visualize_risk_metrics_radar_plot(market_results_weekly, 'visualizations/risk_radar_weekly.png')
        
        plot_distribution_combined(currencies_results_daily, 'visualizations/return_distribution_daily.png', 0.006)
        plot_distribution_combined(currencies_results_weekly, 'visualizations/return_distribution_weekly.png', 0.006)
        
        trading_metrics_daily = evaluate_trades_overview(currencies_results_daily, 0.006)
        trading_metrics_weekly = evaluate_trades_overview(currencies_results_weekly, 0.006)
        trading_metrics_daily.to_excel("raw_results/trading_metrics_daily.xlsx", index=False)
        trading_metrics_weekly.to_excel("raw_results/trading_metrics_weekly.xlsx", index=False)

    if print_latex:
        # Label distribution 
        print("Label distribution daily")
        basic_latex_table(label_distribution_daily)
        print("-"*100)
        print("Label distribution weekly")
        basic_latex_table(label_distribution_weekly)
        print("-"*100)

        # Ml metrics
        print("Key ML metrics")
        print_key_ml_to_latex(key_ml_metrics_daily, key_ml_metrics_weekly)
        print("-"*100)
        print("Metrics per class daily")
        print_classification_report_to_latex(metrics_per_class_daily)
        print("-"*100)

        # Financial metrics
        print("Overall market daily")
        print_financial_overview_to_latex(market_results_daily)
        print("-"*100)
        print("Overall market weekly")
        print_financial_overview_to_latex(market_results_weekly)
        print("-"*100)

        # Trade metrics
        print("Trade metrics")
        print_trade_metrics_to_latex(trading_metrics_daily, trading_metrics_weekly)
        print("-"*100)

        #holding periods
        print("Holding periods")
        print_holding_periods_to_latex(periods)
        print("-"*100)
        

In [3]:
if __name__ == "__main__":
    main()

\begin{tabular}{llrrr}
\toprule
Dataset & Currency & Maximum return & Ratio outliers & Number \\
\midrule
daily & Chainlink & 10904.224638 & 0.565899 & 1 \\
daily & Curve & 6819.571994 & 1.544180 & 2 \\
daily & Ribbon & 3494.980942 & 3.449933 & 2 \\
daily & Uniswap & 5160.906512 & 1.959536 & 1 \\
daily & Dydx & 3982.986133 & 0.000000 & 0 \\
daily & Aave & 19360.705588 & 39.160540 & 5 \\
daily & Lido & 6744.442980 & 2.791410 & 3 \\
daily & Polygon & 8472.409607 & 4.061551 & 6 \\
daily & Ethereum & 9032.506701 & 0.000000 & 0 \\
daily & Bitcoin & 6697.803771 & 0.000000 & 0 \\
daily & SHIB & 5692.750422 & 25.776327 & 12 \\
daily & Pepe & 1906.388714 & 28.228434 & 6 \\
weekly & Chainlink & 4415.763053 & 14.137024 & 9 \\
weekly & Curve & 2441.246690 & 14.479397 & 5 \\
weekly & Ribbon & 1560.872268 & 19.578688 & 5 \\
weekly & Uniswap & 1775.650117 & 9.620316 & 3 \\
weekly & Dydx & 1542.777563 & 25.153485 & 6 \\
weekly & Aave & 11472.280371 & 65.394976 & 11 \\
weekly & Lido & 2537.034894 & 29.

KeyError: 'excess_over_mean'