In [9]:
import optuna
import pandas as pd
import yfinance as yf
from typing import List, Dict, Tuple
import numpy as np
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.tsa.stattools import adfuller
import os
from statsmodels.tsa.api import AutoReg
from scipy.stats import pearsonr
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import statsmodels.api as sm
import matplotlib.pyplot as plt
import psycopg2
import time
from datetime import timedelta,datetime
from helper import *
import pandas as pd
import yfinance as yf
import numpy as np
import random
import os

import psycopg2
import time
from datetime import timedelta,datetime
from forming_combination.data_handler import DataHandler
from forming_combination.combination_formation import Combination_Formations
from signal_generation.signal_generator import process_results_df
from backtesting.port_mana import PortfolioManager
from utils.helper import plot_asset_balance, generate_periods_df,run_backtest_for_periods
from utils.calculate_metrics import calculate_shapre_and_mdd
from data.get_data import *
from optuna import *
from optuna.pruners import MedianPruner
# Set up logging to see Optuna's internal messages
optuna.logging.set_verbosity(optuna.logging.INFO)
pd.set_option('future.no_silent_downcasting', True)
import warnings

# Suppress the specific Optuna warning
warnings.filterwarnings(
    "ignore",
    category=UserWarning,
    module="optuna.distributions"
)
# Set random seeds for reproducibility
SEED = 42  # You can choose any integer
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)  # For hash-based operations

In [2]:
vn30_stocks = pd.read_csv('optimization_data/vn30_stocks.csv', index_col=0, parse_dates=True)
# Load data once at the start
etf_list = ['FUEVFVND', 'FUESSVFL', 'E1VFVN30', 'FUEVN100']
start_date = '2021-06-01'
end_date = '2025-01-01'
periods_df = generate_periods_df(
        vn30_stocks,
        start_date,
        end_date,
        window=80,
    )

In [3]:
periods_df

Unnamed: 0,stocks_list,start_date,end_date
0,"[ACB, BID, BVH, CTG, FPT, GAS, GVR, HDB, HPG, ...",2021-06-01,2022-07-31
1,"[ACB, BID, BVH, CTG, FPT, GAS, GVR, HDB, HPG, ...",2022-05-13,2023-02-05
2,"[ACB, BCM, BID, BVH, CTG, FPT, GAS, GVR, HDB, ...",2022-11-18,2023-08-06
3,"[ACB, BCM, BID, BVH, CTG, FPT, GAS, GVR, HDB, ...",2023-05-19,2025-01-01


In [4]:
def objective(trial):
    # Suggest parameters to tune
    estimation_window = trial.suggest_int("estimation_window", 40, 80, step=10)
    min_trading_days_fraction = trial.suggest_float(
        "min_trading_days", 1 / 3, 2 / 3, step=1 / 6
    )
    min_trading_days = int(min_trading_days_fraction * estimation_window)  # Convert fraction to days
    max_clusters = trial.suggest_int("max_clusters", 6, 12, step=3)
    top_stocks = trial.suggest_int("top_stocks", 6, 12, step=3)
    correlation_threshold = trial.suggest_float(
        "correlation_threshold", 0.4, 0.8, step=0.2
    )
    residual_threshold = trial.suggest_float(
        "residual_threshold", 0.15, 0.45, step=0.15
    )
    ou_window = estimation_window  # Typo fixed: ou_winodw -> ou_window
    tier = trial.suggest_int("tier", 1, 7, step=1)
    first_allocation = trial.suggest_float("first_allocation", 0.3, 0.6, step=0.1)
    adding_allocation = trial.suggest_float("adding_allocation", 0.1, 0.3, step=0.1)
    # Run backtest with suggested parameters
    combined_returns_df, _, _ = run_backtest_for_periods(
        periods_df=periods_df,
        futures="VN30F1M",
        etf_list=etf_list,
        etf_included=False,
        estimation_window=estimation_window,
        min_trading_days=min_trading_days_fraction * estimation_window,  # Convert fraction to days
        max_clusters=max_clusters,
        top_stocks=top_stocks,
        correlation_threshold=correlation_threshold,
        residual_threshold=residual_threshold,
        tier=tier,
        first_allocation=first_allocation,
        adding_allocation=adding_allocation,
        use_existing_data=True
    )

    # Split into train and test sets
    train_set = combined_returns_df[combined_returns_df.index < "2024-01-01"]
    test_set = combined_returns_df[combined_returns_df.index >= "2024-01-01"]

    # Calculate metrics for train set (optimization targets)
    sharpe_train,max_drawdown_train = calculate_shapre_and_mdd(train_set, risk_free_rate=0.05)

    # Calculate metrics for test set (for reporting, not optimization)
    sharpe_test, max_drawdown_test = calculate_shapre_and_mdd(test_set, risk_free_rate=0.05)

    # Store test set metrics for later analysis
    trial.set_user_attr("sharpe_test", sharpe_test)
    trial.set_user_attr("max_drawdown_test", max_drawdown_test)

    # Return tuple for multi-objective optimization (maximize Sharpe, minimize drawdown)
    return sharpe_train, max_drawdown_train



In [None]:
# Create an Optuna study with multi-objective optimization
study = optuna.create_study(
    directions=["maximize", "minimize"],  # Maximize Sharpe, minimize drawdown
    sampler=optuna.samplers.TPESampler(seed=SEED),
    study_name="vn30_arbitrage_tuning",
    load_if_exists=True,
    pruner=MedianPruner()
)

# Optimize with a moderate number of trials
study.optimize(objective, n_trials=4)



[I 2025-03-31 12:23:27,676] A new study created in memory with name: vn30_arbitrage_tuning
[I 2025-03-31 12:37:26,098] Trial 0 finished with values: [-2.7083470730186483, 0.20484396764566193] and parameters: {'estimation_window': 40, 'min_trading_days': 0.3333333333333333, 'max_clusters': 6, 'top_stocks': 6, 'correlation_threshold': 0.4, 'residual_threshold': 0.15, 'tier': 4, 'first_allocation': 0.4, 'adding_allocation': 0.3}.
[I 2025-03-31 12:47:37,497] Trial 1 finished with values: [-2.4600370783057066, 0.2640421360153262] and parameters: {'estimation_window': 60, 'min_trading_days': 0.3333333333333333, 'max_clusters': 12, 'top_stocks': 6, 'correlation_threshold': 0.6000000000000001, 'residual_threshold': 0.44999999999999996, 'tier': 3, 'first_allocation': 0.4, 'adding_allocation': 0.2}.
[I 2025-03-31 12:54:06,247] Trial 2 finished with values: [-1.4529276131732094, 0.37177258809714353] and parameters: {'estimation_window': 80, 'min_trading_days': 0.3333333333333333, 'max_clusters': 

In [6]:
# Print the best results
best_trial = study.best_trials[0]  # Get the best trial (Pareto optimal)
best_params = best_trial.params
best_sharpe_train = best_trial.values[0]
best_drawdown_train = best_trial.values[1]
best_sharpe_test = best_trial.user_attrs["sharpe_test"]
best_drawdown_test = best_trial.user_attrs["max_drawdown_test"]

print(f"Best Train Sharpe Ratio: {best_sharpe_train}")
print(f"Best Train Max Drawdown: {best_drawdown_train}")
print(f"Test Sharpe Ratio: {best_sharpe_test}")
print(f"Test Max Drawdown: {best_drawdown_test}")
print(f"Best Parameters: {best_params}")
# Save all trials to CSV with custom column names matching variable names
trials_df = study.trials_dataframe()

# Rename columns to match your variable names
column_mapping = {
    'values_0': 'best_sharpe_train',
    'values_1': 'best_drawdown_train',
    'user_attrs_sharpe_test': 'best_sharpe_test',
    'user_attrs_max_drawdown_test': 'best_drawdown_test',
}

# Rename the columns in the DataFrame
trials_df.rename(columns=column_mapping, inplace=True)

# Keep only relevant columns (optional, remove if you want all columns)
relevant_columns = ['number', 'best_sharpe_train', 'best_drawdown_train', 
                   'best_sharpe_test', 'best_drawdown_test'] + [f'params_{p}' for p in best_params.keys()]
trials_df = trials_df[relevant_columns]

# Save to CSV
trials_df.to_csv("optuna_trials.csv", index=False)

Best Train Sharpe Ratio: -2.4600370783057066
Best Train Max Drawdown: 0.2640421360153262
Test Sharpe Ratio: -1.4327260486139113
Test Max Drawdown: 0.07125794130377028
Best Parameters: {'estimation_window': 60, 'min_trading_days': 0.3333333333333333, 'max_clusters': 12, 'top_stocks': 6, 'correlation_threshold': 0.6000000000000001, 'residual_threshold': 0.44999999999999996, 'tier': 3, 'first_allocation': 0.4, 'adding_allocation': 0.2}


In [7]:
# Visualize parameter importances (for Sharpe only, as it's multi-objective)
optuna.visualization.plot_param_importances(study, target=lambda t: t.values[0]).show()

