# Fine Tuning Experiments

## Dependencies Set-up

In [None]:
# --- Import Required Libraries ---
import os
import sys
import pandas as pd

# --- Set Up Paths ---
ROOT_PATH = os.path.abspath(os.path.join(os.getcwd(), '..'))
if ROOT_PATH not in sys.path:
    sys.path.insert(0, ROOT_PATH)

# --- Import Custom Libraries ---
from pop.util.print_results import (
    print_statistics,
    plot_fitness_diversity,
    compare_best_fitness_diversity,
    plot_time_per_run,
    plot_fitness_distribution,
    plot_fitness_vs_time
)
from pop.util.fine_tuning import (
    select_configs,
    get_results_path,
    generate_base_filename,
    fine_tune_algorithms_parallel, 
    run_selected_configs_parallel
)

## Parameter Definition

In [None]:
# Number of runs for statistically significant results
num_runs = 5

# Experiment parameters (define these explicitly)
dataset = "../dataset"
num_companies = 200
risk_free_rate = 0.042
start_date = "2015-01-01"
end_date = "2020-01-01"
correlation_level = 'high' # 'low', 'medium', 'high', or None
seed = 0

## Hyperparameter Configuration

In [None]:
# Hyperparameters for Genetic Algorithm (GA)
ga_param_grid = {
    "pop_size": [200],
    "mutation_rate": [0.01, 0.05, 0.1],
    "gaussian_stdev": [0.01, 0.05, 0.1],
    "num_elites": [1, 2, 5],
    "max_generations": [50, 100]
}

# Hyperparameters for Particle Swarm Optimization (PSO)
pso_param_grid = {
    "swarm_size": [200],
    "max_iterations": [50, 100],
    "w": [0.4, 0.7, 0.9],
    "c1": [1.0, 1.5, 2.0],
    "c2": [1.0, 1.5, 2.0]
}

# Possible repair methods
REPAIR_METHODS = ["normalize", "shrink"]

## Fine-Tuning Run

- Experimentation with different hyperparameters

In [None]:
# Parallel fine-tuning for all configs
fine_tuning_results = fine_tune_algorithms_parallel(
    num_runs=5,
    dataset=dataset,
    num_companies=num_companies,
    risk_free_rate=risk_free_rate,
    start_date=start_date,
    end_date=end_date,
    correlation_level=correlation_level,
    seed=seed,
    ga_param_grid=ga_param_grid,
    pso_param_grid=pso_param_grid,
    REPAIR_METHODS=REPAIR_METHODS
)

In [None]:
# If already available in separate files, load fine tuning results for GA and PSO
fetch_from_files = True

if fetch_from_files:
    ft_results_path_ga = f"../experiments/results/{correlation_level}_correlation/ga/fine_tuning_results.csv"
    ft_results_path_pso = f"../experiments/results/{correlation_level}_correlation/pso/fine_tuning_results.csv"
    fine_tuning_results_df_ga = pd.read_csv(ft_results_path_ga)
    fine_tuning_results_df_pso = pd.read_csv(ft_results_path_pso)

    ga_best, ga_median, ga_worst = select_configs(fine_tuning_results_df_ga, "GA")
    pso_best, pso_median, pso_worst = select_configs(fine_tuning_results_df_pso, "PSO")
else:
    # Select best, median, and worst configs for GA and PSO
    ga_best, ga_median, ga_worst = select_configs(pd.DataFrame(fine_tuning_results), "GA")
    pso_best, pso_median, pso_worst = select_configs(pd.DataFrame(fine_tuning_results), "PSO")

# Combine into a list
selected_configs = [ga_best, ga_median, ga_worst, pso_best, pso_median, pso_worst]

- Re-run best, median and worst configuratiosn for GA and PSO 

In [None]:
# Run the selected configurations in parallel
final_results_df = run_selected_configs_parallel(
    selected_configs=selected_configs,
    num_runs=31,
    dataset=dataset,
    num_companies=num_companies,
    risk_free_rate=risk_free_rate,
    start_date=start_date,
    end_date=end_date,
    correlation_level=correlation_level,
    seed=seed
)

In [None]:
# Save the final results to a CSV file
result_path = get_results_path("final_fine_tuning_results.csv", correlation_level=correlation_level)
final_results_df.to_csv(result_path, index=False)
print(f"✅ Final fine-tuning results saved to '{result_path}'")

In [None]:
# Load and print statistics
final_results_df = pd.read_csv(get_results_path("final_fine_tuning_results.csv", correlation_level=correlation_level))
print_statistics(final_results_df, "sharpe_ratio")
print_statistics(final_results_df, "annual_return")
print_statistics(final_results_df, "runtime")

## Visualization

### Fitness and Diversity Evolution (GA and PSO)

- Plot the evolution of fitness and diversity over generations for each algorithm
- Separate plots for GA and PSO
- Display best, median, and worst configurations in the same plot for comparison

In [None]:
# Plotting function for fitness and diversity
for algo in ["ga", "pso"]:
    for quality in ["best", "median", "worst"]:
        plot_fitness_diversity(
            get_results_path,
            algo,
            quality,
            correlation_level
        )

### Comparison of Best Fitness and Diversity between GA and PSO

- Compare the best fitness and diversity evolution between GA and PSO
- Helps in understanding which algorithm converges faster and maintains diversity

In [None]:
# Run the comparison function
compare_best_fitness_diversity(get_results_path, correlation_level)

### Final Fitness Distribution

- A box plot or violin plot showing the distribution of final fitness values for each algorithm
- Visualize the variability in performance

In [None]:
plot_fitness_distribution(final_results_df)

### Time taken per run

- A bar plot or box plot to show the time taken for each configuration
- Compare time taken between GA and PSO

In [None]:
plot_time_per_run(final_results_df)

### Fitness vs. Time Tradeoff

- A scatter plot comparing the mean fitness vs. mean time taken
- Each point represents a different configuration

In [None]:
plot_fitness_vs_time(final_results_df)