In [17]:
import shutil
from pathlib import Path

import pandas as pd

EXPORT_TO_THESIS = True  # set to True to also copy outputs into tables/

THESIS_ROOT = Path('/Users/stefan/Workspace/bauers-ma/thesis/').resolve()

TABLES_DIR = THESIS_ROOT / 'tables'

# Local output folder for replication package
OUT_DIR = Path('.') / 'data'
OUT_DIR.mkdir(exist_ok=True)


# Expected input CSV locations (relative to this notebook folder)
FITNESS_CSV = Path('../model_training/fitness/grid_search_results.csv')
TESTOUTCOME_CSV = Path('../model_training/testoutcome/grid_search_results.csv')

print(f'Thesis root: {THESIS_ROOT}')
print(f'Output dir: {OUT_DIR.resolve()}')
print(f'Fitness CSV: {FITNESS_CSV} (exists={FITNESS_CSV.exists()})')
print(f'Test outcome CSV: {TESTOUTCOME_CSV} (exists={TESTOUTCOME_CSV.exists()})')

Thesis root: /Users/stefan/Workspace/bauers-ma/thesis
Output dir: /Users/stefan/Workspace/bauers-ma-replication-package/results/data
Fitness CSV: ../model_training/fitness/grid_search_results.csv (exists=True)
Test outcome CSV: ../model_training/testoutcome/grid_search_results.csv (exists=True)


In [18]:
# Load grid search results
if not FITNESS_CSV.exists() or not TESTOUTCOME_CSV.exists():
    raise FileNotFoundError(
        'Missing input CSVs. Expected:'
        f"\n  - {FITNESS_CSV.resolve()}"
        f"\n  - {TESTOUTCOME_CSV.resolve()}"
        "\n\nRun this notebook from the directory that contains the 'fitness_new/' and 'testoutcome_new/' folders."
    )

fitness_results = pd.read_csv(FITNESS_CSV)
testoutcome_results = pd.read_csv(TESTOUTCOME_CSV)

print(f"Fitness model configurations: {len(fitness_results)}")
print(f"Test outcome model configurations: {len(testoutcome_results)}")

Fitness model configurations: 160
Test outcome model configurations: 40


In [19]:
# Sort by mean_test_score (R² for regression) in descending order
fitness_top10 = fitness_results.nlargest(10, 'mean_test_score').copy()

# Create a clean table with only relevant columns
fitness_table = pd.DataFrame({
    'Rank': range(1, 11),
    'Hidden Layers': fitness_top10['param_hidden_layer_sizes'].values,
    'Activation': fitness_top10['param_activation'].values,
    'Alpha': fitness_top10['param_alpha'].values,
    'Learning Rate': fitness_top10['param_learning_rate'].values,
    'Solver': fitness_top10['param_solver'].values,
    'CV R²': fitness_top10['mean_test_score'].round(4).values
})

print("Top 10 Fitness Model Configurations:")
print(fitness_table.to_string(index=False))

Top 10 Fitness Model Configurations:
 Rank Hidden Layers Activation   Alpha Learning Rate Solver  CV R²
    1     (128, 64)       tanh 0.00010      adaptive    sgd 0.6426
    2     (128, 64)       tanh 0.00001      adaptive    sgd 0.6419
    3     (128, 64)       tanh 0.00100      adaptive    sgd 0.6418
    4     (128, 64)       tanh 0.00001      constant   adam 0.6406
    5     (128, 64)       tanh 0.00001      adaptive   adam 0.6406
    6     (128, 64)       tanh 0.00100      constant   adam 0.6392
    7     (128, 64)       tanh 0.00100      adaptive   adam 0.6392
    8     (64, 128)       tanh 0.00010      constant   adam 0.6387
    9     (64, 128)       tanh 0.00010      adaptive   adam 0.6387
   10     (128, 64)       tanh 0.00010      constant   adam 0.6386


In [20]:
# Sort by mean_test_score (F1 micro for classification) in descending order
testoutcome_top10 = testoutcome_results.nlargest(10, 'mean_test_score').copy()

# Create a clean table with only relevant columns
testoutcome_table = pd.DataFrame({
    'Rank': range(1, 11),
    'Hidden Layers': testoutcome_top10['param_hidden_layer_sizes'].values,
    'Activation': testoutcome_top10['param_activation'].values,
    'Alpha': testoutcome_top10['param_alpha'].values,
    'Learning Rate': testoutcome_top10['param_learning_rate'].values,
    'CV F1 (Micro)': testoutcome_top10['mean_test_score'].round(4).values
})

print("Top 10 Test Outcome Model Configurations:")
print(testoutcome_table.to_string(index=False))

Top 10 Test Outcome Model Configurations:
 Rank Hidden Layers Activation  Alpha Learning Rate  CV F1 (Micro)
    1     (64, 128)       tanh 0.0010      constant         0.7817
    2     (64, 128)       tanh 0.0010      adaptive         0.7817
    3     (64, 128)       relu 0.0010      constant         0.7815
    4     (64, 128)       relu 0.0010      adaptive         0.7815
    5     (64, 128)       relu 0.0001      constant         0.7801
    6     (64, 128)       relu 0.0001      adaptive         0.7801
    7     (64, 128)       tanh 0.0001      constant         0.7797
    8     (64, 128)       tanh 0.0001      adaptive         0.7797
    9     (128, 64)       relu 0.0010      constant         0.7784
   10     (128, 64)       relu 0.0010      adaptive         0.7784


In [21]:
def format_alpha_latex(alpha):
    """Format alpha value as LaTeX scientific notation"""
    if alpha == 0.0001:
        return r"$1 \times 10^{-4}$"
    elif alpha == 0.001:
        return r"$1 \times 10^{-3}$"
    elif alpha == 1e-05:
        return r"$1 \times 10^{-5}$"
    else:
        return str(alpha)

def create_latex_table_fitness(df):
    """Create LaTeX table for fitness model configurations"""
    df_latex = df.copy()
    df_latex['Alpha'] = df_latex['Alpha'].apply(format_alpha_latex)
    
    latex = r"""\begin{table}
	\centering
	\caption{Top 10 Hyperparameter Configurations for Fitness Prediction Model}
	\label{tab:fitness-top-configs}
	\small
	\begin{tabular}{cllcccc}
		\toprule
		\textbf{Rank} & \textbf{Layers} & \textbf{Activation} & \textbf{Alpha} & \textbf{LR} & \textbf{Solver} & \textbf{CV R²} \\
		\midrule
"""
    
    for _, row in df_latex.iterrows():
        latex += f"\t\t{row['Rank']} & {row['Hidden Layers']} & {row['Activation']} & {row['Alpha']} & {row['Learning Rate']} & {row['Solver']} & {row['CV R²']:.4f} \\\\\n"
    
    latex += r"""		\bottomrule
	\end{tabular}
\end{table}"""
    
    return latex

def create_latex_table_testoutcome(df):
    """Create LaTeX table for test outcome model configurations"""
    df_latex = df.copy()
    df_latex['Alpha'] = df_latex['Alpha'].apply(format_alpha_latex)
    
    latex = r"""\begin{table}
	\centering
	\caption{Top 10 Hyperparameter Configurations for Test Outcome Prediction Model}
	\label{tab:testoutcome-top-configs}
	\small
	\begin{tabular}{cllccc}
		\toprule
		\textbf{Rank} & \textbf{Layers} & \textbf{Activation} & \textbf{Alpha} & \textbf{LR} & \textbf{CV F1 (Micro)} \\
		\midrule
"""
    
    for _, row in df_latex.iterrows():
        latex += f"\t\t{row['Rank']} & {row['Hidden Layers']} & {row['Activation']} & {row['Alpha']} & {row['Learning Rate']} & {row['CV F1 (Micro)']:.4f} \\\\\n"
    
    latex += r"""		\bottomrule
	\end{tabular}
\end{table}"""
    
    return latex

In [22]:
# Generate LaTeX tables
fitness_latex = create_latex_table_fitness(fitness_table)
testoutcome_latex = create_latex_table_testoutcome(testoutcome_table)

# Save to local replication output folder
out_fitness = OUT_DIR / 'fitness_top10_hyperparameters.tex'
out_testoutcome = OUT_DIR / 'testoutcome_top10_hyperparameters.tex'

out_fitness.write_text(fitness_latex)
out_testoutcome.write_text(testoutcome_latex)

print('LaTeX tables saved in data/:')
print(f'  - {out_fitness.name}')
print(f'  - {out_testoutcome.name}')

if EXPORT_TO_THESIS:
    shutil.copy2(out_fitness, TABLES_DIR / out_fitness.name)
    shutil.copy2(out_testoutcome, TABLES_DIR / out_testoutcome.name)
    print('Copied to thesis tables/:')
    print(f'  - {out_fitness.name}')
    print(f'  - {out_testoutcome.name}')

LaTeX tables saved in data/:
  - fitness_top10_hyperparameters.tex
  - testoutcome_top10_hyperparameters.tex
Copied to thesis tables/:
  - fitness_top10_hyperparameters.tex
  - testoutcome_top10_hyperparameters.tex


In [23]:
# Export to CSV in local replication output folder
out_fitness_csv = OUT_DIR / 'fitness_top10_hyperparameters.csv'
out_testoutcome_csv = OUT_DIR / 'testoutcome_top10_hyperparameters.csv'

fitness_table.to_csv(out_fitness_csv, index=False)
testoutcome_table.to_csv(out_testoutcome_csv, index=False)

print('CSV files saved in data/:')
print(f'  - {out_fitness_csv.name}')
print(f'  - {out_testoutcome_csv.name}')

CSV files saved in data/:
  - fitness_top10_hyperparameters.csv
  - testoutcome_top10_hyperparameters.csv


In [24]:
print("FITNESS MODEL - Pattern Analysis")
print("=" * 80)
print(f"Activation functions: {fitness_table['Activation'].value_counts().to_dict()}")
print(f"Hidden layer architectures: {fitness_table['Hidden Layers'].value_counts().to_dict()}")
print(f"Learning rate schedules: {fitness_table['Learning Rate'].value_counts().to_dict()}")
print(f"Alpha values: {fitness_table['Alpha'].value_counts().to_dict()}")
print(f"Solvers: {fitness_table['Solver'].value_counts().to_dict()}")
print(f"R² score range: {fitness_table['CV R²'].min():.4f} - {fitness_table['CV R²'].max():.4f}")

print("\n" + "=" * 80)
print("TEST OUTCOME MODEL - Pattern Analysis")
print("=" * 80)
print(f"Activation functions: {testoutcome_table['Activation'].value_counts().to_dict()}")
print(f"Hidden layer architectures: {testoutcome_table['Hidden Layers'].value_counts().to_dict()}")
print(f"Learning rate schedules: {testoutcome_table['Learning Rate'].value_counts().to_dict()}")
print(f"Alpha values: {testoutcome_table['Alpha'].value_counts().to_dict()}")
print(f"F1 score range: {testoutcome_table['CV F1 (Micro)'].min():.4f} - {testoutcome_table['CV F1 (Micro)'].max():.4f}")

FITNESS MODEL - Pattern Analysis
Activation functions: {'tanh': 10}
Hidden layer architectures: {'(128, 64)': 8, '(64, 128)': 2}
Learning rate schedules: {'adaptive': 6, 'constant': 4}
Alpha values: {0.0001: 4, 1e-05: 3, 0.001: 3}
Solvers: {'adam': 7, 'sgd': 3}
R² score range: 0.6386 - 0.6426

TEST OUTCOME MODEL - Pattern Analysis
Activation functions: {'relu': 6, 'tanh': 4}
Hidden layer architectures: {'(64, 128)': 8, '(128, 64)': 2}
Learning rate schedules: {'constant': 5, 'adaptive': 5}
Alpha values: {0.001: 6, 0.0001: 4}
F1 score range: 0.7784 - 0.7817
