In [10]:
import sys
sys.path.append('..')

In [11]:
import warnings
import multiprocessing
warnings.filterwarnings("ignore", category=ResourceWarning)

# Also suppress multiprocessing warnings
import sys
import os
os.environ['PYTHONWARNINGS'] = 'ignore::ResourceWarning'

# Data Loading

In [12]:
from pathlib import Path
import pandas as pd

def load_datasets(data_root: str | Path = "data",
                  tasks: tuple[str, ...] = ("binary", "multiclass"),
                  splits: tuple[str, ...] = ("train", "val", "test")) -> dict:

    data_root = Path(data_root)
    datasets  = {}

    for task in tasks:
        task_dir     = data_root / task
        task_dict    = {}

        for split in splits:
            split_dict = {}
            for kind in ("X", "y"):
                file_path = task_dir / f"{kind}_{split}.pkl"
                split_dict[kind] = pd.read_pickle(file_path)
            task_dict[split] = split_dict

        datasets[task] = task_dict

    return datasets

In [13]:
from typing import Tuple, Literal
import pandas as pd

def load_split(
    preprocessing_type: Literal["cleaned_only", "full_process"],
    sampling_method: Literal["undersampled", "oversampled"],
    classification_type: Literal["binary", "multiclass"]
) -> Tuple[
    Tuple[pd.DataFrame, pd.Series],  # train: (X_train, y_train)
    Tuple[pd.DataFrame, pd.Series],  # val: (X_val, y_val)
    Tuple[pd.DataFrame, pd.Series]   # test: (X_test, y_test)
]:
    """
    Load different types of splits from the data
    
    Args:
        preprocessing_type: must be "cleaned_only" or "full_process"
        sampling_method: must be "undersampled" or "oversampled"
        classification_type: must be "binary" or "multiclass"
    
    Returns:
        Tuple of (train, val, test) splits, where each split is (X, y)
        - train: (X_train, y_train)
        - val: (X_val, y_val)  
        - test: (X_test, y_test)
    """
    dataset = load_datasets(
        f"../data/{preprocessing_type}/{sampling_method}")[classification_type]
    split_names = ["train", "val", "test"]

    return tuple([(lambda split: (dataset[split]["X"], dataset[split]["y"]))(split) for split in split_names])

# Experiments

In [14]:
def combine_text(X):
    X = X.copy()

    combined = X["resume_text"].astype(
        str) + " [SEP] " + X["job_description_text"].astype(str)

    return combined.values

In [15]:
splits = load_split(preprocessing_type="cleaned_only", sampling_method="undersampled", classification_type="binary")

In [16]:
SEED = 42

# Experiment 1: Base Parameters

In [17]:
from utils import ExperimentManager, Experiment

manager = ExperimentManager(f"../runs/ensemble/optimization/bernoulli/", ["Fit", "Not Fit"])

In [18]:
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,StackingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


def compute_cosine_similarity(X):
    """
    Compute cosine similarity between resume_text and job_description_text
    for each row in a DataFrame or compatible input.
    Returns a 2D NumPy array of shape (n_samples, 1).
    """

    # Defensive: ensure X is a DataFrame with expected columns
    if isinstance(X, np.ndarray):
        # If it's already an ndarray, we must know column order
        X = pd.DataFrame(X, columns=["resume_text", "job_description_text"])
    elif not isinstance(X, pd.DataFrame):
        raise ValueError("Input X must be a DataFrame or 2D ndarray.")

    if "resume_text" not in X.columns or "job_description_text" not in X.columns:
        raise ValueError("Expected columns 'resume_text' and 'job_description_text' not found.")

    # Flatten all text for vectorizer fit
    all_texts = X["resume_text"].astype(str).tolist() + X["job_description_text"].astype(str).tolist()
    
    # Fit vectorizer
    vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
    vectorizer.fit(all_texts)

    # Compute cosine similarity for each row
    cosine_scores = []
    for idx, row in X.iterrows():
        resume_text = str(row['resume_text'])
        job_text = str(row['job_description_text'])

        tfidf_matrix = vectorizer.transform([resume_text, job_text])
        cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        cosine_scores.append(cos_sim)

    return np.array(cosine_scores).reshape(-1, 1)

def pipeline_factory(params):

    clf_lr = LogisticRegression(random_state=SEED)
    clf_rf = RandomForestClassifier(random_state=SEED)
    clf_nb = BernoulliNB()

    stacking_clf = StackingClassifier(
        estimators=[
            ('lr', clf_lr),
            ('nb', clf_nb),
            ('rf', clf_rf)
        ],
        final_estimator=ExplainableBoostingClassifier(random_state=SEED),
        cv=5,
        n_jobs=1
    )

    return Pipeline([
        ('features', FeatureUnion([
            # TF-IDF features
            ('tfidf_features', Pipeline([
                ("join", FunctionTransformer(combine_text, validate=False)),
                ('tfidf', TfidfVectorizer()),
                ('selector', SelectKBest(chi2, k=100))
            ])),
            
            # Cosine similarity feature with scaling
            ('cosine_sim', Pipeline([
                ('extract', FunctionTransformer(compute_cosine_similarity, validate=False))
            ]))
        ])),
        ('clf', stacking_clf)
    ])


experiment = Experiment(
    name=f"Baseline optimization EBM chi2 stack",
    description=f"No hyperparameter tuning yet but parameters are changed from defaults",
    pipeline_factory=pipeline_factory
)

manager.run_experiment(experiment, splits=splits)



=== Running Experiment: Baseline optimization EBM chi2 stack ===

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6814

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6801
   Micro F1:     0.6814
   Weighted F1:  0.6801

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6845  R: 0.6814
   Micro    - P: 0.6814  R: 0.6814
   Weighted - P: 0.6845  R: 0.6814

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.6608     0.7456     0.7007        857
   Not Fit              0.7082     0.6173     0.6596        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6845     0.6814     0.6801       1714
   weighted avg         0.6845     0.6814     0.6801       1714

🔢 CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted →
   True ↓        Fit  Not Fi

<utils.ExperimentManger.Experiment at 0x7fccd213d490>

# Experiment 2: Hyperparameter Optimization

## Conservative Parameter Space

In [19]:
def conservative_ensemble_param_space(trial):
    """
    More conservative parameter space with fewer options
    Good for faster optimization with reasonable performance
    """
    params = {}
    
    # TF-IDF - Limited options
    params['tfidf__ngram_range'] = trial.suggest_categorical('tfidf__ngram_range', [
        (1, 1), (1, 2)
    ])
    params['tfidf__max_features'] = trial.suggest_categorical('tfidf__max_features', [
        5000, 10000, 15000
    ])
    params['tfidf__sublinear_tf'] = trial.suggest_categorical('tfidf__sublinear_tf', [
        True, False
    ])
    
    # Feature Selection
    params['selector__k'] = trial.suggest_categorical('selector__k', [
        100, 500, 1000, 2000
    ])
    
    # Logistic Regression - Simple
    params['clf__estimators__lr__C'] = trial.suggest_float('clf__estimators__lr__C', 
                                                          0.1, 10.0, log=True)
    params['clf__estimators__lr__class_weight'] = trial.suggest_categorical('clf__estimators__lr__class_weight', [
        None, 'balanced'
    ])
    
    # Random Forest - Simple
    params['clf__estimators__rf__n_estimators'] = trial.suggest_categorical('clf__estimators__rf__n_estimators', [
        100, 200
    ])
    params['clf__estimators__rf__max_depth'] = trial.suggest_categorical('clf__estimators__rf__max_depth', [
        None, 10, 20
    ])
    
    # Naive Bayes - Simple
    params['clf__estimators__nb__alpha'] = trial.suggest_float('clf__estimators__nb__alpha', 
                                                              0.5, 2.0)
    
    # EBM - Simple
    params['clf__final_estimator__learning_rate'] = trial.suggest_float('clf__final_estimator__learning_rate', 
                                                                        0.01, 0.05)
    params['clf__final_estimator__interactions'] = trial.suggest_categorical('clf__final_estimator__interactions', [
        0, 3
    ])
    
    # Fixed parameters
    params['clf__estimators__lr__random_state'] = 42
    params['clf__estimators__rf__random_state'] = 42
    params['clf__final_estimator__random_state'] = 42
    params['clf__final_estimator__n_jobs'] = 1
    params['clf__estimators__rf__n_jobs'] = 1
    params['clf__n_jobs'] = 1
    
    return params

## L1 Regularization Space

In [20]:
def l1_regularization_param_space(trial):
    """
    Parameter space focused on L1 regularization (Lasso)
    Uses liblinear/saga solvers that support L1
    """
    params = {}
    
    # ========== TF-IDF: AGGRESSIVE REGULARIZATION ==========
    params['tfidf__max_features'] = trial.suggest_categorical('tfidf__max_features', [
        1000, 2000, 3000, 5000
    ])
    params['tfidf__ngram_range'] = trial.suggest_categorical('tfidf__ngram_range', [
        (1, 1), (1, 2)
    ])
    params['tfidf__min_df'] = trial.suggest_categorical('tfidf__min_df', [
        3, 5, 10, 0.01, 0.02
    ])
    params['tfidf__max_df'] = trial.suggest_categorical('tfidf__max_df', [
        0.7, 0.8, 0.85
    ])
    params['tfidf__sublinear_tf'] = True
    params['tfidf__use_idf'] = True
    params['tfidf__stop_words'] = 'english'
    
    # ========== FEATURE SELECTION ==========
    params['selector__k'] = trial.suggest_categorical('selector__k', [
        50, 100, 200, 300, 500
    ])
    
    # ========== LOGISTIC REGRESSION: L1 PENALTY ONLY ==========
    params['clf__estimators__lr__penalty'] = 'l1'  # Fixed to L1
    params['clf__estimators__lr__solver'] = trial.suggest_categorical(
        'clf__estimators__lr__solver', ['liblinear', 'saga']
    )
    params['clf__estimators__lr__C'] = trial.suggest_float(
        'clf__estimators__lr__C', 0.001, 1.0, log=True
    )
    params['clf__estimators__lr__max_iter'] = 1000
    params['clf__estimators__lr__class_weight'] = 'balanced'
    
    # ========== RANDOM FOREST: PREVENT OVERFITTING ==========
    params['clf__estimators__rf__n_estimators'] = trial.suggest_categorical(
        'clf__estimators__rf__n_estimators', [50, 100, 150]
    )
    params['clf__estimators__rf__max_depth'] = trial.suggest_categorical(
        'clf__estimators__rf__max_depth', [3, 5, 7, 10]
    )
    params['clf__estimators__rf__min_samples_split'] = trial.suggest_categorical(
        'clf__estimators__rf__min_samples_split', [10, 20, 50]
    )
    params['clf__estimators__rf__min_samples_leaf'] = trial.suggest_categorical(
        'clf__estimators__rf__min_samples_leaf', [5, 10, 20]
    )
    params['clf__estimators__rf__max_features'] = trial.suggest_categorical(
        'clf__estimators__rf__max_features', ['sqrt', 'log2']
    )
    params['clf__estimators__rf__class_weight'] = 'balanced'
    params['clf__estimators__rf__bootstrap'] = True
    
    # ========== NAIVE BAYES ==========
    params['clf__estimators__nb__alpha'] = trial.suggest_float(
        'clf__estimators__nb__alpha', 0.5, 5.0
    )
    
    # ========== EBM: CONSERVATIVE SETTINGS ==========
    params['clf__final_estimator__learning_rate'] = trial.suggest_float(
        'clf__final_estimator__learning_rate', 0.001, 0.02, log=True
    )
    params['clf__final_estimator__max_rounds'] = trial.suggest_categorical(
        'clf__final_estimator__max_rounds', [500, 1000, 2000]
    )
    params['clf__final_estimator__early_stopping_rounds'] = trial.suggest_categorical(
        'clf__final_estimator__early_stopping_rounds', [25, 50, 100]
    )
    params['clf__final_estimator__validation_size'] = 0.2
    params['clf__final_estimator__interactions'] = trial.suggest_categorical(
        'clf__final_estimator__interactions', [0, 1, 2]
    )
    params['clf__final_estimator__max_bins'] = trial.suggest_categorical(
        'clf__final_estimator__max_bins', [32, 64, 128]
    )
    
    # ========== STACKING ==========
    params['clf__cv'] = trial.suggest_categorical('clf__cv', [5, 7, 10])
    
    return params

In [21]:
def pipeline_factory(params):
    """
    Properly handle nested parameters for ensemble pipeline
    """
    
    # ========== EXTRACT PARAMETERS FOR EACH COMPONENT ==========
    
    # TF-IDF parameters
    tfidf_params = {}
    for key, value in params.items():
        if key.startswith('tfidf__'):
            param_name = key.replace('tfidf__', '')
            tfidf_params[param_name] = value
    
    # Feature selector parameters
    selector_params = {}
    for key, value in params.items():
        if key.startswith('selector__'):
            param_name = key.replace('selector__', '')
            selector_params[param_name] = value
    
    # LogisticRegression parameters
    lr_params = {'random_state': SEED}
    for key, value in params.items():
        if key.startswith('clf__estimators__lr__'):
            param_name = key.replace('clf__estimators__lr__', '')
            lr_params[param_name] = value
    
    # RandomForest parameters
    rf_params = {'random_state': SEED, 'n_jobs': 1}
    for key, value in params.items():
        if key.startswith('clf__estimators__rf__'):
            param_name = key.replace('clf__estimators__rf__', '')
            rf_params[param_name] = value
    
    # NaiveBayes parameters
    nb_params = {}
    for key, value in params.items():
        if key.startswith('clf__estimators__nb__'):
            param_name = key.replace('clf__estimators__nb__', '')
            nb_params[param_name] = value
    
    # EBM parameters
    ebm_params = {'random_state': SEED, 'n_jobs': 1}
    for key, value in params.items():
        if key.startswith('clf__final_estimator__'):
            param_name = key.replace('clf__final_estimator__', '')
            ebm_params[param_name] = value
    
    # Stacking parameters
    stacking_params = {'n_jobs': 1}
    for key, value in params.items():
        if key.startswith('clf__') and '__' not in key.replace('clf__', ''):
            param_name = key.replace('clf__', '')
            stacking_params[param_name] = value
    
    # ========== CREATE COMPONENTS WITH PARAMETERS ==========
    
    # Create TF-IDF vectorizer
    tfidf = TfidfVectorizer(**tfidf_params)
    
    # Create feature selector
    if not selector_params:
        selector_params['k'] = 100  # Default value
    selector = SelectKBest(chi2, **selector_params)
    
    # Create base estimators
    try:
        clf_lr = LogisticRegression(**lr_params)
    except ValueError as e:
        print(f"LogReg parameter error: {e}")
        # Fallback to safe parameters
        clf_lr = LogisticRegression(random_state=SEED, C=1.0, penalty='l2', solver='lbfgs')
    
    clf_rf = RandomForestClassifier(**rf_params)
    clf_nb = BernoulliNB(**nb_params)
    
    # Create EBM meta-learner
    try:
        ebm = ExplainableBoostingClassifier(**ebm_params)
    except Exception as e:
        print(f"EBM parameter error: {e}")
        # Fallback to safe parameters
        ebm = ExplainableBoostingClassifier(random_state=SEED, n_jobs=1)
    
    # Create stacking classifier
    stacking_clf = StackingClassifier(
        estimators=[
            ('lr', clf_lr),
            ('nb', clf_nb),
            ('rf', clf_rf)
        ],
        final_estimator=ebm,
        **stacking_params
    )
    
    return Pipeline([
        ('features', FeatureUnion([
            # TF-IDF features
            ('tfidf_features', Pipeline([
                ("join", FunctionTransformer(combine_text, validate=False)),
                ('tfidf', tfidf),
                ('selector', selector)
            ])),
            
            # Cosine similarity feature with scaling
            ('cosine_sim', Pipeline([
                ('extract', FunctionTransformer(compute_cosine_similarity, validate=False))
            ]))
        ])),
        ('clf', stacking_clf)
    ])

optuna_kwargs = {
    "n_trials": 30,        # Increase from 5 to 20 for better optimization
    "cv_folds": 20,         # Reduce from 10 to 5 for faster training
    "scoring": "accuracy",
    "random_state": SEED,
    "optimise":True
}

experiment = Experiment(
    name=f"L1 Regularization optimization EBM chi2 stack",
    description=f"L1 focused hyperparameter tuning",
    pipeline_factory=pipeline_factory,
    param_space=l1_regularization_param_space
)

manager.run_experiment(experiment, splits=splits, **optuna_kwargs)


=== Running Experiment: L1 Regularization optimization EBM chi2 stack ===


Hyperparameter Optimization (Custom Val Split):   0%|                                         | 0/30 [00:00<?, ?trial/s][I 2025-07-15 23:20:05,668] A new study created in memory with name: no-name-6609a21d-2ba6-4d7f-bf5f-2223068da0fb
Hyperparameter Optimization (Custom Val Split):   3%| | 1/30 [00:24<11:36, 24.02s/trial, Train: 0.6986 | Val: 0.6763 | B[I 2025-07-15 23:20:29,694] Trial 0 finished with value: 0.6763440860215054 and parameters: {'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1), 'tfidf__min_df': 0.02, 'tfidf__max_df': 0.85, 'selector__k': 100, 'clf__estimators__lr__solver': 'liblinear', 'clf__estimators__lr__C': 0.013648799370370042, 'clf__estimators__rf__n_estimators': 50, 'clf__estimators__rf__max_depth': 5, 'clf__estimators__rf__min_samples_split': 50, 'clf__estimators__rf__min_samples_leaf': 10, 'clf__estimators__rf__max_features': 'sqrt', 'clf__estimators__nb__alpha': 3.582439613132824, 'clf__final_estimator__learning_rate': 0.003641374501882251, 'clf__final_


🎯 Optimization completed using Custom Val Split!
   Best score: 0.7161
   Total trials: 30
🔧 Training final model with best parameters...
✅ Training complete!
📊 Logging optimization summary...
✅ Optimization summary logged!

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6663

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6661
   Micro F1:     0.6663
   Weighted F1:  0.6661

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6666  R: 0.6663
   Micro    - P: 0.6663  R: 0.6663
   Weighted - P: 0.6666  R: 0.6663

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.6596     0.6873     0.6731        857
   Not Fit              0.6736     0.6453     0.6591        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6666     0.6663     0.6661       1714
   weighted avg 

<utils.ExperimentManger.Experiment at 0x7fccd20819a0>

## L2 Parameter Space

In [22]:
def l2_regularization_param_space(trial):
    """
    Parameter space focused on L2 regularization (Ridge)
    Uses lbfgs/saga solvers that work well with L2
    """
    params = {}
    
    # ========== TF-IDF: AGGRESSIVE REGULARIZATION ==========
    params['tfidf__max_features'] = trial.suggest_categorical('tfidf__max_features', [
        1000, 2000, 3000, 5000
    ])
    params['tfidf__ngram_range'] = trial.suggest_categorical('tfidf__ngram_range', [
        (1, 1), (1, 2)
    ])
    params['tfidf__min_df'] = trial.suggest_categorical('tfidf__min_df', [
        3, 5, 10, 0.01, 0.02
    ])
    params['tfidf__max_df'] = trial.suggest_categorical('tfidf__max_df', [
        0.7, 0.8, 0.85
    ])
    params['tfidf__sublinear_tf'] = True
    params['tfidf__use_idf'] = True
    params['tfidf__stop_words'] = 'english'
    
    # ========== FEATURE SELECTION ==========
    params['selector__k'] = trial.suggest_categorical('selector__k', [
        50, 100, 200, 300, 500
    ])
    
    # ========== LOGISTIC REGRESSION: L2 PENALTY ONLY ==========
    params['clf__estimators__lr__penalty'] = 'l2'  # Fixed to L2
    params['clf__estimators__lr__solver'] = trial.suggest_categorical(
        'clf__estimators__lr__solver', ['lbfgs', 'saga']
    )
    params['clf__estimators__lr__C'] = trial.suggest_float(
        'clf__estimators__lr__C', 0.001, 1.0, log=True
    )
    params['clf__estimators__lr__max_iter'] = 1000
    params['clf__estimators__lr__class_weight'] = 'balanced'
    
    # ========== RANDOM FOREST: PREVENT OVERFITTING ==========
    params['clf__estimators__rf__n_estimators'] = trial.suggest_categorical(
        'clf__estimators__rf__n_estimators', [50, 100, 150]
    )
    params['clf__estimators__rf__max_depth'] = trial.suggest_categorical(
        'clf__estimators__rf__max_depth', [3, 5, 7, 10]
    )
    params['clf__estimators__rf__min_samples_split'] = trial.suggest_categorical(
        'clf__estimators__rf__min_samples_split', [10, 20, 50]
    )
    params['clf__estimators__rf__min_samples_leaf'] = trial.suggest_categorical(
        'clf__estimators__rf__min_samples_leaf', [5, 10, 20]
    )
    params['clf__estimators__rf__max_features'] = trial.suggest_categorical(
        'clf__estimators__rf__max_features', ['sqrt', 'log2']
    )
    params['clf__estimators__rf__class_weight'] = 'balanced'
    params['clf__estimators__rf__bootstrap'] = True
    
    # ========== NAIVE BAYES ==========
    params['clf__estimators__nb__alpha'] = trial.suggest_float(
        'clf__estimators__nb__alpha', 0.5, 5.0
    )
    
    # ========== EBM: CONSERVATIVE SETTINGS ==========
    params['clf__final_estimator__learning_rate'] = trial.suggest_float(
        'clf__final_estimator__learning_rate', 0.001, 0.02, log=True
    )
    params['clf__final_estimator__max_rounds'] = trial.suggest_categorical(
        'clf__final_estimator__max_rounds', [500, 1000, 2000]
    )
    params['clf__final_estimator__early_stopping_rounds'] = trial.suggest_categorical(
        'clf__final_estimator__early_stopping_rounds', [25, 50, 100]
    )
    params['clf__final_estimator__validation_size'] = 0.2
    params['clf__final_estimator__interactions'] = trial.suggest_categorical(
        'clf__final_estimator__interactions', [0, 1, 2]
    )
    params['clf__final_estimator__max_bins'] = trial.suggest_categorical(
        'clf__final_estimator__max_bins', [32, 64, 128]
    )
    
    # ========== STACKING ==========
    params['clf__cv'] = trial.suggest_categorical('clf__cv', [5, 7, 10])
    
    return params


In [23]:
def pipeline_factory(params):
    """
    Properly handle nested parameters for ensemble pipeline
    """
    
    # ========== EXTRACT PARAMETERS FOR EACH COMPONENT ==========
    
    # TF-IDF parameters
    tfidf_params = {}
    for key, value in params.items():
        if key.startswith('tfidf__'):
            param_name = key.replace('tfidf__', '')
            tfidf_params[param_name] = value
    
    # Feature selector parameters
    selector_params = {}
    for key, value in params.items():
        if key.startswith('selector__'):
            param_name = key.replace('selector__', '')
            selector_params[param_name] = value
    
    # LogisticRegression parameters
    lr_params = {'random_state': SEED}
    for key, value in params.items():
        if key.startswith('clf__estimators__lr__'):
            param_name = key.replace('clf__estimators__lr__', '')
            lr_params[param_name] = value
    
    # RandomForest parameters
    rf_params = {'random_state': SEED, 'n_jobs': 1}
    for key, value in params.items():
        if key.startswith('clf__estimators__rf__'):
            param_name = key.replace('clf__estimators__rf__', '')
            rf_params[param_name] = value
    
    # NaiveBayes parameters
    nb_params = {}
    for key, value in params.items():
        if key.startswith('clf__estimators__nb__'):
            param_name = key.replace('clf__estimators__nb__', '')
            nb_params[param_name] = value
    
    # EBM parameters
    ebm_params = {'random_state': SEED, 'n_jobs': 1}
    for key, value in params.items():
        if key.startswith('clf__final_estimator__'):
            param_name = key.replace('clf__final_estimator__', '')
            ebm_params[param_name] = value
    
    # Stacking parameters
    stacking_params = {'n_jobs': 1}
    for key, value in params.items():
        if key.startswith('clf__') and '__' not in key.replace('clf__', ''):
            param_name = key.replace('clf__', '')
            stacking_params[param_name] = value
    
    # ========== CREATE COMPONENTS WITH PARAMETERS ==========
    
    # Create TF-IDF vectorizer
    tfidf = TfidfVectorizer(**tfidf_params)
    
    # Create feature selector
    if not selector_params:
        selector_params['k'] = 100  # Default value
    selector = SelectKBest(chi2, **selector_params)
    
    # Create base estimators
    try:
        clf_lr = LogisticRegression(**lr_params)
    except ValueError as e:
        print(f"LogReg parameter error: {e}")
        # Fallback to safe parameters
        clf_lr = LogisticRegression(random_state=SEED, C=1.0, penalty='l2', solver='lbfgs')
    
    clf_rf = RandomForestClassifier(**rf_params)
    clf_nb = BernoulliNB(**nb_params)
    
    # Create EBM meta-learner
    try:
        ebm = ExplainableBoostingClassifier(**ebm_params)
    except Exception as e:
        print(f"EBM parameter error: {e}")
        # Fallback to safe parameters
        ebm = ExplainableBoostingClassifier(random_state=SEED, n_jobs=1)
    
    # Create stacking classifier
    stacking_clf = StackingClassifier(
        estimators=[
            ('lr', clf_lr),
            ('nb', clf_nb),
            ('rf', clf_rf)
        ],
        final_estimator=ebm,
        passthrough=True,
        **stacking_params
    )
    
    return Pipeline([
        ('features', FeatureUnion([
            # TF-IDF features
            ('tfidf_features', Pipeline([
                ("join", FunctionTransformer(combine_text, validate=False)),
                ('tfidf', tfidf),
                ('selector', selector)
            ])),
            
            # Cosine similarity feature with scaling
            ('cosine_sim', Pipeline([
                ('extract', FunctionTransformer(compute_cosine_similarity, validate=False))
            ]))
        ])),
        ('clf', stacking_clf)
    ])

optuna_kwargs = {
    "n_trials": 30,        # Increase from 5 to 20 for better optimization
    "cv_folds": 20,         # Reduce from 10 to 5 for faster training
    "scoring": "accuracy",
    "random_state": SEED,
    "optimise":True
}

experiment = Experiment(
    name=f"L2 Regularization optimization EBM chi2 stack",
    description=f"L2 focused hyperparameter tuning",
    pipeline_factory=pipeline_factory,
    param_space=l2_regularization_param_space
)

manager.run_experiment(experiment, splits=splits, **optuna_kwargs)


=== Running Experiment: L2 Regularization optimization EBM chi2 stack ===


Hyperparameter Optimization (Custom Val Split):   0%|                                         | 0/30 [00:00<?, ?trial/s][I 2025-07-15 23:36:59,018] A new study created in memory with name: no-name-d38b7507-9d5d-4e5b-a2fc-d6f4fc46447f
Hyperparameter Optimization (Custom Val Split):   3%| | 1/30 [00:37<18:07, 37.49s/trial, Train: 0.7030 | Val: 0.6801 | B[I 2025-07-15 23:37:36,508] Trial 0 finished with value: 0.6801075268817204 and parameters: {'tfidf__max_features': 2000, 'tfidf__ngram_range': (1, 1), 'tfidf__min_df': 3, 'tfidf__max_df': 0.85, 'selector__k': 50, 'clf__estimators__lr__solver': 'lbfgs', 'clf__estimators__lr__C': 0.006500772939840752, 'clf__estimators__rf__n_estimators': 50, 'clf__estimators__rf__max_depth': 5, 'clf__estimators__rf__min_samples_split': 20, 'clf__estimators__rf__min_samples_leaf': 10, 'clf__estimators__rf__max_features': 'log2', 'clf__estimators__nb__alpha': 4.555199971262057, 'clf__final_estimator__learning_rate': 0.004320008388957367, 'clf__final_estimato


🎯 Optimization completed using Custom Val Split!
   Best score: 0.7430
   Total trials: 30
🔧 Training final model with best parameters...
✅ Training complete!
📊 Logging optimization summary...
✅ Optimization summary logged!

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6803

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6790
   Micro F1:     0.6803
   Weighted F1:  0.6790

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6832  R: 0.6803
   Micro    - P: 0.6803  R: 0.6803
   Weighted - P: 0.6832  R: 0.6803

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.6601     0.7433     0.6992        857
   Not Fit              0.7063     0.6173     0.6588        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6832     0.6803     0.6790       1714
   weighted avg 

<utils.ExperimentManger.Experiment at 0x7fcccf95de80>

## Elasticnet Parameter Space

In [24]:
def elasticnet_regularization_param_space(trial):
    """
    Parameter space focused on ElasticNet regularization
    Uses saga solver (only one that supports elasticnet)
    """
    params = {}
    
    # ========== TF-IDF: AGGRESSIVE REGULARIZATION ==========
    params['tfidf__max_features'] = trial.suggest_categorical('tfidf__max_features', [
        1000, 2000, 3000, 5000
    ])
    params['tfidf__ngram_range'] = trial.suggest_categorical('tfidf__ngram_range', [
        (1, 1), (1, 2)
    ])
    params['tfidf__min_df'] = trial.suggest_categorical('tfidf__min_df', [
        3, 5, 10, 0.01, 0.02
    ])
    params['tfidf__max_df'] = trial.suggest_categorical('tfidf__max_df', [
        0.7, 0.8, 0.85
    ])
    params['tfidf__sublinear_tf'] = True
    params['tfidf__use_idf'] = True
    params['tfidf__stop_words'] = 'english'
    
    # ========== FEATURE SELECTION ==========
    params['selector__k'] = trial.suggest_categorical('selector__k', [
        50, 100, 200, 300, 500
    ])
    
    # ========== LOGISTIC REGRESSION: ELASTICNET PENALTY ==========
    params['clf__estimators__lr__penalty'] = 'elasticnet'  # Fixed to elasticnet
    params['clf__estimators__lr__solver'] = 'saga'  # Only solver that supports elasticnet
    params['clf__estimators__lr__C'] = trial.suggest_float(
        'clf__estimators__lr__C', 0.001, 1.0, log=True
    )
    params['clf__estimators__lr__l1_ratio'] = trial.suggest_float(
        'clf__estimators__lr__l1_ratio', 0.1, 0.9
    )
    params['clf__estimators__lr__max_iter'] = 2000  # ElasticNet may need more iterations
    params['clf__estimators__lr__class_weight'] = 'balanced'
    
    # ========== RANDOM FOREST: PREVENT OVERFITTING ==========
    params['clf__estimators__rf__n_estimators'] = trial.suggest_categorical(
        'clf__estimators__rf__n_estimators', [50, 100, 150]
    )
    params['clf__estimators__rf__max_depth'] = trial.suggest_categorical(
        'clf__estimators__rf__max_depth', [3, 5, 7, 10]
    )
    params['clf__estimators__rf__min_samples_split'] = trial.suggest_categorical(
        'clf__estimators__rf__min_samples_split', [10, 20, 50]
    )
    params['clf__estimators__rf__min_samples_leaf'] = trial.suggest_categorical(
        'clf__estimators__rf__min_samples_leaf', [5, 10, 20]
    )
    params['clf__estimators__rf__max_features'] = trial.suggest_categorical(
        'clf__estimators__rf__max_features', ['sqrt', 'log2']
    )
    params['clf__estimators__rf__class_weight'] = 'balanced'
    params['clf__estimators__rf__bootstrap'] = True
    
    # ========== NAIVE BAYES ==========
    params['clf__estimators__nb__alpha'] = trial.suggest_float(
        'clf__estimators__nb__alpha', 0.5, 5.0
    )
    
    # ========== EBM: CONSERVATIVE SETTINGS ==========
    params['clf__final_estimator__learning_rate'] = trial.suggest_float(
        'clf__final_estimator__learning_rate', 0.001, 0.02, log=True
    )
    params['clf__final_estimator__max_rounds'] = trial.suggest_categorical(
        'clf__final_estimator__max_rounds', [500, 1000, 2000]
    )
    params['clf__final_estimator__early_stopping_rounds'] = trial.suggest_categorical(
        'clf__final_estimator__early_stopping_rounds', [25, 50, 100]
    )
    params['clf__final_estimator__validation_size'] = 0.2
    params['clf__final_estimator__interactions'] = trial.suggest_categorical(
        'clf__final_estimator__interactions', [0, 1, 2]
    )
    params['clf__final_estimator__max_bins'] = trial.suggest_categorical(
        'clf__final_estimator__max_bins', [32, 64, 128]
    )
    
    # ========== STACKING ==========
    params['clf__cv'] = trial.suggest_categorical('clf__cv', [5, 7, 10])
    
    return params

In [25]:
def pipeline_factory(params):
    """
    Properly handle nested parameters for ensemble pipeline
    """
    
    # ========== EXTRACT PARAMETERS FOR EACH COMPONENT ==========
    
    # TF-IDF parameters
    tfidf_params = {}
    for key, value in params.items():
        if key.startswith('tfidf__'):
            param_name = key.replace('tfidf__', '')
            tfidf_params[param_name] = value
    
    # Feature selector parameters
    selector_params = {}
    for key, value in params.items():
        if key.startswith('selector__'):
            param_name = key.replace('selector__', '')
            selector_params[param_name] = value
    
    # LogisticRegression parameters
    lr_params = {'random_state': SEED}
    for key, value in params.items():
        if key.startswith('clf__estimators__lr__'):
            param_name = key.replace('clf__estimators__lr__', '')
            lr_params[param_name] = value
    
    # RandomForest parameters
    rf_params = {'random_state': SEED, 'n_jobs': 1}
    for key, value in params.items():
        if key.startswith('clf__estimators__rf__'):
            param_name = key.replace('clf__estimators__rf__', '')
            rf_params[param_name] = value
    
    # NaiveBayes parameters
    nb_params = {}
    for key, value in params.items():
        if key.startswith('clf__estimators__nb__'):
            param_name = key.replace('clf__estimators__nb__', '')
            nb_params[param_name] = value
    
    # EBM parameters
    ebm_params = {'random_state': SEED, 'n_jobs': 1}
    for key, value in params.items():
        if key.startswith('clf__final_estimator__'):
            param_name = key.replace('clf__final_estimator__', '')
            ebm_params[param_name] = value
    
    # Stacking parameters
    stacking_params = {'n_jobs': 1}
    for key, value in params.items():
        if key.startswith('clf__') and '__' not in key.replace('clf__', ''):
            param_name = key.replace('clf__', '')
            stacking_params[param_name] = value
    
    # ========== CREATE COMPONENTS WITH PARAMETERS ==========
    
    # Create TF-IDF vectorizer
    tfidf = TfidfVectorizer(**tfidf_params)
    
    # Create feature selector
    if not selector_params:
        selector_params['k'] = 100  # Default value
    selector = SelectKBest(chi2, **selector_params)
    
    # Create base estimators
    try:
        clf_lr = LogisticRegression(**lr_params)
    except ValueError as e:
        print(f"LogReg parameter error: {e}")
        # Fallback to safe parameters
        clf_lr = LogisticRegression(random_state=SEED, C=1.0, penalty='l2', solver='lbfgs')
    
    clf_rf = RandomForestClassifier(**rf_params)
    clf_nb = BernoulliNB(**nb_params)
    
    # Create EBM meta-learner
    try:
        ebm = ExplainableBoostingClassifier(**ebm_params)
    except Exception as e:
        print(f"EBM parameter error: {e}")
        # Fallback to safe parameters
        ebm = ExplainableBoostingClassifier(random_state=SEED, n_jobs=1)
    
    # Create stacking classifier
    stacking_clf = StackingClassifier(
        estimators=[
            ('lr', clf_lr),
            ('nb', clf_nb),
            ('rf', clf_rf)
        ],
        final_estimator=ebm,
        **stacking_params
    )
    
    return Pipeline([
        ('features', FeatureUnion([
            # TF-IDF features
            ('tfidf_features', Pipeline([
                ("join", FunctionTransformer(combine_text, validate=False)),
                ('tfidf', tfidf),
                ('selector', selector)
            ])),
            
            # Cosine similarity feature with scaling
            ('cosine_sim', Pipeline([
                ('extract', FunctionTransformer(compute_cosine_similarity, validate=False))
            ]))
        ])),
        ('clf', stacking_clf)
    ])

optuna_kwargs = {
    "n_trials": 30,        # Increase from 5 to 20 for better optimization
    "cv_folds": 20,         # Reduce from 10 to 5 for faster training
    "scoring": "accuracy",
    "random_state": SEED,
    "optimise":True
}

experiment = Experiment(
    name=f"ElasticNet Regularization optimization EBM chi2 stack",
    description=f"ElasticNet focused hyperparameter tuning",
    pipeline_factory=pipeline_factory,
    param_space=elasticnet_regularization_param_space
)

manager.run_experiment(experiment, splits=splits, **optuna_kwargs)


=== Running Experiment: ElasticNet Regularization optimization EBM chi2 stack ===


Hyperparameter Optimization (Custom Val Split):   0%|                                         | 0/30 [00:00<?, ?trial/s][I 2025-07-16 01:41:10,575] A new study created in memory with name: no-name-89991111-256c-48b4-a637-4b9f1dea3e71
Hyperparameter Optimization (Custom Val Split):   3%| | 1/30 [00:23<11:34, 23.93s/trial, Train: 0.7184 | Val: 0.6860 | B[I 2025-07-16 01:41:34,509] Trial 0 finished with value: 0.6860215053763441 and parameters: {'tfidf__max_features': 3000, 'tfidf__ngram_range': (1, 1), 'tfidf__min_df': 10, 'tfidf__max_df': 0.7, 'selector__k': 300, 'clf__estimators__lr__C': 0.38097714291113866, 'clf__estimators__lr__l1_ratio': 0.7531359880622237, 'clf__estimators__rf__n_estimators': 50, 'clf__estimators__rf__max_depth': 10, 'clf__estimators__rf__min_samples_split': 50, 'clf__estimators__rf__min_samples_leaf': 20, 'clf__estimators__rf__max_features': 'sqrt', 'clf__estimators__nb__alpha': 0.5149401937924916, 'clf__final_estimator__learning_rate': 0.011671875351247803, 'clf_


🎯 Optimization completed using Custom Val Split!
   Best score: 0.7016
   Total trials: 30
🔧 Training final model with best parameters...




✅ Training complete!
📊 Logging optimization summary...
✅ Optimization summary logged!

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6307

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6307
   Micro F1:     0.6307
   Weighted F1:  0.6307

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6307  R: 0.6307
   Micro    - P: 0.6307  R: 0.6307
   Weighted - P: 0.6307  R: 0.6307

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.6330     0.6219     0.6274        857
   Not Fit              0.6284     0.6394     0.6339        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6307     0.6307     0.6307       1714
   weighted avg         0.6307     0.6307     0.6307       1714

🔢 CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted →
   True 

<utils.ExperimentManger.Experiment at 0x7fccd1fed370>

In [26]:
manager.compare_experiments()

dir = "../experiment_summaries/ensemble_optimization"
filename  = "ensemble_optimization_summaries.csv"

manager.export_experiment_summary(dir, filename)


=== Experiment Comparison (accuracy) ===
Experiment                     Test Score   Status    
-------------------------------------------------------
Baseline optimization EBM chi2 stack 0.6814       ✅ Completed
L1 Regularization optimization EBM chi2 stack 0.6663       ✅ Completed
L2 Regularization optimization EBM chi2 stack 0.6803       ✅ Completed
ElasticNet Regularization optimization EBM chi2 stack 0.6307       ✅ Completed
📊 Experiment summary exported to: ensemble_optimization_summaries.csv


Unnamed: 0,name,status,timestamp,description,test_accuracy,test_macro_precision,test_macro_recall,test_macro_f1,test_micro_precision,test_micro_recall,test_micro_f1,test_weighted_precision,test_weighted_recall,test_weighted_f1,test_num_samples,test_num_classes
0,Baseline optimization EBM chi2 stack,Completed,2025-07-15T23:20:05.591521,No hyperparameter tuning yet but parameters ar...,0.681447,0.684486,0.681447,0.680129,0.681447,0.681447,0.681447,0.684486,0.681447,0.680129,1714,2
1,L1 Regularization optimization EBM chi2 stack,Completed,2025-07-15T23:36:58.982653,L1 focused hyperparameter tuning,0.666278,0.666572,0.666278,0.66613,0.666278,0.666278,0.666278,0.666572,0.666278,0.66613,1714,2
2,L2 Regularization optimization EBM chi2 stack,Completed,2025-07-16T01:41:10.517114,L2 focused hyperparameter tuning,0.68028,0.683189,0.68028,0.679006,0.68028,0.68028,0.68028,0.683189,0.68028,0.679006,1714,2
3,ElasticNet Regularization optimization EBM chi...,Completed,2025-07-16T01:54:39.248084,ElasticNet focused hyperparameter tuning,0.630688,0.630728,0.630688,0.63066,0.630688,0.630688,0.630688,0.630728,0.630688,0.63066,1714,2
