In [4]:
import sys
sys.path.append('..')

In [5]:
import warnings
import multiprocessing
warnings.filterwarnings("ignore", category=ResourceWarning)

# Also suppress multiprocessing warnings
import sys
import os
os.environ['PYTHONWARNINGS'] = 'ignore::ResourceWarning'

# Data Loading

In [6]:
from pathlib import Path
import pandas as pd

def load_datasets(data_root: str | Path = "data",
                  tasks: tuple[str, ...] = ("binary", "multiclass"),
                  splits: tuple[str, ...] = ("train", "val", "test")) -> dict:

    data_root = Path(data_root)
    datasets  = {}

    for task in tasks:
        task_dir     = data_root / task
        task_dict    = {}

        for split in splits:
            split_dict = {}
            for kind in ("X", "y"):
                file_path = task_dir / f"{kind}_{split}.pkl"
                split_dict[kind] = pd.read_pickle(file_path)
            task_dict[split] = split_dict

        datasets[task] = task_dict

    return datasets

In [7]:
from typing import Tuple, Literal
import pandas as pd

def load_split(
    preprocessing_type: Literal["cleaned_only", "full_process"],
    sampling_method: Literal["undersampled", "oversampled"],
    classification_type: Literal["binary", "multiclass"]
) -> Tuple[
    Tuple[pd.DataFrame, pd.Series],  # train: (X_train, y_train)
    Tuple[pd.DataFrame, pd.Series],  # val: (X_val, y_val)
    Tuple[pd.DataFrame, pd.Series]   # test: (X_test, y_test)
]:
    """
    Load different types of splits from the data
    
    Args:
        preprocessing_type: must be "cleaned_only" or "full_process"
        sampling_method: must be "undersampled" or "oversampled"
        classification_type: must be "binary" or "multiclass"
    
    Returns:
        Tuple of (train, val, test) splits, where each split is (X, y)
        - train: (X_train, y_train)
        - val: (X_val, y_val)  
        - test: (X_test, y_test)
    """
    dataset = load_datasets(
        f"../data/{preprocessing_type}/{sampling_method}")[classification_type]
    split_names = ["train", "val", "test"]

    return tuple([(lambda split: (dataset[split]["X"], dataset[split]["y"]))(split) for split in split_names])

# Experiments

In [8]:
def combine_text(X):
    X = X.copy()

    combined = X["resume_text"].astype(
        str) + " [SEP] " + X["job_description_text"].astype(str)

    return combined.values

In [9]:
splits = load_split(preprocessing_type="cleaned_only", sampling_method="undersampled", classification_type="binary")

In [10]:
SEED = 42

## Experiment 3: Feature Selection methods

### Chi2 Squared Configurations

Best for:
- EBM Chi2: 100 @ 67.04%
- Ridge Chi2: 100 @ 67.09%
- LogReg Chi2: 100 @ 66.51%

In [18]:
from utils import ExperimentManager, Experiment

chi2_manager = ExperimentManager(f"../runs/ensembles/chi2/", ["Fit", "Not Fit"])

In [19]:
k_sizes = [10, 50, 100, 500, 1000, 5000]

In [20]:
from sklearn.feature_selection import chi2, SelectKBest

for size in k_sizes:

    def pipeline_factory(params):

        clf_lr = LogisticRegression(random_state=SEED)
        clf_rf = RandomForestClassifier(random_state=SEED)
        clf_nb = BernoulliNB()

        stacking_clf = StackingClassifier(
            estimators=[
                ('lr', clf_lr),
                ('nb', clf_nb),
                ('rf', clf_rf)
            ],
            final_estimator=ExplainableBoostingClassifier(random_state=SEED),
            cv=5,
            n_jobs=1
        )

        return Pipeline([
            ("join", FunctionTransformer(combine_text, validate=False)),
            ('tfidf', TfidfVectorizer()),  # Convert text to numeric
            ('selector', SelectKBest(chi2, k=size)),
            ('clf', stacking_clf)
        ])
    
    experiment = Experiment(
        name=f"EBM meta chi2 {size} stack",
        description=f"Stack ensemble classifier with EBM classifier and chi2 {size}",
        pipeline_factory=pipeline_factory
    )

    chi2_manager.run_experiment(experiment, splits=splits)



=== Running Experiment: EBM meta chi2 10 stack ===

ðŸŽ¯ TEST SET EVALUATION RESULTS

ðŸ“Š OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.5408

ðŸŽ¯ MAIN PERFORMANCE METRICS
   Macro F1:     0.5190
   Micro F1:     0.5408
   Weighted F1:  0.5190

ðŸ“ˆ PRECISION/RECALL SUMMARY
   Macro    - P: 0.5499  R: 0.5408
   Micro    - P: 0.5408  R: 0.5408
   Weighted - P: 0.5499  R: 0.5408

ðŸ“‹ DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.5286     0.7538     0.6215        857
   Not Fit              0.5711     0.3279     0.4166        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.5499     0.5408     0.5190       1714
   weighted avg         0.5499     0.5408     0.5190       1714

ðŸ”¢ CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted â†’
   True â†“        Fit

In [21]:
for size in k_sizes:

    def pipeline_factory(params):

        clf_lr = LogisticRegression(random_state=SEED)
        clf_rf = RandomForestClassifier(random_state=SEED)
        clf_nb = BernoulliNB()

        stacking_clf = StackingClassifier(
            estimators=[
                ('lr', clf_lr),
                ('nb', clf_nb),
                ('rf', clf_rf)
            ],
            final_estimator=RidgeClassifier(random_state=SEED),
            cv=5,
            n_jobs=1
        )

        return Pipeline([
            ("join", FunctionTransformer(combine_text, validate=False)),
            ('tfidf', TfidfVectorizer()),  # Convert text to numeric
            ('selector', SelectKBest(chi2, k=size)),
            ('clf', stacking_clf)
        ])
    
    experiment = Experiment(
        name=f"ridge meta chi2 {size} stack",
        description=f"Stack ensemble classifier with ridge classifier and chi2 {size}",
        pipeline_factory=pipeline_factory
    )

    chi2_manager.run_experiment(experiment, splits=splits)


=== Running Experiment: ridge meta chi2 10 stack ===

ðŸŽ¯ TEST SET EVALUATION RESULTS

ðŸ“Š OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.5438

ðŸŽ¯ MAIN PERFORMANCE METRICS
   Macro F1:     0.5293
   Micro F1:     0.5438
   Weighted F1:  0.5293

ðŸ“ˆ PRECISION/RECALL SUMMARY
   Macro    - P: 0.5499  R: 0.5438
   Micro    - P: 0.5438  R: 0.5438
   Weighted - P: 0.5499  R: 0.5438

ðŸ“‹ DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.5324     0.7188     0.6117        857
   Not Fit              0.5673     0.3687     0.4470        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.5499     0.5438     0.5293       1714
   weighted avg         0.5499     0.5438     0.5293       1714

ðŸ”¢ CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted â†’
   True â†“        F

In [22]:
for size in k_sizes:

    def pipeline_factory(params):

        clf_lr = LogisticRegression(random_state=SEED)
        clf_rf = RandomForestClassifier(random_state=SEED)
        clf_nb = BernoulliNB()

        stacking_clf = StackingClassifier(
            estimators=[
                ('lr', clf_lr),
                ('nb', clf_nb),
                ('rf', clf_rf)
            ],
            final_estimator=LogisticRegression(random_state=SEED),
            cv=5,
            n_jobs=1
        )

        return Pipeline([
            ("join", FunctionTransformer(combine_text, validate=False)),
            ('tfidf', TfidfVectorizer()),  # Convert text to numeric
            ('selector', SelectKBest(chi2, k=size)),
            ('clf', stacking_clf)
        ])
    
    experiment = Experiment(
        name=f"LogReg meta chi2 {size} stack",
        description=f"Stack ensemble classifier with LogReg classifier and chi2 {size}",
        pipeline_factory=pipeline_factory
    )

    chi2_manager.run_experiment(experiment, splits=splits)


=== Running Experiment: LogReg meta chi2 10 stack ===

ðŸŽ¯ TEST SET EVALUATION RESULTS

ðŸ“Š OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.5414

ðŸŽ¯ MAIN PERFORMANCE METRICS
   Macro F1:     0.5283
   Micro F1:     0.5414
   Weighted F1:  0.5283

ðŸ“ˆ PRECISION/RECALL SUMMARY
   Macro    - P: 0.5466  R: 0.5414
   Micro    - P: 0.5414  R: 0.5414
   Weighted - P: 0.5466  R: 0.5414

ðŸ“‹ DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.5311     0.7083     0.6070        857
   Not Fit              0.5622     0.3746     0.4496        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.5466     0.5414     0.5283       1714
   weighted avg         0.5466     0.5414     0.5283       1714

ðŸ”¢ CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted â†’
   True â†“        

In [23]:
chi2_manager.compare_experiments()

filename = "chi2_selector_comparisons.csv"
dir = "../experiment_summaries/ensemble"

chi2_manager.export_experiment_summary(dir, filename)
chi2_manager.close()


=== Experiment Comparison (accuracy) ===
Experiment                     Test Score   Status    
-------------------------------------------------------
EBM meta chi2 10 stack         0.5408       âœ… Completed
EBM meta chi2 50 stack         0.5834       âœ… Completed
EBM meta chi2 100 stack        0.6727       âœ… Completed
EBM meta chi2 500 stack        0.6505       âœ… Completed
EBM meta chi2 1000 stack       0.6278       âœ… Completed
EBM meta chi2 5000 stack       0.6523       âœ… Completed
ridge meta chi2 10 stack       0.5438       âœ… Completed
ridge meta chi2 50 stack       0.5840       âœ… Completed
ridge meta chi2 100 stack      0.6704       âœ… Completed
ridge meta chi2 500 stack      0.6499       âœ… Completed
ridge meta chi2 1000 stack     0.6424       âœ… Completed
ridge meta chi2 5000 stack     0.6441       âœ… Completed
LogReg meta chi2 10 stack      0.5414       âœ… Completed
LogReg meta chi2 50 stack      0.5858       âœ… Completed
LogReg meta chi2 100 stack     0.66