In [4]:
import sys
sys.path.append('..')

In [5]:
import warnings
import multiprocessing
warnings.filterwarnings("ignore", category=ResourceWarning)

# Also suppress multiprocessing warnings
import sys
import os
os.environ['PYTHONWARNINGS'] = 'ignore::ResourceWarning'

# Data Loading

In [6]:
from pathlib import Path
import pandas as pd

def load_datasets(data_root: str | Path = "data",
                  tasks: tuple[str, ...] = ("binary", "multiclass"),
                  splits: tuple[str, ...] = ("train", "val", "test")) -> dict:

    data_root = Path(data_root)
    datasets  = {}

    for task in tasks:
        task_dir     = data_root / task
        task_dict    = {}

        for split in splits:
            split_dict = {}
            for kind in ("X", "y"):
                file_path = task_dir / f"{kind}_{split}.pkl"
                split_dict[kind] = pd.read_pickle(file_path)
            task_dict[split] = split_dict

        datasets[task] = task_dict

    return datasets

In [7]:
from typing import Tuple, Literal
import pandas as pd

def load_split(
    preprocessing_type: Literal["cleaned_only", "full_process"],
    sampling_method: Literal["undersampled", "oversampled"],
    classification_type: Literal["binary", "multiclass"]
) -> Tuple[
    Tuple[pd.DataFrame, pd.Series],  # train: (X_train, y_train)
    Tuple[pd.DataFrame, pd.Series],  # val: (X_val, y_val)
    Tuple[pd.DataFrame, pd.Series]   # test: (X_test, y_test)
]:
    """
    Load different types of splits from the data
    
    Args:
        preprocessing_type: must be "cleaned_only" or "full_process"
        sampling_method: must be "undersampled" or "oversampled"
        classification_type: must be "binary" or "multiclass"
    
    Returns:
        Tuple of (train, val, test) splits, where each split is (X, y)
        - train: (X_train, y_train)
        - val: (X_val, y_val)  
        - test: (X_test, y_test)
    """
    dataset = load_datasets(
        f"../data/{preprocessing_type}/{sampling_method}")[classification_type]
    split_names = ["train", "val", "test"]

    return tuple([(lambda split: (dataset[split]["X"], dataset[split]["y"]))(split) for split in split_names])

# Experiments

In [8]:
def combine_text(X):
    X = X.copy()

    combined = X["resume_text"].astype(
        str) + " [SEP] " + X["job_description_text"].astype(str)

    return combined.values

In [9]:
splits = load_split(preprocessing_type="cleaned_only", sampling_method="undersampled", classification_type="binary")

In [10]:
SEED = 42

In [8]:
from utils import ExperimentManager, Experiment

method_manager = ExperimentManager(f"../runs/ensembles/", ["Fit", "Not Fit"])

## Experiment 1: Bagging vs Stacking

### Experiment Summary

This experiment was done to check the potential of Bagging vs Stacking. Stacking classifier with logistic regression meta learner (64.12%) outperforms bagging classifier (60.44%). Stack method will be used for succeeding experiemnts.

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

def pipeline_factory(params):

    clf_lr = LogisticRegression(random_state=SEED)
    clf_rf = RandomForestClassifier(random_state=SEED)
    clf_nb = BernoulliNB()

    voting_clf = VotingClassifier(
        estimators=[
            ('lr', clf_lr),
            ('nb', clf_nb),
            ('rf', clf_rf)
        ],
        # using hard here since Ridge does not have predict_proba
        voting="hard",
        n_jobs=1
    )

    return Pipeline([
        ("join", FunctionTransformer(combine_text, validate=False)),
        ('tfidf', TfidfVectorizer()),  # Convert text to numeric
        ('clf', voting_clf)  
    ])

experiment = Experiment(
    name=f"HV ensemble classifier",
    description=f"Hard Voting Ensemble Model Classifier",
    pipeline_factory=pipeline_factory
)

method_manager.run_experiment(experiment, splits=splits)


=== Running Experiment: HV ensemble classifier ===

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6167

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6158
   Micro F1:     0.6167
   Weighted F1:  0.6158

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6177  R: 0.6167
   Micro    - P: 0.6167  R: 0.6167
   Weighted - P: 0.6177  R: 0.6167

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.6066     0.6639     0.6340        857
   Not Fit              0.6289     0.5694     0.5977        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6177     0.6167     0.6158       1714
   weighted avg         0.6177     0.6167     0.6158       1714

🔢 CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted →
   True ↓        Fit  Not Fit 
   Fit     

<utils.ExperimentManger.Experiment at 0x7f0d980660f0>

In [10]:
from sklearn.ensemble import StackingClassifier

def pipeline_factory(params):

    clf_lr = LogisticRegression(random_state=SEED)
    clf_rf = RandomForestClassifier(random_state=SEED)
    clf_nb = BernoulliNB()

    stacking_clf = StackingClassifier(
        estimators=[
            ('lr', clf_lr),
            ('nb', clf_nb),
            ('rf', clf_rf)
        ],
        final_estimator=LogisticRegression(random_state=42),
        cv=5,
        n_jobs=1
    )

    return Pipeline([
        ("join", FunctionTransformer(combine_text, validate=False)),
        ('tfidf', TfidfVectorizer()),  # Convert text to numeric
        ('clf', stacking_clf)
    ])


experiment = Experiment(
    name=f"LogReg meta stack",
    description=f"Stack ensemble classifier with LogisticRegression classifier",
    pipeline_factory=pipeline_factory
)

method_manager.run_experiment(experiment, splits=splits)


=== Running Experiment: LogReg meta stack ===

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6394

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6394
   Micro F1:     0.6394
   Weighted F1:  0.6394

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6395  R: 0.6394
   Micro    - P: 0.6394  R: 0.6394
   Weighted - P: 0.6395  R: 0.6394

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.6366     0.6499     0.6432        857
   Not Fit              0.6424     0.6289     0.6356        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6395     0.6394     0.6394       1714
   weighted avg         0.6395     0.6394     0.6394       1714

🔢 CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted →
   True ↓        Fit  Not Fit 
   Fit          

<utils.ExperimentManger.Experiment at 0x7f0c45f4f110>

In [11]:
method_manager.compare_experiments()

filename = "bag_vs_stack.csv"
dir = "../experiment_summaries/ensemble"

method_manager.export_experiment_summary(dir, filename)
method_manager.close()


=== Experiment Comparison (accuracy) ===
Experiment                     Test Score   Status    
-------------------------------------------------------
HV ensemble classifier         0.6167       ✅ Completed
LogReg meta stack              0.6394       ✅ Completed
📊 Experiment summary exported to: bag_vs_stack.csv


## Experiment 2: Meta learner comparisons for stacked ensembles

- [X] Random Forest
- [X] XGBoost
- [X] Explainable Boosting Machine
- [X] RidgeClassifier

### Experiment Summary
Overall Logistic Regression remains the best at 63.94%, however, explainable boosting machine and ridge classifer are clost at 63.01% and 63.77% respectively. Thus future experiments will involve all three.

In [12]:
from utils import ExperimentManager, Experiment

meta_learner_manager = ExperimentManager(f"../runs/ensembles/", ["Fit", "Not Fit"])

### LogReg Classifier

In [13]:
from sklearn.ensemble import StackingClassifier

def pipeline_factory(params):

    clf_lr = LogisticRegression(random_state=SEED)
    clf_rf = RandomForestClassifier(random_state=SEED)
    clf_nb = BernoulliNB()

    stacking_clf = StackingClassifier(
        estimators=[
            ('lr', clf_lr),
            ('nb', clf_nb),
            ('rf', clf_rf)
        ],
        final_estimator=LogisticRegression(random_state=42),
        cv=5,
        n_jobs=1
    )

    return Pipeline([
        ("join", FunctionTransformer(combine_text, validate=False)),
        ('tfidf', TfidfVectorizer()),  # Convert text to numeric
        ('clf', stacking_clf)
    ])


experiment = Experiment(
    name=f"LogReg meta stack",
    description=f"Stack ensemble classifier with LogisticRegression classifier",
    pipeline_factory=pipeline_factory
)

meta_learner_manager.run_experiment(experiment, splits=splits)


=== Running Experiment: LogReg meta stack ===

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6394

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6394
   Micro F1:     0.6394
   Weighted F1:  0.6394

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6395  R: 0.6394
   Micro    - P: 0.6394  R: 0.6394
   Weighted - P: 0.6395  R: 0.6394

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.6366     0.6499     0.6432        857
   Not Fit              0.6424     0.6289     0.6356        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6395     0.6394     0.6394       1714
   weighted avg         0.6395     0.6394     0.6394       1714

🔢 CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted →
   True ↓        Fit  Not Fit 
   Fit          

<utils.ExperimentManger.Experiment at 0x7f0c45c5df70>

### RidgeClassifier

In [14]:
from sklearn.linear_model import RidgeClassifier

def pipeline_factory(params):

    clf_lr = LogisticRegression(random_state=SEED)
    clf_rf = RandomForestClassifier(random_state=SEED)
    clf_nb = BernoulliNB()

    stacking_clf = StackingClassifier(
        estimators=[
            ('lr', clf_lr),
            ('nb', clf_nb),
            ('rf', clf_rf)
        ],
        final_estimator=RidgeClassifier(random_state=SEED),
        cv=5,
        n_jobs=1
    )

    return Pipeline([
        ("join", FunctionTransformer(combine_text, validate=False)),
        ('tfidf', TfidfVectorizer()),  # Convert text to numeric
        ('clf', stacking_clf)
    ])


experiment = Experiment(
    name=f"Ridge meta stack",
    description=f"Stack ensemble classifier with Ridge classifier",
    pipeline_factory=pipeline_factory
)

meta_learner_manager.run_experiment(experiment, splits=splits)


=== Running Experiment: Ridge meta stack ===

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6377

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6376
   Micro F1:     0.6377
   Weighted F1:  0.6376

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6378  R: 0.6377
   Micro    - P: 0.6377  R: 0.6377
   Weighted - P: 0.6378  R: 0.6377

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.6344     0.6499     0.6421        857
   Not Fit              0.6411     0.6254     0.6332        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6378     0.6377     0.6376       1714
   weighted avg         0.6378     0.6377     0.6376       1714

🔢 CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted →
   True ↓        Fit  Not Fit 
   Fit           

<utils.ExperimentManger.Experiment at 0x7f0c472d03e0>

### Random Forest meta learner

In [15]:
def pipeline_factory(params):

    clf_lr = LogisticRegression(random_state=SEED)
    clf_rf = RandomForestClassifier(random_state=SEED)
    clf_nb = BernoulliNB()

    stacking_clf = StackingClassifier(
        estimators=[
            ('lr', clf_lr),
            ('nb', clf_nb),
            ('rf', clf_rf)
        ],
        final_estimator=RandomForestClassifier(random_state=42),
        cv=5,
        n_jobs=1
    )

    return Pipeline([
        ("join", FunctionTransformer(combine_text, validate=False)),
        ('tfidf', TfidfVectorizer()),  # Convert text to numeric
        ('clf', stacking_clf)
    ])


experiment = Experiment(
    name=f"Random forest meta stack",
    description=f"Stack ensemble classifier with random forest classifier",
    pipeline_factory=pipeline_factory
)

meta_learner_manager.run_experiment(experiment, splits=splits)


=== Running Experiment: Random forest meta stack ===

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6126

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6116
   Micro F1:     0.6126
   Weighted F1:  0.6116

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6137  R: 0.6126
   Micro    - P: 0.6126  R: 0.6126
   Weighted - P: 0.6137  R: 0.6126

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.6023     0.6628     0.6311        857
   Not Fit              0.6252     0.5624     0.5921        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6137     0.6126     0.6116       1714
   weighted avg         0.6137     0.6126     0.6116       1714

🔢 CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted →
   True ↓        Fit  Not Fit 
   Fit   

<utils.ExperimentManger.Experiment at 0x7f0c45f3e420>

### Explainable Boosting Machine

In [16]:
from interpret.glassbox import ExplainableBoostingClassifier

def pipeline_factory(params):

    clf_lr = LogisticRegression(random_state=SEED)
    clf_rf = RandomForestClassifier(random_state=SEED)
    clf_nb = BernoulliNB()

    stacking_clf = StackingClassifier(
        estimators=[
            ('lr', clf_lr),
            ('nb', clf_nb),
            ('rf', clf_rf)
        ],
        final_estimator=ExplainableBoostingClassifier(random_state=SEED),
        cv=5,
        n_jobs=1
    )

    return Pipeline([
        ("join", FunctionTransformer(combine_text, validate=False)),
        ('tfidf', TfidfVectorizer()),  # Convert text to numeric
        ('clf', stacking_clf)
    ])


experiment = Experiment(
    name=f"EBM meta stack",
    description=f"Stack ensemble classifier with EBM classifier",
    pipeline_factory=pipeline_factory
)

meta_learner_manager.run_experiment(experiment, splits=splits)


=== Running Experiment: EBM meta stack ===

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6301

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6301
   Micro F1:     0.6301
   Weighted F1:  0.6301

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6301  R: 0.6301
   Micro    - P: 0.6301  R: 0.6301
   Weighted - P: 0.6301  R: 0.6301

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.6286     0.6359     0.6323        857
   Not Fit              0.6316     0.6243     0.6279        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6301     0.6301     0.6301       1714
   weighted avg         0.6301     0.6301     0.6301       1714

🔢 CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted →
   True ↓        Fit  Not Fit 
   Fit           54

<utils.ExperimentManger.Experiment at 0x7f0c472e0bf0>

In [17]:
meta_learner_manager.compare_experiments()

filename = "meta_learner_comparisons.csv"
dir = "../experiment_summaries/ensemble"

meta_learner_manager.export_experiment_summary(dir, filename)
meta_learner_manager.close()


=== Experiment Comparison (accuracy) ===
Experiment                     Test Score   Status    
-------------------------------------------------------
LogReg meta stack              0.6394       ✅ Completed
Ridge meta stack               0.6377       ✅ Completed
Random forest meta stack       0.6126       ✅ Completed
EBM meta stack                 0.6301       ✅ Completed
📊 Experiment summary exported to: meta_learner_comparisons.csv
