In [1]:
import sys
sys.path.append('..')

# Data Loading

In [3]:
from pathlib import Path
import pandas as pd

def load_datasets(data_root: str | Path = "data",
                  tasks: tuple[str, ...] = ("binary", "multiclass"),
                  splits: tuple[str, ...] = ("train", "val", "test")) -> dict:

    data_root = Path(data_root)
    datasets  = {}

    for task in tasks:
        task_dir     = data_root / task
        task_dict    = {}

        for split in splits:
            split_dict = {}
            for kind in ("X", "y"):
                file_path = task_dir / f"{kind}_{split}.pkl"
                split_dict[kind] = pd.read_pickle(file_path)
            task_dict[split] = split_dict

        datasets[task] = task_dict

    return datasets

In [4]:
from typing import Tuple, Literal
import pandas as pd

def load_split(
    preprocessing_type: Literal["cleaned_only", "full_process"],
    sampling_method: Literal["undersampled", "oversampled"],
    classification_type: Literal["binary", "multiclass"]
) -> Tuple[
    Tuple[pd.DataFrame, pd.Series],  # train: (X_train, y_train)
    Tuple[pd.DataFrame, pd.Series],  # val: (X_val, y_val)
    Tuple[pd.DataFrame, pd.Series]   # test: (X_test, y_test)
]:
    """
    Load different types of splits from the data
    
    Args:
        preprocessing_type: must be "cleaned_only" or "full_process"
        sampling_method: must be "undersampled" or "oversampled"
        classification_type: must be "binary" or "multiclass"
    
    Returns:
        Tuple of (train, val, test) splits, where each split is (X, y)
        - train: (X_train, y_train)
        - val: (X_val, y_val)  
        - test: (X_test, y_test)
    """
    dataset = load_datasets(
        f"../data/{preprocessing_type}/{sampling_method}")[classification_type]
    split_names = ["train", "val", "test"]

    return tuple([(lambda split: (dataset[split]["X"], dataset[split]["y"]))(split) for split in split_names])

# Experiments

In [8]:
def combine_text(X):
    X = X.copy()

    combined = X["resume_text"].astype(
        str) + " [SEP] " + X["job_description_text"].astype(str)

    return combined.values

In [None]:
from utils import ExperimentManager, Experiment

baseline_manager = ExperimentManager(f"../runs/ensembles/baselines/", ["Fit", "Not Fit"])

In [7]:
splits = load_split(preprocessing_type="cleaned_only", sampling_method="undersampled", classification_type="binary")

In [10]:
SEED = 42

## Experiment 1: Bagging vs Stacking

### Experiment Summary

This experiment was done to check the potential of Bagging vs Stacking. Stacking classifier with logistic regression meta learner (64.12%) outperforms bagging classifier (60.44%). Stack method will be used for succeeding experiemnts.

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

def pipeline_factory(params):

    clf_lr = LogisticRegression(random_state=SEED)
    clf_rf = RandomForestClassifier(random_state=SEED)
    clf_nb = BernoulliNB()

    voting_clf = VotingClassifier(
        estimators=[
            ('lr', clf_lr),
            ('nb', clf_nb),
            ('rf', clf_rf)
        ],
        voting="soft",
        n_jobs=-1
    )

    return Pipeline([
        ("join", FunctionTransformer(combine_text, validate=False)),
        ('tfidf', TfidfVectorizer()),  # Convert text to numeric
        ('clf', voting_clf)  
    ])

experiment = Experiment(
    name=f"SV ensemble classifier",
    description=f"Soft Voting Ensemble Model Classifier",
    pipeline_factory=pipeline_factory
)

baseline_manager.run_experiment(experiment, splits=splits)


=== Running Experiment: SV ensemble classifier ===

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6044

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6022
   Micro F1:     0.6044
   Weighted F1:  0.6022

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6068  R: 0.6044
   Micro    - P: 0.6044  R: 0.6044
   Weighted - P: 0.6068  R: 0.6044

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.5909     0.6791     0.6319        857
   Not Fit              0.6228     0.5298     0.5725        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6068     0.6044     0.6022       1714
   weighted avg         0.6068     0.6044     0.6022       1714

🔢 CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted →
   True ↓        Fit  Not Fit 
   Fit     

<utils.ExperimentManger.Experiment at 0x303800c20>

In [16]:
from sklearn.ensemble import StackingClassifier

def pipeline_factory(params):

    clf_lr = LogisticRegression(random_state=SEED)
    clf_rf = RandomForestClassifier(random_state=SEED)
    clf_nb = BernoulliNB()

    stacking_clf = StackingClassifier(
        estimators=[
            ('lr', clf_lr),
            ('nb', clf_nb),
            ('rf', clf_rf)
        ],
        final_estimator=LogisticRegression(random_state=42),
        cv=5,
        n_jobs=-1
    )

    return Pipeline([
        ("join", FunctionTransformer(combine_text, validate=False)),
        ('tfidf', TfidfVectorizer()),  # Convert text to numeric
        ('clf', stacking_clf)
    ])


experiment = Experiment(
    name=f"LogReg meta stack",
    description=f"Stack ensemble classifier with LogisticRegression classifier",
    pipeline_factory=pipeline_factory
)

baseline_manager.run_experiment(experiment, splits=splits)


=== Running Experiment: LogReg meta stack ===

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6412

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6412
   Micro F1:     0.6412
   Weighted F1:  0.6412

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6412  R: 0.6412
   Micro    - P: 0.6412  R: 0.6412
   Weighted - P: 0.6412  R: 0.6412

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.6391     0.6488     0.6439        857
   Not Fit              0.6434     0.6336     0.6384        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6412     0.6412     0.6412       1714
   weighted avg         0.6412     0.6412     0.6412       1714

🔢 CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted →
   True ↓        Fit  Not Fit 
   Fit          

<utils.ExperimentManger.Experiment at 0x303efbe00>

## Experiment 2: Meta learner comparisons for stacked ensembles

- [ ] Random Forest
- [ ] XGBoost
- [ ] GradientBoostClassifier
- [ ] Explainable Boosting Machine (BIG FUCKING MAYBE)

### Random Forest meta learner

In [17]:
def pipeline_factory(params):

    clf_lr = LogisticRegression(random_state=SEED)
    clf_rf = RandomForestClassifier(random_state=SEED)
    clf_nb = BernoulliNB()

    stacking_clf = StackingClassifier(
        estimators=[
            ('lr', clf_lr),
            ('nb', clf_nb),
            ('rf', clf_rf)
        ],
        final_estimator=RandomForestClassifier(random_state=42),
        cv=5,
        n_jobs=-1
    )

    return Pipeline([
        ("join", FunctionTransformer(combine_text, validate=False)),
        ('tfidf', TfidfVectorizer()),  # Convert text to numeric
        ('clf', stacking_clf)
    ])


experiment = Experiment(
    name=f"Random forest meta stack",
    description=f"Stack ensemble classifier with random forest classifier",
    pipeline_factory=pipeline_factory
)

baseline_manager.run_experiment(experiment, splits=splits)


=== Running Experiment: Random forest meta stack ===

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6074

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6069
   Micro F1:     0.6074
   Weighted F1:  0.6069

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6079  R: 0.6074
   Micro    - P: 0.6074  R: 0.6074
   Weighted - P: 0.6079  R: 0.6074

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.6004     0.6418     0.6204        857
   Not Fit              0.6153     0.5729     0.5934        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6079     0.6074     0.6069       1714
   weighted avg         0.6079     0.6074     0.6069       1714

🔢 CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted →
   True ↓        Fit  Not Fit 
   Fit   

<utils.ExperimentManger.Experiment at 0x303e79cd0>

### XGBoost Classifier

In [20]:
from xgboost import XGBClassifier

def pipeline_factory(params):

    clf_lr = LogisticRegression(random_state=SEED)
    clf_rf = RandomForestClassifier(random_state=SEED)
    clf_nb = BernoulliNB()

    stacking_clf = StackingClassifier(
        estimators=[
            ('lr', clf_lr),
            ('nb', clf_nb),
            ('rf', clf_rf)
        ],
        final_estimator=XGBClassifier(use_label_encoder = False, metric = "logloss"),
        cv=5,
        n_jobs=-1
    )

    return Pipeline([
        ("join", FunctionTransformer(combine_text, validate=False)),
        ('tfidf', TfidfVectorizer()),  # Convert text to numeric
        ('clf', stacking_clf)
    ])


experiment = Experiment(
    name=f"XGBoost meta stack",
    description=f"Stack ensemble classifier with XGBoost classifier",
    pipeline_factory=pipeline_factory
)

baseline_manager.run_experiment(experiment, splits=splits)


=== Running Experiment: XGBoost meta stack ===


Parameters: { "metric", "use_label_encoder" } are not used.




🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.5887

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.5886
   Micro F1:     0.5887
   Weighted F1:  0.5886

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.5887  R: 0.5887
   Micro    - P: 0.5887  R: 0.5887
   Weighted - P: 0.5887  R: 0.5887

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.5905     0.5788     0.5846        857
   Not Fit              0.5870     0.5986     0.5927        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.5887     0.5887     0.5886       1714
   weighted avg         0.5887     0.5887     0.5886       1714

🔢 CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted →
   True ↓        Fit  Not Fit 
   Fit           496      361 
   Not Fit       344      513 



<utils.ExperimentManger.Experiment at 0x3062ce300>