In [1]:
import sys
sys.path.append('..')

# Data Loading

In [2]:
from pathlib import Path
import pandas as pd

def load_datasets(data_root: str | Path = "data",
                  tasks: tuple[str, ...] = ("binary", "multiclass"),
                  splits: tuple[str, ...] = ("train", "val", "test")) -> dict:

    data_root = Path(data_root)
    datasets  = {}

    for task in tasks:
        task_dir     = data_root / task
        task_dict    = {}

        for split in splits:
            split_dict = {}
            for kind in ("X", "y"):
                file_path = task_dir / f"{kind}_{split}.pkl"
                split_dict[kind] = pd.read_pickle(file_path)
            task_dict[split] = split_dict

        datasets[task] = task_dict

    return datasets


# ── usage ────────────────────────────────────────────────────────
datasets = load_datasets("../data")

# quick sanity-check
for task, splits in datasets.items():
    for split, obj in splits.items():
        print(f"{task:<10} {split:<5}  X shape = {obj['X'].shape},  y len = {len(obj['y'])}")

binary     train  X shape = (4336, 2),  y len = 4336
binary     val    X shape = (1860, 2),  y len = 1860
binary     test   X shape = (1714, 2),  y len = 1714
multiclass train  X shape = (3237, 2),  y len = 3237
multiclass val    X shape = (1389, 2),  y len = 1389
multiclass test   X shape = (1332, 2),  y len = 1332


# Experiments

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
MODEL_TYPE = "Soft Voting Ensemble"
SEED = 42

In [9]:
def get_datasets(kind: str):
    dataset = datasets[kind]
    
    X_train = dataset["train"]["X"]
    y_train = dataset["train"]["y"]
    
    X_val = dataset["val"]["X"]
    y_val = dataset["val"]["y"]
    
    X_test = dataset["test"]["X"]
    y_test = dataset["test"]["y"]

    return X_train, y_train, X_val, y_val, X_test, y_test

def combine_text(X):
    X = X.copy() 

    combined = X["resume_text"].astype(str) + " [SEP] " + X["job_description_text"].astype(str)

    return combined.values

## Binary Classification

In [14]:
from utils import ExperimentManager, Experiment

CLASSIFICATION_TYPE = "binary"

manager = ExperimentManager(f"../runs/{CLASSIFICATION_TYPE}/{MODEL_TYPE}", ["Fit", "Not Fit"])
X_train, y_train, X_val, y_val, X_test, y_test = get_datasets(CLASSIFICATION_TYPE)

splits = ((X_train, y_train), (X_val, y_val), (X_test, y_test))

### Basline with default scikit learn parameters and no optimization using soft voting classifier

In [15]:
clf_lr = LogisticRegression(random_state=SEED)
clf_rf = RandomForestClassifier(random_state=SEED)
clf_nb = BernoulliNB()

voting_clf = VotingClassifier(
    estimators=[
        ('lr', clf_lr),
        ('nb', clf_nb),
        ('rf', clf_rf)
    ],
    voting="soft",
    n_jobs=-1
)

# Pipeline
pipe = Pipeline([
    ("join", FunctionTransformer(combine_text, validate=False)),
    ('tfidf', TfidfVectorizer()),  # Convert text to numeric
    ('clf', voting_clf)  
])

def pipeline_factory(params):
    # Since we’re not using params here, we just return the static pipeline
    return pipe

experiment = Experiment(
    name=f"Baseline {CLASSIFICATION_TYPE} {MODEL_TYPE}",
    description=f"{CLASSIFICATION_TYPE} {MODEL_TYPE} with TF-IDF and no hyperparameter tuning.",
    pipeline_factory=pipeline_factory
)

manager.run_experiment(experiment, splits=splits)


=== Running Experiment: Baseline binary Soft Voting Ensemble ===

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6044

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6022
   Micro F1:     0.6044
   Weighted F1:  0.6022

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6068  R: 0.6044
   Micro    - P: 0.6044  R: 0.6044
   Weighted - P: 0.6068  R: 0.6044

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.5909     0.6791     0.6319        857
   Not Fit              0.6228     0.5298     0.5725        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6068     0.6044     0.6022       1714
   weighted avg         0.6068     0.6044     0.6022       1714

🔢 CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted →
   True ↓        Fit  Not Fi

<utils.ExperimentManger.Experiment at 0x7fde95fbfe90>

## Multiclass Classification

In [16]:
from utils import ExperimentManager, Experiment

CLASSIFICATION_TYPE = "multiclass"

manager = ExperimentManager(f"../runs/{CLASSIFICATION_TYPE}/{MODEL_TYPE}", ["Good Fit", "Potential Fit", "Not Fit"])
X_train, y_train, X_val, y_val, X_test, y_test = get_datasets(CLASSIFICATION_TYPE)

splits = ((X_train, y_train), (X_val, y_val), (X_test, y_test))

In [None]:
clf_lr = LogisticRegression(random_state=SEED)
clf_rf = RandomForestClassifier(random_state=SEED)
clf_nb = BernoulliNB()

voting_clf = VotingClassifier(
    estimators=[
        ('lr', clf_lr),
        ('nb', clf_nb),
        ('rf', clf_rf)
    ],
    voting="soft",
    n_jobs=-1
)

# Pipeline
pipe = Pipeline([
    ("join", FunctionTransformer(combine_text, validate=False)),
    ('tfidf', TfidfVectorizer()),  # Convert text to numeric
    ('clf', voting_clf)  
])

def pipeline_factory(params):
    # Since we’re not using params here, we just return the static pipeline
    return pipe

experiment = Experiment(
    name=f"Baseline {CLASSIFICATION_TYPE} {MODEL_TYPE}",
    description=f"{CLASSIFICATION_TYPE} {MODEL_TYPE} with TF-IDF and no hyperparameter tuning.",
    pipeline_factory=pipeline_factory
)

manager.run_experiment(experiment, splits=splits)


=== Running Experiment: Baseline multiclass Soft Voting Ensemble ===

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,332
   Classes: 3
   Overall Accuracy: 0.4520

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.4519
   Micro F1:     0.4520
   Weighted F1:  0.4519

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.4523  R: 0.4520
   Micro    - P: 0.4520  R: 0.4520
   Weighted - P: 0.4523  R: 0.4520

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Good Fit             0.4404     0.4662     0.4530        444
   Not Fit              0.4372     0.4234     0.4302        444
   Potential Fit        0.4792     0.4662     0.4726        444
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.4523     0.4520     0.4519       1332
   weighted avg         0.4523     0.4520     0.4519       1332

🔢 CONFUSION MATRIX
   Rows: True Labels, C

<utils.ExperimentManger.Experiment at 0x7fde954ca7e0>

Exception ignored in: <function ResourceTracker.__del__ at 0x7f0b69d7a980>
Traceback (most recent call last):
  File "/home/maveron/.conda/envs/304/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/home/maveron/.conda/envs/304/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/home/maveron/.conda/envs/304/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x7f02a737e980>
Traceback (most recent call last):
  File "/home/maveron/.conda/envs/304/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/home/maveron/.conda/envs/304/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/home/maveron/.conda/envs/304/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exceptio