In [1]:
import sys
sys.path.append('..')

# Data Loading

In [2]:
from pathlib import Path
import pandas as pd

def load_datasets(data_root: str | Path = "data",
                  tasks: tuple[str, ...] = ("binary", "multiclass"),
                  splits: tuple[str, ...] = ("train", "val", "test")) -> dict:

    data_root = Path(data_root)
    datasets  = {}

    for task in tasks:
        task_dir     = data_root / task
        task_dict    = {}

        for split in splits:
            split_dict = {}
            for kind in ("X", "y"):
                file_path = task_dir / f"{kind}_{split}.pkl"
                split_dict[kind] = pd.read_pickle(file_path)
            task_dict[split] = split_dict

        datasets[task] = task_dict

    return datasets


# ── usage ────────────────────────────────────────────────────────
datasets = load_datasets("../data")

# quick sanity-check
for task, splits in datasets.items():
    for split, obj in splits.items():
        print(f"{task:<10} {split:<5}  X shape = {obj['X'].shape},  y len = {len(obj['y'])}")

binary     train  X shape = (4336, 2),  y len = 4336
binary     val    X shape = (1860, 2),  y len = 1860
binary     test   X shape = (1714, 2),  y len = 1714
multiclass train  X shape = (3237, 2),  y len = 3237
multiclass val    X shape = (1389, 2),  y len = 1389
multiclass test   X shape = (1332, 2),  y len = 1332


# Experiments

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
MODEL_TYPE = "Soft Voting Ensemble"
SEED = 42

In [5]:
def get_datasets(kind: str):
    dataset = datasets[kind]
    
    X_train = dataset["train"]["X"]
    y_train = dataset["train"]["y"]
    
    X_val = dataset["val"]["X"]
    y_val = dataset["val"]["y"]
    
    X_test = dataset["test"]["X"]
    y_test = dataset["test"]["y"]

    return X_train, y_train, X_val, y_val, X_test, y_test

def combine_text(X):
    X = X.copy() 

    combined = X["resume_text"].astype(str) + " [SEP] " + X["job_description_text"].astype(str)

    return combined.values

## Binary Classification

In [6]:
from utils import ExperimentManager, Experiment

CLASSIFICATION_TYPE = "binary"

manager = ExperimentManager(f"../runs/{CLASSIFICATION_TYPE}/{MODEL_TYPE}")
X_train, y_train, X_val, y_val, X_test, y_test = get_datasets(CLASSIFICATION_TYPE)

### Basline with default scikit learn parameters and no optimization using soft voting classifier

In [7]:
clf_lr = LogisticRegression(random_state=SEED)
clf_rf = RandomForestClassifier(random_state=SEED)
clf_nb = MultinomialNB()

voting_clf = VotingClassifier(
    estimators=[
        ('lr', clf_lr),
        ('nb', clf_nb),
        ('rf', clf_rf)
    ],
    voting="soft",
    n_jobs=-1
)

# Pipeline
pipe = Pipeline([
    ("join", FunctionTransformer(combine_text, validate=False)),
    ('tfidf', TfidfVectorizer()),  # Convert text to numeric
    ('clf', voting_clf)  
])

def pipeline_factory(params):
    # Since we’re not using params here, we just return the static pipeline
    return pipe

experiment = Experiment(
    name=f"Baseline {CLASSIFICATION_TYPE} {MODEL_TYPE}",
    description=f"{CLASSIFICATION_TYPE} {MODEL_TYPE} with TF-IDF and no hyperparameter tuning.",
    pipeline_factory=pipeline_factory,
    splits=[(X_val, y_val), (X_test, y_test)],
    split_names=["Validation", "Test"]
)

manager.run_experiment(experiment, train_data=(X_train, y_train))


=== Running Experiment: Baseline binary Soft Voting Ensemble ===

--- Validation Evaluation ---
Accuracy: 0.6887
Macro F1: 0.6874
Micro F1: 0.6887

Confusion Matrix:
[[700 230]
 [349 581]]

Classfication Report:
              precision    recall  f1-score   support

           0     0.6673    0.7527    0.7074       930
           1     0.7164    0.6247    0.6674       930

    accuracy                         0.6887      1860
   macro avg     0.6919    0.6887    0.6874      1860
weighted avg     0.6919    0.6887    0.6874      1860


--- Test Evaluation ---
Accuracy: 0.6237
Macro F1: 0.6234
Micro F1: 0.6237

Confusion Matrix:
[[559 298]
 [347 510]]

Classfication Report:
              precision    recall  f1-score   support

           0     0.6170    0.6523    0.6341       857
           1     0.6312    0.5951    0.6126       857

    accuracy                         0.6237      1714
   macro avg     0.6241    0.6237    0.6234      1714
weighted avg     0.6241    0.6237    0.6234    

<utils.ExperimentManger.Experiment at 0x7f49157485c0>

## Multiclass Classification

In [8]:
from utils import ExperimentManager, Experiment

CLASSIFICATION_TYPE = "multiclass"

manager = ExperimentManager(f"../runs/{CLASSIFICATION_TYPE}/{MODEL_TYPE}")
X_train, y_train, X_val, y_val, X_test, y_test = get_datasets(CLASSIFICATION_TYPE)

In [9]:
clf_lr = LogisticRegression(random_state=SEED)
clf_rf = RandomForestClassifier(random_state=SEED)
clf_nb = MultinomialNB()

voting_clf = VotingClassifier(
    estimators=[
        ('lr', clf_lr),
        ('nb', clf_nb),
        ('rf', clf_rf)
    ],
    voting="soft",
    n_jobs=-1
)

# Pipeline
pipe = Pipeline([
    ("join", FunctionTransformer(combine_text, validate=False)),
    ('tfidf', TfidfVectorizer()),  # Convert text to numeric
    ('clf', voting_clf)  
])

def pipeline_factory(params):
    # Since we’re not using params here, we just return the static pipeline
    return pipe

experiment = Experiment(
    name=f"Baseline {CLASSIFICATION_TYPE} {MODEL_TYPE}",
    description=f"{CLASSIFICATION_TYPE} {MODEL_TYPE} with TF-IDF and no hyperparameter tuning.",
    pipeline_factory=pipeline_factory,
    splits=[(X_val, y_val), (X_test, y_test)],
    split_names=["Validation", "Test"]
)

manager.run_experiment(experiment, train_data=(X_train, y_train))


=== Running Experiment: Baseline multiclass Soft Voting Ensemble ===

--- Validation Evaluation ---
Accuracy: 0.5961
Macro F1: 0.5903
Micro F1: 0.5961

Confusion Matrix:
[[350  51  62]
 [122 217 124]
 [103  99 261]]

Classfication Report:
              precision    recall  f1-score   support

           0     0.6087    0.7559    0.6744       463
           1     0.5913    0.4687    0.5229       463
           2     0.5839    0.5637    0.5736       463

    accuracy                         0.5961      1389
   macro avg     0.5946    0.5961    0.5903      1389
weighted avg     0.5946    0.5961    0.5903      1389


--- Test Evaluation ---
Accuracy: 0.4602
Macro F1: 0.4597
Micro F1: 0.4602

Confusion Matrix:
[[212  88 144]
 [132 220  92]
 [119 144 181]]

Classfication Report:
              precision    recall  f1-score   support

           0     0.4579    0.4775    0.4675       444
           1     0.4867    0.4955    0.4911       444
           2     0.4341    0.4077    0.4204       44

<utils.ExperimentManger.Experiment at 0x7f4915a01bb0>