In [1]:
import sys
sys.path.append('..') 

# Data Loading

In [2]:
from pathlib import Path
import pandas as pd

def load_datasets(data_root: str | Path = "data",
                  tasks: tuple[str, ...] = ("binary", "multiclass"),
                  splits: tuple[str, ...] = ("train", "val", "test")) -> dict:

    data_root = Path(data_root)
    datasets  = {}

    for task in tasks:
        task_dir     = data_root / task
        task_dict    = {}

        for split in splits:
            split_dict = {}
            for kind in ("X", "y"):
                file_path = task_dir / f"{kind}_{split}.pkl"
                split_dict[kind] = pd.read_pickle(file_path)
            task_dict[split] = split_dict

        datasets[task] = task_dict

    return datasets


# ── usage ────────────────────────────────────────────────────────
datasets = load_datasets("../data")

# quick sanity-check
for task, splits in datasets.items():
    for split, obj in splits.items():
        print(f"{task:<10} {split:<5}  X shape = {obj['X'].shape},  y len = {len(obj['y'])}")

binary     train  X shape = (4336, 2),  y len = 4336
binary     val    X shape = (1860, 2),  y len = 1860
binary     test   X shape = (1714, 2),  y len = 1714
multiclass train  X shape = (3237, 2),  y len = 3237
multiclass val    X shape = (1389, 2),  y len = 1389
multiclass test   X shape = (1332, 2),  y len = 1332


# Experiments

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [4]:
MODEL_TYPE = "RandomForest"
SEED = 42

In [5]:
def get_datasets(kind: str):
    dataset = datasets[kind]
    
    X_train = dataset["train"]["X"]
    y_train = dataset["train"]["y"]
    
    X_val = dataset["val"]["X"]
    y_val = dataset["val"]["y"]
    
    X_test = dataset["test"]["X"]
    y_test = dataset["test"]["y"]

    return X_train, y_train, X_val, y_val, X_test, y_test

def combine_text(X):
    X = X.copy() 

    combined = X["resume_text"].astype(str) + " [SEP] " + X["job_description_text"].astype(str)

    return combined.values


## Binary Classification

In [6]:
from utils import ExperimentManager, Experiment

CLASSIFICATION_TYPE = "binary"

manager = ExperimentManager(f"../runs/{CLASSIFICATION_TYPE}/{MODEL_TYPE}")
X_train, y_train, X_val, y_val, X_test, y_test = get_datasets(CLASSIFICATION_TYPE)

### Baseline with default scikit-learn random forest and no optimization

In [7]:
pipe = Pipeline([
    ("join", FunctionTransformer(combine_text, validate=False)),
    ("vec", TfidfVectorizer()),
    ("clf", RandomForestClassifier(random_state=SEED))
])

def pipeline_factory(params):
    # Since we’re not using params here, we just return the static pipeline
    return pipe

experiment = Experiment(
    name=f"Baseline {CLASSIFICATION_TYPE} {MODEL_TYPE}",
    description=f"{CLASSIFICATION_TYPE} {MODEL_TYPE} with TF-IDF and no hyperparameter tuning.",
    pipeline_factory=pipeline_factory,
    splits=[(X_val, y_val), (X_test, y_test)],
    split_names=["Validation", "Test"]
)

manager.run_experiment(experiment, train_data=(X_train, y_train))



=== Running Experiment: Baseline binary RandomForest ===

--- Validation Evaluation ---
Accuracy: 0.6919
Macro F1: 0.6914
Micro F1: 0.6919

Confusion Matrix:
[[683 247]
 [326 604]]

Classfication Report:
              precision    recall  f1-score   support

           0     0.6769    0.7344    0.7045       930
           1     0.7098    0.6495    0.6783       930

    accuracy                         0.6919      1860
   macro avg     0.6933    0.6919    0.6914      1860
weighted avg     0.6933    0.6919    0.6914      1860


--- Test Evaluation ---
Accuracy: 0.6424
Macro F1: 0.6419
Micro F1: 0.6424

Confusion Matrix:
[[582 275]
 [338 519]]

Classfication Report:
              precision    recall  f1-score   support

           0     0.6326    0.6791    0.6550       857
           1     0.6537    0.6056    0.6287       857

    accuracy                         0.6424      1714
   macro avg     0.6431    0.6424    0.6419      1714
weighted avg     0.6431    0.6424    0.6419      1714



<utils.ExperimentManger.Experiment at 0x7fc42194f140>

## Multiclass Classification

In [8]:
from utils import ExperimentManager, Experiment

CLASSIFICATION_TYPE = "multiclass"

manager = ExperimentManager(f"../runs/{CLASSIFICATION_TYPE}/{MODEL_TYPE}")
X_train, y_train, X_val, y_val, X_test, y_test = get_datasets(CLASSIFICATION_TYPE)

### Baseline with default scikit-learn random forest and no optimization

In [9]:
pipe = Pipeline([
    ("join", FunctionTransformer(combine_text, validate=False)),
    ("vec", TfidfVectorizer()),
    ("clf", RandomForestClassifier(random_state=SEED))
])

def pipeline_factory(params):
    # Since we’re not using params here, we just return the static pipeline
    return pipe

experiment = Experiment(
    name=f"Baseline {CLASSIFICATION_TYPE} {MODEL_TYPE}",
    description=f"{CLASSIFICATION_TYPE} {MODEL_TYPE} with TF-IDF and no hyperparameter tuning.",
    pipeline_factory=pipeline_factory,
    splits=[(X_val, y_val), (X_test, y_test)],
    split_names=["Validation", "Test"]
)

manager.run_experiment(experiment, train_data=(X_train, y_train))



=== Running Experiment: Baseline multiclass RandomForest ===

--- Validation Evaluation ---
Accuracy: 0.6019
Macro F1: 0.5968
Micro F1: 0.6019

Confusion Matrix:
[[346  49  68]
 [120 220 123]
 [ 88 105 270]]

Classfication Report:
              precision    recall  f1-score   support

           0     0.6245    0.7473    0.6804       463
           1     0.5882    0.4752    0.5257       463
           2     0.5857    0.5832    0.5844       463

    accuracy                         0.6019      1389
   macro avg     0.5995    0.6019    0.5968      1389
weighted avg     0.5995    0.6019    0.5968      1389


--- Test Evaluation ---
Accuracy: 0.4550
Macro F1: 0.4546
Micro F1: 0.4550

Confusion Matrix:
[[183  91 170]
 [103 230 111]
 [ 84 167 193]]

Classfication Report:
              precision    recall  f1-score   support

           0     0.4946    0.4122    0.4496       444
           1     0.4713    0.5180    0.4936       444
           2     0.4072    0.4347    0.4205       444

    a

<utils.ExperimentManger.Experiment at 0x7fc4212d3a40>