In [1]:
import sys
sys.path.append('..')

# Data Loading

In [2]:
from pathlib import Path
import pandas as pd

def load_datasets(data_root: str | Path = "data",
                  tasks: tuple[str, ...] = ("binary", "multiclass"),
                  splits: tuple[str, ...] = ("train", "val", "test")) -> dict:

    data_root = Path(data_root)
    datasets  = {}

    for task in tasks:
        task_dir     = data_root / task
        task_dict    = {}

        for split in splits:
            split_dict = {}
            for kind in ("X", "y"):
                file_path = task_dir / f"{kind}_{split}.pkl"
                split_dict[kind] = pd.read_pickle(file_path)
            task_dict[split] = split_dict

        datasets[task] = task_dict

    return datasets


# ── usage ────────────────────────────────────────────────────────
datasets = load_datasets("../data")

# quick sanity-check
for task, splits in datasets.items():
    for split, obj in splits.items():
        print(f"{task:<10} {split:<5}  X shape = {obj['X'].shape},  y len = {len(obj['y'])}")

binary     train  X shape = (4336, 2),  y len = 4336
binary     val    X shape = (1860, 2),  y len = 1860
binary     test   X shape = (1714, 2),  y len = 1714
multiclass train  X shape = (3237, 2),  y len = 3237
multiclass val    X shape = (1389, 2),  y len = 1389
multiclass test   X shape = (1332, 2),  y len = 1332


# Experiments

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [4]:
MODEL_TYPE = "NaiveBayes"
SEED = 42

In [5]:
def get_datasets(kind: str):
    dataset = datasets[kind]
    
    X_train = dataset["train"]["X"]
    y_train = dataset["train"]["y"]
    
    X_val = dataset["val"]["X"]
    y_val = dataset["val"]["y"]
    
    X_test = dataset["test"]["X"]
    y_test = dataset["test"]["y"]

    return X_train, y_train, X_val, y_val, X_test, y_test

def combine_text(X):
    X = X.copy() 

    combined = X["resume_text"].astype(str) + " [SEP] " + X["job_description_text"].astype(str)

    return combined.values


## Binary Classification

In [6]:
from utils import ExperimentManager, Experiment

CLASSIFICATION_TYPE = "binary"

manager = ExperimentManager(f"../runs/{CLASSIFICATION_TYPE}/{MODEL_TYPE}")
X_train, y_train, X_val, y_val, X_test, y_test = get_datasets(CLASSIFICATION_TYPE)

### Baseline with default scikit-learn naive bayes and no optimization

In [7]:
pipe = Pipeline([
    ("join", FunctionTransformer(combine_text, validate=False)),
    ("vec", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

def pipeline_factory(params):
    # Since we’re not using params here, we just return the static pipeline
    return pipe

experiment = Experiment(
    name=f"Baseline {CLASSIFICATION_TYPE} {MODEL_TYPE}",
    description=f"{CLASSIFICATION_TYPE} {MODEL_TYPE} with TF-IDF and no hyperparameter tuning.",
    pipeline_factory=pipeline_factory,
    splits=[(X_val, y_val), (X_test, y_test)],
    split_names=["Validation", "Test"]
)

manager.run_experiment(experiment, train_data=(X_train, y_train))



=== Running Experiment: Baseline binary NaiveBayes ===

--- Validation Evaluation ---
Accuracy: 0.6527
Macro F1: 0.6491
Micro F1: 0.6527

Confusion Matrix:
[[701 229]
 [417 513]]

Classfication Report:
              precision    recall  f1-score   support

           0     0.6270    0.7538    0.6846       930
           1     0.6914    0.5516    0.6136       930

    accuracy                         0.6527      1860
   macro avg     0.6592    0.6527    0.6491      1860
weighted avg     0.6592    0.6527    0.6491      1860


--- Test Evaluation ---
Accuracy: 0.5951
Macro F1: 0.5930
Micro F1: 0.5951

Confusion Matrix:
[[571 286]
 [408 449]]

Classfication Report:
              precision    recall  f1-score   support

           0     0.5832    0.6663    0.6220       857
           1     0.6109    0.5239    0.5641       857

    accuracy                         0.5951      1714
   macro avg     0.5971    0.5951    0.5930      1714
weighted avg     0.5971    0.5951    0.5930      1714


✅

<utils.ExperimentManger.Experiment at 0x7f41c93bd520>

## Multiclass Classification

In [8]:
from utils import ExperimentManager, Experiment

CLASSIFICATION_TYPE = "multiclass"

manager = ExperimentManager(f"../runs/{CLASSIFICATION_TYPE}/{MODEL_TYPE}")
X_train, y_train, X_val, y_val, X_test, y_test = get_datasets(CLASSIFICATION_TYPE)

### Baseline with default scikit-learn random forest and no optimization

In [9]:

# Pipeline
pipe = Pipeline([
    ("join", FunctionTransformer(combine_text, validate=False)),
    ('tfidf', TfidfVectorizer()),  # Convert text to numeric
    ('clf', MultinomialNB())  # Naive Bayes model (not LogisticRegression!)
])

def pipeline_factory(params):
    # Since we're not using params here, we just return the static pipeline
    return pipe

experiment = Experiment(
    name=f"Baseline {CLASSIFICATION_TYPE} {MODEL_TYPE}",
    description=f"{CLASSIFICATION_TYPE} {MODEL_TYPE} with TF-IDF and no hyperparameter tuning.",
    pipeline_factory=pipeline_factory,
    splits=[(X_val, y_val), (X_test, y_test)],
    split_names=["Validation", "Test"]
)

manager.run_experiment(experiment, train_data=(X_train, y_train))


=== Running Experiment: Baseline multiclass NaiveBayes ===

--- Validation Evaluation ---
Accuracy: 0.5241
Macro F1: 0.5183
Micro F1: 0.5241

Confusion Matrix:
[[310  57  96]
 [136 183 144]
 [126 102 235]]

Classfication Report:
              precision    recall  f1-score   support

           0     0.5420    0.6695    0.5990       463
           1     0.5351    0.3952    0.4547       463
           2     0.4947    0.5076    0.5011       463

    accuracy                         0.5241      1389
   macro avg     0.5239    0.5241    0.5183      1389
weighted avg     0.5239    0.5241    0.5183      1389


--- Test Evaluation ---
Accuracy: 0.4287
Macro F1: 0.4248
Micro F1: 0.4287

Confusion Matrix:
[[242  78 124]
 [155 181 108]
 [167 129 148]]

Classfication Report:
              precision    recall  f1-score   support

           0     0.4291    0.5450    0.4802       444
           1     0.4665    0.4077    0.4351       444
           2     0.3895    0.3333    0.3592       444

    acc

<utils.ExperimentManger.Experiment at 0x7f41c91ebec0>