In [1]:
import sys
sys.path.append('..')

# Data Loading

In [2]:
from pathlib import Path
import pandas as pd

def load_datasets(data_root: str | Path = "data",
                  tasks: tuple[str, ...] = ("binary", "multiclass"),
                  splits: tuple[str, ...] = ("train", "val", "test")) -> dict:

    data_root = Path(data_root)
    datasets  = {}

    for task in tasks:
        task_dir     = data_root / task
        task_dict    = {}

        for split in splits:
            split_dict = {}
            for kind in ("X", "y"):
                file_path = task_dir / f"{kind}_{split}.pkl"
                split_dict[kind] = pd.read_pickle(file_path)
            task_dict[split] = split_dict

        datasets[task] = task_dict

    return datasets


# â”€â”€ usage â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
datasets = load_datasets("../data")

# quick sanity-check
for task, splits in datasets.items():
    for split, obj in splits.items():
        print(f"{task:<10} {split:<5}  X shape = {obj['X'].shape},  y len = {len(obj['y'])}")

binary     train  X shape = (4336, 2),  y len = 4336
binary     val    X shape = (1860, 2),  y len = 1860
binary     test   X shape = (1714, 2),  y len = 1714
multiclass train  X shape = (3237, 2),  y len = 3237
multiclass val    X shape = (1389, 2),  y len = 1389
multiclass test   X shape = (1332, 2),  y len = 1332


# Experiments

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB

In [10]:
MODEL_TYPE = "NaiveBayes"
SEED = 42

In [11]:
def get_datasets(kind: str):
    dataset = datasets[kind]
    
    X_train = dataset["train"]["X"]
    y_train = dataset["train"]["y"]
    
    X_val = dataset["val"]["X"]
    y_val = dataset["val"]["y"]
    
    X_test = dataset["test"]["X"]
    y_test = dataset["test"]["y"]

    return X_train, y_train, X_val, y_val, X_test, y_test

def combine_text(X):
    X = X.copy() 

    combined = X["resume_text"].astype(str) + " [SEP] " + X["job_description_text"].astype(str)

    return combined.values


## Binary Classification

In [12]:
from utils import ExperimentManager, Experiment

CLASSIFICATION_TYPE = "binary"

manager = ExperimentManager(f"../runs/{CLASSIFICATION_TYPE}/{MODEL_TYPE}", ["Good Fit", "No Fit"])
X_train, y_train, X_val, y_val, X_test, y_test = get_datasets(CLASSIFICATION_TYPE)

splits = ((X_train, y_train), (X_val, y_val), (X_test, y_test))

### Baseline with default scikit-learn naive bayes and no optimization

In [14]:
NBs = {
    "MultinomialNB": MultinomialNB(),
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB()
}

# Add this before your current loop
vectorizer_configs = {
    "MultinomialNB": TfidfVectorizer(max_features=10000, min_df=2, max_df=0.95),
    "BernoulliNB": TfidfVectorizer(max_features=8000, min_df=2, binary=True),
    "ComplementNB": TfidfVectorizer(max_features=15000, min_df=2, max_df=0.9)
}

for name, method in NBs.items():
    pipe = Pipeline([
        ("join", FunctionTransformer(combine_text, validate=False)),
        ("vec", vectorizer_configs[name]),  # Use specific config
        ("clf", method)
    ])

    def pipeline_factory(params):
        # Since weâ€™re not using params here, we just return the static pipeline
        return pipe

    experiment = Experiment(
        name=f"Baseline {CLASSIFICATION_TYPE} {name}",
        description=f"{CLASSIFICATION_TYPE} {name} with TF-IDF and no hyperparameter tuning.",
        pipeline_factory=pipeline_factory,
    )

    manager.run_experiment(experiment, splits=splits)



=== Running Experiment: Baseline binary MultinomialNB ===

ðŸŽ¯ TEST SET EVALUATION RESULTS

ðŸ“Š OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.5904

ðŸŽ¯ MAIN PERFORMANCE METRICS
   Macro F1:     0.5900
   Micro F1:     0.5904
   Weighted F1:  0.5900

ðŸ“ˆ PRECISION/RECALL SUMMARY
   Macro    - P: 0.5908  R: 0.5904
   Micro    - P: 0.5904  R: 0.5904
   Weighted - P: 0.5908  R: 0.5904

ðŸ“‹ DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Good Fit             0.5847     0.6243     0.6038        857
   No Fit               0.5970     0.5566     0.5761        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.5908     0.5904     0.5900       1714
   weighted avg         0.5908     0.5904     0.5900       1714

ðŸ”¢ CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted â†’
   True â†“   G

### Optimized BernoulliNB

## Multiclass Classification

In [8]:
from utils import ExperimentManager, Experiment

CLASSIFICATION_TYPE = "multiclass"

manager = ExperimentManager(f"../runs/{CLASSIFICATION_TYPE}/{MODEL_TYPE}")
X_train, y_train, X_val, y_val, X_test, y_test = get_datasets(CLASSIFICATION_TYPE)

### Baseline with default scikit-learn random forest and no optimization

In [9]:

# Pipeline
pipe = Pipeline([
    ("join", FunctionTransformer(combine_text, validate=False)),
    ('tfidf', TfidfVectorizer()),  # Convert text to numeric
    ('clf', MultinomialNB())  # Naive Bayes model (not LogisticRegression!)
])

def pipeline_factory(params):
    # Since we're not using params here, we just return the static pipeline
    return pipe

experiment = Experiment(
    name=f"Baseline {CLASSIFICATION_TYPE} {MODEL_TYPE}",
    description=f"{CLASSIFICATION_TYPE} {MODEL_TYPE} with TF-IDF and no hyperparameter tuning.",
    pipeline_factory=pipeline_factory,
    splits=[(X_val, y_val), (X_test, y_test)],
    split_names=["Validation", "Test"]
)

manager.run_experiment(experiment, train_data=(X_train, y_train))


=== Running Experiment: Baseline multiclass NaiveBayes ===

--- Validation Evaluation ---
Accuracy: 0.5241
Macro F1: 0.5183
Micro F1: 0.5241

Detailed Classification Report:
                 precision     recall   f1-score    support

       Good Fit     0.5420     0.6695     0.5990        463
         No Fit     0.4947     0.5076     0.5011        463
  Potential Fit     0.5351     0.3952     0.4547        463

       accuracy                           0.5241       1389
      macro avg     0.5239     0.5241     0.5183       1389
   weighted avg     0.5239     0.5241     0.5183       1389

Confusion Matrix:
[[310  57  96]
 [136 183 144]
 [126 102 235]]

--- Test Evaluation ---
Accuracy: 0.4287
Macro F1: 0.4248
Micro F1: 0.4287

Detailed Classification Report:
                 precision     recall   f1-score    support

       Good Fit     0.4291     0.5450     0.4802        444
         No Fit     0.3895     0.3333     0.3592        444
  Potential Fit     0.4665     0.4077     0.4351

<utils.ExperimentManger.Experiment at 0x7fdb8222a0f0>