In [None]:
from sklearn.metrics import classification_report, balanced_accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from pathlib import Path
import pandas as pd
import numpy as np
import random, os

In [6]:
RNG = 42
np.random.seed(RNG)
random.seed(RNG)
os.environ["PYTHONHASHSEED"] = str(RNG)

In [7]:
def load_datasets(data_root: str | Path = "data",
                  tasks: tuple[str, ...] = ("binary", "multiclass"),
                  splits: tuple[str, ...] = ("train", "val", "test")) -> dict:

    data_root = Path(data_root)
    datasets  = {}

    for task in tasks:
        task_dir     = data_root / task
        task_dict    = {}

        for split in splits:
            split_dict = {}
            for kind in ("X", "y"):
                file_path = task_dir / f"{kind}_{split}.pkl"
                split_dict[kind] = pd.read_pickle(file_path)
            task_dict[split] = split_dict

        datasets[task] = task_dict

    return datasets


# ── usage ────────────────────────────────────────────────────────
datasets = load_datasets("../data")

# quick sanity-check
for task, splits in datasets.items():
    for split, obj in splits.items():
        print(f"{task:<10} {split:<5}  X shape = {obj['X'].shape},  y len = {len(obj['y'])}")

binary     train  X shape = (4336, 2),  y len = 4336
binary     val    X shape = (1860, 2),  y len = 1860
binary     test   X shape = (1714, 2),  y len = 1714
multiclass train  X shape = (3237, 2),  y len = 3237
multiclass val    X shape = (1389, 2),  y len = 1389
multiclass test   X shape = (1332, 2),  y len = 1332


In [None]:
# Define a dummy classifier


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.dummy import DummyClassifier

# ── helper to merge the two text columns ───────────────────────
def join_cv_jd(df: pd.DataFrame) -> np.ndarray:
    """Return 1-D array of concatenated CV + JD strings."""
    return (df["resume_text"] + " [SEP] " + df["job_description_text"]).values
    # keep .values so downstream transformer gets ndarray

# ── NO-OP vectoriser for the baseline ──────────────────────────
def make_dummy_features(x):
    """Ignore contents → emit a column of zeros (shape = [n_samples, 1])."""
    return np.zeros((x.shape[0], 1))

NOOP_VECTORISER = FunctionTransformer(make_dummy_features, validate=False)

# ── generic pipeline (join → vec → clf) ────────────────────────
base_pipe = Pipeline([
    ("join", FunctionTransformer(join_cv_jd, validate=False)),
    ("vec",  NOOP_VECTORISER),                 # placeholder
    ("clf",  DummyClassifier(strategy="most_frequent"))
])

In [None]:
for kind, splits in datasets.items():

    print(f"{'='*5} Baseline for {kind} classification {'='*5}")

    X_train = splits["train"]["X"]
    y_train = splits["train"]["y"]
    X_test = splits["test"]["X"]
    y_test = splits["test"]["y"]

    base_pipe.fit(X_train, y_train)
    print("Test accuracy:", base_pipe.score(X_test, y_test))

binary
{'train': {'X':                                             resume_text  \
711   professional summary senior qa/test engineer w...   
4787  professional summary currently working with ca...   
81    summary highly motivated electrical project en...   
3390  professional profile specialized knowledge of ...   
227   summary seasoned data entry operator with 5 ye...   
...                                                 ...   
2929  summary software developer with overall 3+ yea...   
4897  profile skilled, motivated and result-driven q...   
2045  summary 5+ years experience in development of ...   
2958  career overview expert level user interface fr...   
4414  summary seeking an accountant position that ut...   

                                   job_description_text  
711   why we are excited about you: you are a junior...  
4787  job description job title: salesforce communic...  
81    at lyft, our mission is to improve peoples liv...  
3390  ( candidates without any visa 