In [1]:
import sys, os
sys.path.append('..') 

In [2]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from utils.TextPreprocessor import TextPreprocessor
from typing import List

# Data Loading

In [3]:
# Load dataset from hugging face 
# cnamuangtoun/resume-job-description-fit

ds = load_dataset("cnamuangtoun/resume-job-description-fit")
train_df = ds['train'].to_pandas()
test_df = ds['test'].to_pandas()

# Create train/validation split
train_df, val_df = train_test_split(train_df, test_size=0.30,
                                   stratify=train_df["label"], random_state=42)

def map_multiclass(dfs: List):

    # Create label mapping
    label_to_id = {"Good Fit": 0, "No Fit": 2, "Potential Fit":1}

    for df in dfs:
        df["label"] = df["label"].map(label_to_id)

    return dfs[0], dfs[1], dfs[2]

def map_binaryclass(dfs: List):

    # Create label mapping
    label_to_id = {"Good Fit": 0, "No Fit": 1, "Potential Fit":0}

    for df in dfs:
        df["label"] = df["label"].map(label_to_id)

    return dfs[0], dfs[1], dfs[2]

import pandas as pd

def show_split_stats(train_df: pd.DataFrame,
                     val_df: pd.DataFrame,
                     test_df: pd.DataFrame,
                     label_col: str = "label"):
    header = (
        f"Data loaded and split:\n"
        f"  • Training:   {len(train_df):>6} samples\n"
        f"  • Validation: {len(val_df):>6} samples\n"
        f"  • Test:       {len(test_df):>6} samples\n"
        f"\nTraining label distribution:"
    )

    # Counts and percentages side-by-side
    counts = train_df[label_col].value_counts(dropna=False)
    pct = (counts / counts.sum() * 100).round(1)
    stats = pd.concat([counts.rename("count"), pct.rename("%")], axis=1)

    print(header)
    print(stats)

In [4]:
"""
    Comparing multiclass dataset and binary dataset stats
"""

dfs = {
    kind: dict(zip(["train", "val", "test"],
                   func([train_df.copy(), val_df.copy(), test_df.copy()])))
    for kind, func in {
        "binary": map_binaryclass,
        "multiclass": map_multiclass
    }.items()
}

# ───────── quick sanity-check ─────────
for kind, splits in dfs.items():
    print(f"\n— {kind.capitalize()} —")
    show_split_stats(splits["train"], splits["val"], splits["test"])


— Binary —
Data loaded and split:
  • Training:     4368 samples
  • Validation:   1873 samples
  • Test:         1759 samples

Training label distribution:
       count     %
label             
1       2200  50.4
0       2168  49.6

— Multiclass —
Data loaded and split:
  • Training:     4368 samples
  • Validation:   1873 samples
  • Test:         1759 samples

Training label distribution:
       count     %
label             
2       2200  50.4
1       1089  24.9
0       1079  24.7


In [5]:
CLEAN_TEXT = True
STOP_WORD_REMOVAL = False
LEMMATIZE = False

tp = TextPreprocessor(enable_stopwords=STOP_WORD_REMOVAL, enable_lemmatizer=LEMMATIZE)

for kind, splits in dfs.items():
    for split_name, df in splits.items():
        dfs[kind][split_name] = tp.process_dataset(df, clean_text=CLEAN_TEXT, remove_stop_words=STOP_WORD_REMOVAL, lemmatize=LEMMATIZE)


In [6]:
from utils import SkillNERPreprocessor

# Preprocess with SkillNER (this takes time but only done once)
preprocessor = SkillNERPreprocessor(model="en_core_web_md", cache_dir="../skillner_cache")
df_with_skills = preprocessor.preprocess_data(df)

for kind, splits in dfs.items():
    for split_name, df in splits.items():
        dfs[kind][split_name] = tp.process_dataset(df, clean_text=CLEAN_TEXT, remove_stop_words=STOP_WORD_REMOVAL, lemmatize=LEMMATIZE)

Computing SkillNER features...
Loading spaCy model: en_core_web_md
loading full_matcher ...


TypeError: __call__() got an unexpected keyword argument 'attr'

In [6]:
import os, pickle, json, pathlib, random
from collections import Counter
from datetime import datetime

DEST_DIR         = pathlib.Path("../data/full_process/undersampled")
BALANCE_STRATEGY = "undersample"        # "oversample" | "undersample"
SEED             = 42
random.seed(SEED)

def balance_df(df, label_col="label", strategy="oversample"):
    counts = df[label_col].value_counts()
    if strategy == "oversample":
        target = counts.max()
        balanced = (
            df.groupby(label_col, group_keys=False)
              .apply(lambda g: g.sample(target, replace=True, random_state=SEED))
        )
    elif strategy == "undersample":
        target = counts.min()
        balanced = (
            df.groupby(label_col, group_keys=False)
              .apply(lambda g: g.sample(target, replace=False, random_state=SEED))
        )
    else:
        raise ValueError("strategy must be 'oversample' or 'undersample'")
    return balanced.sample(frac=1, random_state=SEED)

for kind, splits in dfs.items():
    out_dir = DEST_DIR / kind
    out_dir.mkdir(parents=True, exist_ok=True)

    print(f"\n🗂  Exporting {kind} dataset → {out_dir}  (strategy={BALANCE_STRATEGY})")

    for split_name, df in splits.items():
        before = Counter(df["label"])
        df_bal = balance_df(df, label_col="label", strategy=BALANCE_STRATEGY)
        after  = Counter(df_bal["label"])

        print(f"  {split_name:<5}  | before: {dict(before)}  →  after: {dict(after)}")

        X = df_bal.drop(columns=["label"])
        y = df_bal["label"]

        with open(out_dir / f"X_{split_name}.pkl", "wb") as fx:
            pickle.dump(X, fx, protocol=pickle.HIGHEST_PROTOCOL)
        with open(out_dir / f"y_{split_name}.pkl", "wb") as fy:
            pickle.dump(y, fy, protocol=pickle.HIGHEST_PROTOCOL)

    manifest = {
        "created_at": datetime.utcnow().isoformat() + "Z",
        "balance_strategy": BALANCE_STRATEGY,
        "seed": SEED,
        "files": sorted([p.name for p in out_dir.glob('*.pkl')]),
    }
    with open(out_dir / "manifest.json", "w") as mf:
        json.dump(manifest, mf, indent=2)

print("\n✅ All balanced splits saved to:", DEST_DIR.resolve())



🗂  Exporting binary dataset → ../data/full_process/undersampled/binary  (strategy=undersample)
  train  | before: {1: 2200, 0: 2168}  →  after: {1: 2168, 0: 2168}
  val    | before: {0: 930, 1: 943}  →  after: {0: 930, 1: 930}
  test   | before: {1: 857, 0: 902}  →  after: {1: 857, 0: 857}

🗂  Exporting multiclass dataset → ../data/full_process/undersampled/multiclass  (strategy=undersample)
  train  | before: {2: 2200, 1: 1089, 0: 1079}  →  after: {0: 1079, 2: 1079, 1: 1079}


  .apply(lambda g: g.sample(target, replace=False, random_state=SEED))
  .apply(lambda g: g.sample(target, replace=False, random_state=SEED))
  .apply(lambda g: g.sample(target, replace=False, random_state=SEED))
  "created_at": datetime.utcnow().isoformat() + "Z",
  .apply(lambda g: g.sample(target, replace=False, random_state=SEED))
  .apply(lambda g: g.sample(target, replace=False, random_state=SEED))


  val    | before: {0: 463, 2: 943, 1: 467}  →  after: {1: 463, 0: 463, 2: 463}
  test   | before: {2: 857, 1: 444, 0: 458}  →  after: {2: 444, 0: 444, 1: 444}

✅ All balanced splits saved to: /Users/mave/Documents/UniFiles/Resume-Screener/data/full_process/undersampled


  .apply(lambda g: g.sample(target, replace=False, random_state=SEED))
  "created_at": datetime.utcnow().isoformat() + "Z",
