In [1]:
import sys
sys.path.append('..')

# Data Loading

In [2]:
from pathlib import Path
import pandas as pd

def load_datasets(data_root: str | Path = "data",
                  tasks: tuple[str, ...] = ("binary", "multiclass"),
                  splits: tuple[str, ...] = ("train", "val", "test")) -> dict:

    data_root = Path(data_root)
    datasets  = {}

    for task in tasks:
        task_dir     = data_root / task
        task_dict    = {}

        for split in splits:
            split_dict = {}
            for kind in ("X", "y"):
                file_path = task_dir / f"{kind}_{split}.pkl"
                split_dict[kind] = pd.read_pickle(file_path)
            task_dict[split] = split_dict

        datasets[task] = task_dict

    return datasets


# ── usage ────────────────────────────────────────────────────────
datasets = load_datasets("../data")

# quick sanity-check
for task, splits in datasets.items():
    for split, obj in splits.items():
        print(f"{task:<10} {split:<5}  X shape = {obj['X'].shape},  y len = {len(obj['y'])}")

binary     train  X shape = (4336, 2),  y len = 4336
binary     val    X shape = (1860, 2),  y len = 1860
binary     test   X shape = (1714, 2),  y len = 1714
multiclass train  X shape = (3237, 2),  y len = 3237
multiclass val    X shape = (1389, 2),  y len = 1389
multiclass test   X shape = (1332, 2),  y len = 1332


# Experiments

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
MODEL_TYPE = "LogReg"
SEED = 42

In [5]:
def get_datasets(kind: str):
    dataset = datasets[kind]
    
    X_train = dataset["train"]["X"]
    y_train = dataset["train"]["y"]
    
    X_val = dataset["val"]["X"]
    y_val = dataset["val"]["y"]
    
    X_test = dataset["test"]["X"]
    y_test = dataset["test"]["y"]

    return X_train, y_train, X_val, y_val, X_test, y_test


def combine_text(X):
    X = X.copy() 

    combined = X["resume_text"].astype(str) + " [SEP] " + X["job_description_text"].astype(str)

    return combined.values

## Binary Classification

In [6]:
from utils import ExperimentManager, Experiment

CLASSIFICATION_TYPE = "binary"

manager = ExperimentManager(f"../runs/{CLASSIFICATION_TYPE}/{MODEL_TYPE}")
X_train, y_train, X_val, y_val, X_test, y_test = get_datasets(CLASSIFICATION_TYPE)

### Baseline with default scikit learn parameters and no optimization

In [7]:
# Pipeline
pipe = Pipeline([
    ("join", FunctionTransformer(combine_text, validate=False)),
    ('tfidf', TfidfVectorizer()),  # Convert text to numeric
    ('clf', LogisticRegression(random_state=42))  # Logistic Regression model
])

def pipeline_factory(params):
    # Since we’re not using params here, we just return the static pipeline
    return pipe

experiment = Experiment(
    name=f"Baseline {CLASSIFICATION_TYPE} {MODEL_TYPE}",
    description=f"{CLASSIFICATION_TYPE} {MODEL_TYPE} with TF-IDF and no hyperparameter tuning.",
    pipeline_factory=pipeline_factory,
    splits=[(X_val, y_val), (X_test, y_test)],
    split_names=["Validation", "Test"]
)

manager.run_experiment(experiment, train_data=(X_train, y_train))


=== Running Experiment: Baseline binary LogReg ===
🏋️ Training model...
📊 Evaluating model performance...

--- Validation Evaluation ---
Accuracy: 0.6812
Macro F1: 0.6802
Micro F1: 0.6812

Detailed Classification Report:
                 precision     recall   f1-score    support

       Good Fit     0.6628     0.7376     0.6982        930
         No Fit     0.7042     0.6247     0.6621        930

       accuracy                           0.6812       1860
      macro avg     0.6835     0.6812     0.6802       1860
   weighted avg     0.6835     0.6812     0.6802       1860

Confusion Matrix:
[[686 244]
 [349 581]]

--- Test Evaluation ---
Accuracy: 0.6004
Macro F1: 0.6000
Micro F1: 0.6004

Detailed Classification Report:
                 precision     recall   f1-score    support

       Good Fit     0.5951     0.6278     0.6110        857
         No Fit     0.6062     0.5729     0.5891        857

       accuracy                           0.6004       1714
      macro avg     0.6

<utils.ExperimentManger.Experiment at 0x7f4e261ab830>

### Optimized Logistic Regression Experiment

A comperehensive parameter space

In [7]:
# CHANGE: Remove the tuple conversion (new ModelTrainer handles this)
def param_space(trial):
    """Define hyperparameter search space for TF-IDF + Logistic Regression"""
    
    params = {
        'tfidf__ngram_range': trial.suggest_categorical('tfidf__ngram_range', 
                                                       [(1, 1), (1, 2), (1, 3)]),  # Direct tuples
        'tfidf__max_features': trial.suggest_categorical('tfidf__max_features',
                                                        [5000, 10000, 15000, 20000]),
        'tfidf__min_df': trial.suggest_categorical('tfidf__min_df', [2, 5, 0.01]),
        'tfidf__max_df': trial.suggest_categorical('tfidf__max_df', [0.85, 0.9, 0.95]),
        'tfidf__sublinear_tf': trial.suggest_categorical('tfidf__sublinear_tf', [True, False]),
        'tfidf__stop_words': trial.suggest_categorical('tfidf__stop_words', ['english', None]),
        'clf__C': trial.suggest_float('clf__C', 0.01, 100.0, log=True),
        'clf__penalty': trial.suggest_categorical('clf__penalty', ['l1', 'l2']),
        'clf__solver': trial.suggest_categorical('clf__solver', ['liblinear', 'saga']),
        'clf__class_weight': trial.suggest_categorical('clf__class_weight', [None, 'balanced']),
        'clf__max_iter': trial.suggest_categorical('clf__max_iter', [200, 500, 1000]),
    }
    return params

In [8]:
def pipeline_factory(params):
    """Create pipeline with hyperparameters applied"""
    
    # Create base pipeline
    pipe = Pipeline([
        ("join", FunctionTransformer(combine_text, validate=False)),
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression(random_state=SEED))
    ])
    
    # Apply parameters using set_params if any parameters are provided
    if params:
        pipe.set_params(**params)
    
    return pipe

In [10]:
optuna_kwargs = {
    "n_trials": 2,        # Increase from 5 to 20 for better optimization
    "cv_folds": 5,         # Reduce from 10 to 5 for faster training
    "scoring": "accuracy",
    "random_state": SEED
}

In [None]:
experiment = Experiment(
    name=f"Optimizaed {CLASSIFICATION_TYPE} classification {MODEL_TYPE} model",
    description=f"{CLASSIFICATION_TYPE} {MODEL_TYPE} with TF-IDF and {MODEL_TYPE} hyperparameter tuning using a comprehensive parameter space.",
    pipeline_factory=pipeline_factory,
    splits=[(X_val, y_val), (X_test, y_test)],
    param_space=param_space,
    split_names=["Validation", "Test"]
)

manager.run_experiment(experiment, train_data=(X_train, y_train), **optuna_kwargs)


=== Running Experiment: Optimizaed binary classification LogReg model ===
🏋️ Training model...


Hyperparameter Optimization:   0%|                                                             | 0/2 [00:00<?, ?trial/s][I 2025-07-08 18:49:04,140] A new study created in memory with name: no-name-ceaa8ac1-93cc-4607-a017-1fe806df6712


## Multiclass Classification

In [12]:
from utils import ExperimentManager, Experiment

CLASSIFICATION_TYPE = "multiclass"

manager = ExperimentManager(f"../runs/{CLASSIFICATION_TYPE}/{MODEL_TYPE}")
X_train, y_train, X_val, y_val, X_test, y_test = get_datasets(CLASSIFICATION_TYPE)

### Basline with default scikit learn parameters and no optimization

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

def combine_text(X):
    X = X.copy() 

    combined = X["resume_text"].astype(str) + " [SEP] " + X["job_description_text"].astype(str)

    return combined.values

# Pipeline
pipe = Pipeline([
    ("join", FunctionTransformer(combine_text, validate=False)),
    ('tfidf', TfidfVectorizer()),  # Convert text to numeric
    ('clf', LogisticRegression(random_state=42))  # Logistic Regression model
])

def pipeline_factory(params):
    # Since we’re not using params here, we just return the static pipeline
    return pipe

experiment = Experiment(
    name=f"Baseline {CLASSIFICATION_TYPE} {MODEL_TYPE}",
    description=f"{CLASSIFICATION_TYPE} {MODEL_TYPE} with TF-IDF and no hyperparameter tuning.",
    pipeline_factory=pipeline_factory,
    splits=[(X_val, y_val), (X_test, y_test)],
    split_names=["Validation", "Test"]
)

manager.run_experiment(experiment, train_data=(X_train, y_train))


=== Running Experiment: Baseline multiclass LogReg ===
🏋️ Training model...
📊 Evaluating model performance...

--- Validation Evaluation ---
Accuracy: 0.6048
Macro F1: 0.5994
Micro F1: 0.6048

Detailed Classification Report:
                 precision     recall   f1-score    support

       Good Fit     0.6341     0.7559     0.6897        463
         No Fit     0.5991     0.5875     0.5932        463
  Potential Fit     0.5692     0.4708     0.5154        463

       accuracy                           0.6048       1389
      macro avg     0.6008     0.6048     0.5994       1389
   weighted avg     0.6008     0.6048     0.5994       1389

Confusion Matrix:
[[350  60  53]
 [116 218 129]
 [ 86 105 272]]

--- Test Evaluation ---
Accuracy: 0.4324
Macro F1: 0.4317
Micro F1: 0.4324

Detailed Classification Report:
                 precision     recall   f1-score    support

       Good Fit     0.4559     0.4077     0.4304        444
         No Fit     0.3982     0.3964     0.3973        4

<utils.ExperimentManger.Experiment at 0x7f4e264afd70>