In [1]:
import sys
sys.path.append('..')

# Data Loading

In [2]:
from pathlib import Path
import pandas as pd

def load_datasets(data_root: str | Path = "data",
                  tasks: tuple[str, ...] = ("binary", "multiclass"),
                  splits: tuple[str, ...] = ("train", "val", "test")) -> dict:

    data_root = Path(data_root)
    datasets  = {}

    for task in tasks:
        task_dir     = data_root / task
        task_dict    = {}

        for split in splits:
            split_dict = {}
            for kind in ("X", "y"):
                file_path = task_dir / f"{kind}_{split}.pkl"
                split_dict[kind] = pd.read_pickle(file_path)
            task_dict[split] = split_dict

        datasets[task] = task_dict

    return datasets


# ── usage ────────────────────────────────────────────────────────
datasets = load_datasets("../data")

# quick sanity-check
for task, splits in datasets.items():
    for split, obj in splits.items():
        print(f"{task:<10} {split:<5}  X shape = {obj['X'].shape},  y len = {len(obj['y'])}")

binary     train  X shape = (4336, 2),  y len = 4336
binary     val    X shape = (1860, 2),  y len = 1860
binary     test   X shape = (1714, 2),  y len = 1714
multiclass train  X shape = (3237, 2),  y len = 3237
multiclass val    X shape = (1389, 2),  y len = 1389
multiclass test   X shape = (1332, 2),  y len = 1332


# Experiments

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
MODEL_TYPE = "LogReg"
SEED = 42

In [5]:
def get_datasets(kind: str):
    dataset = datasets[kind]
    
    X_train = dataset["train"]["X"]
    y_train = dataset["train"]["y"]
    
    X_val = dataset["val"]["X"]
    y_val = dataset["val"]["y"]
    
    X_test = dataset["test"]["X"]
    y_test = dataset["test"]["y"]

    return X_train, y_train, X_val, y_val, X_test, y_test


def combine_text(X):
    X = X.copy() 

    combined = X["resume_text"].astype(str) + " [SEP] " + X["job_description_text"].astype(str)

    return combined.values

## Binary Classification

In [6]:
from utils import ExperimentManager, Experiment

CLASSIFICATION_TYPE = "binary"

manager = ExperimentManager(f"../runs/{CLASSIFICATION_TYPE}/{MODEL_TYPE}", ["Fit", "Not Fit"])
X_train, y_train, X_val, y_val, X_test, y_test = get_datasets(CLASSIFICATION_TYPE)

splits = ((X_train, y_train), (X_val, y_val), (X_test, y_test))

### Baseline with default scikit learn parameters and no optimization

In [7]:
# Pipeline
pipe = Pipeline([
    ("join", FunctionTransformer(combine_text, validate=False)),
    ('tfidf', TfidfVectorizer()),  # Convert text to numeric
    ('clf', LogisticRegression(random_state=42))  # Logistic Regression model
])

def pipeline_factory(params):
    # Since we’re not using params here, we just return the static pipeline
    return pipe

experiment = Experiment(
    name=f"Baseline {CLASSIFICATION_TYPE} {MODEL_TYPE}",
    description=f"{CLASSIFICATION_TYPE} {MODEL_TYPE} with TF-IDF and no hyperparameter tuning.",
    pipeline_factory=pipeline_factory
)

manager.run_experiment(experiment, splits=splits)


=== Running Experiment: Baseline binary LogReg ===

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6004

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6000
   Micro F1:     0.6004
   Weighted F1:  0.6000

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6007  R: 0.6004
   Micro    - P: 0.6004  R: 0.6004
   Weighted - P: 0.6007  R: 0.6004

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.5951     0.6278     0.6110        857
   Not Fit              0.6062     0.5729     0.5891        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6007     0.6004     0.6000       1714
   weighted avg         0.6007     0.6004     0.6000       1714

🔢 CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted →
   True ↓        Fit  Not Fit 
   Fit     

<utils.ExperimentManger.Experiment at 0x7f718a88e9f0>

### Optimized L1 penalty parameter space

In [7]:
def moderate_l1_param_space(trial):
    """
    Moderate parameter space focused on L1 (Lasso) regularization
    Good for feature selection and sparse models
    """
    params = {
        # TF-IDF: More options
        'tfidf__ngram_range': trial.suggest_categorical('tfidf__ngram_range', 
                                                      [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3)]),
        'tfidf__max_features': trial.suggest_categorical('tfidf__max_features', 
                                                       [5000, 10000, 15000, 20000, 25000]),
        'tfidf__min_df': trial.suggest_categorical('tfidf__min_df', [1, 2, 3, 5, 0.01]),
        'tfidf__max_df': trial.suggest_categorical('tfidf__max_df', [0.8, 0.85, 0.9, 0.95]),
        'tfidf__sublinear_tf': trial.suggest_categorical('tfidf__sublinear_tf', [True, False]),
        'tfidf__stop_words': trial.suggest_categorical('tfidf__stop_words', ['english', None]),
        'tfidf__use_idf': trial.suggest_categorical('tfidf__use_idf', [True, False]),
        'tfidf__norm': trial.suggest_categorical('tfidf__norm', ['l1', 'l2', None]),
        
        # Logistic Regression: L1 penalty specific
        'clf__penalty': 'l1',  # Fixed to L1
        'clf__C': trial.suggest_float('clf__C', 0.01, 100.0, log=True),
        'clf__solver': trial.suggest_categorical('clf__solver', ['liblinear', 'saga']),  # Only L1-compatible solvers
        'clf__class_weight': trial.suggest_categorical('clf__class_weight', [None, 'balanced']),
        'clf__max_iter': trial.suggest_categorical('clf__max_iter', [500, 1000, 2000]),
        'clf__random_state': 42
    }
    
    return params

In [8]:
def pipeline_factory(params):
    """Create pipeline with hyperparameters applied"""
    
    # Create base pipeline
    pipe = Pipeline([
        ("join", FunctionTransformer(combine_text, validate=False)),
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression(random_state=SEED))
    ])
    
    # Apply parameters using set_params if any parameters are provided
    if params:
        pipe.set_params(**params)
    
    return pipe

In [9]:
optuna_kwargs = {
    "n_trials": 10,        # Increase from 5 to 20 for better optimization
    "cv_folds": 5,         # Reduce from 10 to 5 for faster training
    "scoring": "accuracy",
    "random_state": SEED,
    "optimise":True
}

In [10]:
experiment = Experiment(
    name=f"Optimized model LogReg model with L1 penalty",
    description=f"An optimized model with L1 penalty fixed",
    pipeline_factory=pipeline_factory,
    param_space=moderate_l1_param_space,
)

manager.run_experiment(experiment, splits=splits, **optuna_kwargs)


=== Running Experiment: Optimized model LogReg model with L1 penalty ===


Hyperparameter Optimization (Custom Val Split):   0%|                                         | 0/10 [00:00<?, ?trial/s][I 2025-07-10 12:41:51,623] A new study created in memory with name: no-name-144aa387-4e6e-4b91-a3c9-c5a1120cb88d
Hyperparameter Optimization (Custom Val Split):  10%| | 1/10 [00:12<01:49, 12.12s/trial, Train: 0.5000 | Val: 0.5000 | B[I 2025-07-10 12:42:03,746] Trial 0 finished with value: 0.5 and parameters: {'tfidf__ngram_range': (2, 3), 'tfidf__max_features': 10000, 'tfidf__min_df': 5, 'tfidf__max_df': 0.8, 'tfidf__sublinear_tf': True, 'tfidf__stop_words': 'english', 'tfidf__use_idf': True, 'tfidf__norm': 'l1', 'clf__C': 0.01711449083472073, 'clf__solver': 'liblinear', 'clf__class_weight': None, 'clf__max_iter': 1000}. Best is trial 0 with value: 0.5.
Hyperparameter Optimization (Custom Val Split):  20%|▏| 2/10 [01:05<04:50, 36.37s/trial, Train: 0.7315 | Val: 0.6758 | B[I 2025-07-10 12:42:57,090] Trial 1 finished with value: 0.6758064516129032 and parameters: {'tfi


🎯 Optimization completed using Custom Val Split!
   Best score: 0.7242
   Total trials: 10
🔧 Training final model with best parameters...
✅ Training complete!
📊 Logging optimization summary...
✅ Optimization summary logged!

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.5951

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.5946
   Micro F1:     0.5951
   Weighted F1:  0.5946

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.5955  R: 0.5951
   Micro    - P: 0.5951  R: 0.5951
   Weighted - P: 0.5955  R: 0.5951

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.6020     0.5613     0.5809        857
   Not Fit              0.5891     0.6289     0.6084        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.5955     0.5951     0.5946       1714
   weighted avg 

<utils.ExperimentManger.Experiment at 0x7f3684343b30>

## Multiclass Classification

In [20]:
from utils import ExperimentManager, Experiment

CLASSIFICATION_TYPE = "multiclass"

manager = ExperimentManager(f"../runs/{CLASSIFICATION_TYPE}/{MODEL_TYPE}", ["Good Fit", "Potential Fit", "No Fit"])
X_train, y_train, X_val, y_val, X_test, y_test = get_datasets(CLASSIFICATION_TYPE)

splits = ((X_train, y_train), (X_val, y_val), (X_test, y_test))

### Basline with default scikit learn parameters and no optimization

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

def combine_text(X):
    X = X.copy() 

    combined = X["resume_text"].astype(str) + " [SEP] " + X["job_description_text"].astype(str)

    return combined.values

# Pipeline
pipe = Pipeline([
    ("join", FunctionTransformer(combine_text, validate=False)),
    ('tfidf', TfidfVectorizer()),  # Convert text to numeric
    ('clf', LogisticRegression(random_state=42))  # Logistic Regression model
])

def pipeline_factory(params):
    # Since we’re not using params here, we just return the static pipeline
    return pipe

experiment = Experiment(
    name=f"Baseline {CLASSIFICATION_TYPE} {MODEL_TYPE}",
    description=f"{CLASSIFICATION_TYPE} {MODEL_TYPE} with TF-IDF and no hyperparameter tuning.",
    pipeline_factory=pipeline_factory,
    splits=[(X_val, y_val), (X_test, y_test)],
    split_names=["Validation", "Test"]
)

manager.run_experiment(experiment, train_data=(X_train, y_train))

# Baseline Optimized Classification

A baseline parameter space to compare other parameter spaces against

In [21]:
def baseline_param_space(trial):
    """
    Simple, reliable parameter space for quick experiments
    Uses only the most important hyperparameters
    """
    return {
        # Essential TF-IDF parameters
        'tfidf__ngram_range': trial.suggest_categorical('tfidf__ngram_range',
                                                       [(1, 1), (1, 2)]),
        'tfidf__max_features': trial.suggest_categorical('tfidf__max_features',
                                                        [5000, 10000, 20000]),
        'tfidf__min_df': trial.suggest_categorical('tfidf__min_df', [2, 5]),
        'tfidf__max_df': trial.suggest_categorical('tfidf__max_df', [0.9, 0.95]),
        
        # Simple LogReg
        'clf__C': trial.suggest_float('clf__C', 0.1, 10.0, log=True),
        'clf__penalty': 'l2',
        'clf__solver': 'lbfgs',
        'clf__class_weight': trial.suggest_categorical('clf__class_weight', [None, 'balanced']),
        'clf__max_iter': 1000,
    }

In [22]:
def pipeline_factory(params):
    """Create pipeline with hyperparameters applied"""
    
    # Create base pipeline
    pipe = Pipeline([
        ("join", FunctionTransformer(combine_text, validate=False)),
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression(random_state=SEED))
    ])
    
    # Apply parameters using set_params if any parameters are provided
    if params:
        pipe.set_params(**params)
    
    return pipe

In [23]:
optuna_kwargs = {
    "n_trials": 10,        # Increase from 5 to 20 for better optimization
    "cv_folds": 5,         # Reduce from 10 to 5 for faster training
    "scoring": "accuracy",
    "random_state": SEED,
    "optimise":True
}

In [None]:
experiment = Experiment(
    name=f"Optimized LogReg model with L2 penalty",
    description=f"An optimized model with L2 penalty fixed",
    pipeline_factory=pipeline_factory,
    param_space=baseline_param_space,
)

manager.run_experiment(experiment, splits=splits, **optuna_kwargs)


=== Running Experiment: Optimized LogReg model with L2 penalty ===


Hyperparameter Optimization (Custom Val Split):   0%|                                         | 0/10 [00:00<?, ?trial/s][I 2025-07-10 12:53:56,654] A new study created in memory with name: no-name-1d7d9760-c8e8-47df-9173-edbf73654ec5
