In [1]:
import sys
sys.path.append('..')

# Data Loading

In [2]:
from pathlib import Path
import pandas as pd

def load_datasets(data_root: str | Path = "data",
                  tasks: tuple[str, ...] = ("binary", "multiclass"),
                  splits: tuple[str, ...] = ("train", "val", "test")) -> dict:

    data_root = Path(data_root)
    datasets  = {}

    for task in tasks:
        task_dir     = data_root / task
        task_dict    = {}

        for split in splits:
            split_dict = {}
            for kind in ("X", "y"):
                file_path = task_dir / f"{kind}_{split}.pkl"
                split_dict[kind] = pd.read_pickle(file_path)
            task_dict[split] = split_dict

        datasets[task] = task_dict

    return datasets


# ── usage ────────────────────────────────────────────────────────
datasets = load_datasets("../data")

# quick sanity-check
for task, splits in datasets.items():
    for split, obj in splits.items():
        print(f"{task:<10} {split:<5}  X shape = {obj['X'].shape},  y len = {len(obj['y'])}")

binary     train  X shape = (4336, 2),  y len = 4336
binary     val    X shape = (1860, 2),  y len = 1860
binary     test   X shape = (1714, 2),  y len = 1714
multiclass train  X shape = (3237, 2),  y len = 3237
multiclass val    X shape = (1389, 2),  y len = 1389
multiclass test   X shape = (1332, 2),  y len = 1332


# Experiments

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
MODEL_TYPE = "LogReg"
SEED = 42

In [5]:
def get_datasets(kind: str):
    dataset = datasets[kind]
    
    X_train = dataset["train"]["X"]
    y_train = dataset["train"]["y"]
    
    X_val = dataset["val"]["X"]
    y_val = dataset["val"]["y"]
    
    X_test = dataset["test"]["X"]
    y_test = dataset["test"]["y"]

    return X_train, y_train, X_val, y_val, X_test, y_test


def combine_text(X):
    X = X.copy() 

    combined = X["resume_text"].astype(str) + " [SEP] " + X["job_description_text"].astype(str)

    return combined.values

## Binary Classification

In [6]:
from utils import ExperimentManager, Experiment

CLASSIFICATION_TYPE = "binary"

manager = ExperimentManager(f"../runs/{CLASSIFICATION_TYPE}/{MODEL_TYPE}", ["Fit", "Not Fit"])
X_train, y_train, X_val, y_val, X_test, y_test = get_datasets(CLASSIFICATION_TYPE)

splits = ((X_train, y_train), (X_val, y_val), (X_test, y_test))

### Baseline with default scikit learn parameters and no optimization

In [7]:
# Pipeline
pipe = Pipeline([
    ("join", FunctionTransformer(combine_text, validate=False)),
    ('tfidf', TfidfVectorizer()),  # Convert text to numeric
    ('clf', LogisticRegression(random_state=42))  # Logistic Regression model
])

def pipeline_factory(params):
    # Since we’re not using params here, we just return the static pipeline
    return pipe

experiment = Experiment(
    name=f"Baseline {CLASSIFICATION_TYPE} {MODEL_TYPE}",
    description=f"{CLASSIFICATION_TYPE} {MODEL_TYPE} with TF-IDF and no hyperparameter tuning.",
    pipeline_factory=pipeline_factory
)

manager.run_experiment(experiment, splits=splits)


=== Running Experiment: Baseline binary LogReg ===

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6004

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6000
   Micro F1:     0.6004
   Weighted F1:  0.6000

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6007  R: 0.6004
   Micro    - P: 0.6004  R: 0.6004
   Weighted - P: 0.6007  R: 0.6004

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.5951     0.6278     0.6110        857
   Not Fit              0.6062     0.5729     0.5891        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6007     0.6004     0.6000       1714
   weighted avg         0.6007     0.6004     0.6000       1714

🔢 CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted →
   True ↓        Fit  Not Fit 
   Fit     

<utils.ExperimentManger.Experiment at 0x7f827d6fde80>

### Optimized L1 penalty parameter space

In [None]:
def moderate_l1_param_space(trial):
    """
    Moderate parameter space focused on L1 (Lasso) regularization
    Good for feature selection and sparse models
    """
    params = {
        # TF-IDF: More options
        'tfidf__ngram_range': trial.suggest_categorical('tfidf__ngram_range', 
                                                      [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3)]),
        'tfidf__max_features': trial.suggest_categorical('tfidf__max_features', 
                                                       [5000, 10000, 15000, 20000, 25000]),
        'tfidf__min_df': trial.suggest_categorical('tfidf__min_df', [1, 2, 3, 5, 0.01]),
        'tfidf__max_df': trial.suggest_categorical('tfidf__max_df', [0.8, 0.85, 0.9, 0.95]),
        'tfidf__sublinear_tf': trial.suggest_categorical('tfidf__sublinear_tf', [True, False]),
        'tfidf__stop_words': trial.suggest_categorical('tfidf__stop_words', ['english', None]),
        'tfidf__use_idf': trial.suggest_categorical('tfidf__use_idf', [True, False]),
        'tfidf__norm': trial.suggest_categorical('tfidf__norm', ['l1', 'l2', None]),
        
        # Logistic Regression: L1 penalty specific
        'clf__penalty': 'l1',  # Fixed to L1
        'clf__C': trial.suggest_float('clf__C', 0.01, 100.0, log=True),
        'clf__solver': trial.suggest_categorical('clf__solver', ['liblinear', 'saga']),  # Only L1-compatible solvers
        'clf__class_weight': trial.suggest_categorical('clf__class_weight', [None, 'balanced']),
        'clf__max_iter': trial.suggest_categorical('clf__max_iter', [500, 1000, 2000]),
        'clf__random_state': 42
    }
    
    return params

In [9]:
def pipeline_factory(params):
    """Create pipeline with hyperparameters applied"""
    
    # Create base pipeline
    pipe = Pipeline([
        ("join", FunctionTransformer(combine_text, validate=False)),
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression(random_state=SEED))
    ])
    
    # Apply parameters using set_params if any parameters are provided
    if params:
        pipe.set_params(**params)
    
    return pipe

In [10]:
optuna_kwargs = {
    "n_trials": 50,        # Increase from 5 to 20 for better optimization
    "cv_folds": 5,         # Reduce from 10 to 5 for faster training
    "scoring": "accuracy",
    "random_state": SEED,
    "optimise":True
}

In [11]:
experiment = Experiment(
    name=f"Optimized model LogReg model with L1 penalty",
    description=f"An optimized model with L1 penalty fixed",
    pipeline_factory=pipeline_factory,
    param_space=moderate_l1_param_space,
)

manager.run_experiment(experiment, splits=splits, **optuna_kwargs)


=== Running Experiment: Optimized model LogReg model with L1 penalty ===


Hyperparameter Optimization (Custom Val Split):   0%|                                         | 0/50 [00:00<?, ?trial/s][I 2025-07-10 13:38:56,757] A new study created in memory with name: no-name-af31a46e-5484-43f5-9010-2fa52f6bc22f
Hyperparameter Optimization (Custom Val Split):   2%| | 1/50 [00:14<11:50, 14.51s/trial, Train: 0.7913 | Val: 0.7204 | B[I 2025-07-10 13:39:11,271] Trial 0 finished with value: 0.7204301075268817 and parameters: {'tfidf__ngram_range': (2, 3), 'tfidf__max_features': 10000, 'tfidf__min_df': 3, 'tfidf__max_df': 0.9, 'tfidf__sublinear_tf': False, 'tfidf__stop_words': None, 'tfidf__use_idf': False, 'tfidf__norm': None, 'clf__C': 0.30151391918266573, 'clf__solver': 'liblinear', 'clf__class_weight': None, 'clf__max_iter': 1000}. Best is trial 0 with value: 0.7204301075268817.
Hyperparameter Optimization (Custom Val Split):   4%| | 2/50 [00:26<10:14, 12.81s/trial, Train: 0.5000 | Val: 0.5000 | B[I 2025-07-10 13:39:22,882] Trial 1 finished with value: 0.5 and param


🎯 Optimization completed using Custom Val Split!
   Best score: 0.7812
   Total trials: 50
🔧 Training final model with best parameters...
✅ Training complete!
📊 Logging optimization summary...
✅ Optimization summary logged!

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6254

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6246
   Micro F1:     0.6254
   Weighted F1:  0.6246

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6266  R: 0.6254
   Micro    - P: 0.6254  R: 0.6254
   Weighted - P: 0.6266  R: 0.6254

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.6145     0.6733     0.6425        857
   Not Fit              0.6387     0.5776     0.6066        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6266     0.6254     0.6246       1714
   weighted avg 

<utils.ExperimentManger.Experiment at 0x7f827d777830>

## Multiclass Classification

In [12]:
from utils import ExperimentManager, Experiment

CLASSIFICATION_TYPE = "multiclass"

manager = ExperimentManager(f"../runs/{CLASSIFICATION_TYPE}/{MODEL_TYPE}", ["Good Fit", "Potential Fit", "No Fit"])
X_train, y_train, X_val, y_val, X_test, y_test = get_datasets(CLASSIFICATION_TYPE)

splits = ((X_train, y_train), (X_val, y_val), (X_test, y_test))

### Basline with default scikit learn parameters and no optimization

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

def combine_text(X):
    X = X.copy() 

    combined = X["resume_text"].astype(str) + " [SEP] " + X["job_description_text"].astype(str)

    return combined.values

# Pipeline
pipe = Pipeline([
    ("join", FunctionTransformer(combine_text, validate=False)),
    ('tfidf', TfidfVectorizer()),  # Convert text to numeric
    ('clf', LogisticRegression(random_state=42))  # Logistic Regression model
])

def pipeline_factory(params):
    # Since we’re not using params here, we just return the static pipeline
    return pipe

experiment = Experiment(
    name=f"Baseline {CLASSIFICATION_TYPE} {MODEL_TYPE}",
    description=f"{CLASSIFICATION_TYPE} {MODEL_TYPE} with TF-IDF and no hyperparameter tuning.",
    pipeline_factory=pipeline_factory
)

manager.run_experiment(experiment, splits=splits)


=== Running Experiment: Baseline multiclass LogReg ===

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,332
   Classes: 3
   Overall Accuracy: 0.4324

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.4317
   Micro F1:     0.4324
   Weighted F1:  0.4317

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.4328  R: 0.4324
   Micro    - P: 0.4324  R: 0.4324
   Weighted - P: 0.4328  R: 0.4324

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Good Fit             0.4559     0.4077     0.4304        444
   No Fit               0.3982     0.3964     0.3973        444
   Potential Fit        0.4442     0.4932     0.4674        444
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.4328     0.4324     0.4317       1332
   weighted avg         0.4328     0.4324     0.4317       1332

🔢 CONFUSION MATRIX
   Rows: True Labels, Columns: Predic

<utils.ExperimentManger.Experiment at 0x7f826ed0f590>

# Baseline Optimized Classification

A baseline parameter space to compare other parameter spaces against

In [15]:
def baseline_param_space(trial):
    """
    Simple, reliable parameter space for quick experiments
    Uses only the most important hyperparameters
    """
    return {
        # Essential TF-IDF parameters
        'tfidf__ngram_range': trial.suggest_categorical('tfidf__ngram_range',
                                                       [(1, 1), (1, 2)]),
        'tfidf__max_features': trial.suggest_categorical('tfidf__max_features',
                                                        [5000, 10000, 20000]),
        'tfidf__min_df': trial.suggest_categorical('tfidf__min_df', [2, 5]),
        'tfidf__max_df': trial.suggest_categorical('tfidf__max_df', [0.9, 0.95]),
        
        # Simple LogReg
        'clf__C': trial.suggest_float('clf__C', 0.1, 10.0, log=True),
        'clf__penalty': 'l2',
        'clf__solver': 'lbfgs',
        'clf__class_weight': trial.suggest_categorical('clf__class_weight', [None, 'balanced']),
        'clf__max_iter': 1000,
    }

In [16]:
def pipeline_factory(params):
    """Create pipeline with hyperparameters applied"""
    
    # Create base pipeline
    pipe = Pipeline([
        ("join", FunctionTransformer(combine_text, validate=False)),
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression(random_state=SEED))
    ])
    
    # Apply parameters using set_params if any parameters are provided
    if params:
        pipe.set_params(**params)
    
    return pipe

In [17]:
optuna_kwargs = {
    "n_trials": 50,        # Increase from 5 to 20 for better optimization
    "cv_folds": 5,         # Reduce from 10 to 5 for faster training
    "scoring": "accuracy",
    "random_state": SEED,
    "optimise":True
}

In [18]:
experiment = Experiment(
    name=f"Optimized LogReg model with L2 penalty",
    description=f"An optimized model with L2 penalty fixed",
    pipeline_factory=pipeline_factory,
    param_space=baseline_param_space,
)

manager.run_experiment(experiment, splits=splits, **optuna_kwargs)


=== Running Experiment: Optimized LogReg model with L2 penalty ===


Hyperparameter Optimization (Custom Val Split):   0%|                                         | 0/50 [00:00<?, ?trial/s][I 2025-07-10 21:31:18,885] A new study created in memory with name: no-name-ebbe4140-4cf4-4a1b-a8de-0f3bc40ce38e
Hyperparameter Optimization (Custom Val Split):   2%| | 1/50 [00:03<03:07,  3.82s/trial, Train: 0.6769 | Val: 0.5803 | B[I 2025-07-10 21:31:22,708] Trial 0 finished with value: 0.5802735781137509 and parameters: {'tfidf__ngram_range': (1, 1), 'tfidf__max_features': 10000, 'tfidf__min_df': 5, 'tfidf__max_df': 0.95, 'clf__C': 0.3132450703202545, 'clf__class_weight': 'balanced'}. Best is trial 0 with value: 0.5802735781137509.
Hyperparameter Optimization (Custom Val Split):   4%| | 2/50 [00:11<05:02,  6.30s/trial, Train: 0.6555 | Val: 0.5738 | B[I 2025-07-10 21:31:30,754] Trial 1 finished with value: 0.5737940964722822 and parameters: {'tfidf__ngram_range': (1, 2), 'tfidf__max_features': 20000, 'tfidf__min_df': 5, 'tfidf__max_df': 0.9, 'clf__C': 0.12409125988


🎯 Optimization completed using Custom Val Split!
   Best score: 0.6659
   Total trials: 50
🔧 Training final model with best parameters...
✅ Training complete!
📊 Logging optimization summary...
✅ Optimization summary logged!

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,332
   Classes: 3
   Overall Accuracy: 0.4264

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.4218
   Micro F1:     0.4264
   Weighted F1:  0.4218

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.4325  R: 0.4264
   Micro    - P: 0.4264  R: 0.4264
   Weighted - P: 0.4325  R: 0.4264

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Good Fit             0.4671     0.3041     0.3683        444
   No Fit               0.3966     0.4662     0.4286        444
   Potential Fit        0.4338     0.5090     0.4684        444
   ---------------- ---------- ---------- ---------- ----------
   macro avg    

<utils.ExperimentManger.Experiment at 0x7f827d626360>