In [1]:
import sys
sys.path.append('..')

# Data Loading

In [2]:
from pathlib import Path
import pandas as pd

def load_datasets(data_root: str | Path = "data",
                  tasks: tuple[str, ...] = ("binary", "multiclass"),
                  splits: tuple[str, ...] = ("train", "val", "test")) -> dict:

    data_root = Path(data_root)
    datasets  = {}

    for task in tasks:
        task_dir     = data_root / task
        task_dict    = {}

        for split in splits:
            split_dict = {}
            for kind in ("X", "y"):
                file_path = task_dir / f"{kind}_{split}.pkl"
                split_dict[kind] = pd.read_pickle(file_path)
            task_dict[split] = split_dict

        datasets[task] = task_dict

    return datasets


# ── usage ────────────────────────────────────────────────────────
datasets = load_datasets("../data")

# quick sanity-check
for task, splits in datasets.items():
    for split, obj in splits.items():
        print(f"{task:<10} {split:<5}  X shape = {obj['X'].shape},  y len = {len(obj['y'])}")

binary     train  X shape = (4336, 2),  y len = 4336
binary     val    X shape = (1860, 2),  y len = 1860
binary     test   X shape = (1714, 2),  y len = 1714
multiclass train  X shape = (3237, 2),  y len = 3237
multiclass val    X shape = (1389, 2),  y len = 1389
multiclass test   X shape = (1332, 2),  y len = 1332


# Experiments

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [4]:
MODEL_TYPE = "RandomForest"
SEED = 42

In [5]:
def get_datasets(kind: str):
    dataset = datasets[kind]
    
    X_train = dataset["train"]["X"]
    y_train = dataset["train"]["y"]
    
    X_val = dataset["val"]["X"]
    y_val = dataset["val"]["y"]
    
    X_test = dataset["test"]["X"]
    y_test = dataset["test"]["y"]

    return X_train, y_train, X_val, y_val, X_test, y_test

def combine_text(X):
    X = X.copy() 

    combined = X["resume_text"].astype(str) + " [SEP] " + X["job_description_text"].astype(str)

    return combined.values


## Binary Classification

In [6]:
from utils import ExperimentManager, Experiment

CLASSIFICATION_TYPE = "binary"

manager = ExperimentManager(f"../runs/{CLASSIFICATION_TYPE}/{MODEL_TYPE}", ["Fit", "Not Fit"])
X_train, y_train, X_val, y_val, X_test, y_test = get_datasets(CLASSIFICATION_TYPE)

splits = ((X_train, y_train), (X_val, y_val), (X_test, y_test))

### Baseline with default scikit-learn random forest and no optimization

In [7]:
pipe = Pipeline([
    ("join", FunctionTransformer(combine_text, validate=False)),
    ("vec", TfidfVectorizer()),
    ("clf", RandomForestClassifier(random_state=SEED))
])

def pipeline_factory(params):
    # Since we’re not using params here, we just return the static pipeline
    return pipe

experiment = Experiment(
    name=f"Baseline {CLASSIFICATION_TYPE} {MODEL_TYPE}",
    description=f"{CLASSIFICATION_TYPE} {MODEL_TYPE} with TF-IDF and no hyperparameter tuning.",
    pipeline_factory=pipeline_factory,
)

manager.run_experiment(experiment, splits=splits)



=== Running Experiment: Baseline binary RandomForest ===

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6424

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6419
   Micro F1:     0.6424
   Weighted F1:  0.6419

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6431  R: 0.6424
   Micro    - P: 0.6424  R: 0.6424
   Weighted - P: 0.6431  R: 0.6424

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.6326     0.6791     0.6550        857
   Not Fit              0.6537     0.6056     0.6287        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6431     0.6424     0.6419       1714
   weighted avg         0.6431     0.6424     0.6419       1714

🔢 CONFUSION MATRIX
   Rows: True Labels, Columns: Predicted Labels
   Predicted →
   True ↓        Fit  Not Fit 
   Fi

<utils.ExperimentManger.Experiment at 0x7f5343988860>

### Optimized RandomForest Model

In [11]:
def moderate_tfidf_randomforest_param_space(trial):
    """
    Moderate parameter space for TF-IDF + Random Forest
    Good for binary classification with robust ensemble learning
    Optimized for ~50 trials
    """
    params = {
        # TF-IDF: Text vectorization parameters
        'tfidf__ngram_range': trial.suggest_categorical('tfidf__ngram_range',
                                                      [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3)]),
        'tfidf__max_features': trial.suggest_categorical('tfidf__max_features',
                                                       [5000, 10000, 15000, 20000, 25000]),
        'tfidf__min_df': trial.suggest_categorical('tfidf__min_df', [1, 2, 3, 5, 0.01]),
        'tfidf__max_df': trial.suggest_categorical('tfidf__max_df', [0.8, 0.85, 0.9, 0.95]),
        'tfidf__sublinear_tf': trial.suggest_categorical('tfidf__sublinear_tf', [True, False]),
        'tfidf__stop_words': trial.suggest_categorical('tfidf__stop_words', ['english', None]),
        'tfidf__use_idf': trial.suggest_categorical('tfidf__use_idf', [True, False]),
        'tfidf__norm': trial.suggest_categorical('tfidf__norm', ['l1', 'l2', None]),
        
        # Random Forest: Ensemble parameters
        'clf__n_estimators': trial.suggest_categorical('clf__n_estimators', [50, 100, 200, 300, 500]),
        'clf__max_depth': trial.suggest_categorical('clf__max_depth', [5, 10, 15, 20, None]),
        'clf__min_samples_split': trial.suggest_categorical('clf__min_samples_split', [2, 5, 10, 20]),
        'clf__min_samples_leaf': trial.suggest_categorical('clf__min_samples_leaf', [1, 2, 4, 8]),
        'clf__max_features': trial.suggest_categorical('clf__max_features', ['sqrt', 'log2', None, 0.5, 0.8]),
        'clf__bootstrap': trial.suggest_categorical('clf__bootstrap', [True, False]),
        'clf__class_weight': trial.suggest_categorical('clf__class_weight', [None, 'balanced', 'balanced_subsample']),
        'clf__criterion': trial.suggest_categorical('clf__criterion', ['gini', 'entropy']),
        'clf__random_state': 42
    }

    return params

In [12]:
def pipeline_factory(params):
    """Create pipeline with hyperparameters applied"""
    
    # Create base pipeline
    pipe = Pipeline([
        ("join", FunctionTransformer(combine_text, validate=False)),
        ('tfidf', TfidfVectorizer()),
        ('clf', RandomForestClassifier(random_state=SEED))
    ])
    
    # Apply parameters using set_params if any parameters are provided
    if params:
        pipe.set_params(**params)
    
    return pipe

In [13]:
optuna_kwargs = {
    "n_trials": 50,        # Increase from 5 to 20 for better optimization
    "cv_folds": 5,         # Reduce from 10 to 5 for faster training
    "scoring": "accuracy",
    "random_state": SEED,
    "optimise":True
}

In [14]:
experiment = Experiment(
    name=f"Optimized binary RandomForest model",
    description=f"An optimized randomforest model optimized",
    pipeline_factory=pipeline_factory,
    param_space=moderate_tfidf_randomforest_param_space,
)

manager.run_experiment(experiment, splits=splits, **optuna_kwargs)


=== Running Experiment: Optimized binary RandomForest model ===


Hyperparameter Optimization (Custom Val Split):   0%|                                         | 0/50 [00:00<?, ?trial/s][I 2025-07-11 10:29:13,251] A new study created in memory with name: no-name-3b06d75d-ba99-459a-b4f4-68ea289b1bd3
Hyperparameter Optimization (Custom Val Split):   2%| | 1/50 [00:13<11:05, 13.59s/trial, Train: 0.7415 | Val: 0.6645 | B[I 2025-07-11 10:29:26,840] Trial 0 finished with value: 0.6645161290322581 and parameters: {'tfidf__ngram_range': (2, 2), 'tfidf__max_features': 25000, 'tfidf__min_df': 3, 'tfidf__max_df': 0.95, 'tfidf__sublinear_tf': False, 'tfidf__stop_words': None, 'tfidf__use_idf': True, 'tfidf__norm': None, 'clf__n_estimators': 500, 'clf__max_depth': 15, 'clf__min_samples_split': 10, 'clf__min_samples_leaf': 8, 'clf__max_features': 'sqrt', 'clf__bootstrap': False, 'clf__class_weight': 'balanced_subsample', 'clf__criterion': 'entropy'}. Best is trial 0 with value: 0.6645161290322581.
Hyperparameter Optimization (Custom Val Split):   4%| | 2/50 [00:23


🎯 Optimization completed using Custom Val Split!
   Best score: 0.7952
   Total trials: 50
🔧 Training final model with best parameters...
✅ Training complete!
📊 Logging optimization summary...
✅ Optimization summary logged!

🎯 TEST SET EVALUATION RESULTS

📊 OVERVIEW
   Test Samples: 1,714
   Classes: 2
   Overall Accuracy: 0.6937

🎯 MAIN PERFORMANCE METRICS
   Macro F1:     0.6935
   Micro F1:     0.6937
   Weighted F1:  0.6935

📈 PRECISION/RECALL SUMMARY
   Macro    - P: 0.6943  R: 0.6937
   Micro    - P: 0.6937  R: 0.6937
   Weighted - P: 0.6943  R: 0.6937

📋 DETAILED CLASSIFICATION REPORT
   Class             Precision     Recall   F1-Score    Support
   ---------------- ---------- ---------- ---------- ----------
   Fit                  0.6836     0.7211     0.7019        857
   Not Fit              0.7049     0.6663     0.6851        857
   ---------------- ---------- ---------- ---------- ----------
   macro avg            0.6943     0.6937     0.6935       1714
   weighted avg 

<utils.ExperimentManger.Experiment at 0x7f5ad4b752e0>

## Multiclass Classification

In [7]:
from utils import ExperimentManager, Experiment

CLASSIFICATION_TYPE = "multiclass"

manager = ExperimentManager(f"../runs/{CLASSIFICATION_TYPE}/{MODEL_TYPE}", ["Good Fit", "Potential Fit", "No Fit"])
X_train, y_train, X_val, y_val, X_test, y_test = get_datasets(CLASSIFICATION_TYPE)

splits = ((X_train, y_train), (X_val, y_val), (X_test, y_test))

### Baseline with default scikit-learn random forest and no optimization

In [9]:
pipe = Pipeline([
    ("join", FunctionTransformer(combine_text, validate=False)),
    ("vec", TfidfVectorizer()),
    ("clf", RandomForestClassifier(random_state=SEED))
])

def pipeline_factory(params):
    # Since we’re not using params here, we just return the static pipeline
    return pipe

experiment = Experiment(
    name=f"Baseline {CLASSIFICATION_TYPE} {MODEL_TYPE}",
    description=f"{CLASSIFICATION_TYPE} {MODEL_TYPE} with TF-IDF and no hyperparameter tuning.",
    pipeline_factory=pipeline_factory,
    splits=[(X_val, y_val), (X_test, y_test)],
    split_names=["Validation", "Test"]
)

manager.run_experiment(experiment, train_data=(X_train, y_train))



=== Running Experiment: Baseline multiclass RandomForest ===

--- Validation Evaluation ---
Accuracy: 0.6019
Macro F1: 0.5968
Micro F1: 0.6019

Detailed Classification Report:
                 precision     recall   f1-score    support

       Good Fit     0.6245     0.7473     0.6804        463
         No Fit     0.5857     0.5832     0.5844        463
  Potential Fit     0.5882     0.4752     0.5257        463

       accuracy                           0.6019       1389
      macro avg     0.5995     0.6019     0.5968       1389
   weighted avg     0.5995     0.6019     0.5968       1389

Confusion Matrix:
[[346  49  68]
 [120 220 123]
 [ 88 105 270]]

--- Test Evaluation ---
Accuracy: 0.4550
Macro F1: 0.4546
Micro F1: 0.4550

Detailed Classification Report:
                 precision     recall   f1-score    support

       Good Fit     0.4946     0.4122     0.4496        444
         No Fit     0.4072     0.4347     0.4205        444
  Potential Fit     0.4713     0.5180     0.49

<utils.ExperimentManger.Experiment at 0x7f69254d0a70>

### Optimized Multiclass Random Forest Classifier

In [8]:
def moderate_tfidf_randomforest_multiclass_param_space(trial):
    """
    Anti-overfitting parameter space for TF-IDF + Random Forest
    Optimized for 3-class multiclass classification with focus on generalization
    Good for ~30 trials with overfitting control
    """
    params = {
        # TF-IDF: Text vectorization parameters (anti-overfitting focused)
        'tfidf__ngram_range': trial.suggest_categorical('tfidf__ngram_range',
                                                      [(1, 1), (1, 2), (2, 2)]),  # Reduced complexity
        'tfidf__max_features': trial.suggest_int('tfidf__max_features', 5000, 20000),  # Continuous range
        'tfidf__min_df': trial.suggest_int('min_df', 2, 10),  # Higher min_df to reduce noise
        'tfidf__max_df': trial.suggest_float('max_df', 0.7, 0.9),  # More aggressive filtering
        'tfidf__sublinear_tf': trial.suggest_categorical('tfidf__sublinear_tf', [True, False]),
        'tfidf__stop_words': trial.suggest_categorical('tfidf__stop_words', ['english', None]),
        'tfidf__use_idf': True,  # Fixed - IDF helps with generalization
        'tfidf__norm': trial.suggest_categorical('tfidf__norm', ['l1', 'l2']),  # Removed None for regularization
        
        # Random Forest: Anti-overfitting parameters
        'clf__n_estimators': trial.suggest_int('clf__n_estimators', 100, 400),  # Continuous range
        'clf__max_depth': trial.suggest_int('clf__max_depth', 5, 20),  # Limited depth to prevent overfitting
        'clf__min_samples_split': trial.suggest_int('clf__min_samples_split', 5, 25),  # Higher minimum
        'clf__min_samples_leaf': trial.suggest_int('clf__min_samples_leaf', 2, 15),  # Higher minimum
        'clf__max_features': trial.suggest_categorical('clf__max_features', ['sqrt', 'log2', 0.3, 0.5, 0.7]),  # Reduced feature subsets
        'clf__bootstrap': True,  # Fixed - bootstrap helps generalization
        'clf__class_weight': None,  # Fixed - classes are balanced
        'clf__criterion': trial.suggest_categorical('clf__criterion', ['gini', 'entropy']),
        'clf__oob_score': True,  # Enable OOB scoring for overfitting detection
        'clf__random_state': 42,
        
        # Additional anti-overfitting parameters
        'clf__min_weight_fraction_leaf': trial.suggest_float('clf__min_weight_fraction_leaf', 0.0, 0.05),
        'clf__max_leaf_nodes': trial.suggest_int('clf__max_leaf_nodes', 50, 500),
        'clf__min_impurity_decrease': trial.suggest_float('clf__min_impurity_decrease', 0.0, 0.01),
        'clf__max_samples': trial.suggest_float('clf__max_samples', 0.6, 0.9),  # Continuous range
    }
    
    return params

In [9]:
def pipeline_factory(params):
    """Create pipeline with hyperparameters applied"""
    
    # Create base pipeline
    pipe = Pipeline([
        ("join", FunctionTransformer(combine_text, validate=False)),
        ('tfidf', TfidfVectorizer()),
        ('clf', RandomForestClassifier(random_state=SEED))
    ])
    
    # Apply parameters using set_params if any parameters are provided
    if params:
        pipe.set_params(**params)
    
    return pipe

In [10]:
optuna_kwargs = {
    "n_trials": 50,        # Increase from 5 to 20 for better optimization
    "cv_folds": 5,         # Reduce from 10 to 5 for faster training
    "scoring": "accuracy",
    "random_state": SEED,
    "optimise":True
}

In [11]:
experiment = Experiment(
    name=f"Optimized multiclass RandomForest model",
    description=f"An optimized randomforest model for multiclass",
    pipeline_factory=pipeline_factory,
    param_space=moderate_tfidf_randomforest_multiclass_param_space,
)

manager.run_experiment(experiment, splits=splits, **optuna_kwargs)


=== Running Experiment: Optimized multiclass RandomForest model ===


Hyperparameter Optimization (Custom Val Split):   0%|                                         | 0/50 [00:00<?, ?trial/s][I 2025-07-13 12:32:30,549] A new study created in memory with name: no-name-f1d0b3fd-2bbf-446d-a754-02d6401e637e
Hyperparameter Optimization (Custom Val Split):   2%| | 1/50 [00:43<35:49, 43.87s/trial, Train: 0.5765 | Val: 0.5349 | B[I 2025-07-13 12:33:14,419] Trial 0 finished with value: 0.5349172066234701 and parameters: {'tfidf__ngram_range': (1, 1), 'tfidf__max_features': 10142, 'min_df': 7, 'max_df': 0.8831223945291367, 'tfidf__sublinear_tf': False, 'tfidf__stop_words': 'english', 'tfidf__norm': 'l1', 'clf__n_estimators': 166, 'clf__max_depth': 17, 'clf__min_samples_split': 15, 'clf__min_samples_leaf': 6, 'clf__max_features': 0.5, 'clf__criterion': 'gini', 'clf__min_weight_fraction_leaf': 0.04339790066108817, 'clf__max_leaf_nodes': 60, 'clf__min_impurity_decrease': 0.006076267947088519, 'clf__max_samples': 0.7207822639376915}. Best is trial 0 with value: 0.53491


🎯 Optimization completed using Custom Val Split!
   Best score: 0.6328
   Total trials: 50
❌ Experiment 'Optimized multiclass RandomForest model' failed: Invalid parameter 'min_df' for estimator Pipeline(steps=[('join',
                 FunctionTransformer(func=<function combine_text at 0x7f998ae91e40>)),
                ('tfidf', TfidfVectorizer()),
                ('clf', RandomForestClassifier(random_state=42))]). Valid parameters are: ['memory', 'steps', 'transform_input', 'verbose'].


ValueError: Invalid parameter 'min_df' for estimator Pipeline(steps=[('join',
                 FunctionTransformer(func=<function combine_text at 0x7f998ae91e40>)),
                ('tfidf', TfidfVectorizer()),
                ('clf', RandomForestClassifier(random_state=42))]). Valid parameters are: ['memory', 'steps', 'transform_input', 'verbose'].