## UN Agencies Prediction via Multi-Label Classification

In [1]:
# Library imports
import pandas as pd
import numpy as np
import os
from pathlib import Path

In [2]:
# Set the base path for input data files
CURRENT_DIR = Path().resolve()
DATA_BASE_PATH = CURRENT_DIR.parent / "outputs" / "data_output"

In [3]:
# set the path for the modeling data file
output_dir = os.path.join("..", "outputs", "model_output")
os.makedirs(output_dir, exist_ok=True)

In [4]:
# Load data
df = pd.read_csv("../outputs/data_output/UN_Agencies_Cleaned.csv")

In [5]:
df_grouped = (
    df.groupby(['Country', 'Theme', 'Strategic priority code', 'SP_Label'])['Agencies']
    .apply(lambda x: list(set(x))).reset_index()
)

In [6]:
df_grouped.head()

Unnamed: 0,Country,Theme,Strategic priority code,SP_Label,Agencies
0,Afghanistan,crime,3.0,AFG_crime_3,[United Nations Assistance Mission in Afghanis...
1,Afghanistan,education,1.0,AFG_education_1,"[United Nations Educational, Scientific and Cu..."
2,Afghanistan,education,2.0,AFG_education_2,"[United Nations Educational, Scientific and Cu..."
3,Afghanistan,education,3.0,AFG_education_3,"[United Nations Educational, Scientific and Cu..."
4,Afghanistan,environment,1.0,AFG_environment_1,[United Nations High Commissioner for Refugees...


In [7]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

categorical = ['Country', 'Theme']
numeric = ['Strategic priority code']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('num', StandardScaler(), numeric)
])

In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

y = df_grouped['Agencies']
mlb = MultiLabelBinarizer()
y_encoded = mlb.fit_transform(y)

In [9]:
from sklearn.model_selection import train_test_split

X = df_grouped[categorical + numeric]
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import warnings

warnings.filterwarnings("ignore")

# Define candidate models
model_candidates = {
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "RidgeClassifier": RidgeClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "MLPClassifier": MLPClassifier(hidden_layer_sizes=(100,), max_iter=300),
    "SVM_OvR": OneVsRestClassifier(SVC(kernel='linear', probability=True))
}

results = []

for name, base_model in model_candidates.items():
    print(f"\n📌 Training model: {name}")
    try:
        # MultiOutput wrapping where needed
        if name in ["RandomForest", "LogisticRegression", "MLPClassifier", "RidgeClassifier"]:
            clf = MultiOutputClassifier(base_model)
        else:
            clf = base_model
        
        pipeline = Pipeline([
            ("preprocessing", preprocessor),
            ("classifier", clf)
        ])

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        f1 = f1_score(y_test, y_pred, average='micro')
        hamming = hamming_loss(y_test, y_pred)
        acc = accuracy_score(y_test, y_pred)

        results.append({
            "Model": name,
            "F1 Micro": f1,
            "Hamming Loss": hamming,
            "Subset Accuracy": acc
        })

        print(f"✅ {name} completed. F1: {f1:.3f}, Hamming Loss: {hamming:.3f}")

    except Exception as e:
        print(f"❌ {name} failed: {str(e)}")


📌 Training model: RandomForest
✅ RandomForest completed. F1: 0.389, Hamming Loss: 0.047

📌 Training model: RidgeClassifier
✅ RidgeClassifier completed. F1: 0.401, Hamming Loss: 0.042

📌 Training model: XGBoost
✅ XGBoost completed. F1: 0.463, Hamming Loss: 0.041

📌 Training model: MLPClassifier
✅ MLPClassifier completed. F1: 0.427, Hamming Loss: 0.049

📌 Training model: SVM_OvR
✅ SVM_OvR completed. F1: 0.394, Hamming Loss: 0.043


In [11]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module="xgboost")

pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', MultiOutputClassifier(XGBClassifier(eval_metric='logloss', use_label_encoder=False)))
])

# Define hyperparameter search space
param_dist = {
    'classifier__estimator__n_estimators': [300, 400, 500],
    'classifier__estimator__max_depth': [5, 6, 7, 8],
    'classifier__estimator__learning_rate': [0.03, 0.05, 0.07, 0.1],
    'classifier__estimator__subsample': [0.7, 0.8, 0.9],
    'classifier__estimator__colsample_bytree': [0.8, 0.9, 1.0],
    'classifier__estimator__reg_alpha': [0, 0.1, 0.25],
    'classifier__estimator__reg_lambda': [0.5, 1.0, 1.5],
    'classifier__estimator__scale_pos_weight': [0.8, 1.0, 1.2]
}

# Randomized SearchCV
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=10,
    scoring='f1_micro',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Fit and Evaluate
random_search.fit(X_train, y_train)
y_pred = random_search.predict(X_test)

# Evaluation metrics
print("✅ Best Parameters:", random_search.best_params_)
print("🎯 F1 Micro:", f1_score(y_test, y_pred, average='micro'))
print("📉 Hamming Loss:", hamming_loss(y_test, y_pred))
print("🧮 Subset Accuracy:", accuracy_score(y_test, y_pred))

Fitting 3 folds for each of 10 candidates, totalling 30 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.05, classifier__estimator__max_depth=7, classifier__estimator__n_estimators=300, classifier__estimator__reg_alpha=0.1, classifier__estimator__reg_lambda=1.0, classifier__estimator__scale_pos_weight=1.2, classifier__estimator__subsample=0.9; total time=   3.4s
[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.05, classifier__estimator__max_depth=7, classifier__estimator__n_estimators=300, classifier__estimator__reg_alpha=0.1, classifier__estimator__reg_lambda=1.0, classifier__estimator__scale_pos_weight=1.2, classifier__estimator__subsample=0.9; total time=   3.4s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END classifier__estimator__colsample_bytree=0.8, classifier__estimator__learning_rate=0.03, classifier__estimator__max_depth=8, classifier__estimator__n_estimators=400, classifier__estimator__reg_alpha=0.1, classifier__estimator__reg_lambda=1.5, classifier__estimator__scale_pos_weight=1.0, classifier__estimator__subsample=0.9; total time=   4.1s
[CV] END classifier__estimator__colsample_bytree=0.8, classifier__estimator__learning_rate=0.03, classifier__estimator__max_depth=8, classifier__estimator__n_estimators=400, classifier__estimator__reg_alpha=0.1, classifier__estimator__reg_lambda=1.5, classifier__estimator__scale_pos_weight=1.0, classifier__estimator__subsample=0.9; total time=   4.3s
[CV] END classifier__estimator__colsample_bytree=0.8, classifier__estimator__learning_rate=0.03, classifier__estimator__max_depth=8, classifier__estimator__n_estimators=400, classifier__estimator__reg_alpha=0.1, classifier__estimator__reg_lambda=1.5, classifier__estimator__scale_pos_weight=1.0

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.1, classifier__estimator__max_depth=6, classifier__estimator__n_estimators=500, classifier__estimator__reg_alpha=0.25, classifier__estimator__reg_lambda=0.5, classifier__estimator__scale_pos_weight=1.2, classifier__estimator__subsample=0.8; total time=   4.9s
[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.1, classifier__estimator__max_depth=6, classifier__estimator__n_estimators=500, classifier__estimator__reg_alpha=0.25, classifier__estimator__reg_lambda=0.5, classifier__estimator__scale_pos_weight=1.2, classifier__estimator__subsample=0.8; total time=   4.9s
[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.1, classifier__estimator__max_depth=6, classifier__estimator__n_estimators=500, classifier__estimator__reg_alpha=0.25, classifier__estimator__reg_lambda=0.5, classifier__estimator__scale_pos_weight=1.2

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.05, classifier__estimator__max_depth=7, classifier__estimator__n_estimators=300, classifier__estimator__reg_alpha=0.1, classifier__estimator__reg_lambda=1.0, classifier__estimator__scale_pos_weight=1.2, classifier__estimator__subsample=0.9; total time=   3.0s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.05, classifier__estimator__max_depth=6, classifier__estimator__n_estimators=400, classifier__estimator__reg_alpha=0, classifier__estimator__reg_lambda=0.5, classifier__estimator__scale_pos_weight=1.2, classifier__estimator__subsample=0.8; total time=   3.9s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.05, classifier__estimator__max_depth=6, classifier__estimator__n_estimators=400, classifier__estimator__reg_alpha=0, classifier__estimator__reg_lambda=0.5, classifier__estimator__scale_pos_weight=1.2, classifier__estimator__subsample=0.8; total time=   4.1s
[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.05, classifier__estimator__max_depth=6, classifier__estimator__n_estimators=400, classifier__estimator__reg_alpha=0, classifier__estimator__reg_lambda=0.5, classifier__estimator__scale_pos_weight=1.2, classifier__estimator__subsample=0.8; total time=   3.9s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END classifier__estimator__colsample_bytree=1.0, classifier__estimator__learning_rate=0.1, classifier__estimator__max_depth=7, classifier__estimator__n_estimators=400, classifier__estimator__reg_alpha=0, classifier__estimator__reg_lambda=1.5, classifier__estimator__scale_pos_weight=1.2, classifier__estimator__subsample=0.8; total time=   4.6s
[CV] END classifier__estimator__colsample_bytree=1.0, classifier__estimator__learning_rate=0.1, classifier__estimator__max_depth=7, classifier__estimator__n_estimators=400, classifier__estimator__reg_alpha=0, classifier__estimator__reg_lambda=1.5, classifier__estimator__scale_pos_weight=1.2, classifier__estimator__subsample=0.8; total time=   4.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END classifier__estimator__colsample_bytree=1.0, classifier__estimator__learning_rate=0.1, classifier__estimator__max_depth=7, classifier__estimator__n_estimators=400, classifier__estimator__reg_alpha=0, classifier__estimator__reg_lambda=1.5, classifier__estimator__scale_pos_weight=1.2, classifier__estimator__subsample=0.8; total time=   4.4s
[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.05, classifier__estimator__max_depth=8, classifier__estimator__n_estimators=400, classifier__estimator__reg_alpha=0.25, classifier__estimator__reg_lambda=1.0, classifier__estimator__scale_pos_weight=0.8, classifier__estimator__subsample=0.8; total time=   4.3s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.05, classifier__estimator__max_depth=8, classifier__estimator__n_estimators=400, classifier__estimator__reg_alpha=0.25, classifier__estimator__reg_lambda=1.0, classifier__estimator__scale_pos_weight=0.8, classifier__estimator__subsample=0.8; total time=   4.5s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.05, classifier__estimator__max_depth=8, classifier__estimator__n_estimators=400, classifier__estimator__reg_alpha=0.25, classifier__estimator__reg_lambda=1.0, classifier__estimator__scale_pos_weight=0.8, classifier__estimator__subsample=0.8; total time=   4.1s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.07, classifier__estimator__max_depth=6, classifier__estimator__n_estimators=500, classifier__estimator__reg_alpha=0.1, classifier__estimator__reg_lambda=0.5, classifier__estimator__scale_pos_weight=0.8, classifier__estimator__subsample=0.8; total time=   5.2s
[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.07, classifier__estimator__max_depth=6, classifier__estimator__n_estimators=500, classifier__estimator__reg_alpha=0.1, classifier__estimator__reg_lambda=0.5, classifier__estimator__scale_pos_weight=0.8, classifier__estimator__subsample=0.8; total time=   5.2s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.07, classifier__estimator__max_depth=6, classifier__estimator__n_estimators=500, classifier__estimator__reg_alpha=0.1, classifier__estimator__reg_lambda=0.5, classifier__estimator__scale_pos_weight=0.8, classifier__estimator__subsample=0.8; total time=   5.1s
[CV] END classifier__estimator__colsample_bytree=0.8, classifier__estimator__learning_rate=0.03, classifier__estimator__max_depth=6, classifier__estimator__n_estimators=500, classifier__estimator__reg_alpha=0.25, classifier__estimator__reg_lambda=0.5, classifier__estimator__scale_pos_weight=1.2, classifier__estimator__subsample=0.8; total time=   5.2s


Parameters: { "use_label_encoder" } are not used.



[CV] END classifier__estimator__colsample_bytree=0.8, classifier__estimator__learning_rate=0.03, classifier__estimator__max_depth=6, classifier__estimator__n_estimators=500, classifier__estimator__reg_alpha=0.25, classifier__estimator__reg_lambda=0.5, classifier__estimator__scale_pos_weight=1.2, classifier__estimator__subsample=0.8; total time=   5.2s
[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.03, classifier__estimator__max_depth=7, classifier__estimator__n_estimators=300, classifier__estimator__reg_alpha=0.1, classifier__estimator__reg_lambda=1.5, classifier__estimator__scale_pos_weight=1.2, classifier__estimator__subsample=0.8; total time=   3.6s
[CV] END classifier__estimator__colsample_bytree=0.8, classifier__estimator__learning_rate=0.03, classifier__estimator__max_depth=6, classifier__estimator__n_estimators=500, classifier__estimator__reg_alpha=0.25, classifier__estimator__reg_lambda=0.5, classifier__estimator__scale_pos_weight=1

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.03, classifier__estimator__max_depth=7, classifier__estimator__n_estimators=300, classifier__estimator__reg_alpha=0.1, classifier__estimator__reg_lambda=1.5, classifier__estimator__scale_pos_weight=1.2, classifier__estimator__subsample=0.8; total time=   3.5s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.03, classifier__estimator__max_depth=7, classifier__estimator__n_estimators=300, classifier__estimator__reg_alpha=0.1, classifier__estimator__reg_lambda=1.5, classifier__estimator__scale_pos_weight=1.2, classifier__estimator__subsample=0.8; total time=   2.9s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.05, classifier__estimator__max_depth=7, classifier__estimator__n_estimators=500, classifier__estimator__reg_alpha=0.25, classifier__estimator__reg_lambda=1.0, classifier__estimator__scale_pos_weight=1.2, classifier__estimator__subsample=0.8; total time=   4.3s
[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.05, classifier__estimator__max_depth=7, classifier__estimator__n_estimators=500, classifier__estimator__reg_alpha=0.25, classifier__estimator__reg_lambda=1.0, classifier__estimator__scale_pos_weight=1.2, classifier__estimator__subsample=0.8; total time=   4.0s
[CV] END classifier__estimator__colsample_bytree=0.9, classifier__estimator__learning_rate=0.05, classifier__estimator__max_depth=7, classifier__estimator__n_estimators=500, classifier__estimator__reg_alpha=0.25, classifier__estimator__reg_lambda=1.0, classifier__estimator__scale_pos_weight=

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, hamming_loss, accuracy_score

# Identify categorical and numeric columns
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numeric_cols)
    ]
)

# Final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', OneVsRestClassifier(
        XGBClassifier(
            n_estimators=500,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.9,
            reg_alpha=0.25,
            reg_lambda=0.5,
            scale_pos_weight=1.2,
            use_label_encoder=False,
            objective='binary:logistic',
            eval_metric='logloss'
        )
    ))
])

# Fit
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_test)

# Evaluate
print("🎯 F1 Micro:", f1_score(y_test, y_pred, average='micro'))
print("📉 Hamming Loss:", hamming_loss(y_test, y_pred))
print("🧮 Subset Accuracy:", accuracy_score(y_test, y_pred))

🎯 F1 Micro: 0.4909544603867748
📉 Hamming Loss: 0.041846153846153845
🧮 Subset Accuracy: 0.08076923076923077


In [13]:
import joblib

# Save the full pipeline
joblib.dump(pipeline, '../outputs/model_output/Agency_model.pkl')

['../outputs/model_output/Agency_model.pkl']