## Clustering for Agency Financial Efficiency

In [1]:
# Library imports
import pandas as pd
import numpy as np
import os
from pathlib import Path

In [2]:
# Set the base path for input data files
CURRENT_DIR = Path().resolve()
DATA_BASE_PATH = CURRENT_DIR.parent / "outputs" / "data_output"

In [3]:
# set the path for the modeling data file
output_dir = os.path.join("..", "outputs", "model_output")
os.makedirs(output_dir, exist_ok=True)

In [4]:
# Load data
df = pd.read_csv("../outputs/data_output/Financial_Cleaned.csv")

In [5]:
# Remove rows with very low or zero required resources to avoid ratio distortion
df = df[df['Total required resources'] > 0].copy()

# Drop rows with any nulls in financial columns
df.dropna(subset=[
    'Total required resources', 'Total available resources', 'Total expenditure resources'
], inplace=True)

# Clip large values to prevent outlier distortion
df['Total required resources'] = df['Total required resources'].clip(upper=df['Total required resources'].quantile(0.99))
df['Total available resources'] = df['Total available resources'].clip(upper=df['Total available resources'].quantile(0.99))
df['Total expenditure resources'] = df['Total expenditure resources'].clip(upper=df['Total expenditure resources'].quantile(0.99))

In [6]:
# Compute financial efficiency ratios
df['Avail_per_Req'] = df['Total available resources'] / (df['Total required resources'] + 1)
df['Exp_per_Req'] = df['Total expenditure resources'] / (df['Total required resources'] + 1)

# Log-transform to compress scale and reduce skew
df['log_Req'] = np.log1p(df['Total required resources'])
df['log_Avail_per_Req'] = np.log1p(df['Avail_per_Req'])
df['log_Exp_per_Req'] = np.log1p(df['Exp_per_Req'])

# Optional: normalize agency count per SP_Label (if repeated agencies exist)
df['Agency_Count'] = df['Agencies'].apply(lambda x: len(str(x).split(";")) if pd.notnull(x) else 1)

In [7]:
from sklearn.preprocessing import StandardScaler

# Select final features
feature_cols = ['log_Req', 'log_Avail_per_Req', 'log_Exp_per_Req']
X = df[feature_cols]

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score

models = {
    "KMeans": KMeans(n_clusters=3, random_state=42),
    "DBSCAN": DBSCAN(eps=0.5, min_samples=5),
    "Agglomerative": AgglomerativeClustering(n_clusters=3)
}

for name, model in models.items():
    try:
        labels = model.fit_predict(X_scaled)
        if len(set(labels)) > 1:
            score = silhouette_score(X_scaled, labels)
            print(f"{name} Score: {score:.4f}")
        else:
            print(f"{name} only found one cluster.")
    except Exception as e:
        print(f"{name} failed: {e}")

KMeans Score: 0.3198
DBSCAN Score: 0.2510
Agglomerative Score: 0.2405


In [9]:
from sklearn.metrics import silhouette_score

best_k = None
best_score = -1
for k in range(2, 7):
    km = KMeans(n_clusters=k, random_state=42)
    labels = km.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    print(f"KMeans (k={k}) Score: {score:.4f}")
    if score > best_score:
        best_score = score
        best_k = k

KMeans (k=2) Score: 0.2836
KMeans (k=3) Score: 0.3198
KMeans (k=4) Score: 0.3376
KMeans (k=5) Score: 0.3019
KMeans (k=6) Score: 0.2825


In [10]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Best k
best_k = 4

# Final clustering
kmeans = KMeans(n_clusters=best_k, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)

In [11]:
# Label clusters for interpretability
cluster_summary = df.groupby('Cluster')[
    ['Total required resources', 'Total available resources', 'Total expenditure resources',
     'Avail_per_Req', 'Exp_per_Req']
].mean().round(2)

cluster_summary

Unnamed: 0_level_0,Total required resources,Total available resources,Total expenditure resources,Avail_per_Req,Exp_per_Req
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1221672000.0,934430700.0,536494800.0,0.81,0.47
1,11758880.0,15511510.0,13418040.0,1.31,0.93
2,8811023.0,7969604.0,819689.8,0.87,0.15
3,232348400.0,57530520.0,18969230.0,0.24,0.11


In [12]:
def label_cluster(row):
    if row['Cluster'] == 1:
        return "Top Performer"
    elif row['Cluster'] == 0:
        return "Low Performer"
    elif row['Cluster'] == 2:
        return "Execution Gap"
    elif row['Cluster'] == 3:
        return "Moderate Performer"
    else:
        return "Unlabeled"

df['Performance_Label'] = df.apply(label_cluster, axis=1)

In [13]:
df.to_csv("../outputs/model_output/un_agency.csv", index=False)