## 1. Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.decomposition import PCA
import joblib
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


## 2. Load & Filter Data (Sept 2022 – Sept 2025)

In [2]:
df = pd.read_csv("data\processed\integrated_data.csv")


df['loanDate'] = pd.to_datetime(df['loanDate'], errors="coerce")
df = df[
    (df['loanDate'] >= "2022-09-01") &
    (df['loanDate'] <= "2025-09-30")
].copy()

print("Filtered dataset:", len(df))


FileNotFoundError: [Errno 2] No such file or directory: 'data\\processed\\integrated_data.csv'

## 3. Client-Level Aggregation

In [None]:
client = (
    df.groupby('clientId').agg({
        'loanAmount': 'max',
        'isBlacklisted': 'first',
        'isBlocked': 'first',
        'isEligible': 'first',
        'isActive': 'first',
        'isOwing': 'first',
        'dailyLoanCount': 'mean',
        'debt': 'mean',
        'clientMaxAmount': 'max',
        'penaltyDebt': 'mean',
        'totalRefunded': 'mean',
        'amountPaid': 'sum',
        'totalDueAmount': 'sum',
        'refund_amount_mean': 'mean',
        'refund_amount_std': 'mean'
    }).rename(columns={'loanAmount': 'prev_max_loan'}).reset_index()
)


## 4. Extra Behavioural Features (Best Set)/feature engineering

In [None]:
client['has_late_payments'] = (client['penaltyDebt'] > 0).astype(int)

# Loan tenure
dates = df.groupby('clientId')['loanDate']
client['loan_tenure_days'] = (dates.max() - dates.min()).dt.days.fillna(0)

client['num_loans'] = df.groupby('clientId')['loanAmount'].count().values
client['mean_refund_frequency'] = df.groupby('clientId')['refund_amount_count'].mean().values
client['debt_to_max'] = client['debt'] / (client['clientMaxAmount'] + 1)

client['repayment_rate'] = (
    client['amountPaid'] / client['totalDueAmount']
).replace([np.inf, -np.inf], 0).fillna(0).clip(0, 1)

client = client.query("prev_max_loan >= 500 and prev_max_loan <= 10000")


## 5. Final Feature Set

In [None]:
features = [
    'prev_max_loan',
    'clientMaxAmount',
    'repayment_rate',
    'num_loans',
    'loan_tenure_days',
    'debt_to_max',
    'penaltyDebt',
    'totalRefunded',
    'mean_refund_frequency',
    'has_late_payments'
]

X = client[features].fillna(0)


## 6. Scale Features

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


## 7. Auto-Select Best k (3–6)

In [None]:
scores = []

for k in range(3, 7):
    km = KMeans(n_clusters=k, n_init=20, max_iter=300, random_state=42)
    labels = km.fit_predict(X_scaled)

    sil = silhouette_score(X_scaled, labels)
    db = davies_bouldin_score(X_scaled, labels)
    ch = calinski_harabasz_score(X_scaled, labels)

    scores.append((k, sil, db, ch))
    print(f"k={k} | Sil={sil:.3f} | DB={db:.3f} | CH={ch:.1f}")

# choose best k by silhouette score
best_k = max(scores, key=lambda x: x[1])[0]
print("\nSelected best k =", best_k)


## 8. Fit Final KMeans

In [None]:
kmeans = KMeans(n_clusters=best_k, n_init=20, max_iter=300, random_state=42)
client['segment'] = kmeans.fit_predict(X_scaled)

# === FINAL CLUSTER QUALITY METRICS ===
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

labels = client['segment'].values

sil = silhouette_score(X_scaled, labels)
db = davies_bouldin_score(X_scaled, labels)
ch = calinski_harabasz_score(X_scaled, labels)

print("\n=== FINAL CLUSTER QUALITY METRICS (k = {}) ===".format(kmeans.n_clusters))
print(f"Silhouette Score:       {sil:.3f}")
print(f"Davies-Bouldin Index:   {db:.3f}")
print(f"Calinski-Harabasz Index:{ch:.1f}")


## 9. Map Clusters → Recommended Loan Amount

In [None]:
segment_means = client.groupby('segment')['prev_max_loan'].mean().sort_values()
tiers = np.linspace(500, 10000, best_k).round(-2)

mapping = dict(zip(segment_means.index, tiers))
client['recommended_max_loan'] = client['segment'].map(mapping)


## 10.  Save Model Artifacts

In [None]:
joblib.dump({
    'scaler': scaler,
    'kmeans': kmeans,
    'features': features,
    'mapping': mapping
}, "loan_model.pkl")

print("Model saved ✔️")


## 11. Scoring Function (Backend-Ready)

In [None]:
def score_client(client_row):
    model = joblib.load("loan_model.pkl")
    scaler = model['scaler']
    kmeans = model['kmeans']
    mapping = model['mapping']
    feats = model['features']

    row = pd.DataFrame([client_row[feats]])
    scaled = scaler.transform(row)
    seg = kmeans.predict(scaled)[0]
    amount = mapping[seg]

    return {
        "segment": int(seg),
        "recommended_amount": int(amount),
        "amount_fcfa": f"{int(amount):,} FCFA"
    }


## 12.  PCA Visualisation

In [None]:
plt

# PCA transformation
pca = PCA(n_components=2).fit_transform(X_scaled)

# Plot
plt.figure(figsize=(7,5))
plt.scatter(
    pca[:,0], pca[:,1],
    c=client['segment'],
    cmap='tab10',
    alpha=0.7
)
plt.title("Client Behaviour Clusters")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.colorbar(label="Cluster")
plt.tight_layout()

# Save first, then show
plt.savefig('outputs/client_behaviour_clusters.png', dpi=300)
plt.show()

In [None]:
# ===========================
# 12. RANDOM SAMPLE CHECK
# ===========================

def inspect_random_clients(
    client,
    features,
    scaler,
    kmeans,
    mapping,
    n_samples=5,
    seed=None
):
    """
    Pick n_samples random clients and show:
      - clientId
      - cluster (segment)
      - prev_max_loan
      - recommended_max_loan
      - repayment_rate
      - has_late_payments
    """
    print(f"\n=== RANDOM SAMPLE CLIENTS (n={n_samples}) ===")

    # Random sample (seed=None → new random clients every run)
    samples = client.sample(n=n_samples, random_state=seed)

    for idx, row in samples.iterrows():

        # Extract features for prediction
        feats = pd.DataFrame([row[features].fillna(0)], columns=features)
        scaled = scaler.transform(feats)
        seg = kmeans.predict(scaled)[0]
        rec = mapping[seg]

        print(
            f"ClientId: {row['clientId']} | "
            f"Cluster: {seg} | "
            f"PrevMaxLoan: {row['prev_max_loan']:.0f} FCFA → "
            f"Recommended Max Loan: {rec:.0f} FCFA | "
            f"RepaymentRate: {row['repayment_rate']:.2%} | "
            f"LatePayments: {int(row.get('has_late_payments', 0))}"
        )


## 13. Client Testing

In [None]:
inspect_random_clients(
    client=client,
    features=features,
    scaler=scaler,
    kmeans=kmeans,
    mapping=mapping,
    n_samples=5
)

In [None]:
inspect_random_clients(client, features, scaler, kmeans, mapping, n_samples=3, seed=None)
inspect_random_clients(client, features, scaler, kmeans, mapping, n_samples=3, seed=5)
inspect_random_clients(client, features, scaler, kmeans, mapping, n_samples=3, seed=99)


## 14. Scoring Logic for New Clients

In [None]:
def score_new_client_with_progression(client_data):
    """
    Enhanced scoring:
    - New clients (no history)
    - Early clients (1–2 loans)
    - Mature clients (3+ loans)
    """
    
    # 1. Completely new client → no loan history
    if client_data['num_loans'] == 0:
        return {
            'segment': -1,
            'recommended_amount': 500,   # entry amount
            'risk_level': 'NEW CLIENT (UNKNOWN RISK)',
            'color': 'GRAY',
            'comment': 'New client with no history. Assigned starter loan.'
        }
    
    # 2. Early-phase client (1–2 loans)
    if client_data['num_loans'] <= 2:
        # Simple rule: if repayment good so far → small upgrade
        if client_data.get('repayment_rate', 0) > 0.8:
            amt = 1500
        else:
            amt = 800
        
        return {
            'segment': -2,
            'recommended_amount': amt,
            'risk_level': 'EARLY CLIENT (LIMITED HISTORY)',
            'color': 'LIGHT BLUE',
            'comment': 'Early customer. Temporary rule-based limit.'
        }
    
    # 3. Mature client (3+ loans) → use clustering
    import joblib
    model = joblib.load("loan_model.pkl")
    scaler = model['scaler']
    kmeans = model['kmeans']
    mapping = model['mapping']
    features = model['features']

    row = pd.DataFrame([client_data[features]])
    scaled = scaler.transform(row)
    seg = kmeans.predict(scaled)[0]
    amount = mapping[seg]

    return {
        'segment': int(seg),
        'recommended_amount': int(amount),
        'risk_level': 'BEHAVIORAL MODEL APPLIED',
        'color': 'GREEN',
        'comment': 'Behavioural cluster-based credit limit assigned.'
    }
