<a href="https://colab.research.google.com/github/anucodes-hub/hybrid-aml-detection-engine/blob/main/Risk_scoring_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
      #                                                   THIS IS THE MAIN FINAL MODIFIED CODE

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, classification_report

import shap

# =========================================================
# 1. Synthetic dataset generator (with stronger signal)
# =========================================================

def generate_customer_risk_dataset(n: int = 5000, seed: int = 42) -> pd.DataFrame:
    np.random.seed(seed)

    # --- Core KYC / customer features ---
    country_risk = np.random.choice(
        [5, 10, 20, 40, 60, 80],
        size=n,
        p=[0.2, 0.25, 0.25, 0.15, 0.1, 0.05],
    )

    is_pep = np.random.binomial(1, 0.03, size=n)
    occupation_risk = np.random.choice(
        [5, 10, 20, 40, 60, 80],
        size=n,
        p=[0.25, 0.25, 0.2, 0.15, 0.1, 0.05],
    )

    face_match_score = np.clip(
        np.random.normal(90, 5, size=n),
        60,
        100,
    ).astype(int)

    kyc_quality = np.clip(
        np.random.normal(85, 7, size=n),
        50,
        100,
    ).astype(int)

    failed_attempts = np.random.poisson(0.3, size=n)

    avg_txn_amount_30d = np.random.lognormal(
        mean=10,
        sigma=0.6,
        size=n,
    ).astype(int)

    txn_count_7d = np.random.poisson(10, size=n)

    # --- Receiver / counterparty features ---
    receiver_type = np.random.choice(
        [0, 1, 2, 3],
        size=n,
        p=[0.5, 0.2, 0.2, 0.1],
    )

    receiver_risk = np.random.choice(
        [10, 30, 50, 70, 90],
        size=n,
        p=[0.3, 0.25, 0.2, 0.15, 0.1],
    )

    is_receiver_new = np.random.binomial(1, 0.4, size=n)

    receiver_country_risk = np.random.choice(
        [5, 20, 40, 60, 80],
        size=n,
        p=[0.3, 0.25, 0.2, 0.15, 0.1],
    )

    # --- Geo / channel features ---
    is_cross_border = np.random.binomial(1, 0.25, size=n)

    channel_risk = np.random.choice(
        [10, 30, 50, 70, 90],
        size=n,
        p=[0.4, 0.25, 0.2, 0.1, 0.05],
    )

    geo_distance_score = np.random.choice(
        [10, 30, 50, 70, 90],
        size=n,
        p=[0.4, 0.25, 0.2, 0.1, 0.05],
    )

    # --- Behavioural / relationship features ---
    days_since_first_txn = np.random.randint(1, 365, size=n)
    txn_to_high_risk_geo_30d = np.random.poisson(0.5, size=n)
    receiver_txn_count_30d = np.random.poisson(3, size=n)
    alerts_last_90d = np.random.poisson(0.4, size=n)

    # Flags for regimes
    very_high_amount = (avg_txn_amount_30d > 200000).astype(float)
    new_short_history = ((days_since_first_txn < 30) & (is_receiver_new == 1)).astype(float)
    high_risk_corridor = ((country_risk >= 60) & (receiver_country_risk >= 60)).astype(float)

    # --- Latent risk to build labels (richer, with interactions) ---
    latent_risk = (
        0.18 * (country_risk / 100)
        + 0.18 * (receiver_country_risk / 100)
        + 0.20 * is_pep
        + 0.16 * (occupation_risk / 100)
        + 0.20 * (receiver_risk / 100)
        + 0.14 * is_receiver_new
        + 0.16 * is_cross_border
        + 0.16 * (channel_risk / 100)
        + 0.16 * (txn_to_high_risk_geo_30d / 5)
        + 0.18 * (alerts_last_90d / 5)
        - 0.15 * (face_match_score / 100)
        - 0.15 * (kyc_quality / 100)
        + 0.20 * very_high_amount
        + 0.18 * new_short_history
        + 0.20 * high_risk_corridor
    )

    # scale to [0, 1]
    latent_risk = (latent_risk - latent_risk.min()) / (
        latent_risk.max() - latent_risk.min()
    )

    # deterministic label with margin band
    is_high_risk = np.zeros(n, dtype=int)
    is_high_risk[latent_risk >= 0.65] = 1
    is_high_risk[latent_risk <= 0.35] = 0

    mid_band = (latent_risk > 0.35) & (latent_risk < 0.65)
    noisy_labels = np.random.binomial(1, latent_risk[mid_band])
    is_high_risk[mid_band] = noisy_labels

    # IDs
    person_id = [f"user_{i:05d}" for i in range(n)]
    transaction_id = [f"txn_{i:07d}" for i in range(n)]

    df = pd.DataFrame({
        "person_id": person_id,
        "transaction_id": transaction_id,
        "country_risk": country_risk,
        "is_pep": is_pep,
        "occupation_risk": occupation_risk,
        "face_match_score": face_match_score,
        "kyc_quality": kyc_quality,
        "failed_attempts": failed_attempts,
        "avg_txn_amount_30d": avg_txn_amount_30d,
        "txn_count_7d": txn_count_7d,
        "receiver_type": receiver_type,
        "receiver_risk": receiver_risk,
        "is_receiver_new": is_receiver_new,
        "receiver_country_risk": receiver_country_risk,
        "is_cross_border": is_cross_border,
        "channel_risk": channel_risk,
        "geo_distance_score": geo_distance_score,
        "days_since_first_txn": days_since_first_txn,
        "txn_to_high_risk_geo_30d": txn_to_high_risk_geo_30d,
        "receiver_txn_count_30d": receiver_txn_count_30d,
        "alerts_last_90d": alerts_last_90d,
        "is_high_risk": is_high_risk,
    })

    return df

# =========================================================
# 2. Train Random Forest risk model (tuned)
# =========================================================

FEATURE_COLS = [
    "country_risk",
    "is_pep",
    "occupation_risk",
    "face_match_score",
    "kyc_quality",
    "failed_attempts",
    "avg_txn_amount_30d",
    "txn_count_7d",
    "receiver_type",
    "receiver_risk",
    "is_receiver_new",
    "receiver_country_risk",
    "is_cross_border",
    "channel_risk",
    "geo_distance_score",
    "days_since_first_txn",
    "txn_to_high_risk_geo_30d",
    "receiver_txn_count_30d",
    "alerts_last_90d",
]

TARGET_COL = "is_high_risk"

df = generate_customer_risk_dataset()

X = df[FEATURE_COLS]
y = df[TARGET_COL]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

rf_base = RandomForestClassifier(
    n_estimators=600,
    class_weight="balanced_subsample",
    n_jobs=-1,
    random_state=42,
)

param_dist = {
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 3, 5],
    "max_features": ["sqrt", "log2"],
}

search = RandomizedSearchCV(
    rf_base,
    param_distributions=param_dist,
    n_iter=12,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    random_state=42,
)

search.fit(X_train, y_train)
rf = search.best_estimator_
print("Best RF params:", search.best_params_)

y_proba = rf.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.5).astype(int)

print("Random Forest ROC-AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

# =========================================================
# 3. Isolation Forest anomaly model
# =========================================================

IF_FEATURE_COLS = [
    "avg_txn_amount_30d",
    "txn_count_7d",
    "receiver_type",
    "receiver_risk",
    "is_receiver_new",
    "receiver_country_risk",
    "is_cross_border",
    "channel_risk",
    "geo_distance_score",
    "days_since_first_txn",
    "txn_to_high_risk_geo_30d",
    "receiver_txn_count_30d",
    "alerts_last_90d",
]

X_if = df[IF_FEATURE_COLS].values

if_model = IsolationForest(
    n_estimators=500,
    max_samples="auto",
    contamination=0.05,
    max_features=1.0,
    n_jobs=-1,
    random_state=42,
)

if_model.fit(X_if)

scores_raw_all = if_model.score_samples(X_if)
scores_flipped_all = -scores_raw_all
IF_MIN = scores_flipped_all.min()
IF_MAX = scores_flipped_all.max()

def compute_anomaly_score(if_model, params: dict) -> dict:
    x_if = pd.DataFrame([params])[IF_FEATURE_COLS].values
    raw = if_model.score_samples(x_if)[0]
    flipped = -raw
    norm = (flipped - IF_MIN) / (IF_MAX - IF_MIN)
    score = int(round(norm * 100))

    if score < 30:
        level = "NORMAL"
    elif score < 70:
        level = "ELEVATED"
    else:
        level = "HIGH"

    return {
        "anomaly_score": score,
        "anomaly_level": level,
    }

# =========================================================
# 3b. Simple AML rule engine
# =========================================================

def rule_engine_score(params: dict) -> dict:
    score = 0

    # --- Existing High-risk country check ---
    if params["country_risk"] >= 60 or params["receiver_country_risk"] >= 60:
        score += 20
    if params["country_risk"] >= 60 and params["receiver_country_risk"] >= 60:
        score += 15

    # --- LOOPHOLE 3 FIX: TEMPORAL VELOCITY CHECK ---
    # 1. High Frequency Check: Many transactions in a short window
    if params.get("txn_count_7d", 0) > 40:
        score += 25  # Significant penalty for "Smurfing" frequency

    # 2. New Account Burst Check: Rapid activity on a fresh account
    if params.get("days_since_first_txn", 365) < 7 and params.get("txn_count_7d", 0) > 10:
        score += 30  # Classic "Mule Account" footprint

    # --- Existing Behaviour & KYC checks ---
    if params["alerts_last_90d"] >= 3:
        score += 20
    if params["avg_txn_amount_30d"] > 200000:
        score += 15
    if params["kyc_quality"] < 70 or params["face_match_score"] < 80:
        score += 10

    score = max(0, min(100, score))
    level = "HIGH" if score >= 70 else "MEDIUM" if score >= 30 else "LOW"

    return {"rule_score": score, "rule_level": level}



# =========================================================
# 4. SHAP explainability (better background, robust)
# =========================================================

bg_low = X_train[y_train == 0].sample(min(250, (y_train == 0).sum()), random_state=42)
bg_high = X_train[y_train == 1].sample(min(250, (y_train == 1).sum()), random_state=42)
rf_background = pd.concat([bg_low, bg_high], axis=0)

rf_explainer = shap.TreeExplainer(
    rf,
    rf_background,
    feature_perturbation="interventional",
    model_output="probability",
)

def explain_rf(params: dict) -> dict:
    x = pd.DataFrame([params])[FEATURE_COLS]
    shap_values_output = rf_explainer.shap_values(x)
    exp = rf_explainer.expected_value

    if isinstance(shap_values_output, list):
        phi = np.array(shap_values_output[1][0])
        if isinstance(exp, (list, np.ndarray)):
            exp_val = exp[1]
        else:
            exp_val = exp
    else:
        phi = np.array(shap_values_output[0])
        if isinstance(exp, (list, np.ndarray)):
            exp_val = exp[0]
        else:
            exp_val = exp

    return {
        "expected_value": float(np.array(exp_val).ravel()[0]),
        "shap_values": dict(zip(FEATURE_COLS, phi.ravel().tolist())),
    }

def explain_rf_advanced(params: dict) -> dict:
    x = pd.DataFrame([params])[FEATURE_COLS]
    shap_values_output = rf_explainer.shap_values(x)
    shap_interactions_output = rf_explainer.shap_interaction_values(x)

    if isinstance(shap_values_output, list):
        phi = np.array(shap_values_output[1][0])
        inter = np.array(shap_interactions_output[1][0])
    else:
        phi = np.array(shap_values_output[0])
        inter = np.array(shap_interactions_output[0])

    main_effects = dict(zip(FEATURE_COLS, phi.ravel().tolist()))

    interaction_pairs = []
    for i, fi in enumerate(FEATURE_COLS):
        for j in range(i + 1, len(FEATURE_COLS)):
            fj = FEATURE_COLS[j]
            interaction_pairs.append(((fi, fj), abs(inter[i, j])))
    interaction_pairs.sort(key=lambda t: t[1], reverse=True)
    top_interactions = [
        {"pair": pair, "strength": float(strength)}
        for (pair, strength) in interaction_pairs[:3]
    ]

    return {
        "main_effects": main_effects,
        "top_interactions": top_interactions,
    }

# SHAP for Isolation Forest (optional)
if_explainer = shap.TreeExplainer(if_model, X_if)

def explain_if(params: dict) -> dict:
    x = pd.DataFrame([params])[IF_FEATURE_COLS]
    shap_vals = if_explainer.shap_values(x)
    exp = if_explainer.expected_value

    shap_vals_row = np.array(shap_vals)[0]
    exp_val = float(np.array(exp).ravel()[0])

    return {
        "expected_value": exp_val,
        "shap_values": dict(zip(IF_FEATURE_COLS, shap_vals_row.tolist())),
    }

# =========================================================
# 5. Helpers: risk score, status, and SHAP-based reasons
# =========================================================

def compute_risk_score(model, params: dict) -> dict:
    x = pd.DataFrame([params])[FEATURE_COLS]
    proba_high = model.predict_proba(x)[0, 1]
    score = int(round(proba_high * 100))

    if score < 30:
        level = "LOW"
    elif score < 70:
        level = "MEDIUM"
    else:
        level = "HIGH"

    return {
        "probability_high_risk": float(proba_high),
        "risk_score": score,
        "risk_level": level,
    }

def top_reasons_from_shap(shap_values: dict, top_n: int = 2) -> list:
    sorted_feats = sorted(
        shap_values.items(),
        key=lambda kv: abs(kv[1]),
        reverse=True,
    )
    return sorted_feats[:top_n]

REASON_LABELS = {
    "country_risk": "country risk is high",
    "is_pep": "customer is a politically exposed person",
    "occupation_risk": "occupation has higher risk",
    "receiver_risk": "receiver risk is high",
    "is_receiver_new": "receiver is new for this customer",
    "receiver_country_risk": "receiver’s country risk is high",
    "is_cross_border": "transaction is cross-border",
    "channel_risk": "channel risk is high",
    "alerts_last_90d": "customer has multiple past alerts",
    "txn_to_high_risk_geo_30d": "many transactions to high-risk countries",
}

def build_reason_text(shap_values: dict, params: dict) -> str:
    ranked = sorted(
        shap_values.items(),
        key=lambda kv: abs(kv[1]),
        reverse=True,
    )

    filtered = []
    for feat, val in ranked:
        if feat == "is_pep" and params.get("is_pep", 0) == 0:
            continue
        if feat == "is_cross_border" and params.get("is_cross_border", 0) == 0:
            continue
        if feat == "is_receiver_new" and params.get("is_receiver_new", 0) == 0:
            continue
        filtered.append((feat, val))

    if not filtered:
        return "Model detected elevated risk based on multiple factors."

    (f1, v1) = filtered[0]
    (f2, v2) = filtered[1] if len(filtered) > 1 else (None, 0.0)

    label1 = REASON_LABELS.get(f1, f1)
    label2 = REASON_LABELS.get(f2, f2) if f2 else None

    strength = abs(v1)
    if strength < 0.02:
        prefix = "Risk is slightly influenced by "
    elif strength < 0.05:
        prefix = "Risk is moderately influenced by "
    else:
        prefix = "Risk is strongly driven by "

    if label2 and abs(v1) - abs(v2) < 0.01:
        return f"{prefix}{label1} and {label2}."
    else:
        return f"{prefix}{label1}."

# Official FIU-IND Grounds of Suspicion (GoS) Mapping
GOS_MAPPING = {
    "country_risk": "GOS_101",          # High Risk Jurisdictions
    "is_pep": "GOS_402",                # PEP Involvement
    "avg_txn_amount_30d": "GOS_201",    # Unusual Volume
    "txn_count_7d": "GOS_204",          # High Velocity/Structuring
    "kyc_quality": "GOS_501",           # Suspicious Documents
    "face_match_score": "GOS_502"       # Identity Mismatch
}

def derive_regulatory_status(risk_score, anomaly_score, rule_score, params):
    """
    Maps AI scores to official FIU-IND reporting priorities and GoS tags.
    """
    aggregate = max(risk_score, anomaly_score, rule_score)

    # 1. Determine Report Priority
    if aggregate >= 90:
        status, priority = "STR_P1", "URGENT_TERROR_FINANCING"
    elif aggregate >= 75:
        status, priority = "STR_P2", "HIGH_ML_SUSPICION"
    elif aggregate >= 50:
        status, priority = "MANUAL_REVIEW", "MEDIUM"
    else:
        status, priority = "ACCEPTED", "LOW"

    # 2. Extract GoS Tags based on triggering features
    triggered_gos = []
    if params.get("country_risk", 0) >= 60: triggered_gos.append(GOS_MAPPING["country_risk"])
    if params.get("is_pep"): triggered_gos.append(GOS_MAPPING["is_pep"])
    if params.get("txn_count_7d", 0) > 40: triggered_gos.append(GOS_MAPPING["txn_count_7d"])
    if params.get("avg_txn_amount_30d", 0) > 200000: triggered_gos.append(GOS_MAPPING["avg_txn_amount_30d"])

    return {
        "regulatory_status": status,
        "priority_level": priority,
        "gos_tags": triggered_gos
    }


'''def derive_status(risk_score: int, anomaly_score: int, rule_score: int) -> str:
    """
    Combine RF risk, anomaly, and rules into status.
    """
    aggregate = max(risk_score, anomaly_score, rule_score)

    if aggregate >= 90:
        return "REPORTED"
    if aggregate >= 80:
        return "FLAGGED"
    if aggregate >= 65:
        return "MANUAL_REVIEW"
    if aggregate >= 45:
        return "UNDER_OBSERVATION"
    if 30 <= aggregate < 45:
        return "ACCEPTED_UNDER_OBSERVATION"
    return "ACCEPTED" '''

# =========================================================
# 6. Compact API
# =========================================================

def score_compact(params: dict) -> dict:
    """
    Returns fully formatted summary:
      - risk_score (0–100)
      - anomaly_score (0–100)
      - rule_score (0–100)
      - risk_level (LOW / MEDIUM / HIGH)
      - anomaly_level (NORMAL / ELEVATED / HIGH)
      - rule_level (LOW / MEDIUM / HIGH)
      - explanation (string)
      - status (ACCEPTED, UNDER_OBSERVATION, MANUAL_REVIEW, FLAGGED, REPORTED, etc.)
    """
    risk = compute_risk_score(rf, params)
    anomaly = compute_anomaly_score(if_model, params)
    rules = rule_engine_score(params)

    rf_shap = explain_rf(params)
    explanation = build_reason_text(rf_shap["shap_values"], params)

    # FIX: Use the new regulatory status instead of the old derive_status
    reg_status = derive_regulatory_status(
        risk["risk_score"],
        anomaly["anomaly_score"],
        rules["rule_score"],
        params
    )

    return {
        "risk_score": risk["risk_score"],
        "anomaly_score": anomaly["anomaly_score"],
        "rule_score": rules["rule_score"],
        "risk_level": risk["risk_level"],
        "anomaly_level": anomaly["anomaly_level"],
        "rule_level": rules["rule_level"],
        "explanation": explanation,
        "status": reg_status["regulatory_status"], # Use official regulatory status
        "priority": reg_status["priority_level"]
    }

# =========================================================
# 7. Example usage
# =========================================================

example_input = {
 "country_risk": 80,              # High-risk country (e.g., Sanctioned)
    "is_pep": 1,                     # Customer is a Politically Exposed Person
    "occupation_risk": 80,           # High-risk industry (e.g., Arms or Crypto)
    "face_match_score": 65,          # Low match (potential identity fraud)
    "kyc_quality": 55,               # Poor documentation
    "failed_attempts": 3,            # Multiple failed KYC tries
    "avg_txn_amount_30d": 250000,    # Massive volume spike (> 200,000)
    "txn_count_7d": 45,              # High velocity/frequency
    "receiver_type": 3,              # High-risk receiver type
    "receiver_risk": 90,             # Receiver is already blacklisted
    "is_receiver_new": 1,            # First time sending to this person
    "receiver_country_risk": 80,     # Sending to a high-risk corridor
    "is_cross_border": 1,            # International movement
    "channel_risk": 90,              # High-risk channel (e.g., Dark Web wallet)
    "geo_distance_score": 90,        # Transaction from an impossible distance
    "days_since_first_txn": 5,       # Very new account (high risk for "mules")
    "txn_to_high_risk_geo_30d": 8,   # Pattern of sending to risky areas
    "receiver_txn_count_30d": 20,    # Receiver is receiving too many txns
    "alerts_last_90d": 5,            # Repeat offender/Multiple past alerts

}

summary = score_compact(example_input)
print(summary)




#ALERTS SENT TO GOVT SERVER THROUGH WEBHOOK
#We aren't just sending data; we are sending Authenticated Intelligence.
#By using HMAC-SHA256 signatures, we ensure our AML alerts meet the highest cybersecurity standards for financial infrastructure,
#preventing any adversarial tampering between our AI and the Regulator"

import requests
import json
import uuid
import hmac
import hashlib
from datetime import datetime, timezone

# 1. SECURITY SETTINGS (In production, these come from environment variables)
WEBHOOK_URL = "https://webhook.site/b687ed92-0daf-4454-842e-64734c026e04"
SECRET_KEY = "HACKATHON_GOVT_SECRET_2026"  # The shared secret key

summary = score_compact(example_input)
def send_secure_report(summary_data, original_input):
    """
    Wraps the AI summary in metadata and signs it with a digital signature.
    """
    reg_data = derive_regulatory_status(
        summary_data["risk_score"],
        summary_data["anomaly_score"],
        summary_data["rule_score"],
        original_input
    )
    # Create Payload
    payload = {
        "metadata": {
            "report_id": str(uuid.uuid4()),
            "timestamp": datetime.now(timezone.utc).isoformat(),
            "entity": "AstuComply",
            "fiu_report_type": "STR"  # Suspicious Transaction Report
        },
        "regulatory_analysis": reg_data, # NEW: Includes official GoS Tags and Priority
        "ai_analysis": summary_data
    }

    # Convert payload to string for signing
    payload_str = json.dumps(payload)

    # --- LOOPHOLE 4 FIX: GENERATE DIGITAL SIGNATURE ---
    # We sign the data using the secret key so the Govt knows it's us
    signature = hmac.new(
        SECRET_KEY.encode(),
        payload_str.encode(),
        hashlib.sha256
    ).hexdigest()

    # Add Security Headers
    headers = {
        "Content-Type": "application/json",
        "X-AML-Signature": signature,  # The digital "seal"
        "X-API-KEY": "SECRET_KEY_12345" # Second layer of authentication
    }

    # Send the Secure POST
    response = requests.post(WEBHOOK_URL, data=payload_str, headers=headers)

    if response.status_code == 200:
        print("✅ SUCCESS: Encrypted & Signed Report sent!")
        print(f"Digital Signature (HMAC): {signature[:10]}...")
    else:
        print(f"❌ SECURITY ERROR: Code {response.status_code}")

# Execute
send_secure_report(summary, example_input)




Best RF params: {'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': None}
Random Forest ROC-AUC: 0.8752531301034295
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       835
           1       0.56      0.45      0.50       165

    accuracy                           0.85      1000
   macro avg       0.73      0.69      0.71      1000
weighted avg       0.84      0.85      0.84      1000

{'risk_score': 65, 'anomaly_score': 150, 'rule_score': 100, 'risk_level': 'MEDIUM', 'anomaly_level': 'HIGH', 'rule_level': 'HIGH', 'explanation': 'Risk is strongly driven by country risk is high and customer is a politically exposed person.', 'status': 'STR_P1', 'priority': 'URGENT_TERROR_FINANCING'}
✅ SUCCESS: Encrypted & Signed Report sent!
Digital Signature (HMAC): d8cb2b75e7...
