In [None]:
!pip -q install lime scikit-learn pandas numpy


In [None]:
import pandas as pd
import numpy as np

FILE_PATH = "/content/dataset/loan_fraud_dataset_100k.csv"   # <-- change
LABEL_COL = "loan_approved"


df = pd.read_csv(FILE_PATH)

print("Shape:", df.shape)
print("First columns:", df.columns.tolist()[:25])
print("\nLabel counts:")
print(df[LABEL_COL].value_counts(dropna=False).head(10))


Shape: (100000, 22)
First columns: ['age', 'employment_years', 'education_level', 'employment_type', 'monthly_income', 'fixed_monthly_expenses', 'debt_to_income_ratio', 'savings_balance', 'loan_amount', 'loan_duration_months', 'loan_purpose', 'credit_score', 'late_payments_12m', 'missed_payments_12m', 'utility_bill_on_time_ratio', 'income_inflation_ratio', 'document_mismatch_flag', 'application_velocity', 'geo_location_mismatch', 'metadata_anomaly_score', 'is_fraud', 'loan_approved']

Label counts:
loan_approved
1    93627
0     6373
Name: count, dtype: int64


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

y = df[LABEL_COL].astype(int)
FRAUD_ONLY_COLS = [
    "is_fraud",
    "document_mismatch_flag",
    "geo_location_mismatch",
    "metadata_anomaly_score",
    "income_inflation_ratio",
    "application_velocity"
]

X = df.drop(columns=[LABEL_COL] + FRAUD_ONLY_COLS)


num_cols = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X.select_dtypes(exclude=["number"]).columns.tolist()

numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("cat", categorical_pipe, cat_cols),
    ],
    remainder="drop"
)

clf = LogisticRegression(max_iter=2000, class_weight="balanced")

pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("clf", clf)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipe.fit(X_train, y_train)
proba = pipe.predict_proba(X_test)[:, 1]
print("ROC AUC:", round(roc_auc_score(y_test, proba), 4))


ROC AUC: 0.8522


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
APPROVE_TH = 0.70
REJECT_TH  = 0.40

def risk_band_from_pd(pd: float) -> str:
    if pd >= APPROVE_TH:
        return "APPROVED"
    if pd <= REJECT_TH:
        return "REJECTED"
    return "MIDDLE"

SNAPSHOT_FEATURES = [
    "monthly_income",
    "fixed_monthly_expenses",
    "debt_to_income_ratio",
    "employment_years",
    "employment_type",
    "loan_amount",
    "loan_duration_months",
    "utility_bill_on_time_ratio"
]





In [None]:
def fraud_check(raw: dict):
    flags = []
    score = 0.0

    # -------------------------
    # HARD flags (auto block)
    # -------------------------
    if int(raw.get("is_fraud", 0)) == 1:
        flags.append({"name": "explicit_fraud_label", "severity": "hard"})
        score += 0.60

    if int(raw.get("document_mismatch_flag", 0)) == 1:
        flags.append({"name": "document_mismatch", "severity": "hard"})
        score += 0.35

    if float(raw.get("metadata_anomaly_score", 0.0)) >= 0.80:
        flags.append({"name": "metadata_anomaly_high", "severity": "hard"})
        score += 0.35

    if float(raw.get("income_inflation_ratio", 1.0)) >= 2.50:
        flags.append({"name": "income_inflation_extreme", "severity": "hard"})
        score += 0.35

    # -------------------------
    # SOFT flags (review)
    # -------------------------
    if int(raw.get("geo_location_mismatch", 0)) == 1:
        flags.append({"name": "geo_location_mismatch", "severity": "soft"})
        score += 0.15

    # Financial inconsistency
    income = raw.get("monthly_income", None)
    expenses = raw.get("fixed_monthly_expenses", None)
    if income is not None and expenses is not None:
        try:
            income = float(income); expenses = float(expenses)
            if income > 0 and expenses > income:
                flags.append({"name": "expenses_gt_income", "severity": "soft"})
                score += 0.15
        except Exception:
            pass

    # Unusual application behavior
    if int(raw.get("application_velocity", 0)) >= 3:
        flags.append({"name": "rapid_multiple_applications", "severity": "soft"})
        score += 0.15

    # Payment behavior red flags (soft)
    if int(raw.get("missed_payments_12m", 0)) >= 3:
        flags.append({"name": "many_missed_payments_12m", "severity": "soft"})
        score += 0.12

    if float(raw.get("utility_bill_on_time_ratio", 1.0)) < 0.30:
        flags.append({"name": "low_utility_on_time_ratio", "severity": "soft"})
        score += 0.12

    # Moderate income inflation (soft)
    infl = float(raw.get("income_inflation_ratio", 1.0))
    if 1.50 <= infl < 2.50:
        flags.append({"name": "income_inflation_moderate", "severity": "soft"})
        score += 0.12

    score = float(min(score, 1.0))
    decision = "BLOCK" if any(f["severity"] == "hard" for f in flags) else "PASS"

    return {"decision": decision, "fraud_score": round(score, 3), "flags": flags}


In [None]:
from lime.lime_tabular import LimeTabularExplainer

prep = pipe.named_steps["prep"]
clf  = pipe.named_steps["clf"]

# transformed training data for LIME
X_train_trans = prep.transform(X_train)

# names of transformed features (including one-hot)
try:
    feature_names_trans = prep.get_feature_names_out()
except Exception:
    feature_names_trans = np.array([f"f{i}" for i in range(X_train_trans.shape[1])])

explainer = LimeTabularExplainer(
    training_data=np.array(X_train_trans),
    feature_names=feature_names_trans.tolist(),
    mode="classification",
    discretize_continuous=True
)

def predict_proba_on_transformed(Z):
    # Z is already transformed numeric
    return clf.predict_proba(Z)


In [None]:
def get_top_negative_lime(model_explainer, X_row_raw: pd.DataFrame, max_neg=4):
    # transform single row
    X_row_trans = prep.transform(X_row_raw)
    x = np.array(X_row_trans)[0]

    exp = model_explainer.explain_instance(
        x,
        predict_proba_on_transformed,
        num_features=12
    )

    # exp.as_list() returns pairs (feature_description, weight)
    # Keep only negative contributions (sorted strongest negative first)
    neg = [(feat, w) for feat, w in exp.as_list() if w < 0]
    neg = sorted(neg, key=lambda t: t[1])[:max_neg]  # most negative weights

    out = []
    for feat_desc, w in neg:
        out.append({
            "feature": feat_desc,                 # transformed feature name/condition
            "contribution": round(float(w), 3)
        })
    return out


In [None]:
FEATURE_DICTIONARY = {
    "age": "Applicant age",
    "employment_years": "Years of employment history",
    "education_level": "Highest education level",
    "employment_type": "Employment type (e.g., permanent/contract)",
    "monthly_income": "Declared monthly income",
    "fixed_monthly_expenses": "Fixed monthly expenses",
    "debt_to_income_ratio": "Debt-to-income ratio",
    "savings_balance": "Savings balance",
    "loan_amount": "Requested loan amount",
    "loan_duration_months": "Requested loan duration in months",
    "loan_purpose": "Purpose of the loan",
    "credit_score": "Credit score (given field in dataset)",
    "late_payments_12m": "Number of late payments in last 12 months",
    "missed_payments_12m": "Number of missed payments in last 12 months",
    "utility_bill_on_time_ratio": "Share of utility bills paid on time",
    "income_inflation_ratio": "Declared income / expected income indicator",
    "document_mismatch_flag": "1 if document template/fields mismatch expected patterns",
    "application_velocity": "Number of applications in a short time window",
    "geo_location_mismatch": "1 if location signals mismatch",
    "metadata_anomaly_score": "Anomaly score from document metadata",
    "is_fraud": "Fraud label/flag in dataset",
    "loan_approved": "Approval decision label"
}


In [None]:
def build_feature_snapshot(X_row_raw: pd.DataFrame):
    snap = {}
    row = X_row_raw.iloc[0].to_dict()
    for f in SNAPSHOT_FEATURES:
        if f in row:
            v = row[f]
            # make JSON-safe
            if isinstance(v, (np.integer, np.floating)):
                v = float(v)
            snap[f] = v
    return snap

def build_applicant_output(X_row_raw: pd.DataFrame):
    # 1) PD from scoring model
    pd_val = float(pipe.predict_proba(X_row_raw)[0, 1])

    # 2) Band from PD thresholds
    band = risk_band_from_pd(pd_val)

    # 3) Fraud check (uses raw values)
    raw = X_row_raw.iloc[0].to_dict()
    fraud = fraud_check(raw)

    # 4) Fraud override (final authority)
    reason_type = "CREDIT"
    if fraud["decision"] == "BLOCK":
        band = "REJECTED"
        reason_type = "FRAUD"

    # 5) LIME negatives (still ok to show even for fraud; you can also empty it if you want)
    lime_neg = get_top_negative_lime(explainer, X_row_raw, max_neg=4)

    return {
        "pd": round(pd_val, 3),
        "risk_band": band,
        "reason_type": reason_type,
        "thresholds": {
            "approve": float(APPROVE_TH),
            "reject": float(REJECT_TH)
        },
        "top_negative_factors": lime_neg,
        "feature_snapshot": build_feature_snapshot(X_row_raw),
        "fraud": fraud
    }



In [None]:
i = 0  # change index
X_one = X.iloc[[i]].copy()

out = build_applicant_output(X_one)
out


{'pd': 0.325,
 'risk_band': 'REJECTED',
 'reason_type': 'CREDIT',
 'thresholds': {'approve': 0.7, 'reject': 0.4},
 'top_negative_factors': [{'feature': 'num__credit_score <= 589.00',
   'contribution': -0.293},
  {'feature': 'num__debt_to_income_ratio > 0.65', 'contribution': -0.108},
  {'feature': '0.00 < cat__employment_type_temporary <= 1.00',
   'contribution': -0.101},
  {'feature': '0.00 < cat__education_level_high_school <= 1.00',
   'contribution': -0.092}],
 'feature_snapshot': {'monthly_income': 1047.36,
  'fixed_monthly_expenses': 724.24,
  'debt_to_income_ratio': 0.691,
  'employment_years': 9.1,
  'employment_type': 'temporary',
  'loan_amount': 895.72,
  'loan_duration_months': 60,
  'utility_bill_on_time_ratio': 0.91},
 'fraud': {'decision': 'PASS', 'fraud_score': 0.0, 'flags': []}}

In [None]:
# Check PD distribution on test set
pd_values = pipe.predict_proba(X_test)[:, 1]

print("PD min:", pd_values.min())
print("PD mean:", pd_values.mean())
print("PD max:", pd_values.max())


PD min: 0.017670301198973944
PD mean: 0.6550188535777478
PD max: 0.9960528018950798


In [None]:
df["loan_approved"].value_counts(normalize=True)


Unnamed: 0_level_0,proportion
loan_approved,Unnamed: 1_level_1
1,0.93627
0,0.06373


In [None]:
pd_values = pipe.predict_proba(X_test)[:, 1]


In [None]:
import numpy as np

APPROVE_TH = np.percentile(pd_values, 70)   # top 30% → APPROVED
REJECT_TH  = np.percentile(pd_values, 30)   # bottom 30% → REJECTED

print("APPROVE_TH:", APPROVE_TH)
print("REJECT_TH:", REJECT_TH)


APPROVE_TH: 0.8163300746270503
REJECT_TH: 0.5448854156317134


In [None]:
def risk_band_from_pd(pd):
    if pd >= APPROVE_TH:
        return "APPROVED"
    elif pd <= REJECT_TH:
        return "REJECTED"
    else:
        return "MIDDLE"


In [None]:
bands = {
    "APPROVED": (pd_values >= APPROVE_TH).mean(),
    "REJECTED": (pd_values <= REJECT_TH).mean(),
    "MIDDLE": ((pd_values > REJECT_TH) & (pd_values < APPROVE_TH)).mean()
}

bands


{'APPROVED': np.float64(0.3),
 'REJECTED': np.float64(0.3),
 'MIDDLE': np.float64(0.4)}

In [None]:
i = 10
result = build_applicant_output(X.iloc[[i]])
result


{'pd': 0.688,
 'risk_band': 'MIDDLE',
 'reason_type': 'CREDIT',
 'thresholds': {'approve': 0.8163300746270503, 'reject': 0.5448854156317134},
 'top_negative_factors': [{'feature': 'cat__employment_type_self_employed > 0.00',
   'contribution': -0.122},
  {'feature': 'cat__education_level_master > 0.00', 'contribution': -0.103}],
 'feature_snapshot': {'monthly_income': 1272.4,
  'fixed_monthly_expenses': 719.13,
  'debt_to_income_ratio': 0.565,
  'employment_years': 7.6,
  'employment_type': 'self_employed',
  'loan_amount': 1100.18,
  'loan_duration_months': 36,
  'utility_bill_on_time_ratio': 0.99},
 'fraud': {'decision': 'PASS', 'fraud_score': 0.0, 'flags': []}}

In [None]:
X_fraud = X.iloc[[i]].copy()
X_fraud["document_mismatch_flag"] = 1
X_fraud["metadata_anomaly_score"] = 0.95

build_applicant_output(X_fraud)


{'pd': 0.688,
 'risk_band': 'REJECTED',
 'reason_type': 'FRAUD',
 'thresholds': {'approve': 0.8163300746270503, 'reject': 0.5448854156317134},
 'top_negative_factors': [{'feature': 'cat__employment_type_self_employed > 0.00',
   'contribution': -0.125},
  {'feature': 'cat__education_level_master > 0.00', 'contribution': -0.118}],
 'feature_snapshot': {'monthly_income': 1272.4,
  'fixed_monthly_expenses': 719.13,
  'debt_to_income_ratio': 0.565,
  'employment_years': 7.6,
  'employment_type': 'self_employed',
  'loan_amount': 1100.18,
  'loan_duration_months': 36,
  'utility_bill_on_time_ratio': 0.99},
 'fraud': {'decision': 'BLOCK',
  'fraud_score': 0.7,
  'flags': [{'name': 'document_mismatch', 'severity': 'hard'},
   {'name': 'metadata_anomaly_high', 'severity': 'hard'}]}}

In [116]:
REQUIRED_KEYS = {
    "pd", "risk_band", "thresholds", "top_negative_factors",
    "feature_snapshot", "fraud"
}

ALLOWED_BANDS = {"APPROVED", "MIDDLE", "REJECTED"}
ALLOWED_FRAUD_DECISIONS = {"PASS", "BLOCK"}

def validate_output(out: dict):
    # keys
    assert REQUIRED_KEYS.issubset(out.keys()), f"Missing keys: {REQUIRED_KEYS - set(out.keys())}"

    # pd
    assert isinstance(out["pd"], (int, float)), "pd must be numeric"
    assert 0.0 <= out["pd"] <= 1.0, f"pd out of range: {out['pd']}"

    # thresholds
    th = out["thresholds"]
    assert "approve" in th and "reject" in th, "thresholds must have approve/reject"
    assert isinstance(th["approve"], (int, float)) and isinstance(th["reject"], (int, float)), "thresholds must be numeric"
    assert th["reject"] < th["approve"], f"thresholds invalid: reject={th['reject']} approve={th['approve']}"

    # risk band
    assert out["risk_band"] in ALLOWED_BANDS, f"invalid risk_band: {out['risk_band']}"

    # fraud block override
    fraud = out["fraud"]
    assert "decision" in fraud and "flags" in fraud, "fraud must have decision + flags"
    assert fraud["decision"] in ALLOWED_FRAUD_DECISIONS, f"invalid fraud decision: {fraud['decision']}"
    if fraud["decision"] == "BLOCK":
        assert out["risk_band"] == "REJECTED", "fraud BLOCK must force REJECTED"

    # top negative factors
    factors = out["top_negative_factors"]
    assert isinstance(factors, list), "top_negative_factors must be list"
    assert len(factors) <= 4, "top_negative_factors must be max 4"
    for f in factors:
        assert "feature" in f and "contribution" in f, "each factor must have feature + contribution"
        assert isinstance(f["contribution"], (int, float)), "contribution must be numeric"
        assert f["contribution"] <= 0, f"factor is not negative: {f}"  # keep only negatives

    # snapshot
    snap = out["feature_snapshot"]
    assert isinstance(snap, dict), "feature_snapshot must be dict"
    for k in snap.keys():
        assert k in SNAPSHOT_FEATURES, f"snapshot contains unexpected key: {k}"

    return True


In [150]:
def analyze_application(raw: dict):
    """
    Core decision intelligence logic
    """

    # ---- Prediction ----
    X = pd.DataFrame([raw])
    proba = pipe.predict_proba(X)[0]

    # approve class = 1 (based on your LABEL_COL)
    approve_idx = list(pipe.classes_).index(1)
    p_approve = proba[approve_idx]
    pd_score = 1.0 - p_approve

    band = risk_band_from_pd(p_approve)

    # ---- Fraud ----
    fraud_result = fraud_check(raw)

    # ---- LIME ----
    exp = explainer.explain_instance(
        prep.transform(X)[0],
        clf.predict_proba,
        num_features=10
    )

    lime_features = [
        {"feature": f, "weight": float(w)}
        for f, w in exp.as_list()
    ]

    # ---- Decision logic ----
    final_decision = band

    # fraud overrides model decision
    if fraud_result.get("decision") == "BLOCK":
        final_decision = "BLOCKED_FRAUD"
    elif fraud_result.get("severity") == "soft" or fraud_result.get("fraud_score", 0) >= 0.25:
        # borderline or suspicious cases
        if band == "APPROVED":
            final_decision = "MANUAL_REVIEW"
        else:
            final_decision = band


    return {
    "pd_percent": round(pd_score * 100, 2),
    "approve_probability": round(p_approve * 100, 2),
    "risk_band": band,
    "fraud": fraud_result,
    "lime_features": lime_features,
    "recommendation": final_decision,
    "summary": build_summary(
        final_decision,
        round(pd_score * 100, 2),
        fraud_result,
        lime_features
    )
}



In [151]:
test_app = {
    "age": 28,
    "education_level": 2,
    "employment_type": 1,
    "employment_years": 3,
    "monthly_income": 1800,
    "fixed_monthly_expenses": 700,
    "debt_to_income_ratio": 0.25,
    "savings_balance": 1200,
    "loan_amount": 4000,
    "loan_duration_months": 18,
    "loan_purpose": 3,
    "credit_score": 650,
    "late_payments_12m": 1,
    "missed_payments_12m": 0,
    "utility_bill_on_time_ratio": 0.9,
    "is_fraud": 0,
    "document_mismatch_flag": 0
}

analyze_application(test_app)


{'pd_percent': np.float64(3.89),
 'approve_probability': np.float64(96.11),
 'risk_band': 'APPROVED',
 'fraud': {'decision': 'PASS', 'fraud_score': 0.0, 'flags': []},
 'lime_features': [{'feature': 'num__missed_payments_12m <= 0.00',
   'weight': 0.1332194563435222},
  {'feature': 'cat__employment_type_self_employed <= 0.00',
   'weight': 0.12211531675549658},
  {'feature': 'cat__employment_type_permanent <= 0.00',
   'weight': 0.11161905204643227},
  {'feature': 'cat__education_level_bachelor <= 0.00',
   'weight': 0.1107440230276383},
  {'feature': 'cat__education_level_master <= 0.00',
   'weight': 0.10966383234424634},
  {'feature': 'num__employment_years <= 4.60', 'weight': -0.1052956358586917},
  {'feature': 'cat__loan_purpose_car <= 0.00', 'weight': 0.10200399931072701},
  {'feature': 'cat__employment_type_temporary <= 0.00',
   'weight': 0.0954239943197001},
  {'feature': '649.00 < num__credit_score <= 711.00',
   'weight': 0.09342210482114167},
  {'feature': 'cat__education_le

In [152]:
def run_full_workflow(test_case_name: str, app_data: dict):
    print("=" * 60)
    print(f"TEST CASE: {test_case_name}")
    print("=" * 60)

    result = analyze_application(app_data)

    print("Decision:", result["recommendation"])
    print("Risk band:", result["risk_band"])
    print("PD (%):", result["pd_percent"])
    print("Approve probability (%):", result["approve_probability"])

    print("\nFraud check:")
    print(result["fraud"])

    print("\nTop LIME factors:")
    for f in result["lime_features"][:5]:
        print(f"- {f['feature']}: {round(f['weight'], 3)}")

    print("\nSTATUS: WORKFLOW COMPLETED ✅")


In [153]:
borderline_case = {
    "age": 30,
    "education_level": 2,
    "employment_type": 1,
    "employment_years": 2.5,
    "monthly_income": 1500,
    "fixed_monthly_expenses": 950,
    "debt_to_income_ratio": 0.55,
    "savings_balance": 400,
    "loan_amount": 5000,
    "loan_duration_months": 24,
    "loan_purpose": 3,
    "credit_score": 610,
    "late_payments_12m": 2,
    "missed_payments_12m": 0,
    "utility_bill_on_time_ratio": 0.75,
    "is_fraud": 0,
    "document_mismatch_flag": 0
}

run_full_workflow("BORDERLINE CASE", borderline_case)


TEST CASE: BORDERLINE CASE
Decision: APPROVED
Risk band: APPROVED
PD (%): 12.53
Approve probability (%): 87.47

Fraud check:
{'decision': 'PASS', 'fraud_score': 0.0, 'flags': []}

Top LIME factors:
- num__missed_payments_12m <= 0.00: 0.129
- cat__employment_type_self_employed <= 0.00: 0.117
- cat__education_level_master <= 0.00: 0.112
- cat__education_level_bachelor <= 0.00: 0.111
- cat__employment_type_temporary <= 0.00: 0.111

STATUS: WORKFLOW COMPLETED ✅


In [154]:
def build_summary(decision, pd_percent, fraud_result, lime_features):
    if decision == "BLOCKED_FRAUD":
        return "Blocked due to high-confidence document inconsistency. Manual verification required before any approval."

    top = lime_features[:3]
    reasons = ", ".join([t["feature"].split(":")[0].replace("num__", "").replace("cat__", "") for t in top])

    if decision == "APPROVED":
        return f"Approved with low risk (PD {pd_percent}%). Key supporting factors: {reasons}."
    if decision == "REJECTED":
        return f"Rejected due to high risk (PD {pd_percent}%). Key drivers: {reasons}."
    return f"Manual review needed (PD {pd_percent}%). Key signals: {reasons}."


In [155]:
def build_summary(decision, pd_percent, fraud_result, lime_features):
    # Fraud override message
    if decision == "BLOCKED_FRAUD":
        flags = fraud_result.get("flags", [])
        flag_names = ", ".join([f.get("name", "unknown_flag") for f in flags]) if flags else "inconsistency"
        return f"Blocked due to fraud signal: {flag_names}. Manual verification required."

    # Build short reasons from top LIME features
    top = lime_features[:3] if lime_features else []
    cleaned = []
    for t in top:
        feat = t.get("feature", "")
        feat = feat.split(":")[0]
        feat = feat.replace("num__", "").replace("cat__", "")
        cleaned.append(feat)

    reasons = ", ".join(cleaned) if cleaned else "key risk factors"

    if decision == "APPROVED":
        return f"Approved with low estimated risk (PD {pd_percent}%). Key factors: {reasons}."
    if decision == "REJECTED":
        return f"Rejected due to high estimated risk (PD {pd_percent}%). Main drivers: {reasons}."
    return f"Manual review required (PD {pd_percent}%). Key signals: {reasons}."


In [156]:
res = analyze_application(approved_case)
res["summary"]


'Approved with low estimated risk (PD 0.64%). Key factors: credit_score > 711.00, missed_payments_12m <= 0.00, employment_type_self_employed <= 0.00.'

In [157]:
import pandas as pd

def build_summary(decision, pd_percent, fraud_result, lime_features):
    if decision == "BLOCKED_FRAUD":
        flags = fraud_result.get("flags", [])
        flag_names = ", ".join([f.get("name", "unknown_flag") for f in flags]) if flags else "inconsistency"
        return f"Blocked due to fraud signal: {flag_names}. Manual verification required."

    top = lime_features[:3] if lime_features else []
    cleaned = []
    for t in top:
        feat = t.get("feature", "")
        feat = feat.split(":")[0].replace("num__", "").replace("cat__", "")
        cleaned.append(feat)

    reasons = ", ".join(cleaned) if cleaned else "key risk factors"

    if decision == "APPROVED":
        return f"Approved with low estimated risk (PD {pd_percent}%). Key factors: {reasons}."
    if decision == "REJECTED":
        return f"Rejected due to high estimated risk (PD {pd_percent}%). Main drivers: {reasons}."
    return f"Manual review required (PD {pd_percent}%). Key signals: {reasons}."


def analyze_application(raw: dict, num_lime_features: int = 10):
    # 1) Fill missing fields to avoid sklearn missing-column errors
    row = {col: raw.get(col, DEFAULTS.get(col, 0)) for col in FEATURE_COLS}
    X = pd.DataFrame([row])

    # 2) Model prediction
    proba = pipe.predict_proba(X)[0]
    approve_idx = list(pipe.classes_).index(1)  # assumes approve label = 1
    p_approve = float(proba[approve_idx])
    pd_score = 1.0 - p_approve  # PD in 0..1

    # 3) Band (your function uses approve probability thresholds)
    band = risk_band_from_pd(p_approve)

    # 4) Fraud layer
    fraud_result = fraud_check(row)

    # 5) LIME explanations (uses transformed row)
    exp = explainer.explain_instance(
        prep.transform(X)[0],
        clf.predict_proba,
        num_features=num_lime_features
    )

    lime_features = [{"feature": f, "weight": float(w)} for f, w in exp.as_list()]

    # 6) Final decision override logic
    final_decision = band
    if fraud_result.get("decision") == "BLOCK":
        final_decision = "BLOCKED_FRAUD"
    elif band == "MIDDLE":
        final_decision = "MANUAL_REVIEW"

    # 7) Summary text
    pd_percent = round(pd_score * 100, 2)
    summary = build_summary(final_decision, pd_percent, fraud_result, lime_features)

    return {
        "recommendation": final_decision,
        "risk_band": band,
        "approve_probability_percent": round(p_approve * 100, 2),
        "pd_percent": pd_percent,
        "fraud": fraud_result,
        "lime_features": lime_features,
        "summary": summary,
        "input_row": row
    }


def print_decision_report(case_name: str, result: dict, top_k: int = 5):
    print("=" * 72)
    print(f"CASE: {case_name}")
    print("=" * 72)

    print(f"Final decision: {result['recommendation']}")
    print(f"Model band:     {result['risk_band']}")
    print(f"Approve prob:   {result['approve_probability_percent']}%")
    print(f"PD:            {result['pd_percent']}%")

    print("\nFraud:")
    print(result["fraud"])

    print("\nDecision summary:")
    print(result["summary"])

    print("\nTop LIME factors:")
    for item in result["lime_features"][:top_k]:
        print(f"- {item['feature']}: {round(item['weight'], 3)}")

    print("\nSTATUS: FULL WORKFLOW OK ✅")
    print("=" * 72)



In [158]:
APPROVED_CASE = {
    "age": 32,
    "education_level": 3,
    "employment_type": 1,
    "employment_years": 6,
    "monthly_income": 3000,
    "fixed_monthly_expenses": 800,
    "debt_to_income_ratio": 0.20,
    "savings_balance": 5000,
    "loan_amount": 6000,
    "loan_duration_months": 24,
    "loan_purpose": 2,
    "credit_score": 750,
    "late_payments_12m": 0,
    "missed_payments_12m": 0,
    "utility_bill_on_time_ratio": 0.98,
    "is_fraud": 0,
    "document_mismatch_flag": 0
}

REJECTED_CASE = {
    "age": 24,
    "education_level": 1,
    "employment_type": 0,
    "employment_years": 1,
    "monthly_income": 900,
    "fixed_monthly_expenses": 700,
    "debt_to_income_ratio": 0.78,
    "savings_balance": 100,
    "loan_amount": 8000,
    "loan_duration_months": 36,
    "loan_purpose": 4,
    "credit_score": 420,
    "late_payments_12m": 4,
    "missed_payments_12m": 2,
    "utility_bill_on_time_ratio": 0.40,
    "is_fraud": 0,
    "document_mismatch_flag": 0
}

# Borderline: should land in your "MIDDLE" zone (may need tiny tweaks)
MANUAL_REVIEW_CASE = {
    "age": 30,
    "education_level": 2,
    "employment_type": 1,
    "employment_years": 2.5,
    "monthly_income": 1500,
    "fixed_monthly_expenses": 950,
    "debt_to_income_ratio": 0.55,
    "savings_balance": 400,
    "loan_amount": 5000,
    "loan_duration_months": 24,
    "loan_purpose": 3,
    "credit_score": 610,
    "late_payments_12m": 2,
    "missed_payments_12m": 0,
    "utility_bill_on_time_ratio": 0.75,
    "is_fraud": 0,
    "document_mismatch_flag": 0
}

# Fraud override: model may approve, but fraud blocks
FRAUD_BLOCK_CASE = {
    "age": 29,
    "education_level": 2,
    "employment_type": 1,
    "employment_years": 2,
    "monthly_income": 2200,
    "fixed_monthly_expenses": 900,
    "debt_to_income_ratio": 0.45,
    "savings_balance": 800,
    "loan_amount": 5000,
    "loan_duration_months": 18,
    "loan_purpose": 3,
    "credit_score": 610,
    "late_payments_12m": 1,
    "missed_payments_12m": 0,
    "utility_bill_on_time_ratio": 0.85,
    "is_fraud": 0,
    "document_mismatch_flag": 1
}


In [160]:
# All model input features (same as training)
FEATURE_COLS = [c for c in df.columns if c != LABEL_COL]

print("Number of features:", len(FEATURE_COLS))
print("First 10 features:", FEATURE_COLS[:10])


Number of features: 21
First 10 features: ['age', 'employment_years', 'education_level', 'employment_type', 'monthly_income', 'fixed_monthly_expenses', 'debt_to_income_ratio', 'savings_balance', 'loan_amount', 'loan_duration_months']


In [161]:
# Numeric defaults = median
num_defaults = df[FEATURE_COLS].select_dtypes(include=["number"]).median().to_dict()

# Categorical defaults = most frequent value
cat_defaults = (
    df[FEATURE_COLS]
    .select_dtypes(exclude=["number"])
    .mode(dropna=True)
    .iloc[0]
    .to_dict()
)

DEFAULTS = {**num_defaults, **cat_defaults}

print("Defaults ready for", len(DEFAULTS), "features")


Defaults ready for 21 features


In [164]:
import os, pickle

os.makedirs("models", exist_ok=True)

with open("models/loan_model.pkl", "wb") as f:
    pickle.dump(pipe, f)

print("Model saved at:", os.path.abspath("models/loan_model.pkl"))


Model saved at: /content/models/loan_model.pkl
