<a href="https://colab.research.google.com/github/appliedcode/mthree-c422/blob/mthree-c422-Likhitha/Responsible_AI_DC_Practice_md.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import shap
import matplotlib.pyplot as plt


In [2]:
np.random.seed(2025)
n = 300
genders = ["Male", "Female"]
age_groups = ["18-30", "31-45", "46-60", "60+"]
trust_levels = ["High", "Medium", "Low"]

df = pd.DataFrame({
    "Patient_ID": range(1, n+1),
    "Gender": np.random.choice(genders, n, p=[0.48, 0.52]),
    "Age_Group": np.random.choice(age_groups, n),
    "Blood_Pressure": np.random.randint(90, 180, n),
    "Cholesterol_Level": np.random.randint(150, 300, n),
    "Family_History_Risk": np.random.choice([0,1], n, p=[0.6,0.4]),
    "AI_Diagnosis": np.random.choice(["Low Risk","Medium Risk","High Risk"], n, p=[0.5,0.3,0.2]),
    "Public_Trust_in_AI": np.random.choice(trust_levels, n, p=[0.35,0.45,0.2])
})

In [3]:
label_map = {"Low Risk":0, "Medium Risk":1, "High Risk":2}
df["label"] = df["AI_Diagnosis"].map(label_map)
X = pd.get_dummies(df[["Gender","Age_Group","Blood_Pressure","Cholesterol_Level","Family_History_Risk"]], drop_first=True)
y = df["label"]


In [4]:
X_train, X_test, y_train, y_test, df_train, df_test = train_test_split(X, y, df, test_size=0.25, random_state=42, stratify=y)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [5]:
acc = accuracy_score(y_test, y_pred)
print(f"Overall accuracy (proxy): {acc:.3f}")

Overall accuracy (proxy): 0.427


In [6]:
def group_fairness_table(df_slice, preds, true, group_col):
    out = {}
    base = pd.DataFrame({group_col: df_slice[group_col].values, "y_true": true.values, "y_pred": preds})
    for g, sub in base.groupby(group_col):
        support = len(sub)
        # positive prediction rate (PPR): proportion predicted High Risk
        ppr = (sub["y_pred"]==2).mean()
        # TPR for High Risk: among true High Risk, fraction predicted High Risk
        high_true = sub[sub["y_true"]==2]
        tpr = (high_true["y_pred"]==2).mean() if len(high_true)>0 else np.nan
        out[g] = {"support":int(support), "PPR":float(ppr), "TPR":(float(tpr) if not np.isnan(tpr) else "NA")}
    return pd.DataFrame(out).T

fair_gender = group_fairness_table(df_test, y_pred, y_test, "Gender")
fair_age   = group_fairness_table(df_test, y_pred, y_test, "Age_Group")

print("\nFairness by Gender (HighRisk PPR & TPR):")
print(fair_gender)
print("\nFairness by Age_Group (HighRisk PPR & TPR):")
print(fair_age)

# Compute simple disparity metrics (Demographic parity diff = max PPR - min PPR)
def disparity_summary(fair_df):
    pprs = [v["PPR"] for v in fair_df.to_dict(orient="index").values()]
    return {"min_ppr": min(pprs), "max_ppr": max(pprs), "dp_diff": max(pprs)-min(pprs)}
dg = disparity_summary(fair_gender); da = disparity_summary(fair_age)
print("\nDisparity summary (Gender):", dg)
print("Disparity summary (Age):", da)



Fairness by Gender (HighRisk PPR & TPR):
        support   PPR  TPR
Female     40.0  0.25  0.3
Male       35.0  0.00  0.0

Fairness by Age_Group (HighRisk PPR & TPR):
       support       PPR       TPR
18-30     16.0  0.187500  0.333333
31-45     12.0  0.083333  0.000000
46-60     22.0  0.272727  1.000000
60+       25.0  0.000000  0.000000

Disparity summary (Gender): {'min_ppr': 0.0, 'max_ppr': 0.25, 'dp_diff': 0.25}
Disparity summary (Age): {'min_ppr': 0.0, 'max_ppr': 0.2727272727272727, 'dp_diff': 0.2727272727272727}


In [7]:
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_test)
# For multiclass shap_values is a list; pick index 2 (High Risk) if present
sv = shap_values[2] if isinstance(shap_values, list) and len(shap_values)>2 else shap_values
plt.figure(figsize=(6,3))
shap.summary_plot(sv, X_test, plot_type="bar", show=False)
plt.title("SHAP: Top features for High Risk")
plt.tight_layout()
plt.savefig("shap_highrisk_bar.png", dpi=120)
plt.close()
print("\nSaved SHAP summary plot -> shap_highrisk_bar.png")


  shap.summary_plot(sv, X_test, plot_type="bar", show=False)



Saved SHAP summary plot -> shap_highrisk_bar.png


In [8]:
audit = {
    "timestamp": pd.Timestamp.now().isoformat(),
    "overall_accuracy": acc,
    "gender_fairness": fair_gender.to_dict(),
    "age_fairness": fair_age.to_dict(),
    "gender_dp_diff": dg["dp_diff"],
    "age_dp_diff": da["dp_diff"],
    "notes": "High Risk treated as positive (label==2). Labels are synthetic; treat results as demonstration."
}
with open("audit_summary.json","w") as f:
    json.dump(audit, f, indent=2)
print("Audit summary written -> audit_summary.json")

# ---------- Short action plan (print)
print("\nAction Plan (short):")
print("1) Data: collect more clinical ground-truth outcomes; ensure representative sampling by gender/age.")
print("2) Model: test per-group calibration; consider group-specific thresholds or reweighting if dp_diff > 0.05.")
print("3) Transparency: publish feature importances & decision rules; integrate SHAP explanations in clinician UI.")
print("4) Oversight: human-in-the-loop review for High Risk cases; monitoring pipeline for model drift.")
print("5) Privacy & Accountability: log predictions, keep audit trail, and enforce access controls.")

Audit summary written -> audit_summary.json

Action Plan (short):
1) Data: collect more clinical ground-truth outcomes; ensure representative sampling by gender/age.
2) Model: test per-group calibration; consider group-specific thresholds or reweighting if dp_diff > 0.05.
3) Transparency: publish feature importances & decision rules; integrate SHAP explanations in clinician UI.
4) Oversight: human-in-the-loop review for High Risk cases; monitoring pipeline for model drift.
5) Privacy & Accountability: log predictions, keep audit trail, and enforce access controls.
