<a href="https://colab.research.google.com/github/Zuhra66/AI-Powered-Fraud-Detection-IBM-AISkillsBuild/blob/main/AI_Powered_Fraud_Detection_IBM_AISkillsBuild.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [98]:
!pip install git+https://github.com/ibm-granite-community/utils \
    "langchain_community<0.3.0" \
    replicate

Collecting git+https://github.com/ibm-granite-community/utils
  Cloning https://github.com/ibm-granite-community/utils to /tmp/pip-req-build-kzad5ldl
  Running command git clone --filter=blob:none --quiet https://github.com/ibm-granite-community/utils /tmp/pip-req-build-kzad5ldl
  Resolved https://github.com/ibm-granite-community/utils to commit aa05c43dc5ee022083221f3db59adc2ec869d50a
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [99]:
from ibm_granite_community.notebook_utils import get_env_var
from langchain_community.llms import Replicate

model = Replicate(
    model="ibm-granite/granite-3.3-8b-instruct",
    replicate_api_token=get_env_var('REPLICATE_API_TOKEN'),
    model_kwargs={"max_tokens":1024, "temperature":0.2},
)

In [100]:
csv_text = """amount,hour,location_risk,merchant_cat,device_score,repeat_customer,is_fraud
120.5,14,med,grocery,0.62,1,0
980.0,2,high,luxury,0.30,0,1
45.2,18,low,online,0.85,1,0
210.0,3,med,travel,0.55,0,1
67.9,11,low,electronics,0.77,1,0
"""

path = "/content/fraud_example.csv"
with open(path, "w") as f:
    f.write(csv_text)

import pandas as pd
df = pd.read_csv(path)
print("Saved to:", path)
df


Saved to: /content/fraud_example.csv


Unnamed: 0,amount,hour,location_risk,merchant_cat,device_score,repeat_customer,is_fraud
0,120.5,14,med,grocery,0.62,1,0
1,980.0,2,high,luxury,0.3,0,1
2,45.2,18,low,online,0.85,1,0
3,210.0,3,med,travel,0.55,0,1
4,67.9,11,low,electronics,0.77,1,0


In [101]:
# AI-Powered Fraud Detection System (simple, notebook-friendly UI with ipywidgets)

import io
import sys
import math
import warnings
warnings.filterwarnings("ignore")

# Try imports and install if missing (works in most notebook environments)
def _ensure(pkgs):
    import importlib
    for p in pkgs:
        try:
            importlib.import_module(p)
        except ImportError:
            get_ipython().system(f"pip -q install {p}")

_ensure(["pandas", "numpy", "scikit-learn", "ipywidgets", "matplotlib"])

import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, RocCurveDisplay
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# -----------------------------
# UI: Header & Theme
# -----------------------------
header = widgets.HTML(
    value="""
<div style='background:#222;color:#fff;padding:22px;border-radius:12px;margin:10px 0;'>
  <h1 style='margin:0;color:#fff;text-align:center;font-family:Arial, sans-serif;'>AI-Powered Fraud Detection</h1>
  <p style='margin:6px 0 0;text-align:center;opacity:.9;'>Train a simple model to flag potentially fraudulent transactions.</p>
</div>
"""
)

# -----------------------------
# Helpers: Data
# -----------------------------
EXPECTED_DOC = """Expected columns in CSV:
- amount (float)
- hour (0-23)
- location_risk (low|med|high)
- merchant_cat (string)
- device_score (0-1)
- repeat_customer (0 or 1)
- is_fraud (0 or 1)  ← label"""

def make_synthetic(n=3000, fraud_rate=0.08, seed=7):
    rng = np.random.default_rng(seed)
    amount = np.round(rng.gamma(2.0, 60.0, n), 2)
    hour = rng.integers(0, 24, n)
    location_risk = rng.choice(["low", "med", "high"], size=n, p=[0.6, 0.3, 0.1])
    merchant_cat = rng.choice(["grocery","electronics","travel","luxury","online"], size=n,
                              p=[0.35,0.20,0.15,0.05,0.25])
    device_score = np.clip(rng.normal(0.7, 0.2, n), 0, 1)
    repeat_customer = rng.choice([0,1], size=n, p=[0.4,0.6])

    # use np.isin for arrays
    risk = (
        0.015 * amount
        + 0.35 * (hour < 6)
        + np.select([location_risk=="high", location_risk=="med"], [0.9, 0.3], default=0)
        + np.where(np.isin(merchant_cat, ["luxury","travel"]), 0.35, 0)
        + (0.6 - device_score)
        + (1 - repeat_customer) * 0.25
    )

    thresh = np.quantile(risk, 1 - fraud_rate)
    is_fraud = (risk >= thresh).astype(int)

    df = pd.DataFrame({
        "amount": amount,
        "hour": hour,
        "location_risk": location_risk,
        "merchant_cat": merchant_cat,
        "device_score": np.round(device_score, 3),
        "repeat_customer": repeat_customer,
        "is_fraud": is_fraud
    })
    return df

# -----------------------------
# Widgets: Data Ingestion (Option B: load by path)
# -----------------------------
csv_path = widgets.Text(
    value="/content/fraud_example.csv",  # change to your file path
    description="CSV Path",
    layout=widgets.Layout(width="420px")
)
load_btn = widgets.Button(description="Load CSV from Path", button_style="primary")
gen_btn = widgets.Button(description="Generate Synthetic Data", button_style="info")
status = widgets.HTML(value=f"<pre style='margin:0'>{EXPECTED_DOC}</pre>")

# -----------------------------
# Widgets: Model & Training
# -----------------------------
model_choice = widgets.ToggleButtons(
    options=[("Logistic Regression", "lr"), ("Random Forest", "rf")],
    description="Model:",
)
# LR hyperparam
lr_c = widgets.FloatLogSlider(value=1.0, base=10, min=-2, max=2, step=0.1, description="LR C")
# RF hyperparams
rf_n = widgets.IntSlider(value=200, min=50, max=600, step=50, description="Trees")
rf_depth = widgets.IntSlider(value=8, min=2, max=20, step=1, description="Max Depth")

train_btn = widgets.Button(description="Train Model", button_style="success")
metrics_out = widgets.Output()
plot_out = widgets.Output()

# -----------------------------
# Widgets: Single Prediction
# -----------------------------
amount_w = widgets.FloatText(value=120.0, description="Amount")
hour_w = widgets.IntSlider(value=14, min=0, max=23, description="Hour")
loc_w = widgets.Dropdown(options=["low","med","high"], value="med", description="Location Risk")
cat_w = widgets.Dropdown(options=["grocery","electronics","travel","luxury","online"], value="grocery", description="Merchant")
dev_w = widgets.FloatSlider(value=0.6, min=0, max=1, step=0.01, description="Device Score")
rep_w = widgets.ToggleButtons(options=[("No",0),("Yes",1)], value=1, description="Repeat Cust")
thresh_w = widgets.FloatSlider(value=0.5, min=0.05, max=0.95, step=0.05, description="Threshold")
predict_btn = widgets.Button(description="Predict Risk", button_style="warning")

# Accessible prediction block
pred_out = widgets.Output()
pred_html = widgets.HTML()

def show_prediction(prob, thr):
    """Render a high-contrast, accessible prediction block."""
    ok   = "#2e7d32"   # green
    warn = "#c62828"   # red
    info = "#1565c0"   # blue
    text = "var(--colab-primary-text-color, #202124)"
    bg   = "var(--colab-cell-output-bg-color, #ffffff)"

    fraud = prob >= thr
    verdict = "Fraudulent" if fraud else "Legit"
    verdict_color = warn if fraud else ok
    prob_color = verdict_color
    thr_color = info

    pred_html.value = f"""
    <div style="background:{bg}; color:{text}; border:1px solid #e0e0e0;
                border-radius:10px; padding:12px; font-family:system-ui, Arial;">
      <div style="margin-bottom:4px;">
        <b>Probability of Fraud:</b>
        <span style="color:{prob_color}; font-weight:600;">{prob:.3f}</span>
      </div>
      <div style="margin-bottom:8px;">
        <b>Threshold:</b>
        <span style="color:{thr_color}; font-weight:600;">{thr:.2f}</span>
      </div>
      <div style="font-weight:700; color:{verdict_color};">Prediction: {verdict}</div>
    </div>
    """

# -----------------------------
# Containers & visibility
# -----------------------------
def _toggle_hparams(*_):
    lr_c.layout.display = "flex" if model_choice.value == "lr" else "none"
    rf_n.layout.display = "flex" if model_choice.value == "rf" else "none"
    rf_depth.layout.display = "flex" if model_choice.value == "rf" else "none"

model_choice.observe(_toggle_hparams, "value")
_toggle_hparams()

# -----------------------------
# Global state for the session
# -----------------------------
DATA = None
PIPE = None
FEATURE_NAMES = None

# -----------------------------
# Build / Train
# -----------------------------
def build_pipeline(model_key):
    num_cols = ["amount", "hour", "device_score", "repeat_customer"]
    cat_cols = ["location_risk", "merchant_cat"]

    pre = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ],
        remainder="drop",
    )

    if model_key == "lr":
        clf = LogisticRegression(max_iter=200, C=float(lr_c.value))
    else:
        clf = RandomForestClassifier(
            n_estimators=int(rf_n.value),
            max_depth=int(rf_depth.value),
            random_state=42,
            n_jobs=-1
        )

    pipe = Pipeline([("pre", pre), ("clf", clf)])
    return pipe, num_cols, cat_cols

def feature_names_from(pipe, num_cols, cat_cols, X_fit):
    ohe = pipe.named_steps["pre"].named_transformers_["cat"]
    cat_names = list(ohe.get_feature_names_out(cat_cols))
    return num_cols + cat_names

# -----------------------------
# Callbacks
# -----------------------------
def on_generate(_):
    global DATA
    DATA = make_synthetic()
    status.value = f"<b>Loaded synthetic dataset</b> with shape {DATA.shape}."
gen_btn.on_click(on_generate)

def on_load_csv(_):
    """Option B: load a CSV directly from a path."""
    global DATA
    try:
        df = pd.read_csv(csv_path.value)
    except Exception as e:
        status.value = f"<span style='color:#b00'>Failed to read CSV: {e}</span><pre>{EXPECTED_DOC}</pre>"
        return

    missing = {"is_fraud"} - set(df.columns)
    if missing:
        status.value = f"<span style='color:#b00'>Missing required column(s): {missing}</span><pre>{EXPECTED_DOC}</pre>"
        return

    if "repeat_customer" in df.columns:
        df["repeat_customer"] = df["repeat_customer"].astype(int)

    DATA = df.copy()
    status.value = f"<b>Loaded CSV</b> with shape {DATA.shape}."
load_btn.on_click(on_load_csv)

def on_train(_):
    global DATA, PIPE, FEATURE_NAMES
    if DATA is None:
        status.value = "<span style='color:#b00'>No data loaded. Load CSV by path or click 'Generate Synthetic Data'.</span>"
        return

    with metrics_out:
        clear_output()
        display(widgets.HTML("<h3 style='margin:4px 0;'>Training…</h3>"))

    pipe, num_cols, cat_cols = build_pipeline(model_choice.value)
    X = DATA.drop(columns=["is_fraud"])
    y = DATA["is_fraud"].astype(int)

    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
    pipe.fit(Xtr, ytr)

    proba = pipe.predict_proba(Xte)[:, 1]
    pred = (proba >= 0.5).astype(int)
    acc = accuracy_score(yte, pred)
    prec = precision_score(yte, pred, zero_division=0)
    rec = recall_score(yte, pred, zero_division=0)
    f1 = f1_score(yte, pred, zero_division=0)
    auc = roc_auc_score(yte, proba)
    cm = confusion_matrix(yte, pred)

    try:
        FEATURE_NAMES = feature_names_from(pipe, num_cols, cat_cols, Xtr)
    except Exception:
        FEATURE_NAMES = None

    with metrics_out:
        clear_output()
        html = f"""
<div style="display:grid;grid-template-columns:repeat(5,1fr);gap:8px;margin:6px 0;">
  <div style="background:#f5f5f7;padding:10px;border-radius:8px;"><b>Accuracy</b><div>{acc:.3f}</div></div>
  <div style="background:#f5f5f7;padding:10px;border-radius:8px;"><b>Precision</b><div>{prec:.3f}</div></div>
  <div style="background:#f5f5f7;padding:10px;border-radius:8px;"><b>Recall</b><div>{rec:.3f}</div></div>
  <div style="background:#f5f5f7;padding:10px;border-radius:8px;"><b>F1</b><div>{f1:.3f}</div></div>
  <div style="background:#f5f5f7;padding:10px;border-radius:8px;"><b>ROC-AUC</b><div>{auc:.3f}</div></div>
</div>
"""
        display(widgets.HTML(html))

        display(widgets.HTML("<b>Confusion Matrix (threshold=0.5)</b>"))
        df_cm = pd.DataFrame(cm, index=["True 0","True 1"], columns=["Pred 0","Pred 1"])
        display(df_cm)

        clf = pipe.named_steps["clf"]
        if FEATURE_NAMES is not None:
            try:
                if hasattr(clf, "feature_importances_"):
                    fi = pd.Series(clf.feature_importances_, index=FEATURE_NAMES).sort_values(ascending=False)
                elif hasattr(clf, "coef_"):
                    fi = pd.Series(clf.coef_.ravel(), index=FEATURE_NAMES).abs().sort_values(ascending=False)
                else:
                    fi = None
                if fi is not None:
                    top = fi.head(12).to_frame("importance")
                    display(widgets.HTML("<b>Top Features</b>"))
                    display(top)
            except Exception:
                pass

    with plot_out:
        clear_output()
        fig, ax = plt.subplots(figsize=(5,4), dpi=120)
        RocCurveDisplay.from_predictions(yte, proba, ax=ax)
        ax.set_title("ROC Curve")
        plt.show()

    PIPE = pipe

train_btn.on_click(on_train)

def on_predict(_):
    global PIPE
    with pred_out:
        clear_output()

    if PIPE is None:
        with pred_out:
            display(widgets.HTML("<span style='color:#b00'>Train a model first.</span>"))
        return

    row = pd.DataFrame([{
        "amount": float(amount_w.value),
        "hour": int(hour_w.value),
        "location_risk": str(loc_w.value),
        "merchant_cat": str(cat_w.value),
        "device_score": float(dev_w.value),
        "repeat_customer": int(rep_w.value)
    }])
    score = PIPE.predict_proba(row)[:,1][0]
    thr = float(thresh_w.value)

    show_prediction(score, thr)
    with pred_out:
        display(pred_html)

predict_btn.on_click(on_predict)

# -----------------------------
# Layout & Display
# -----------------------------
data_card = widgets.HTML(
    value="""
<div style='background:#f5f5f7;padding:14px;border-radius:10px;margin:8px 0;'>
  <b>1) Load Data</b>
  <p style='margin:4px 0;'>Load a CSV by path <i>or</i> generate a synthetic dataset.</p>
</div>
"""
)

# Controls row now uses CSV path + Load button (Option B) + Generate Synthetic
controls_row = widgets.HBox([csv_path, widgets.HTML("<div style='width:10px'></div>"), load_btn,
                             widgets.HTML("<div style='width:16px'></div>"), gen_btn])

model_card = widgets.HTML(
    value="""
<div style='background:#f5f5f7;padding:14px;border-radius:10px;margin:8px 0;'>
  <b>2) Choose Model & Train</b>
</div>
"""
)
hparams = widgets.HBox([lr_c, rf_n, rf_depth])
train_row = widgets.HBox([model_choice, widgets.HTML("<div style='width:16px'></div>"), train_btn])

predict_card = widgets.HTML(
    value="""
<div style='background:#f5f5f7;padding:14px;border-radius:10px;margin:8px 0;'>
  <b>3) Try a Single Prediction</b>
</div>
"""
)

pred_inputs_left = widgets.VBox([amount_w, hour_w, dev_w])
pred_inputs_right = widgets.VBox([loc_w, cat_w, rep_w, thresh_w])
pred_actions = widgets.HBox([predict_btn])

page = widgets.VBox([
    header,
    data_card, controls_row, status,
    model_card, train_row, hparams, metrics_out, plot_out,
    predict_card, widgets.HBox([pred_inputs_left, pred_inputs_right]), pred_actions, pred_out
])

display(page)


VBox(children=(HTML(value="\n<div style='background:#222;color:#fff;padding:22px;border-radius:12px;margin:10p…