In [15]:
# ===============================
# Cell 1: Imports & Global Config
# ===============================

import numpy as np
import pandas as pd

# Visualization (minimal, for inspection only)
import matplotlib.pyplot as plt
import seaborn as sns

# ML utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score,
    classification_report,
    confusion_matrix
)

# XGBoost
from xgboost import XGBClassifier

# Serialization
import pickle

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [16]:
# ===============================
# Cell 2: Load Dataset
# ===============================

# Load the dataset
df = pd.read_csv("Fraud.csv")

# Basic confirmation
print("Dataset loaded successfully")
print("Shape:", df.shape)


Dataset loaded successfully
Shape: (6362620, 11)


In [17]:
# ======================================
# Cell 3: Columns & Basic Data Inspection
# ======================================

# Display column names
print("Columns:\n", df.columns.tolist())

print("\nData types:")
print(df.dtypes)

print("\nFirst 5 rows:")
df.head()


Columns:
 ['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']

Data types:
step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object

First 5 rows:


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [18]:
# ======================================
# Cell 4: Fraud Label Distribution
# ======================================

fraud_counts = df['isFraud'].value_counts()
fraud_percentage = df['isFraud'].value_counts(normalize=True) * 100

print("Fraud Counts:")
print(fraud_counts)

print("\nFraud Percentage (%):")
print(fraud_percentage.round(4))


Fraud Counts:
isFraud
0    6354407
1       8213
Name: count, dtype: int64

Fraud Percentage (%):
isFraud
0    99.8709
1     0.1291
Name: proportion, dtype: float64


In [19]:
# ======================================
# Cell 5: Feature Categorization
# ======================================

target_col = "isFraud"

print("Target column:", target_col)

print("\nCategorical columns:")
print(df.select_dtypes(include=["object"]).columns.tolist())

print("\nNumerical columns:")
print(df.select_dtypes(exclude=["object"]).columns.tolist())


Target column: isFraud

Categorical columns:
['type', 'nameOrig', 'nameDest']

Numerical columns:
['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']


In [20]:
# ======================================
# Cell 6: Create Transaction ID (Metadata)
# ======================================

import hashlib

def generate_transaction_id(row):
    raw_string = f"{row['nameOrig']}_{row['nameDest']}_{row['step']}"
    return hashlib.sha256(raw_string.encode()).hexdigest()

# Create transaction_id column
df['transaction_id'] = df.apply(generate_transaction_id, axis=1)

# Verify
df[['transaction_id', 'nameOrig', 'nameDest', 'step']].head()


Unnamed: 0,transaction_id,nameOrig,nameDest,step
0,61d00e74e239769e7a15adcd44c5b266f28681788ec62b...,C1231006815,M1979787155,1
1,3912994d1b4421f3ec243dbc914d220ee8355b3e08959d...,C1666544295,M2044282225,1
2,da54cf2af2d69b3b9f9b84b2c666292dc8709d861c9e1f...,C1305486145,C553264065,1
3,a133687e039901e9dc2873fc8b8c94314a09f74d300a6d...,C840083671,C38997010,1
4,02fa004d6d1022cacdaf01e2e4fd6972c1f5fe7678e733...,C2048537720,M1230701703,1


In [21]:
# ======================================
# Cell 7: Drop Forbidden Columns & Split X / y
# ======================================

# Define target
target_col = "isFraud"

# Columns to drop from model inputs
drop_cols = [
    "isFraud",
    "isFlaggedFraud",
    "nameOrig",
    "nameDest",
    "transaction_id"
]

# Separate features and target
X = df.drop(columns=drop_cols)
y = df[target_col]

# Verify
print("Input features (X):", X.columns.tolist())
print("Target (y):", target_col)

print("\nX shape:", X.shape)
print("y shape:", y.shape)


Input features (X): ['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
Target (y): isFraud

X shape: (6362620, 7)
y shape: (6362620,)


In [22]:
# ======================================
# Cell 8: Encode 'type' Categorical Column
# ======================================

# Define fixed mapping
type_mapping = {
    "PAYMENT": 0,
    "TRANSFER": 1,
    "CASH_OUT": 2,
    "DEBIT": 3,
    "CASH_IN": 4
}

# Apply mapping
X['type'] = X['type'].map(type_mapping)

# Sanity checks
print("Unique encoded 'type' values:", X['type'].unique())
print("\nMissing values after encoding:", X['type'].isna().sum())


Unique encoded 'type' values: [0 1 2 3 4]

Missing values after encoding: 0


In [23]:
# ======================================
# Cell 9: Train / Validation Split
# ======================================

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y
)

# Verify split sizes
print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)

# Verify fraud distribution
print("\nFraud rate in training set:")
print(y_train.value_counts(normalize=True) * 100)

print("\nFraud rate in validation set:")
print(y_val.value_counts(normalize=True) * 100)


Training set shape: (5090096, 7)
Validation set shape: (1272524, 7)

Fraud rate in training set:
isFraud
0    99.870926
1     0.129074
Name: proportion, dtype: float64

Fraud rate in validation set:
isFraud
0    99.870887
1     0.129113
Name: proportion, dtype: float64


In [24]:
# ======================================
# Cell 10: Baseline XGBoost Model
# ======================================

# Calculate scale_pos_weight
neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
scale_pos_weight = neg / pos

print("scale_pos_weight:", scale_pos_weight)

# Initialize XGBoost classifier
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    random_state=RANDOM_STATE,
    n_jobs=-1
)

# Train model
xgb_model.fit(X_train, y_train)


scale_pos_weight: 773.7482496194825


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [25]:
# ======================================
# Cell 11: Model Evaluation
# ======================================

# Predict probabilities for validation set
y_val_proba = xgb_model.predict_proba(X_val)[:, 1]

# ROC-AUC score
roc_auc = roc_auc_score(y_val, y_val_proba)
print("Validation ROC-AUC:", round(roc_auc, 4))

# Default threshold predictions (0.5)
y_val_pred = (y_val_proba >= 0.5).astype(int)

# Confusion Matrix
cm = confusion_matrix(y_val, y_val_pred)
print("\nConfusion Matrix (threshold=0.5):")
print(cm)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, digits=4))


Validation ROC-AUC: 0.9997

Confusion Matrix (threshold=0.5):
[[1254320   16561]
 [      3    1640]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9870    0.9934   1270881
           1     0.0901    0.9982    0.1653      1643

    accuracy                         0.9870   1272524
   macro avg     0.5451    0.9926    0.5794   1272524
weighted avg     0.9988    0.9870    0.9924   1272524



In [26]:
# ======================================
# Cell 11: Model Evaluation
# ======================================

# Predict probabilities for validation set
y_val_proba = xgb_model.predict_proba(X_val)[:, 1]

# ROC-AUC score
roc_auc = roc_auc_score(y_val, y_val_proba)
print("Validation ROC-AUC:", round(roc_auc, 4))

# Default threshold predictions (0.5)
y_val_pred = (y_val_proba >= 0.5).astype(int)

# Confusion Matrix
cm = confusion_matrix(y_val, y_val_pred)
print("\nConfusion Matrix (threshold=0.5):")
print(cm)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, digits=4))


Validation ROC-AUC: 0.9997

Confusion Matrix (threshold=0.5):
[[1254320   16561]
 [      3    1640]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9870    0.9934   1270881
           1     0.0901    0.9982    0.1653      1643

    accuracy                         0.9870   1272524
   macro avg     0.5451    0.9926    0.5794   1272524
weighted avg     0.9988    0.9870    0.9924   1272524



In [27]:
# ======================================
# Cell 12: Feature Importance
# ======================================

importances = xgb_model.feature_importances_
feature_names = X_train.columns

feature_importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

feature_importance_df


Unnamed: 0,feature,importance
4,newbalanceOrig,0.480603
3,oldbalanceOrg,0.172651
1,type,0.116637
2,amount,0.089386
5,oldbalanceDest,0.06051
6,newbalanceDest,0.047737
0,step,0.032476


In [28]:
# ======================================
# Cell 13: Probability → Risk Score
# ======================================

def probability_to_risk_score(prob):
    """
    Convert fraud probability (0–1) to risk score (0–100)
    """
    return np.round(prob * 100, 2)

# Apply on validation set for inspection
val_risk_scores = probability_to_risk_score(y_val_proba)

# Preview
risk_preview = pd.DataFrame({
    "fraud_probability": y_val_proba[:10],
    "risk_score": val_risk_scores[:10]
})

risk_preview


Unnamed: 0,fraud_probability,risk_score
0,0.002205,0.22
1,0.000632,0.06
2,0.002015,0.2
3,0.002773,0.28
4,0.004409,0.44
5,0.005993,0.6
6,0.347911,34.790001
7,0.004664,0.47
8,0.007336,0.73
9,0.000155,0.02


In [29]:
# ======================================
# Cell 14: Risk Level Assignment
# ======================================

def assign_risk_level(risk_score):
    if risk_score < 20:
        return "LOW"
    elif risk_score < 70:
        return "MEDIUM"
    else:
        return "HIGH"

# Apply on validation set
val_risk_levels = pd.Series(val_risk_scores).apply(assign_risk_level)

# Preview distribution
risk_level_distribution = val_risk_levels.value_counts(normalize=True) * 100

print("Risk level distribution (%):")
print(risk_level_distribution.round(3))


Risk level distribution (%):
LOW       97.365
MEDIUM     1.748
HIGH       0.887
Name: proportion, dtype: float64


In [30]:
# ======================================
# Cell 15: Confidence Score
# ======================================

def compute_confidence(prob):
    """
    Confidence score in percentage (0–100)
    """
    return np.round(abs(prob - 0.5) * 2 * 100, 2)

# Apply on validation set
val_confidence_scores = np.array([compute_confidence(p) for p in y_val_proba])

# Preview
confidence_preview = pd.DataFrame({
    "fraud_probability": y_val_proba[:10],
    "confidence_percent": val_confidence_scores[:10],
    "risk_score": val_risk_scores[:10],
    "risk_level": val_risk_levels[:10]
})

confidence_preview


Unnamed: 0,fraud_probability,confidence_percent,risk_score,risk_level
0,0.002205,99.56,0.22,LOW
1,0.000632,99.87,0.06,LOW
2,0.002015,99.6,0.2,LOW
3,0.002773,99.45,0.28,LOW
4,0.004409,99.12,0.44,LOW
5,0.005993,98.8,0.6,LOW
6,0.347911,30.42,34.790001,MEDIUM
7,0.004664,99.07,0.47,LOW
8,0.007336,98.53,0.73,LOW
9,0.000155,99.97,0.02,LOW


In [31]:
# ======================================
# Cell 16: MCP-Style Risk Decision Output
# ======================================

MODEL_VERSION = "xgb_fraud_v1"

def decision_from_risk(risk_level):
    if risk_level == "LOW":
        return "APPROVE"
    elif risk_level == "MEDIUM":
        return "STEP_UP_AUTH"
    else:
        return "BLOCK"

# Build structured output for validation samples
mcp_style_output = pd.DataFrame({
    "transaction_id": df.loc[X_val.index, "transaction_id"].values,
    "fraud_probability": np.round(y_val_proba, 6),
    "risk_score": val_risk_scores,
    "risk_level": val_risk_levels.values,
    "confidence_percent": val_confidence_scores,
})

mcp_style_output["decision"] = mcp_style_output["risk_level"].apply(decision_from_risk)
mcp_style_output["model_version"] = MODEL_VERSION

# Preview
mcp_style_output.head(10)


Unnamed: 0,transaction_id,fraud_probability,risk_score,risk_level,confidence_percent,decision,model_version
0,550fa3f271fde2d87ad6216837adbb4f61c1b1c5dd7077...,0.002205,0.22,LOW,99.56,APPROVE,xgb_fraud_v1
1,72b1666538d5f94eed4a2de4df8dec7492b134ce5c9406...,0.000632,0.06,LOW,99.87,APPROVE,xgb_fraud_v1
2,748119f5f7b79cc5c658cc6006adfc17fd63f01960d8b1...,0.002015,0.2,LOW,99.6,APPROVE,xgb_fraud_v1
3,921867ace4189a6e5a259b269c01aa678c573454af40bf...,0.002773,0.28,LOW,99.45,APPROVE,xgb_fraud_v1
4,60afbcaf3fe69fc6cc898a33d51a6329a34fd81b6669c4...,0.004409,0.44,LOW,99.12,APPROVE,xgb_fraud_v1
5,9826a1d5bd5f81a20ce192b19981cc23ded5047db46fb4...,0.005993,0.6,LOW,98.8,APPROVE,xgb_fraud_v1
6,08a373be4c42a7266968d7c8471f5afa458dafdcedc5fd...,0.347911,34.790001,MEDIUM,30.42,STEP_UP_AUTH,xgb_fraud_v1
7,839ceabcdfb31e23d4fc9b48bed7be7069a2ced2eaf30e...,0.004664,0.47,LOW,99.07,APPROVE,xgb_fraud_v1
8,5c790ea8c0932b46cfa28804edf2a4b530cb6e440650a1...,0.007336,0.73,LOW,98.53,APPROVE,xgb_fraud_v1
9,0b36dda011c9ac5202950d5b8c80c1d76a3fbaa5e42359...,0.000155,0.02,LOW,99.97,APPROVE,xgb_fraud_v1


In [32]:
# ======================================
# Cell 17: Final Training on Full Dataset
# ======================================

# Recalculate scale_pos_weight on full dataset
neg_full = (y == 0).sum()
pos_full = (y == 1).sum()
scale_pos_weight_full = neg_full / pos_full

print("Final scale_pos_weight:", scale_pos_weight_full)

# Initialize final model (same params)
final_xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight_full,
    objective="binary:logistic",
    eval_metric="auc",
    tree_method="hist",
    random_state=RANDOM_STATE,
    n_jobs=-1
)

# Train on full dataset
final_xgb_model.fit(X, y)

print("Final model training complete.")


Final scale_pos_weight: 773.7010836478753
Final model training complete.


In [33]:
# ======================================
# Cell 18: Export Model & Metadata (.pkl)
# ======================================

export_artifact = {
    "model": xgb_model,
    "feature_names": X.columns.tolist(),
    "type_mapping": type_mapping,
    "risk_thresholds": {
        "low_max": 20,
        "medium_max": 70
    },
    "model_version": MODEL_VERSION
}

# Save to pickle
with open("upi_fraud_risk_model.pkl", "wb") as f:
    pickle.dump(export_artifact, f)

print("Model exported successfully as upi_fraud_risk_model.pkl")


Model exported successfully as upi_fraud_risk_model.pkl
