
##  Objective
Build a scalable and interpretable Machine Learning system to **proactively detect fraudulent transactions** and provide **actionable business insights**.

Dataset size: **6.3 million rows × 10 columns**

## 1. Import Libraries & Configuration

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    precision_recall_curve
)

import warnings
warnings.filterwarnings('ignore')

## 2. Load Dataset (Memory Optimized)

In [2]:
dtypes = {
    'step': 'int16',
    'type': 'category',
    'amount': 'float32',
    'oldbalanceOrg': 'float32',
    'newbalanceOrig': 'float32',
    'oldbalanceDest': 'float32',
    'newbalanceDest': 'float32',
    'isFraud': 'int8',
    'isFlaggedFraud': 'int8'
}

df = pd.read_csv('fraud.csv', dtype=dtypes)
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.639648,C1231006815,170136.0,160296.359375,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.280029,C1666544295,21249.0,19384.720703,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.139648,C2048537720,41554.0,29885.859375,M1230701703,0.0,0.0,0,0


## 3. Data Quality Checks

In [3]:
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

### Target Variable Distribution (Class Imbalance)

In [4]:
df['isFraud'].value_counts(normalize=True)

isFraud
0    0.998709
1    0.001291
Name: proportion, dtype: float64

## 4. Feature Engineering

In [5]:
df['log_amount'] = np.log1p(df['amount'])
df['balance_diff_orig'] = df['oldbalanceOrg'] - df['newbalanceOrig']
df['balance_diff_dest'] = df['newbalanceDest'] - df['oldbalanceDest']

## 5. Exploratory Data Analysis

In [6]:
pd.crosstab(df['type'], df['isFraud'], normalize='index')

isFraud,0,1
type,Unnamed: 1_level_1,Unnamed: 2_level_1
CASH_IN,1.0,0.0
CASH_OUT,0.99816,0.00184
DEBIT,1.0,0.0
PAYMENT,1.0,0.0
TRANSFER,0.992312,0.007688


In [7]:
df.select_dtypes(include='object').head()


Unnamed: 0,nameOrig,nameDest
0,C1231006815,M1979787155
1,C1666544295,M2044282225
2,C1305486145,C553264065
3,C840083671,C38997010
4,C2048537720,M1230701703


In [8]:
df = df.drop(columns=['nameOrig', 'nameDest'])


## 6. Train–Validation Split

In [9]:
X = df.drop(columns=['isFraud'])
y = df['isFraud']

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

## 7. Model Development – Logistic Regression

In [10]:
categorical_cols = ['type']
numerical_cols = X_train.select_dtypes(exclude='category').columns

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', 'passthrough', numerical_cols)
])

model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    n_jobs=-1
)

pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', model)
])

pipeline.fit(X_train, y_train)

## 8. Model Evaluation

In [11]:
y_pred_proba = pipeline.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred_proba)

np.float64(0.9911838301148915)

### Classification Report

In [12]:
y_pred = (y_pred_proba > 0.5).astype(int)
print(classification_report(y_val, y_pred))
confusion_matrix(y_val, y_pred)

              precision    recall  f1-score   support

           0       1.00      0.95      0.98   1270881
           1       0.03      0.95      0.05      1643

    accuracy                           0.95   1272524
   macro avg       0.51      0.95      0.51   1272524
weighted avg       1.00      0.95      0.97   1272524



array([[1211016,   59865],
       [     77,    1566]])

## 9. Feature Importance (Interpretability)

In [13]:
feature_names = pipeline.named_steps['preprocess'].get_feature_names_out()
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': pipeline.named_steps['model'].coef_[0]
}).sort_values(by='Coefficient', ascending=False)

coef_df.head(10)

Unnamed: 0,Feature,Coefficient
4,cat__type_TRANSFER,4.732647
1,cat__type_CASH_OUT,2.636967
11,num__isFlaggedFraud,0.08287107
5,num__step,0.002952366
10,num__newbalanceDest,0.0001780526
7,num__oldbalanceOrg,1.501648e-05
13,num__balance_diff_orig,1.095531e-05
6,num__amount,2.262647e-07
8,num__newbalanceOrig,-1.509865e-05
9,num__oldbalanceDest,-0.0001780861
