# 01 â€” Data Cleaning & Feature Engineering

### ðŸŽ¯ Business Objective
Digital transactions carry fraud risk that increases business losses. To detect and prevent fraudulent events efficiently, we will clean and engineer features to build a reliable fraud detection model.

**Output of this notebook:**  
A processed, feature-engineered dataset saved to  
`data/processed/onlinefraud_clean.csv`


In [None]:
import os
import sys

# Make project root importable (so `src` can be found)
sys.path.append(os.path.abspath(".."))

import pandas as pd
import numpy as np

from src.features import (
    add_party_type_features,
    add_balance_features,
    encode_categoricals,
    drop_leakage_columns,
)

RAW_PATH = "../data/raw/onlinefraud.csv"
PROCESSED_PATH = "../data/processed/onlinefraud_clean.csv"

pd.set_option("display.max_columns", None)


In [None]:
df = pd.read_csv(RAW_PATH)
print("Raw data shape:", df.shape)
df.head()




Raw data shape: (6362620, 11)


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [15]:
df.info()
df.describe().T


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
step,6362620.0,243.3972,142.332,1.0,156.0,239.0,335.0,743.0
amount,6362620.0,179861.9,603858.2,0.0,13389.57,74871.94,208721.5,92445520.0
oldbalanceOrg,6362620.0,833883.1,2888243.0,0.0,0.0,14208.0,107315.2,59585040.0
newbalanceOrig,6362620.0,855113.7,2924049.0,0.0,0.0,0.0,144258.4,49585040.0
oldbalanceDest,6362620.0,1100702.0,3399180.0,0.0,0.0,132705.665,943036.7,356015900.0
newbalanceDest,6362620.0,1224996.0,3674129.0,0.0,0.0,214661.44,1111909.0,356179300.0
isFraud,6362620.0,0.00129082,0.0359048,0.0,0.0,0.0,0.0,1.0
isFlaggedFraud,6362620.0,2.514687e-06,0.001585775,0.0,0.0,0.0,0.0,1.0


In [18]:
print("Missing values per column:")
print(df.isna().sum())

print("\nFraud distribution (%):")
print(df["isFraud"].value_counts(normalize=True) * 100)


Missing values per column:
step               0
type               0
amount             0
nameOrig           0
oldbalanceOrg      0
newbalanceOrig     0
nameDest           0
oldbalanceDest     0
newbalanceDest     0
isFraud            0
isFlaggedFraud     0
orig_party_type    0
dest_party_type    0
dtype: int64

Fraud distribution (%):
isFraud
0    99.870918
1     0.129082
Name: proportion, dtype: float64


In [19]:
# 1. Add customer / merchant roles for origin and destination
df = add_party_type_features(df)

# 2. Add balance movement features
df = add_balance_features(df)

# Quick peek at new columns
df[[
    "nameOrig", "orig_party_type",
    "nameDest", "dest_party_type",
    "oldbalanceOrg", "newbalanceOrig", "orig_balance_change",
    "oldbalanceDest", "newbalanceDest", "dest_balance_change", "net_balance_change"
]].head()


Unnamed: 0,nameOrig,orig_party_type,nameDest,dest_party_type,oldbalanceOrg,newbalanceOrig,orig_balance_change,oldbalanceDest,newbalanceDest,dest_balance_change,net_balance_change
0,C1231006815,customer,M1979787155,merchant,170136.0,160296.36,-9839.64,0.0,0.0,0.0,-9839.64
1,C1666544295,customer,M2044282225,merchant,21249.0,19384.72,-1864.28,0.0,0.0,0.0,-1864.28
2,C1305486145,customer,C553264065,customer,181.0,0.0,-181.0,0.0,0.0,0.0,-181.0
3,C840083671,customer,C38997010,customer,181.0,0.0,-181.0,21182.0,0.0,-21182.0,-21363.0
4,C2048537720,customer,M1230701703,merchant,41554.0,29885.86,-11668.14,0.0,0.0,0.0,-11668.14


In [20]:
df_encoded = encode_categoricals(df)

print("After encoding categoricals:")
print(df_encoded.shape)
df_encoded.head()


After encoding categoricals:
(6362620, 18)


Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,orig_balance_change,dest_balance_change,net_balance_change,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,dest_party_type_merchant
0,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,-9839.64,0.0,-9839.64,False,False,True,False,True
1,1,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,-1864.28,0.0,-1864.28,False,False,True,False,True
2,1,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,-181.0,0.0,-181.0,False,False,False,True,False
3,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,-181.0,-21182.0,-21363.0,True,False,False,False,False
4,1,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,-11668.14,0.0,-11668.14,False,False,True,False,True


In [21]:
df_clean = drop_leakage_columns(df_encoded)

print("After dropping ID columns:")
print(df_clean.shape)
df_clean.head()


After dropping ID columns:
(6362620, 16)


Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,orig_balance_change,dest_balance_change,net_balance_change,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,dest_party_type_merchant
0,1,9839.64,170136.0,160296.36,0.0,0.0,0,0,-9839.64,0.0,-9839.64,False,False,True,False,True
1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0,-1864.28,0.0,-1864.28,False,False,True,False,True
2,1,181.0,181.0,0.0,0.0,0.0,1,0,-181.0,0.0,-181.0,False,False,False,True,False
3,1,181.0,181.0,0.0,21182.0,0.0,1,0,-181.0,-21182.0,-21363.0,True,False,False,False,False
4,1,11668.14,41554.0,29885.86,0.0,0.0,0,0,-11668.14,0.0,-11668.14,False,False,True,False,True


In [22]:
df_clean.info()

print("\nFraud distribution in cleaned data (%):")
print(df_clean["isFraud"].value_counts(normalize=True) * 100)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 16 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   step                      int64  
 1   amount                    float64
 2   oldbalanceOrg             float64
 3   newbalanceOrig            float64
 4   oldbalanceDest            float64
 5   newbalanceDest            float64
 6   isFraud                   int64  
 7   isFlaggedFraud            int64  
 8   orig_balance_change       float64
 9   dest_balance_change       float64
 10  net_balance_change        float64
 11  type_CASH_OUT             bool   
 12  type_DEBIT                bool   
 13  type_PAYMENT              bool   
 14  type_TRANSFER             bool   
 15  dest_party_type_merchant  bool   
dtypes: bool(5), float64(8), int64(3)
memory usage: 564.3 MB

Fraud distribution in cleaned data (%):
isFraud
0    99.870918
1     0.129082
Name: proportion, dtype: float64


In [23]:
os.makedirs(os.path.dirname(PROCESSED_PATH), exist_ok=True)

df_clean.to_csv(PROCESSED_PATH, index=False)
print("Saved cleaned dataset to:", PROCESSED_PATH)


Saved cleaned dataset to: ../data/processed/onlinefraud_clean.csv
