<h3>Setup</h3>

In [1]:
# Imports
import kagglehub
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [20]:
# Load Kaggle dataset
def load_kaggle_csv(dataset, filename):
    path = Path(kagglehub.dataset_download(dataset))
    return pd.read_csv(path / filename)

df = load_kaggle_csv(
    "chitwanmanchanda/fraudulent-transactions-data",
    "Fraud.csv"
)

# Sort by time
df = df.sort_values('step').reset_index(drop=True)



<h3>Feature Engineering</h3>

In [21]:
# Calculate change in balance for both accounts
df['orig_delta'] = df['newbalanceOrig'] - df['oldbalanceOrg']
df['dest_delta'] = df['newbalanceDest'] - df['oldbalanceDest']

# Calculate change in balance for origin
mask_orig_customer = ~df['nameOrig'].str.startswith('M')
df['orig_delta'] = 0  # default 0 for merchants
df.loc[mask_orig_customer, 'orig_delta'] = (
    df.loc[mask_orig_customer, 'newbalanceOrig'] - 
    df.loc[mask_orig_customer, 'oldbalanceOrg']
)

# Calculate change in balance for destination
mask_dest_customer = ~df['nameDest'].str.startswith('M')
df['dest_delta'] = 0  # default 0 for merchants
df.loc[mask_dest_customer, 'dest_delta'] = (
    df.loc[mask_dest_customer, 'newbalanceDest'] - 
    df.loc[mask_dest_customer, 'oldbalanceDest']
)

# Initialize previous fraud flag
df['orig_prev_fraud'] = 0
df['dest_prev_fraud'] = 0

# Keep track of seen customers
customer_fraud_history = {}

for i, row in df.iterrows():
    orig = row['nameOrig']
    dest = row['nameDest']
    
    # Set flag for origin customer
    df.at[i, 'orig_prev_fraud'] = customer_fraud_history.get(orig, 0)
    # Set flag for destination customer
    df.at[i, 'dest_prev_fraud'] = customer_fraud_history.get(dest, 0)
    
    # Update history if this transaction is fraud
    if row['isFraud'] == 1:
        customer_fraud_history[orig] = 1
        customer_fraud_history[dest] = 1

# Drop balance figures and answer column
df = df.drop(columns=[
    'oldbalanceOrg', 'newbalanceOrig',
    'oldbalanceDest', 'newbalanceDest',
    'isFlaggedFraud'
])

# Apply one hot encording for transaction type
pd.get_dummies(df, columns=['type'], drop_first=True)

Unnamed: 0,step,amount,nameOrig,nameDest,isFraud,orig_delta,dest_delta,orig_prev_fraud,dest_prev_fraud,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,9839.64,C1231006815,M1979787155,0,-9839.64,0.00,0,0,0,0,1,0
1,1,5157.05,C1562950869,M2021835850,0,-1667.92,0.00,0,0,0,0,1,0
2,1,5746.44,C845388562,M550572371,0,0.00,0.00,0,0,0,0,1,0
3,1,5607.36,C948424584,M1447685190,0,-5202.00,0.00,0,0,0,0,1,0
4,1,6360.79,C2027701910,M1345293143,0,-3731.00,0.00,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,339682.13,C2013999242,C1850423904,1,-339682.13,0.00,0,0,0,0,0,1
6362616,743,339682.13,C786484425,C776919290,1,-339682.13,339682.13,0,0,1,0,0,0
6362617,743,6311409.28,C1529008245,C1881841831,1,-6311409.28,0.00,0,0,0,0,0,1
6362618,743,6311409.28,C1162922333,C1365125890,1,-6311409.28,6311409.27,0,0,1,0,0,0


In [22]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,nameDest,isFraud,orig_delta,dest_delta,orig_prev_fraud,dest_prev_fraud
0,1,PAYMENT,9839.64,C1231006815,M1979787155,0,-9839.64,0.0,0,0
1,1,PAYMENT,5157.05,C1562950869,M2021835850,0,-1667.92,0.0,0,0
2,1,PAYMENT,5746.44,C845388562,M550572371,0,0.0,0.0,0,0
3,1,PAYMENT,5607.36,C948424584,M1447685190,0,-5202.0,0.0,0,0
4,1,PAYMENT,6360.79,C2027701910,M1345293143,0,-3731.0,0.0,0,0


<h3>Machine Learning</h3>

In [23]:
X = df.drop('isFraud', axis=1)
y = df['isFraud']

split_index = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

In [4]:
print(f"X_train shape {X_train.shape}")
print(f"X_test shape {X_test.shape}")

X_train shape (5090096, 10)
X_test shape (1272524, 10)


In [7]:
X_train

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFlaggedFraud
292779,15,PAYMENT,9914.74,C482751146,44248.00,34333.26,M1651188591,0.00,0.00,0
499763,20,PAYMENT,6854.53,C188264521,0.00,0.00,M1469015863,0.00,0.00,0
2970411,231,CASH_OUT,361211.80,C593201095,0.00,0.00,C1985763166,489745.16,850956.95,0
3137549,236,PAYMENT,7083.51,C1617277615,0.00,0.00,M1529547196,0.00,0.00,0
1500682,143,CASH_IN,218019.51,C1705563354,13045685.58,13263705.09,C2121401221,2438123.98,2220104.47,0
...,...,...,...,...,...,...,...,...,...,...
1524870,153,PAYMENT,1895.99,C1302053063,0.00,0.00,M1494049570,0.00,0.00,0
5834821,402,CASH_OUT,347110.99,C788162540,103785.00,0.00,C1771727447,87871.75,434982.74,0
4182953,304,PAYMENT,13259.63,C545341724,0.00,0.00,M253734860,0.00,0.00,0
3985280,298,PAYMENT,24122.92,C1831600144,0.00,0.00,M1382629737,0.00,0.00,0


In [5]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

ValueError: could not convert string to float: 'PAYMENT'

In [None]:
linkcode
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))