# Transactional Fraud Modeling

In [5]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'data/Fraud.csv'  # Update the file path as necessary

df = pd.read_csv(file_path)
df = df.drop('isFlaggedFraud', axis = 1)
# View basic info about the data
print(df.info())
print(df.describe())

# Handle Missing Values
# Assume that missing balances for 'oldbalanceDest' and 'newbalanceDest' are 0 for merchants
df['oldbalanceDest'].fillna(0, inplace=True)
df['newbalanceDest'].fillna(0, inplace=True)

# Remove duplicates
df = df.drop_duplicates()

# Convert data types
# Convert 'type' to a categorical feature
df['type'] = df['type'].astype('category')

# Feature Engineering
# Encode 'type' as numeric
df['type'] = df['type'].cat.codes

# Derive new features related to balance changes
df['balanceOrg_diff'] = df['newbalanceOrig'] - df['oldbalanceOrg'] - df['amount']
df['balanceDest_diff'] = df['newbalanceDest'] - df['oldbalanceDest'] + df['amount']

# Drop irrelevant columns
df = df.drop(columns=['nameOrig', 'nameDest'])

# Standardize numerical features
from sklearn.preprocessing import StandardScaler

numerical_columns = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'balanceOrg_diff', 'balanceDest_diff']
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Verify cleaning
print(df.info())
print(df.describe())

# Save the cleaned dataset
df.to_csv('data/cleaned_fraudulent_transactions.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 10 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 485.4+ MB
None
               step        amount  oldbalanceOrg  newbalanceOrig  \
count  6.362620e+06  6.362620e+06   6.362620e+06    6.362620e+06   
mean   2.433972e+02  1.798619e+05   8.338831e+05    8.551137e+05   
std    1.423320e+02  6.038582e+05   2.888243e+06    2.924049e+06   
min    1.000000e+00  0.000000e+00   0.000000e+00    0.000000e+00   
25%    1.560000e+02  1.338957e+04   0.000000e+00    0.000000e+00   
50%    2.390000e+02  7.487194e+04   1.420800e+04    0.000000e+

In [6]:
from sklearn.model_selection import train_test_split

# Assuming `df` is the cleaned DataFrame and 'is_fraud' is the target column
X = df.drop('isFraud', axis=1)
y = df['isFraud']

# Split the data into training (80%) and temporary (20%) sets with stratification to handle the imbalance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Print the size of each split to verify
print(f"Training Set: {X_train.shape}, Test Set: {X_test.shape}")
print(f"Fraud in Training Set: {y_train.sum()}, Test Set: {y_test.sum()}")

Training Set: (5090096, 9), Test Set: (1272524, 9)
Fraud in Training Set: 6570, Test Set: 1643


In [11]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

classes = np.array([0, 1])
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
weights = dict(zip(classes, class_weights))

# Logistic Regression
log_reg = LogisticRegression(class_weight=weights, random_state=42)
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log))

# Decision Tree
decision_tree = DecisionTreeClassifier(class_weight=weights, random_state=42)
decision_tree.fit(X_train, y_train)
y_pred_tree = decision_tree.predict(X_test)
print("\nDecision Tree Classification Report:")
print(classification_report(y_test, y_pred_tree))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98    635441
           1       0.03      0.86      0.06       821

    accuracy                           0.96    636262
   macro avg       0.51      0.91      0.52    636262
weighted avg       1.00      0.96      0.98    636262


Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    635441
           1       0.89      0.87      0.88       821

    accuracy                           1.00    636262
   macro avg       0.94      0.94      0.94    636262
weighted avg       1.00      1.00      1.00    636262

