In [1]:
import pandas as pd
import numpy as np

# Load the synthetic dataset
df = pd.read_csv("../data/raw/bank_transactions.csv", parse_dates=["timestamp"])

print("Shape:", df.shape)
print("\nPreview:")
print(df.head())

print("\nMissing values:")
print(df.isnull().sum())


Shape: (174780, 9)

Preview:
   transaction_id  customer_id           timestamp    amount  \
0          159943         4575 2023-01-01 00:10:00  35747.94   
1             887           25 2023-01-01 00:10:00  41011.80   
2            2381           68 2023-01-01 00:21:00  97614.54   
3          121485         3455 2023-01-01 00:36:00  58350.27   
4           88703         2519 2023-01-01 00:51:00  93539.79   

  merchant_category merchant_id    device_id location  is_fraud  
0       Restaurants       M8906    iPhone_12   Ibadan         0  
1       Restaurants       M4162   Windows_PC    Abuja         1  
2           Fashion       M9815  Samsung_S21    Lagos         0  
3            Health       M9060  Infinix_Hot   Kaduna         0  
4       Restaurants       M6597  Infinix_Hot     Kano         0  

Missing values:
transaction_id       0
customer_id          0
timestamp            0
amount               0
merchant_category    0
merchant_id          0
device_id            0
location    

In [2]:
# Drop duplicate transactions (if any)
df = df.drop_duplicates(subset=["transaction_id"])

# Fill missing categorical values with "Unknown"
categorical_cols = ["merchant_category", "merchant_id", "device_id", "location"]
df[categorical_cols] = df[categorical_cols].fillna("Unknown")

# Fill numeric values with median
df["amount"] = df["amount"].fillna(df["amount"].median())


In [3]:
# Extracting time based features
df["hour_of_day"] = df["timestamp"].dt.hour
df["day_of_week"] = df["timestamp"].dt.dayofweek
df["is_night"] = df["hour_of_day"].apply(lambda x: 1 if x < 6 or x > 22 else 0)
df["is_weekend"] = df["day_of_week"].apply(lambda x: 1 if x >= 5 else 0)


In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
categorical_cols = ["merchant_category", "merchant_id", "device_id", "location"]

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

print("Categorical features encoded successfully.")


Categorical features encoded successfully.


In [None]:
# scale data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df["amount_scaled"] = scaler.fit_transform(df[["amount"]])


In [7]:
df.to_csv("../data/processed/preprocessed_transactions.csv", index=False)
print("✅ Preprocessing complete. Saved as preprocessed_transactions.csv")


✅ Preprocessing complete. Saved as preprocessed_transactions.csv
