In [3]:
import pandas as pd
import numpy as np

# Load the preprocessed dataset
df = pd.read_csv("../data/processed/preprocessed_transactions.csv", parse_dates=["timestamp"])
print("Data loaded:", df.shape)
df.head()


Data loaded: (174780, 14)


Unnamed: 0,transaction_id,customer_id,timestamp,amount,merchant_category,merchant_id,device_id,location,is_fraud,hour_of_day,day_of_week,is_night,is_weekend,amount_scaled
0,159943,4575,2023-01-01 00:10:00,35747.94,7,7906,5,2,0,0,6,1,1,-0.580246
1,887,25,2023-01-01 00:10:00,41011.8,7,3162,4,0,1,0,6,1,1,-0.457207
2,2381,68,2023-01-01 00:21:00,97614.54,2,8815,2,5,0,0,6,1,1,0.865847
3,121485,3455,2023-01-01 00:36:00,58350.27,5,8060,0,3,0,0,6,1,1,-0.051931
4,88703,2519,2023-01-01 00:51:00,93539.79,7,5597,0,4,0,0,6,1,1,0.770603


In [None]:
# Compute customer-level aggregates
# These features summarize each customer’s typical transaction patterns
customer_agg = df.groupby("customer_id").agg(
    avg_amount_per_user=("amount", "mean"),
    std_amount_per_user=("amount", "std"),
    unique_devices_per_user=("device_id", "nunique"),
    unique_locations_per_user=("location", "nunique"),
    total_txns=("transaction_id", "count")
).reset_index()

# Fill std NaNs (in case user has only 1 txn)
customer_agg["std_amount_per_user"] = customer_agg["std_amount_per_user"].fillna(0)
customer_agg.head()


Unnamed: 0,customer_id,avg_amount_per_user,std_amount_per_user,unique_devices_per_user,unique_locations_per_user,total_txns
0,1,68131.1836,27485.836054,6,4,50
1,2,103639.102766,48843.224566,6,4,47
2,3,51499.4915,28547.115209,6,2,20
3,4,120810.399167,42042.801031,6,3,36
4,5,61219.286,25127.247508,6,5,40


In [4]:
# Step 3: Merge aggregates back to transactions

#We join these customer profiles back into each transaction to enrich it.

df = df.merge(customer_agg, on="customer_id", how="left")
df.head()




Unnamed: 0,transaction_id,customer_id,timestamp,amount,merchant_category,merchant_id,device_id,location,is_fraud,hour_of_day,day_of_week,is_night,is_weekend,amount_scaled,avg_amount_per_user,std_amount_per_user,unique_devices_per_user,unique_locations_per_user,total_txns
0,159943,4575,2023-01-01 00:10:00,35747.94,7,7906,5,2,0,0,6,1,1,-0.580246,59793.6052,25932.929281,6,4,50
1,887,25,2023-01-01 00:10:00,41011.8,7,3162,4,0,1,0,6,1,1,-0.457207,30864.253333,24340.600055,6,2,42
2,2381,68,2023-01-01 00:21:00,97614.54,2,8815,2,5,0,0,6,1,1,0.865847,56124.793939,23407.726005,6,3,33
3,121485,3455,2023-01-01 00:36:00,58350.27,5,8060,0,3,0,0,6,1,1,-0.051931,27133.586522,21855.917304,6,1,23
4,88703,2519,2023-01-01 00:51:00,93539.79,7,5597,0,4,0,0,6,1,1,0.770603,90759.971026,38881.60798,6,2,39


In [5]:
df.columns

Index(['transaction_id', 'customer_id', 'timestamp', 'amount',
       'merchant_category', 'merchant_id', 'device_id', 'location', 'is_fraud',
       'hour_of_day', 'day_of_week', 'is_night', 'is_weekend', 'amount_scaled',
       'avg_amount_per_user', 'std_amount_per_user', 'unique_devices_per_user',
       'unique_locations_per_user', 'total_txns'],
      dtype='object')

In [6]:
# Ratio of current amount to user's average
df["amount_to_avg_ratio"] = df["amount"] / (df["avg_amount_per_user"] + 1e-6)

# Flag if amount is far outside normal range
df["is_high_deviation"] = (abs(df["amount"] - df["avg_amount_per_user"]) > 2 * df["std_amount_per_user"]).astype(int)


In [7]:
df = df.sort_values(["customer_id", "timestamp"])

def rolling_counts(group):
    group = group.sort_values("timestamp")
    group["txns_last_7d"] = (
        group.rolling("7D", on="timestamp").count()["customer_id"] - 1
    )
    group["txns_last_30d"] = (
        group.rolling("30D", on="timestamp").count()["customer_id"] - 1
    )
    return group

df = df.groupby("customer_id", group_keys=False).apply(rolling_counts)



  df = df.groupby("customer_id", group_keys=False).apply(rolling_counts)


In [8]:
feature_cols = [
    "amount", "amount_scaled", "merchant_category", "merchant_id",
    "device_id", "location", "hour_of_day", "day_of_week", "is_night", "is_weekend",
    "avg_amount_per_user", "std_amount_per_user", "unique_devices_per_user",
    "unique_locations_per_user", "amount_to_avg_ratio", "is_high_deviation",
    "txns_last_7d", "txns_last_30d", "is_new_device", "is_new_location"
]

target_col = "is_fraud"


In [9]:
df[feature_cols + [target_col]].to_csv("../artifacts/feature_engineered_transactions.csv", index=False)
print("✅ Feature engineering complete! Saved as feature_engineered_transactions.csv")


KeyError: "['is_new_device', 'is_new_location'] not in index"

In [10]:
set(feature_cols) - set(df.columns)


{'is_new_device', 'is_new_location'}