# 02 - Feature Engineering


This notebook performs Feature Engineering on the fraud detection dataset.

**Focus: Minimizing False Alerts for Legitimate Frequent Customers**

***

In [134]:
import pandas as pd
import numpy as np
import os

RAW_DATA_PATH = '../data/raw/'
PROCESSED_DATA_PATH = '../data/processed/'
DATA_FILE = 'transactions.csv'

os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)

pd.set_option("display.max_colwidth", None)

In [135]:
input_filepath = os.path.join(RAW_DATA_PATH, DATA_FILE)
print(f"Loading dataset from: {input_filepath}...")
if not os.path.exists(input_filepath):
    raise FileNotFoundError(f"Data file not found at: {input_filepath}")


df = pd.read_csv(input_filepath)

Loading dataset from: ../data/raw/transactions.csv...


In [136]:
df['transaction_datetime'] = pd.to_datetime(df['unix_time'], unit='s')

In [137]:
def create_customer_behavior_features(
    df: pd.DataFrame, freq_threshold: int = 4
) -> pd.DataFrame:
    """
    Flag frequent shoppers and put their merchant activity in context.
    """
    df["is_frequent_customer"] = df["times_shopped_at_merchant_month"] >= freq_threshold

    # How busy is the customer overall this month?
    cust_month_txns = df.groupby(["cc_num", "trans_month"])["trans_num"].transform(
        "count"
    )
    df["customer_monthly_txns"] = cust_month_txns

    # Share of customer-month traffic captured by *this* merchant
    df["merchant_share_of_cust_month"] = (
        (df["times_shopped_at_merchant_month"] / cust_month_txns)
        .clip(upper=1)
        .fillna(0)
    )

    # Rolling “days since previous txn” (-1 = first record for that card)
    df = df.sort_values(["cc_num", "transaction_datetime"])
    days_since = (
        pd.to_datetime(df["transaction_datetime"])
        .groupby(df["cc_num"])
        .diff()
        .dt.total_seconds()
        .div(86_400)
    )
    df["days_since_prev_txn"] = days_since.fillna(-1)

    return df

In [138]:
def create_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    • Capture weekday vs weekend and night-time usage anomalies.
    """
    tx_time = pd.to_datetime(df['transaction_datetime'], errors='coerce')

    df['is_weekend'] = tx_time.dt.dayofweek >= 5           # Sat/Sun
    df['is_night'] = (df['hour'] < 6) | (df['hour'] > 22)  # 10 p.m.–6 a.m.

    # Simple circular representation of hour (helps trees & distances)
    radians = 2 * np.pi * df['hour'] / 24
    df['hour_sin'] = np.sin(radians)
    df['hour_cos'] = np.cos(radians)

    return df

In [139]:
def create_amount_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    More robust and context-aware amount features.
    """
 
    # 1. Standard per-customer-per-month
    cust_month_mean = df.groupby(["cc_num", "trans_month"])["amt"].transform("mean")
    cust_month_std = df.groupby(["cc_num", "trans_month"])["amt"].transform("std").replace(0, np.nan)
    df["amt_dev_from_mean"] = df["amt"] - cust_month_mean
    df["amt_zscore_cust_month"] = (df["amt_dev_from_mean"] / cust_month_std).fillna(0)

    # 2. Per-customer rolling stats (recent context, not just month)
    df = df.sort_values(["cc_num", "transaction_datetime"])
    df["amt_median_last5"] = (
        df.groupby("cc_num")["amt"].transform(lambda x: x.rolling(window=5, min_periods=1).median().shift(1))
    )
    df["amt_dev_from_median5"] = df["amt"] - df["amt_median_last5"]
    # Optionally, z-score with rolling std
    df["amt_std_last5"] = (
        df.groupby("cc_num")["amt"].transform(lambda x: x.rolling(window=5, min_periods=2).std().shift(1))
    )
    df["amt_zscore_last5"] = (df["amt_dev_from_median5"] / df["amt_std_last5"]).fillna(0)

    # 3. Customer+Merchant mean and deviation (how normal is this at this merchant?)
    cust_merch_mean = df.groupby(["cc_num", "merchant"])["amt"].transform("mean")
    df["amt_dev_from_cust_merchant_mean"] = df["amt"] - cust_merch_mean

    # 4. Log-transform and bins
    df["log_amt"] = np.log1p(df["amt"])
    df["amt_bin"] = pd.cut(df["amt"], bins=[-1, 10, 50, 200, 1000, np.inf], labels=["very_small", "small", "med", "large", "huge"])

    # 5. Micro-payment flag (optional, already present)
    df["is_small_amt"] = df["amt"] < 10

    return df


In [140]:
def create_distance_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Detect when a frequent customer suddenly shops much farther away.
    """
    cust_median_dist = df.groupby("cc_num")["dist_between_client_and_merch"].transform(
        "median"
    )

    df["dist_vs_cust_median"] = df["dist_between_client_and_merch"] - cust_median_dist
    df["is_farther_than_usual"] = (
        df["dist_between_client_and_merch"] > 1.5 * cust_median_dist
    )

    return df

In [141]:
for fn in (
    create_customer_behavior_features,
    create_temporal_features,
    create_amount_features,
    create_distance_features,
):
    df = fn(df)

In [142]:
df

Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,state,...,amt_median_last5,amt_dev_from_median5,amt_std_last5,amt_zscore_last5,amt_dev_from_cust_merchant_mean,log_amt,amt_bin,is_small_amt,dist_vs_cust_median,is_farther_than_usual
1017,60416207185,"fraud_Jones, Sawayn and Romaguera",misc_net,7.27,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,WY,...,,,,0.000000,-11.236000,2.112635,very_small,True,52.794632,True
2724,60416207185,fraud_Berge LLC,gas_transport,52.94,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,WY,...,7.270,45.670,,0.000000,-5.084167,3.987872,med,False,35.368328,False
2726,60416207185,fraud_Luettgen PLC,gas_transport,82.08,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,WY,...,30.105,51.975,32.293567,1.609454,22.610000,4.419804,med,False,-52.992838,False
2882,60416207185,fraud_Daugherty LLC,kids_pets,34.79,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,WY,...,52.940,-18.150,37.708144,-0.481328,-12.430000,3.577669,small,False,12.289756,False
2907,60416207185,fraud_Beier and Sons,home,27.18,Mary,Diaz,F,9886 Anita Drive,Fort Washakie,WY,...,43.865,-16.685,31.430534,-0.530853,-12.086667,3.338613,small,False,-0.481788,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1849259,4992346398065154184,fraud_Parisian and Sons,gas_transport,66.11,Benjamin,Kim,M,920 Patrick Light,Mc Nabb,IL,...,91.570,-25.460,133.339558,-0.190941,10.874286,4.206333,med,False,-31.977231,False
1849567,4992346398065154184,fraud_Bogisich-Homenick,misc_net,4.58,Benjamin,Kim,M,920 Patrick Light,Mc Nabb,IL,...,66.110,-61.530,46.691364,-1.317803,-8.965000,1.719189,very_small,True,4.619634,False
1850234,4992346398065154184,fraud_Erdman-Kertzmann,gas_transport,95.96,Benjamin,Kim,M,920 Patrick Light,Mc Nabb,IL,...,66.110,29.850,46.019584,0.648637,29.148333,4.574299,med,False,-40.889969,False
1850235,4992346398065154184,fraud_Stracke-Lemke,grocery_pos,149.48,Benjamin,Kim,M,920 Patrick Light,Mc Nabb,IL,...,66.110,83.370,46.002856,1.812279,7.835000,5.013830,med,False,5.032798,False


In [143]:
df.to_csv(os.path.join(PROCESSED_DATA_PATH, 'transactions_processed.csv'), index=False)
print("Saved to: ", os.path.join(PROCESSED_DATA_PATH, 'transactions_processed.csv'))


Saved to:  ../data/processed/transactions_processed.csv


In [144]:
non_freq, freq = df['is_frequent_customer'].value_counts()[0], df['is_frequent_customer'].value_counts()[1]

percentage_frequent_customers = freq/non_freq * 100

print(f'{percentage_frequent_customers:.1f}%')

1.4%


  non_freq, freq = df['is_frequent_customer'].value_counts()[0], df['is_frequent_customer'].value_counts()[1]
