# Feature Engineering and Data Preprocessing


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
fraud_df = pd.read_csv("../data/raw/Fraud_Data.csv")
ip_df = pd.read_csv("../data/raw/IpAddress_to_Country.csv")


In [3]:
fraud_df["ip_address_int"] = fraud_df["ip_address"].astype(int)


In [4]:
ip_df["lower_bound_ip_address"] = ip_df["lower_bound_ip_address"].astype(int)
ip_df["upper_bound_ip_address"] = ip_df["upper_bound_ip_address"].astype(int)

ip_df = ip_df.sort_values("lower_bound_ip_address")
fraud_df = fraud_df.sort_values("ip_address_int")


In [5]:
merged_df = pd.merge_asof(
    fraud_df,
    ip_df,
    left_on="ip_address_int",
    right_on="lower_bound_ip_address",
    direction="backward"
)

merged_df = merged_df[
    merged_df["ip_address_int"] <= merged_df["upper_bound_ip_address"]
]


In [6]:
merged_df["signup_time"] = pd.to_datetime(merged_df["signup_time"])
merged_df["purchase_time"] = pd.to_datetime(merged_df["purchase_time"])

merged_df["hour_of_day"] = merged_df["purchase_time"].dt.hour
merged_df["day_of_week"] = merged_df["purchase_time"].dt.dayofweek
merged_df["time_since_signup"] = (
    merged_df["purchase_time"] - merged_df["signup_time"]
).dt.total_seconds()


In [7]:
from sklearn.preprocessing import StandardScaler

num_cols = ["purchase_value", "age", "time_since_signup"]
scaler = StandardScaler()
merged_df[num_cols] = scaler.fit_transform(merged_df[num_cols])

merged_df = pd.get_dummies(
    merged_df,
    columns=["browser", "source", "sex", "country"],
    drop_first=True
)


In [8]:
merged_df.to_csv("../data/processed/fraud_processed.csv", index=False)


## Summary
- IP geolocation successfully integrated
- Time-based and behavioral features engineered
- Dataset prepared for modeling
