Feature Engineering

In [2]:
import pandas as pd
import sys
import os

project_root = os.path.abspath("..")
sys.path.append(project_root)

In [3]:
from src.feat_eng import FeatureEngineer

# Load Fraud Data
df = pd.read_csv(r"E:\Improved-detection-of-fraud-cases-for-e-commerce-and-bank-transactions-week-5_6\data\raw\Fraud_data_cleaned.csv")


# Create engineer object
fe = FeatureEngineer(df)

# Generate all features
df_features = fe.run_all()

df_features.head()


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,hour_of_day,day_of_week,time_since_signup,txn_count_1h,txn_count_24h,txn_velocity_24h
116708,2,2015-01-11 03:47:13,2015-02-21 10:03:37,54,FGBQNDNBETFJJ,SEO,Chrome,F,25,880217500.0,0,10,5,990.273333,1.0,1.0,0.041667
15108,4,2015-06-02 16:40:57,2015-09-26 21:32:16,41,MKFUIVOHLJBYN,Direct,Safari,F,38,2785906000.0,0,21,5,2788.855278,1.0,1.0,0.041667
46047,8,2015-05-28 07:53:06,2015-08-13 11:53:07,47,SCQGQALXBUQZJ,SEO,Chrome,M,25,356056700.0,0,11,3,1852.000278,1.0,1.0,0.041667
67650,9,2015-05-16 15:58:32,2015-05-20 23:06:42,62,IEZOHXPZBIRTE,SEO,FireFox,M,21,759104700.0,0,23,2,103.136111,1.0,1.0,0.041667
109067,12,2015-01-10 06:25:12,2015-03-04 20:56:37,35,MSNWCFEHKTIOY,Ads,Safari,M,19,2985180000.0,0,20,2,1286.523611,1.0,1.0,0.041667


Data Transformation

In [9]:
import pandas as pd
from src.transformer import DataTransformer

df = df_features  # from your previous step

numerical_cols = [
    "purchase_value",      # was "Amount"
    "time_since_signup",
    "txn_count_1h", 
    "txn_count_24h", 
    "txn_velocity_24h"
]

categorical_cols = [
    "browser", 
    "source",
    "class"
]

transformer = DataTransformer(
    df,
    numerical_cols=numerical_cols,
    categorical_cols=categorical_cols,
    scaling="standard"
)

df_transformed = transformer.fit_transform()

df_transformed.head()


Unnamed: 0,purchase_value,time_since_signup,txn_count_1h,txn_count_24h,txn_velocity_24h,browser_Chrome,browser_FireFox,browser_IE,browser_Opera,browser_Safari,source_Ads,source_Direct,source_SEO,class_0,class_1
0,0.931338,-0.437279,0.0,0.0,-6.938894e-18,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.221836,1.633857,0.0,0.0,-6.938894e-18,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.549298,0.555032,0.0,0.0,-6.938894e-18,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,1.367955,-1.458852,0.0,0.0,-6.938894e-18,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,-0.105627,-0.096136,0.0,0.0,-6.938894e-18,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


Handle Class Imbalance

In [11]:
from sklearn.model_selection import train_test_split
from src.imbalance import ImbalanceHandler

# -----------------------------
# 1. Reconstruct target column
# -----------------------------
# Convert one-hot columns back to a single class label
y = df_transformed[["class_0", "class_1"]].idxmax(axis=1)
# Optionally, convert to numeric 0/1
y = y.apply(lambda x: int(x.split("_")[1]))

# Define features
X = df_transformed.drop(columns=["class_0", "class_1"])

# Check shapes and distribution
print("X shape:", X.shape)
print("y shape:", y.shape)
print(y.value_counts())

# -----------------------------
# 2. Split dataset
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# 3. Handle class imbalance
# -----------------------------
imb = ImbalanceHandler(method="smote")
X_train_bal, y_train_bal = imb.fit_resample(X_train, y_train)


X shape: (151112, 13)
y shape: (151112,)
0    136961
1     14151
Name: count, dtype: int64
ðŸ“Š BEFORE RESAMPLING:
0    109568
1     11321
dtype: int64

ðŸ“Š AFTER RESAMPLING:
0    109568
1    109568
dtype: int64
