# Task 3 - Feature Engineering

**Objective**: Build a robust, automated, and reproducible data processing script that transforms raw data into a model-ready format.

**Steps**:
1. Create Aggregate Features
2. Extract Features
3. Encode Categorical Variables
4. Handle Missing Values
5. Normalize/Standardize Numerical Features
6. Feature Engineering with WoE and IV

In [None]:
# Install necessary packages
%pip install xverse pandas numpy scikit-learn seaborn matplotlib

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from pathlib import Path
import importlib

# Set project root
project_root = Path(os.getcwd()).parent
sys.path.append(str(project_root))

# Import functions from src.data_processing
import src.data_processing
importlib.reload(src.data_processing)

from src.data_processing import (
    extract_temporal_features,
    aggregate_features,
    handle_missing_values,
    encode_categorical_features,
    normalize_features,
    woe_transformation
)

# Load Data
data_path = project_root / 'data' / 'raw' / 'data.csv'
df = pd.read_csv(data_path)
print(f"Data Shape: {df.shape}")
df.head()

Data Shape: (95662, 16)


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


## Step 1 & 2: Extract and Aggregate Features

We will extract temporal features from `TransactionStartTime` and then aggregate the data to the Customer level (RFM analysis).

In [7]:
# Extract Temporal Features
df = extract_temporal_features(df)

# Aggregate Features
customer_df = aggregate_features(df)

print(f"Customer Level Data Shape: {customer_df.shape}")
customer_df.head()

Customer Level Data Shape: (3742, 16)


Unnamed: 0,CustomerId,TransactionCount,TotalAmount,AvgAmount,StdAmount,Amount_min,Amount_max,Value_sum,Value_mean,Value_std,TransactionHour_mean,HasFraud,PrimaryChannel,PrimaryCategory,PrimaryPricing,PrimaryCurrency
0,CustomerId_1,1,-10000.0,-10000.0,,-10000.0,-10000.0,10000,10000.0,,16.0,0,ChannelId_2,airtime,4,UGX
1,CustomerId_10,1,-10000.0,-10000.0,,-10000.0,-10000.0,10000,10000.0,,16.0,0,ChannelId_2,airtime,4,UGX
2,CustomerId_1001,5,20000.0,4000.0,6558.963333,-5000.0,10000.0,30400,6080.0,4100.243895,7.8,0,ChannelId_3,financial_services,2,UGX
3,CustomerId_1002,11,4225.0,384.090909,560.498966,-75.0,1500.0,4775,434.090909,518.805446,13.454545,0,ChannelId_2,financial_services,2,UGX
4,CustomerId_1003,6,20000.0,3333.333333,6030.478146,-5000.0,10000.0,32000,5333.333333,3945.461528,14.333333,0,ChannelId_3,airtime,2,UGX


## Step 3 & 4: Handle Missing Values and Encode Categorical Variables

We will handle missing values (e.g., `StdAmount` might be NaN for single transactions) and encode categorical variables.

In [8]:
# Handle Missing Values
customer_df = handle_missing_values(customer_df)

# Encode Categorical Variables
customer_df = encode_categorical_features(customer_df)

customer_df.head()

Unnamed: 0,CustomerId,TransactionCount,TotalAmount,AvgAmount,StdAmount,Amount_min,Amount_max,Value_sum,Value_mean,Value_std,TransactionHour_mean,HasFraud,PrimaryChannel,PrimaryCategory,PrimaryPricing,PrimaryCurrency,PrimaryChannel_Encoded,PrimaryCategory_Encoded,PrimaryPricing_Encoded,PrimaryCurrency_Encoded
0,CustomerId_1,1.0,-10000.0,-10000.0,0.0,-10000.0,-10000.0,10000,10000.0,0.0,16.0,0,ChannelId_2,airtime,4,UGX,1,0,3,0
1,CustomerId_10,1.0,-10000.0,-10000.0,0.0,-10000.0,-10000.0,10000,10000.0,0.0,16.0,0,ChannelId_2,airtime,4,UGX,1,0,3,0
2,CustomerId_1001,5.0,20000.0,4000.0,6558.963333,-5000.0,10000.0,30400,6080.0,4100.243895,7.8,0,ChannelId_3,financial_services,2,UGX,2,2,2,0
3,CustomerId_1002,11.0,4225.0,384.090909,560.498966,-75.0,1500.0,4775,434.090909,518.805446,13.454545,0,ChannelId_2,financial_services,2,UGX,1,2,2,0
4,CustomerId_1003,6.0,20000.0,3333.333333,6030.478146,-5000.0,10000.0,32000,5333.333333,3945.461528,14.333333,0,ChannelId_3,airtime,2,UGX,2,0,2,0


## Step 5: Normalize/Standardize Numerical Features

We will scale the numerical features to have mean 0 and std 1.

In [9]:
# Normalize Features
customer_df = normalize_features(customer_df)
customer_df.head()

Unnamed: 0,CustomerId,TransactionCount,TotalAmount,AvgAmount,StdAmount,Amount_min,Amount_max,Value_sum,Value_mean,Value_std,TransactionHour_mean,HasFraud,PrimaryChannel,PrimaryCategory,PrimaryPricing,PrimaryCurrency,PrimaryChannel_Encoded,PrimaryCategory_Encoded,PrimaryPricing_Encoded,PrimaryCurrency_Encoded
0,CustomerId_1,-0.253459,-0.066891,-0.153364,0.0,-10000.0,-10000.0,10000,10000.0,0.0,0.883284,0,ChannelId_2,airtime,4,UGX,1,0,3,0
1,CustomerId_10,-0.253459,-0.066891,-0.153364,0.0,-10000.0,-10000.0,10000,10000.0,0.0,0.883284,0,ChannelId_2,airtime,4,UGX,1,0,3,0
2,CustomerId_1001,-0.212186,-0.055849,-0.06987,6558.963333,-5000.0,10000.0,30400,6080.0,4100.243895,-1.222654,0,ChannelId_3,financial_services,2,UGX,2,2,2,0
3,CustomerId_1002,-0.150278,-0.061655,-0.091435,560.498966,-75.0,1500.0,4775,434.090909,518.805446,0.229556,0,ChannelId_2,financial_services,2,UGX,1,2,2,0
4,CustomerId_1003,-0.201868,-0.055849,-0.073846,6030.478146,-5000.0,10000.0,32000,5333.333333,3945.461528,0.455248,0,ChannelId_3,airtime,2,UGX,2,0,2,0


## Step 6: Feature Engineering with WoE and IV

We will use Weight of Evidence (WoE) to transform categorical variables and calculate Information Value (IV) to select the most predictive features.
We use `HasFraud` as the target variable for this demonstration.

In [14]:
# Select features for WoE
features_for_woe = ['PrimaryChannel', 'PrimaryCategory', 'PrimaryPricing']
target = 'HasFraud'

# Calculate WoE and IV using custom function
woe_df, iv_dict = woe_transformation(customer_df, target, features_for_woe)

# Information Value
print("Information Value (IV):")
iv_df = pd.DataFrame(list(iv_dict.items()), columns=['Feature', 'IV'])
print(iv_df)

# Join WoE features back
customer_df = pd.concat([customer_df, woe_df], axis=1)
customer_df.head()

Information Value (IV):
           Feature        IV
0   PrimaryChannel  0.183556
1  PrimaryCategory  0.180466
2   PrimaryPricing  0.325191


Unnamed: 0,CustomerId,TransactionCount,TotalAmount,AvgAmount,StdAmount,Amount_min,Amount_max,Value_sum,Value_mean,Value_std,...,PrimaryCategory,PrimaryPricing,PrimaryCurrency,PrimaryChannel_Encoded,PrimaryCategory_Encoded,PrimaryPricing_Encoded,PrimaryCurrency_Encoded,PrimaryChannel_WoE,PrimaryCategory_WoE,PrimaryPricing_WoE
0,CustomerId_1,-0.253459,-0.066891,-0.153364,0.0,-10000.0,-10000.0,10000,10000.0,0.0,...,airtime,4,UGX,1,0,3,0,0.929436,0.429448,0.908407
1,CustomerId_10,-0.253459,-0.066891,-0.153364,0.0,-10000.0,-10000.0,10000,10000.0,0.0,...,airtime,4,UGX,1,0,3,0,0.929436,0.429448,0.908407
2,CustomerId_1001,-0.212186,-0.055849,-0.06987,6558.963333,-5000.0,10000.0,30400,6080.0,4100.243895,...,financial_services,2,UGX,2,2,2,0,-0.094894,-0.293483,-0.001663
3,CustomerId_1002,-0.150278,-0.061655,-0.091435,560.498966,-75.0,1500.0,4775,434.090909,518.805446,...,financial_services,2,UGX,1,2,2,0,0.929436,-0.293483,-0.001663
4,CustomerId_1003,-0.201868,-0.055849,-0.073846,6030.478146,-5000.0,10000.0,32000,5333.333333,3945.461528,...,airtime,2,UGX,2,0,2,0,-0.094894,0.429448,-0.001663
