# üìì 04_feature_engineering.ipynb (Feature Creation)

In [1]:
# # Feature Engineering
# Create new features from raw data


# ## 1. Import Libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

print("üîß Starting feature engineering...")

üîß Starting feature engineering...


In [2]:
# Load geolocation-enhanced data
df = pd.read_csv('../data/processed/fraud_data_with_country.csv')

print(f"üìä Dataset shape: {df.shape}")
df.head()


üìä Dataset shape: (129146, 18)


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,purchase_hour,purchase_day,ip_address_int,lower_bound,upper_bound,country,is_high_risk_country
0,247547,2015-06-28 03:00:34,2015-08-09 03:57:29,47,KIXYSVCHIPQBR,SEO,Safari,F,30,16778860.0,0,3,Sunday,16778864,16778240.0,16779263.0,Australia,0
1,220737,2015-01-28 14:21:11,2015-02-11 20:28:28,15,PKYOWQKWGJNJI,SEO,Chrome,F,34,16842050.0,0,20,Wednesday,16842045,16809984.0,16842751.0,Thailand,0
2,390400,2015-03-19 20:49:09,2015-04-11 23:41:23,44,LVCSXLISZHVUO,Ads,IE,M,29,16843660.0,0,23,Saturday,16843656,16843264.0,16843775.0,China,0
3,69592,2015-02-24 06:11:57,2015-05-23 16:40:14,55,UHAUHNXXUADJE,Direct,Chrome,F,30,16938730.0,0,16,Saturday,16938732,16924672.0,16941055.0,China,0
4,174987,2015-07-07 12:58:11,2015-11-03 04:04:30,51,XPGPMOHIDRMGE,SEO,Chrome,F,37,16971980.0,0,4,Tuesday,16971984,16941056.0,16973823.0,Thailand,0


In [3]:
# Ensure Correct Data Types
# Convert datetime columns
df['signup_time'] = pd.to_datetime(df['signup_time'])
df['purchase_time'] = pd.to_datetime(df['purchase_time'])

# Confirm target variable
assert 'class' in df.columns, "Target column 'class' missing"

print("‚úÖ Data types corrected")


‚úÖ Data types corrected


In [4]:

# Core Time-Based Features (SAFE & IMPORTANT)
print("‚è∞ Creating core time-based features...")

# Time since signup (hours)
df['time_since_signup_hours'] = (
    df['purchase_time'] - df['signup_time']
).dt.total_seconds() / 3600

# Purchase hour & day
df['purchase_hour'] = df['purchase_time'].dt.hour
df['purchase_day_of_week'] = df['purchase_time'].dt.dayofweek  # 0=Mon

print("‚úÖ Time-based features created")


‚è∞ Creating core time-based features...
‚úÖ Time-based features created


In [5]:
# New Customer Flags
print("üÜï Creating new customer risk flags...")

df['new_customer_1hr'] = (df['time_since_signup_hours'] < 1).astype(int)
df['new_customer_24hr'] = (df['time_since_signup_hours'] < 24).astype(int)
df['new_customer_7days'] = (df['time_since_signup_hours'] < 168).astype(int)

print("‚úÖ New customer flags created")


üÜï Creating new customer risk flags...
‚úÖ New customer flags created


In [6]:
# Purchase Value Behavior (SAFE)
print("üí∞ Creating purchase value features...")

# High-value thresholds (global)
df['high_value_purchase'] = (
    df['purchase_value'] > df['purchase_value'].quantile(0.75)
).astype(int)

df['very_high_value_purchase'] = (
    df['purchase_value'] > df['purchase_value'].quantile(0.90)
).astype(int)

print("‚úÖ Purchase value features created")


üí∞ Creating purchase value features...
‚úÖ Purchase value features created


In [7]:
# Device Risk Features (SAFE & STRONG)
print("üíª Creating device risk features...")

# Frequency of device usage
device_counts = df['device_id'].value_counts()
df['device_frequency'] = df['device_id'].map(device_counts)

# Number of unique users per device
device_user_counts = (
    df.groupby('device_id')['user_id']
    .nunique()
    .reset_index(name='unique_users_per_device')
)

df = df.merge(device_user_counts, on='device_id', how='left')

# Suspicious device flag
df['suspicious_device'] = (df['unique_users_per_device'] > 3).astype(int)

print("‚úÖ Device features created")


üíª Creating device risk features...
‚úÖ Device features created


In [8]:
# Location Risk Feature
print("üìç Confirming location risk features...")

# is_high_risk_country already created in geolocation step
assert 'is_high_risk_country' in df.columns, "Missing is_high_risk_country"

print("‚úÖ Location risk feature ready")


üìç Confirming location risk features...
‚úÖ Location risk feature ready


In [9]:
# Age Feature
print("üë§ Processing age feature...")

# Keep age as numeric (do NOT bin)
df['age'] = df['age'].clip(lower=18, upper=90)

print("‚úÖ Age feature cleaned")


üë§ Processing age feature...
‚úÖ Age feature cleaned


In [10]:
print("üßπ Dropping problematic / unsafe features...")

features_to_drop = [
    # Broken velocity features
    'time_between_purchases_hours',
    'total_purchases_per_user',
    'multiple_purchases_flag',
    'high_frequency_flag',
    'very_high_frequency_flag',
    
    # Weak or noisy
    'high_risk_browser',
    'rare_country',
    'suspicious_age',
    
    # Raw identifiers (not model features)
    'device_id',
    'ip_address',
    'ip_address_int',
    'lower_bound',
    'upper_bound',
    'country'
]

existing_drops = [f for f in features_to_drop if f in df.columns]
df.drop(columns=existing_drops, inplace=True)

print(f"‚úÖ Dropped {len(existing_drops)} columns")


üßπ Dropping problematic / unsafe features...
‚úÖ Dropped 6 columns


In [11]:
# Final Feature Review
print("üìä Final feature set:")

feature_cols = [c for c in df.columns if c not in ['class', 'signup_time', 'purchase_time']]
print(f"üéØ Total model features: {len(feature_cols)}")

feature_cols


üìä Final feature set:
üéØ Total model features: 19


['user_id',
 'purchase_value',
 'source',
 'browser',
 'sex',
 'age',
 'purchase_hour',
 'purchase_day',
 'is_high_risk_country',
 'time_since_signup_hours',
 'purchase_day_of_week',
 'new_customer_1hr',
 'new_customer_24hr',
 'new_customer_7days',
 'high_value_purchase',
 'very_high_value_purchase',
 'device_frequency',
 'unique_users_per_device',
 'suspicious_device']

In [13]:
# Sanity Check: Fraud Rates (NO LEAKAGE CHECK)
print("üîç Sanity check: fraud rates")

check_features = [
    'new_customer_1hr',
    'new_customer_24hr',
    'high_value_purchase',
    'is_high_risk_country',
    'suspicious_device'
]

for f in check_features:
    if f in df.columns:
        rates = df.groupby(f)['class'].mean()
        print(f"\n{f}:")
        print(rates)


üîç Sanity check: fraud rates

new_customer_1hr:
new_customer_1hr
0    0.045868
1    0.994914
Name: class, dtype: float64

new_customer_24hr:
new_customer_24hr
0    0.045930
1    0.873449
Name: class, dtype: float64

high_value_purchase:
high_value_purchase
0    0.094124
1    0.097763
Name: class, dtype: float64

is_high_risk_country:
is_high_risk_country
0    0.089531
1    0.129267
Name: class, dtype: float64

suspicious_device:
suspicious_device
0    0.045902
1    0.909301
Name: class, dtype: float64


In [14]:
# Save Final Feature Dataset
print("üíæ Saving final feature-engineered dataset...")

df.to_csv('../data/processed/fraud_data_final_features.csv', index=False)

print("‚úÖ Saved to ../data/processed/fraud_data_final_features.csv")
print(f"üìä Final shape: {df.shape}")


üíæ Saving final feature-engineered dataset...
‚úÖ Saved to ../data/processed/fraud_data_final_features.csv
üìä Final shape: (129146, 22)
