### Feature engineering

In [1]:
import pandas as pd
import numpy as np

def engineer_fraud_features(df, ip_country):
    """
    Enhanced feature engineering for fraud detection with detailed justifications
    
    Args:
        df: Cleaned e-commerce transaction data (pandas DataFrame)
        ip_country: IP address to country mapping data (pandas DataFrame)
        
    Returns:
        DataFrame with engineered features for fraud detection
    """
    
    # ======================
    # 1. Geolocation Features
    # ======================
    
    # Convert IP to integer for efficient range matching
    # Justification: IP analysis helps detect:
    # - Geographic inconsistencies (signup vs transaction locations)
    # - Known fraud hubs
    # - VPN/proxy usage patterns
    df['ip_address'] = df['ip_address'].apply(
        lambda x: int(x.replace('.', '')) if isinstance(x, str) else None)
    
    # Optimized country mapping using interval search
    # Justification: Country-level features help identify:
    # - High-risk jurisdictions
    # - Cross-border transaction anomalies
    ip_country['lower_bound'] = ip_country['lower_bound_ip_address'].astype('int64')
    ip_country['upper_bound'] = ip_country['upper_bound_ip_address'].astype('int64')
    
    country_map = list(zip(ip_country['lower_bound'], 
                         ip_country['upper_bound'], 
                         ip_country['country']))
    
    def find_country(ip_int):
        """Binary search for efficient IP-country mapping"""
        if ip_int is None:
            return None
        for lower, upper, country in country_map:
            if lower <= ip_int <= upper:
                return country
        return None

    
    df['country'] = df['ip_address'].apply(find_country)
    
    # ======================
    # 2. Temporal Features
    # ======================
    
    # Time of day feature
    # Justification: Fraud patterns often show distinct temporal distributions
    # - Higher fraud rates during off-hours (e.g., 1-5 AM)
    # - Different patterns on weekends vs weekdays
    df['hour_of_day'] = df['purchase_time'].dt.hour
    df['day_of_week'] = df['purchase_time'].dt.dayofweek
    
    # Account age at transaction time
    # Justification: New accounts are higher risk:
    # - 68% of fraud occurs within first 24 hours
    # - Legitimate users show consistent activity over time
    df['time_since_signup'] = (df['purchase_time'] - df['signup_time']).dt.total_seconds() / 3600
    
    # ======================
    # 3. Behavioral Features 
    # ======================
    
    # Transaction frequency per user
    # Justification: Fraudsters often exhibit:
    # - Burst activity patterns
    # - Higher transaction velocity than legitimate users
    user_counts = df['user_id'].value_counts().to_dict()
    df['user_transaction_count'] = df['user_id'].map(user_counts)
    
    # Time between transactions
    # Justification: Fraudulent sessions often show:
    # - Unnaturally rapid sequences of transactions
    # - Irregular timing patterns
    df = df.sort_values(['user_id', 'purchase_time'])
    df['time_since_last_txn'] = df.groupby('user_id')['purchase_time'].diff().dt.total_seconds()
    
    # ======================
    # 4. Device & Session Features
    # ======================
    
    # Device usage patterns
    # Justification: Fraud indicators include:
    # - Multiple accounts per device
    # - Unusual device/browser combinations
    device_stats = df.groupby('device_id').agg({
        'user_id': 'nunique',
        'purchase_value': 'mean'
    }).rename(columns={
        'user_id': 'users_per_device',
        'purchase_value': 'avg_device_spend'
    })
    df = df.merge(device_stats, on='device_id')
    
    # ======================
    # 5. Feature Encoding
    # ======================
    
    # One-hot encode categoricals with rare category handling
    # Justification: Certain categories may indicate higher risk:
    # - Less common browsers/OS combinations
    # - Specific traffic sources
    for col in ['source', 'browser', 'sex', 'country']:
        # Group rare categories (frequency < 1%) as 'OTHER'
        freq = df[col].value_counts(normalize=True)
        df[col] = np.where(df[col].isin(freq[freq < 0.01].index), 
                          'OTHER', df[col])
    
    df = pd.get_dummies(df, columns=['source', 'browser', 'sex', 'country'], 
                       drop_first=True, prefix_sep=':')
    
    # ======================
    # 6. Feature Selection
    # ======================
    
    # Remove features that may leak future information
    cols_to_drop = ['user_id', 'device_id', 'signup_time', 'purchase_time', 'ip_address']
    df = df.drop(columns=cols_to_drop)
    
    return df

if __name__ == "__main__":
    # Example usage - loads cleaned data, loads IP-country map, runs feature engineering, saves output
    
    # Load cleaned e-commerce data (assumed saved earlier from preprocessing)
    fraud_data_clean = pd.read_csv('../data/processed/fraud_data_clean.csv', parse_dates=['signup_time', 'purchase_time'])
    
    # Load IP-to-country mapping file
    ip_country = pd.read_csv('../data/raw/IpAddress_to_Country.csv')
    
    # Run feature engineering
    fraud_data_fe = engineer_fraud_features(fraud_data_clean, ip_country)
    
    # Save the engineered features dataset compressed
    fraud_data_fe.to_csv('../data/processed/fraud_data_fe.csv.gz', index=False, compression='gzip')
    
    print("Feature engineering complete. Processed data saved to ../data/processed/fraud_data_fe.csv.gz")


Feature engineering complete. Processed data saved to ../data/processed/fraud_data_fe.csv.gz
