### Data processing

In [4]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Load datasets
fraud_data = pd.read_csv('../data/raw/Fraud_Data.csv')
ip_country = pd.read_csv('../data/raw/IpAddress_to_Country.csv')
credit_data = pd.read_csv('../data/raw/creditcard.csv')

# Data cleaning functions
def clean_fraud_data(df):
    # Handle missing values
    df = df.dropna()
    
    # Convert timestamps
    df['signup_time'] = pd.to_datetime(df['signup_time'])
    df['purchase_time'] = pd.to_datetime(df['purchase_time'])
    
    # Remove duplicates
    df = df.drop_duplicates()
    
    return df

def clean_credit_data(df):
    # No missing values in this dataset
    # Scale the 'Amount' feature
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    df['Amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1, 1))
    
    return df

# Apply cleaning
fraud_data_clean = clean_fraud_data(fraud_data)
credit_data_clean = clean_credit_data(credit_data)

# Save cleaned data
fraud_data_clean.to_csv('../data/processed/fraud_data_clean.csv', index=False)
credit_data_clean.to_csv('../data/processed/credit_data_clean.csv', index=False)

### Feature engineering

In [None]:
def engineer_fraud_features(df, ip_country):
    # First ensure IP addresses are strings and handle missing values
    df['ip_address'] = df['ip_address'].astype(str)
    
    # Remove any rows where IP is 'nan' (from NaN values)
    df = df[df['ip_address'] != 'nan']
    
    # Convert valid IPs to integer format
    df['ip_address'] = df['ip_address'].apply(lambda x: int(x.replace('.', '')) if x != 'nan' else None)
    
    # Drop any remaining rows with null IPs after conversion
    df = df.dropna(subset=['ip_address'])
    
    # Merge with country data (optimized version)
    ip_country['lower_bound'] = ip_country['lower_bound_ip_address'].astype('int64')
    ip_country['upper_bound'] = ip_country['upper_bound_ip_address'].astype('int64')
    
    # Create country mapping using interval indexing (more efficient than row-by-row)
    country_map = []
    for _, row in ip_country.iterrows():
        country_map.append((row['lower_bound'], row['upper_bound'], row['country']))
    
    def find_country(ip_int):
        for lower, upper, country in country_map:
            if lower <= ip_int <= upper:
                return country
        return None
    
    df['country'] = df['ip_address'].apply(find_country)
    
    # Drop rows where country couldn't be determined
    df = df.dropna(subset=['country'])
    
    # Time-based features
    df['hour_of_day'] = df['purchase_time'].dt.hour
    df['day_of_week'] = df['purchase_time'].dt.dayofweek
    df['time_since_signup'] = (df['purchase_time'] - df['signup_time']).dt.total_seconds() / 3600
    
    # Transaction velocity features
    user_counts = df['user_id'].value_counts().to_dict()
    df['user_transaction_count'] = df['user_id'].map(user_counts)
    
    # Time between transactions (requires sorting)
    df = df.sort_values(['user_id', 'purchase_time'])
    df['time_since_last_transaction'] = df.groupby('user_id')['purchase_time'].diff().dt.total_seconds() / 3600
    
    # Encode categorical features
    df = pd.get_dummies(df, 
                       columns=['source', 'browser', 'sex', 'country'], 
                       drop_first=True)
    
    return df

# Apply feature engineering
fraud_data_fe = engineer_fraud_features(fraud_data_clean, ip_country)

# For credit data, features are already engineered (V1-V28 from PCA)
# Just separate features and target
X_credit = credit_data_clean.drop('Class', axis=1)
y_credit = credit_data_clean['Class']

# Save engineered data
fraud_data_fe.to_csv('../data/processed/fraud_data_fe.csv', index=False)