## 1. Import Libraries and Load Cleaned Data

In [4]:
# ======================================
# 1. Setup & Imports
# ======================================
import pandas as pd
import numpy as np
import os
import sys

# Extend Python path to use modules from src/
sys.path.append("../src")

In [5]:
# Custom modules
from data_loader import load_all_data
from preprocessing import (
    clean_fraud_data,
    clean_creditcard_data,
    clean_ip_country_data
)
from feature_engineering import (
    extract_time_features,
    map_ip_to_country,
    transaction_frequency_features,
    encode_categorical_features,
    scale_numeric_features
)
from sklearn.preprocessing import StandardScaler

In [None]:
# ======================================
raw_dir = "../data/raw"
processed_dir = "../data/processed"


# Load all data
fraud_df_raw, credit_df_raw, ip_df_raw = load_all_data(raw_dir)

# Clean datasets
fraud_df = clean_fraud_data(fraud_df_raw)
credit_df = clean_creditcard_data(credit_df_raw)
ip_country_df = clean_ip_country_data(ip_df_raw)

# Save cleaned data 
fraud_df.to_csv(f"{processed_dir}/fraud_data_cleaned.csv", index=False)
credit_df.to_csv(f"{processed_dir}/creditcard_cleaned.csv", index=False)
ip_country_df.to_csv(f"{processed_dir}/ip_country_cleaned.csv", index=False)

# ======================================

Loaded Fraud Data: (151112, 11) rows, columns: ['user_id', 'signup_time', 'purchase_time', 'purchase_value', 'device_id', 'source', 'browser', 'sex', 'age', 'ip_address', 'class']
Loaded Credit Card Data: (284807, 31) rows, columns: ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class']
Loaded IP-Country Data: (138846, 3) rows, columns: ['lower_bound_ip_address', 'upper_bound_ip_address', 'country']


## 2. Feature Engineering on Fraud Dataset (fraud_df)

### A. Extract Time-Based Features

In [7]:
fraud_df = extract_time_features(fraud_df)

### B. Transaction Frequency

In [12]:
def transaction_frequency_features(df, user_col='user_id', time_col='timestamp', window='1D'):
    df[time_col] = pd.to_datetime(df[time_col])
    
    # Sort by user and time
    df = df.sort_values(by=[user_col, time_col])
    
    # Create rolling feature
    df['txn_freq_last_1D'] = (
        df.set_index(time_col)
          .groupby(user_col)
          .rolling(window=window)
          .size()
          .reset_index(level=0, drop=True)
    )

    return df


### C. Geolocation (Country) Feature

In [13]:
fraud_df['ip_int'] = fraud_df['ip_address'].astype('int64')
fraud_df = map_ip_to_country(fraud_df, ip_country_df)

### D. Fill remaining missing values (if any)

In [14]:
fraud_df.fillna(-1, inplace=True)

## 3. Feature Engineering on Credit Card Dataset (credit_df)

### A. Time Feature

In [15]:
# 'Time' is seconds since first transaction, extract hour of day assuming day starts at 0
credit_df['hour_of_day'] = (credit_df['Time'] // 3600) % 24

### B. Scale Amount

In [16]:
scaler = StandardScaler()
credit_df = scale_numeric_features(credit_df, ['Amount'], scaler)
credit_df.rename(columns={'Amount': 'Amount_scaled'}, inplace=True)

### C. Amount Normalization or Binning

In [22]:
print(credit_df.columns.tolist())


['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount_scaled', 'Class', 'hour_of_day']


In [28]:
# 1. Scale the 'Amount' column
scaler = StandardScaler()
credit_df['Amount_scaled1'] = scaler.fit_transform(credit_df[['Amount_scaled']])

# 2. Bin the scaled 'Amount_scaled' into 5 quantile-based bins
credit_df['Amount_bin'] = pd.qcut(credit_df['Amount_scaled'], q=5, labels=False)

# Show result
credit_df[[ 'Amount_scaled', 'Amount_bin']].head()


Unnamed: 0,Amount_scaled,Amount_bin
0,0.2442,4
1,-0.342584,0
2,1.1589,4
3,0.139886,4
4,-0.073813,3


## 4. Encoding Categorical Variables (Fraud Dataset)

In [30]:
# One-hot encode categorical columns such as 'source', 'browser', 'sex', 'country'categorical_cols = ['source', 'browser', 'sex', 'country']
categorical_cols = ['source', 'browser', 'sex', 'country']
fraud_df = encode_categorical_features(fraud_df, categorical_cols)

## 5. Handle Missing Values & Final Checks

In [31]:
# Check missing values after feature engineering
print(fraud_df.isnull().sum())

# Fill or drop missing values accordingly
fraud_df.fillna(-1, inplace=True)
credit_df.fillna(0, inplace=True)

user_id                          0
signup_time                      0
purchase_time                    0
purchase_value                   0
device_id                        0
                                ..
country_Viet Nam                 0
country_Virgin Islands (U.S.)    0
country_Yemen                    0
country_Zambia                   0
country_Zimbabwe                 0
Length: 202, dtype: int64


## 6. Save the New Feature-Enhanced Data

In [32]:
fraud_df.to_csv('../data/processed/fraud_data_features.csv', index=False)
credit_df.to_csv('../data/processed/creditcard_features.csv', index=False)