In [None]:
# Fraud Detection - Feature Engineering Notebook
# Task 1: Data Analysis and Preprocessing

import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# Import custom modules
from data_cleaning import FraudDataCleaner
from eda import FraudEDA
from geolocation import IPGeolocationMapper
from feature_engineering import FraudFeatureEngineer
from data_transformation import FraudDataTransformer

print("All libraries imported successfully!")

In [None]:
# Step 1: Load and Clean Data
print("="*80)
print("STEP 1: DATA LOADING AND CLEANING")
print("="*80)

# Initialize cleaner
cleaner = FraudDataCleaner(verbose=True)

# Load fraud data
fraud_data_path = '../data/raw/Fraud_Data.csv'
df_fraud = cleaner.load_data(fraud_data_path)

# Display initial info
print(f"\nðŸ“Š Dataset Information:")
print(f"Shape: {df_fraud.shape}")
print(f"Columns: {df_fraud.columns.tolist()}")
print(f"\nSample data:")
print(df_fraud.head())

# Comprehensive cleaning
df_fraud_clean = cleaner.clean_fraud_data(df_fraud)

# Generate cleaning report
cleaning_report = cleaner.generate_cleaning_report()

In [None]:
# Step 2: Exploratory Data Analysis
print("\n" + "="*80)
print("STEP 2: EXPLORATORY DATA ANALYSIS (EDA)")
print("="*80)

# Initialize EDA
eda = FraudEDA(figsize=(12, 8))

# 2.1 Class Distribution Analysis
class_stats = eda.analyze_class_distribution(df_fraud_clean, target_col='class')

# 2.2 Univariate Analysis
eda.univariate_analysis(df_fraud_clean)

# 2.3 Bivariate Analysis
eda.bivariate_analysis(df_fraud_clean, target_col='class')

# 2.4 Correlation Analysis
eda.correlation_analysis(df_fraud_clean)

# 2.5 Outlier Analysis
eda.outlier_analysis(df_fraud_clean)

# Generate EDA report
eda_report = eda.generate_eda_report()

In [None]:
# Step 3: Geolocation Integration
print("\n" + "="*80)
print("STEP 3: GEOLOCATION INTEGRATION")
print("="*80)

# Initialize geolocation mapper
geo_mapper = IPGeolocationMapper()

# Load IP-to-country mapping
ip_mapping_path = '../data/raw/IpAddress_to_Country.csv'
geo_mapper.load_ip_country_mapping(ip_mapping_path)

# Map IP addresses to countries
df_fraud_geo = df_fraud_clean.copy()
df_fraud_geo['country'] = geo_mapper.map_ips_to_countries(df_fraud_geo['ip_address'])

# Analyze fraud by country
fraud_by_country = geo_mapper.analyze_fraud_by_country(df_fraud_geo, ip_col='ip_address')

# Validate mapping
geo_mapper.validate_mapping()

In [None]:
# Step 4: Feature Engineering
print("\n" + "="*80)
print("STEP 4: FEATURE ENGINEERING")
print("="*80)

# Initialize feature engineer
feature_engineer = FraudFeatureEngineer(verbose=True)

# Create all features
df_fraud_features = feature_engineer.create_all_features(df_fraud_geo, target_col='class')

# Display feature statistics
print(f"\nðŸ“ˆ Feature Engineering Statistics:")
print(feature_engineer.feature_stats)

# Analyze feature importance
importance_report = feature_engineer.get_feature_importance_report(
    df_fraud_features, target_col='class', top_n=20
)

# Save engineered data
output_path = '../data/processed/fraud_data_engineered.csv'
df_fraud_features.to_csv(output_path, index=False)
print(f"\nðŸ’¾ Engineered data saved to: {output_path}")
print(f"   Shape: {df_fraud_features.shape}")

In [None]:
# Step 5: Data Transformation
print("\n" + "="*80)
print("STEP 5: DATA TRANSFORMATION")
print("="*80)

# Initialize transformer
transformer = FraudDataTransformer(random_state=42, verbose=True)

# Run complete transformation pipeline
transformed_data = transformer.full_pipeline(
    df=df_fraud_features,
    target_col='class',
    normalize_method='standard',
    encode_method='label',
    imbalance_method='smote',
    test_size=0.2
)

# Display transformation results
X_train = transformed_data['X_train']
X_test = transformed_data['X_test']
y_train = transformed_data['y_train']
y_test = transformed_data['y_test']

print(f"\nâœ… Transformation Complete:")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train fraud rate: {y_train.mean():.4f}")
print(f"y_test fraud rate: {y_test.mean():.4f}")

# Save transformed data
train_output = '../data/processed/X_train.csv'
test_output = '../data/processed/X_test.csv'
pd.DataFrame(X_train).to_csv(train_output, index=False)
pd.DataFrame(X_test).to_csv(test_output, index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

print(f"\nðŸ’¾ Transformed data saved to data/processed/")