In [None]:
import numpy as np
import pandas as pd
from src.data_preprocessing import (
    prepare_features_for_modeling,
    encode_categorical_features,
    scale_numerical_features,
    handle_class_imbalance,
    split_data_stratified
)
from src.feature_engineering import create_time_features
from src.data_loader import load_fraud_data
import sys
sys.path.append('..')

ImportError: cannot import name '_is_pandas_df' from 'sklearn.utils.validation' (c:\Users\arwa\fraud_detection\venv\Lib\site-packages\sklearn\utils\validation.py)

In [None]:
print("Loading processed data...")
try:

    df = pd.read_csv('../data/processed/fraud_data_with_features.csv',
                     parse_dates=['signup_time', 'purchase_time'])
    print(f"Loaded processed data: {df.shape}")
except:
    print("Processed data not found, creating from scratch...")
    from src.data_loader import load_fraud_data, load_country_mapping
    from src.geolocation import add_ip_integer_columns, merge_with_country
    from src.feature_engineering import create_time_features

    fraud_df = load_fraud_data()
    country_df = load_country_mapping()
    fraud_df, country_df = add_ip_integer_columns(fraud_df, country_df)
    df = merge_with_country(fraud_df, country_df)
    df = create_time_features(df)

In [None]:
print("\n STEP 5: DATA TRANSFORMATION ")
print("\n1. Separating features and target...")
X, y, categorical_cols, numerical_cols = prepare_features_for_modeling(df)

print(f"Target variable: class")
print(f"Categorical features: {categorical_cols}")
print(f"Numerical features: {numerical_cols}")

In [None]:
print("\n2. Encoding categorical features...")
X_encoded, encoder = encode_categorical_features(X, categorical_cols)
print(f"Shape after encoding: {X_encoded.shape}")

In [None]:
print("\n3. Scaling numerical features...")
X_scaled, scaler = scale_numerical_features(
    X_encoded, numerical_cols, scaler_type='standard')
print(f"Shape after scaling: {X_scaled.shape}")
print("\nSample of scaled numerical features (first 5 rows):")
print(X_scaled[numerical_cols].head())

In [None]:
print("\n4. Creating stratified train-test split...")
X_train, X_test, y_train, y_test = split_data_stratified(
    X_scaled, y, test_size=0.2, random_state=42
)

In [None]:
print("\n STEP 6: HANDLE CLASS IMBALANCE ")
print("\nIMPORTANT: We apply SMOTE ONLY to training data, NOT to test data!")
print("""
WHY WE CHOOSE SMOTE OVER UNDERSAMPLING:

1. SMOTE creates SYNTHETIC minority samples instead of duplicating
2. It preserves the information in the majority class (unlike undersampling)
3. Fraud detection has VERY FEW positive cases (1.96%)
4. With undersampling, we'd lose 98% of our legitimate transactions
5. SMOTE helps the model learn better patterns without losing data

Alternative considered: Random Under-sampling
- Would discard 49 out of 50 legitimate transactions
- Too much information loss for fraud detection
- Not suitable for such severe imbalance
""")


X_train_resampled, y_train_resampled, sampler = handle_class_imbalance(
    X_train, y_train, method='smote', random_state=42
)

In [None]:
import os
print("\n Saving Processed Datasets ")
os.makedirs('../data/processed', exist_ok=True)
datasets = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'X_train_resampled': X_train_resampled,
    'y_train_resampled': y_train_resampled
}

for name, data in datasets.items():
    if isinstance(data, pd.DataFrame):
        data.to_csv(f'../data/processed/{name}.csv', index=False)
    else:  # Series
        data.to_csv(f'../data/processed/{name}.csv', index=False, header=True)
    print(f"Saved {name}: {data.shape}")

print("\nAll datasets saved to ../data/processed/")

In [None]:
print("\n PREPROCESSING SUMMARY ")
print(f"1. Original data shape: {df.shape}")
print(f"2. After encoding categorical features: {X_encoded.shape}")
print(f"3. After scaling numerical features: {X_scaled.shape}")
print(f"4. Training set (before SMOTE): {X_train.shape}")
print(f"5. Training set (after SMOTE): {X_train_resampled.shape}")
print(f"6. Test set (no SMOTE): {X_test.shape}")

print("\nKey points for Task 2 (Model Building):")
print("• Use X_train_resampled, y_train_resampled for training")
print("• Use X_test, y_test for evaluation (NO SMOTE on test data!)")
print("• Test data preserves original class distribution")
print("• Training data has balanced classes (50% fraud, 50% legitimate)")

In [None]:
print("\n CLASS DISTRIBUTION DOCUMENTATION ")


def print_distribution_stats(name, y_data):
    total = len(y_data)
    fraud = y_data.sum()
    legit = total - fraud
    fraud_pct = fraud / total * 100

    print(f"\n{name}:")
    print(f"  Total samples: {total}")
    print(f"  Legitimate (0): {legit} ({legit/total*100:.1f}%)")
    print(f"  Fraudulent (1): {fraud} ({fraud_pct:.1f}%)")
    print(f"  Imbalance ratio: 1:{int(legit/fraud) if fraud > 0 else 'N/A'}")


print("BEFORE RESAMPLING:")
print_distribution_stats("Original dataset", y)
print_distribution_stats("Training set (before SMOTE)", y_train)
print_distribution_stats("Test set", y_test)

print("\nAFTER RESAMPLING:")
print_distribution_stats("Training set (after SMOTE)", y_train_resampled)