In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import os
# Add the project root directory to Python path
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))

In [2]:
from src.data_preprocessing import DataPreprocessor

In [3]:
preprocessor = DataPreprocessor()

In [None]:
  # Load data
preprocessor.load_data(
        fraud_path='../data/raw/Fraud_Data.csv',
        ip_country_path='../data/raw/IpAddress_to_Country.csv',
        credit_card_path='../data/raw/creditcard.csv'
    )

INFO:src.data_preprocessing:Loading datasets...
INFO:src.data_preprocessing:Loaded fraud data: (151112, 11)
INFO:src.data_preprocessing:Loaded IP country data: (138846, 3)
INFO:src.data_preprocessing:Loaded credit card data: (284807, 31)


In [5]:
  # Preprocess fraud data
fraud_processed = preprocessor.preprocess_fraud_data()
    

INFO:src.data_preprocessing:Starting fraud data preprocessing...
INFO:src.data_preprocessing:Handling missing values using strategy: impute
INFO:src.data_preprocessing:Imputed missing values
INFO:src.data_preprocessing:Removed 0 duplicate rows
INFO:src.data_preprocessing:Corrected data types
INFO:src.data_preprocessing:Merging fraud data with geolocation information...
INFO:src.data_preprocessing:Merged data shape: (151112, 13)
INFO:src.data_preprocessing:Countries found: 1
INFO:src.data_preprocessing:Fraud data preprocessing completed


In [6]:
   # Preprocess credit card data
credit_processed = preprocessor.preprocess_credit_card_data()
    

INFO:src.data_preprocessing:Starting credit card data preprocessing...
INFO:src.data_preprocessing:Handling missing values using strategy: impute
INFO:src.data_preprocessing:Imputed missing values
INFO:src.data_preprocessing:Removed 1081 duplicate rows
INFO:src.data_preprocessing:Corrected data types
INFO:src.data_preprocessing:Credit card data preprocessing completed


In [7]:
  # Analyze class distribution
fraud_dist = preprocessor.get_class_distribution(fraud_processed)
credit_dist = preprocessor.get_class_distribution(credit_processed, 'Class')
    

INFO:src.data_preprocessing:Class distribution: {'class_counts': {0: 136961, 1: 14151}, 'total_samples': 151112, 'imbalance_ratio': np.float64(0.10332138345952498), 'fraud_percentage': np.float64(9.364577267192546)}
INFO:src.data_preprocessing:Class distribution: {'class_counts': {0: 283253, 1: 473}, 'total_samples': 283726, 'imbalance_ratio': np.float64(0.0016698852262818046), 'fraud_percentage': np.float64(0.1667101358352777)}


In [8]:
# Save preprocessed data
print("Saving preprocessed data...")
 # Create output directory if it doesn't exist
output_dir = '../data/processed'
os.makedirs(output_dir, exist_ok=True)

Saving preprocessed data...


In [9]:
 # Save fraud data
fraud_output_path = os.path.join(output_dir, 'fraud_processed.csv')
fraud_processed.to_csv(fraud_output_path, index=False)
print(f"✅ Saved fraud processed data: {fraud_processed.shape} -> {fraud_output_path}")
    

✅ Saved fraud processed data: (151112, 13) -> ../data/processed\fraud_processed.csv


In [10]:
 # Save credit card data
credit_output_path = os.path.join(output_dir, 'credit_processed.csv')
credit_processed.to_csv(credit_output_path, index=False)
print(f"✅ Saved credit card processed data: {credit_processed.shape} -> {credit_output_path}")
    

✅ Saved credit card processed data: (283726, 31) -> ../data/processed\credit_processed.csv


In [11]:
  # Save summary statistics
summary_stats = {
        'fraud_data': {
            'shape': fraud_processed.shape,
            'class_distribution': fraud_dist,
            'columns': list(fraud_processed.columns),
            'file_path': fraud_output_path
        },
        'credit_card_data': {
            'shape': credit_processed.shape,
            'class_distribution': credit_dist,
            'columns': list(credit_processed.columns),
            'file_path': credit_output_path
        },
        'preprocessing_info': {
            'fraud_data_original_shape': (151112, 11),
            'credit_card_original_shape': (284807, 31),
            'duplicates_removed': 1081,
            'missing_values_handled': 'imputed',
            'geolocation_merged': True,
            'countries_found': fraud_processed['country'].nunique() if 'country' in fraud_processed.columns else 0
        }
    }