In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
# Configure plot styles
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 100


In [3]:
from src.data_processing import DataProcessor, EDAReportGenerator

In [4]:
# Define paths
raw_data_path = "../data/raw/data.csv"  # Update with your actual path
processed_data_path = "../data/processed/processed_customer_data.csv"

In [5]:
# Initialize and run processing pipeline
processor = DataProcessor(raw_data_path)
rfm_df = processor.run_pipeline(processed_data_path)

# Display the processed RFM data
if rfm_df is not None:
    display(rfm_df.head(10))

Data loaded successfully from ../data/raw/data.csv. Shape: (95662, 16)
Starting data preprocessing...
TransactionStartTime converted and date extracted.
Calculating RFM features...
RFM features calculated.
Defining proxy default variable...
Proxy default variable defined.
Processed customer data saved to ../data/processed/processed_customer_data.csv


Unnamed: 0,CustomerId,Recency,Frequency,Monetary,Default
0,CustomerId_1,83,1,10000,0
1,CustomerId_10,83,1,10000,0
2,CustomerId_1001,89,5,30400,0
3,CustomerId_1002,25,11,4775,0
4,CustomerId_1003,11,6,32000,0
5,CustomerId_1004,52,1,2000,0
6,CustomerId_1005,46,9,61200,0
7,CustomerId_1006,64,1,1000,0
8,CustomerId_1007,81,1,28000,0
9,CustomerId_1008,65,2,20000,0


In [6]:
# Run EDA on the original dataset (not just RFM)
if processor.df is not None:
    eda = EDAReportGenerator(processor.df)


In [7]:
overview= eda.get_data_overview()
print(overview)

### 1. Overview of the Data
  - Number of Rows: 95662
  - Number of Columns: 17
  - Data Types:
TransactionId                        object
BatchId                              object
AccountId                            object
SubscriptionId                       object
CustomerId                           object
CurrencyCode                         object
CountryCode                           int64
ProviderId                           object
ProductId                            object
ProductCategory                      object
ChannelId                            object
Amount                              float64
Value                                 int64
TransactionStartTime    datetime64[ns, UTC]
PricingStrategy                       int64
FraudResult                           int64
transaction_date                     object



In [8]:
summary= eda.get_summary_statistics()
print(summary)


### 2. Summary Statistics (Numerical Features)
       CountryCode        Amount         Value  PricingStrategy   FraudResult
count      95662.0  9.566200e+04  9.566200e+04     95662.000000  95662.000000
mean         256.0  6.717846e+03  9.900584e+03         2.255974      0.002018
std            0.0  1.233068e+05  1.231221e+05         0.732924      0.044872
min          256.0 -1.000000e+06  2.000000e+00         0.000000      0.000000
25%          256.0 -5.000000e+01  2.750000e+02         2.000000      0.000000
50%          256.0  1.000000e+03  1.000000e+03         2.000000      0.000000
75%          256.0  2.800000e+03  5.000000e+03         2.000000      0.000000
max          256.0  9.880000e+06  9.880000e+06         4.000000      1.000000



In [9]:
numerical_distribution= eda.analyze_numerical_distribution()
print(numerical_distribution)


### 3. Distribution of Numerical Features
  - **CountryCode**:
    - Mean: 256.00
    - Median: 256.00
    - Skewness: 0.00 (Positive: Right-skewed, Negative: Left-skewed, ~0: Symmetric)
    - Std Dev: 0.00
  - **Amount**:
    - Mean: 6717.85
    - Median: 1000.00
    - Skewness: 51.10 (Positive: Right-skewed, Negative: Left-skewed, ~0: Symmetric)
    - Std Dev: 123306.80
    - Range: -1000000.00 to 9880000.00
    - Observations: Typically highly skewed with many small transactions and a few very large ones. 'Amount' includes negative values (debits/credits).
  - **Value**:
    - Mean: 9900.58
    - Median: 1000.00
    - Skewness: 51.29 (Positive: Right-skewed, Negative: Left-skewed, ~0: Symmetric)
    - Std Dev: 123122.09
    - Range: 2.00 to 9880000.00
    - Observations: Typically highly skewed with many small transactions and a few very large ones. 'Amount' includes negative values (debits/credits).
  - **PricingStrategy**:
    - Mean: 2.26
    - Median: 2.00
    - Skewness: 1.66 

In [10]:
categorical_distribution= eda.analyze_categorical_distribution()
print(categorical_distribution)


### 4. Distribution of Categorical Features
  - **TransactionId**: 95662 unique values. Primarily identifiers or datetime strings.
  - **BatchId**: 94809 unique values. Primarily identifiers or datetime strings.
  - **AccountId**: 3633 unique values. Primarily identifiers or datetime strings.
  - **SubscriptionId**: 3627 unique values. Primarily identifiers or datetime strings.
  - **CustomerId**: 3742 unique values. Primarily identifiers or datetime strings.
  - **CurrencyCode** (Top 5):
CurrencyCode
UGX    100.0%
    - Observations: Highly concentrated, e.g., mostly UGX and Country Code 256.
  - **ProviderId** (Top 5):
ProviderId
ProviderId_4    39.920763
ProviderId_6    35.736238
ProviderId_5    15.201438
ProviderId_1     5.898894
ProviderId_3     3.223851%
  - **ProductId** (Top 5):
ProductId
ProductId_6     34.114905
ProductId_3     25.447931
ProductId_10    16.081621
ProductId_15    12.506533
ProductId_1      3.015827%
  - **ProductCategory** (Top 5):
ProductCategory
financial_s

In [11]:
correlation_analysis= eda.perform_correlation_analysis()
print(correlation_analysis) 


### 5. Correlation Analysis (Numerical Features)
  - Correlation Matrix:
                 CountryCode    Amount     Value  PricingStrategy  FraudResult
CountryCode              NaN       NaN       NaN              NaN          NaN
Amount                   NaN  1.000000  0.989692        -0.061931     0.557370
Value                    NaN  0.989692  1.000000        -0.017020     0.566739
PricingStrategy          NaN -0.061931 -0.017020         1.000000    -0.033821
FraudResult              NaN  0.557370  0.566739        -0.033821     1.000000
  - Key Observations:
    - Strong positive correlation between 'Amount' (absolute magnitude) and 'Value'.
    - 'FraudResult' correlations with other numerical features might be weak due to its imbalance, but large magnitudes in 'Amount'/'Value' could be indicative.



In [12]:
missing_values= eda.identify_missing_values()
print(missing_values)


### 6. Identifying Missing Values
  - No missing values found in the dataset.



In [13]:
outliers= eda.detect_outliers()
print(outliers)


### 7. Outlier Detection (Numerical Features) - Z-Score Method
  - **Amount**:
    - Mean: 6717.85
    - Standard Deviation: 123306.80
    - Number of Outliers (|Z-score| > 3): 269
    - Percentage of Outliers: 0.28%
    - Outlier Range: -1000000.00 to 9880000.00
    - Max Z-score: 80.07
    - Observations: Z-score method identifies extreme values that deviate significantly from the mean. High Z-scores may indicate potential fraud, data entry errors, or legitimate large transactions.
  - **Value**:
    - Mean: 9900.58
    - Standard Deviation: 123122.09
    - Number of Outliers (|Z-score| > 3): 269
    - Percentage of Outliers: 0.28%
    - Outlier Range: 380000.00 to 9880000.00
    - Max Z-score: 80.17
    - Observations: Z-score method identifies extreme values that deviate significantly from the mean. High Z-scores may indicate potential fraud, data entry errors, or legitimate large transactions.
  - **FraudResult**: As a binary flag, 'FraudResult' doesn't have numerical outliers in

In [14]:
# Remove outliers from all numerical columns with default threshold
processor.remove_outliers_zscore()

Removing outliers with Z-score > 3...
  - Amount: 269 outliers removed
  - Value: 269 outliers removed
  - PricingStrategy: 385 outliers removed
  - FraudResult: 193 outliers removed
Outlier removal completed:
  - Initial rows: 95662
  - Final rows: 95044
  - Rows removed: 618
  - Percentage removed: 0.65%


True

In [15]:
# Clean data (remove outliers with Z-score > 3) and save to CSV
processor.clean_and_save_data("cleaned-data.csv")

=== Data Cleaning and Saving Pipeline ===

1. Removing outliers (Z-score > 3)...
Removing outliers with Z-score > 3...
  - Amount: 1571 outliers removed
  - Value: 1109 outliers removed
Outlier removal completed:
  - Initial rows: 95044
  - Final rows: 93469
  - Rows removed: 1575
  - Percentage removed: 1.66%

2. Saving cleaned data to e:\AI\Week-5\Week 5 Project\credit-risk-model\data\processed\cleaned-data.csv...
✅ Successfully saved cleaned data!
📊 Final dataset shape: (93469, 17)
💾 File saved at: e:\AI\Week-5\Week 5 Project\credit-risk-model\data\processed\cleaned-data.csv

📈 Dataset Summary:
   - Total rows: 93469
   - Total columns: 17
   - Fraud cases: 0 (0.0000)


True