# Credit Card Fraud Detection - Exploratory Data Analysis

This notebook performs comprehensive exploratory data analysis on the credit card fraud detection dataset.

## 1. Setup and Data Loading

In [None]:
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from src.utils import (
    load_data, 
    print_dataset_summary, 
    plot_class_distribution,
    plot_feature_distributions,
    create_correlation_matrix
)

warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)

print("Libraries imported successfully!")

In [None]:
# Load the dataset
data_path = '../data/creditcard_2023.csv'
df = load_data(data_path)
print_dataset_summary(df)

## 2. Dataset Overview

In [None]:
print("First 5 rows:")
display(df.head())
print("\nDataset info:")
print(df.info())
print("\nBasic statistics:")
display(df.describe())

## 3. Class Distribution Analysis

In [None]:
plot_class_distribution(df)

class_stats = df['Class'].value_counts()
class_percentages = df['Class'].value_counts(normalize=True) * 100

print(f"Legitimate: {class_stats[0]:,} ({class_percentages[0]:.3f}%)")
print(f"Fraudulent: {class_stats[1]:,} ({class_percentages[1]:.3f}%)")
print(f"Imbalance ratio: {class_stats[0] / class_stats[1]:.1f}:1")

## 4. Feature Analysis

In [None]:
v_features = [col for col in df.columns if col.startswith('V')]
print(f"Number of V features: {len(v_features)}")
plot_feature_distributions(df, features=v_features[:16])

## 5. Correlation Analysis

In [None]:
create_correlation_matrix(df)

correlations_with_target = df.corr()['Class'].abs().sort_values(ascending=False)
print("Top features correlated with Class:")
print(correlations_with_target.head(15))

## 6. Summary

In [None]:
total_transactions = len(df)
fraud_transactions = (df['Class'] == 1).sum()
fraud_rate = fraud_transactions / total_transactions * 100

print("📊 DATASET SUMMARY")
print("=" * 30)
print(f"Total transactions: {total_transactions:,}")
print(f"Fraudulent transactions: {fraud_transactions:,}")
print(f"Fraud rate: {fraud_rate:.3f}%")
print(f"Features: {len(df.columns) - 1}")
print("\n✅ EDA COMPLETE")