# Fraud Detection - Data Exploration

In this notebook, we will explore the Credit Card Fraud Detection dataset to understand the data distribution, check for class imbalance, and identify potential useful features.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import importlib

# Add src to path
sys.path.append(os.path.join(os.getcwd(), '..', 'src'))

import load_data
importlib.reload(load_data)
from load_data import load_data

# Settings
pd.set_option('display.max_columns', None)
sns.set_style("whitegrid")

## 1. Load Data

In [None]:
df = load_data()
if df is not None:
    print(f"Dataset Shape: {df.shape}")

## 2. Basic Inspection

In [None]:
if df is not None:
    display(df.head())

In [None]:
if df is not None:
    display(df.info())

In [None]:
if df is not None:
    display(df.describe())

## 3. Class Imbalance Analysis

In [None]:
if df is not None:
    class_counts = df['Class'].value_counts()
    class_ratio = df['Class'].value_counts(normalize=True)
    
    print("Class Counts:\n", class_counts)
    print("\nClass Ratios:\n", class_ratio)
    
    plt.figure(figsize=(6, 4))
    sns.countplot(x='Class', data=df)
    plt.title('Class Distribution (0: No Fraud, 1: Fraud)')
    plt.show()

## 4. Time and Amount Analysis
We investigate `Time` and `Amount` as they are the only non-PCA features.

In [None]:
if df is not None:
    fig, ax = plt.subplots(1, 2, figsize=(18, 4))

    # Amount Distribution
    sns.histplot(df['Amount'], bins=50, ax=ax[0], kde=True)
    ax[0].set_title('Distribution of Transaction Amount')
    ax[0].set_xlim([0, 2000]) # Limit to see the bulk of data

    # Time Distribution
    sns.histplot(df['Time'], bins=50, ax=ax[1], kde=True)
    ax[1].set_title('Distribution of Transaction Time')

    plt.show()

In [None]:
if df is not None:
    # Compare Amount for Fraud vs Normal
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='Class', y='Amount', data=df, showfliers=False)
    plt.title('Transaction Amount by Class (Outliers Removed)')
    plt.show()

## 5. Correlation Analysis
We check the correlation between features and the Class variable.

In [None]:
if df is not None:
    plt.figure(figsize=(24, 20))
    corr = df.corr()
    sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':20})
    plt.title('Correlation Matrix', fontsize=14)
    plt.show()

In [None]:
if df is not None:
    # Correlations with Class
    class_corr = df.corr()['Class'].sort_values(ascending=False)
    print("Top Positive Correlations:\n", class_corr.head(5))
    print("\nTop Negative Correlations:\n", class_corr.tail(5))