# Exploratory Data Analysis for Fraud Detection

This notebook explores the financial transactions dataset to understand patterns, distributions, and relationships that might be relevant for fraud detection.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os

# Set plot style
plt.style.use('seaborn-whitegrid')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

## 1. Load the Dataset

First, we'll load the raw transaction data from the CSV file.

In [None]:
# Define the path to the raw data file
data_path = '../data/raw/transactions.csv'

# Check if the file exists
if os.path.exists(data_path):
    # Load the data
    df = pd.read_csv(data_path)
    print(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns")
else:
    print(f"File not found: {data_path}")
    print("Please download the dataset from Kaggle and place it in the data/raw directory.")

## 2. Data Overview

Let's examine the structure and content of the dataset.

In [None]:
# Display the first few rows
df.head()

In [None]:
# Check data types and missing values
df.info()

In [None]:
# Summary statistics
df.describe().T

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percent
})

missing_df[missing_df['Missing Values'] > 0].sort_values('Percentage', ascending=False)

## 3. Fraud Distribution Analysis

Let's examine the distribution of fraudulent vs. legitimate transactions.

In [None]:
# Check the distribution of the target variable (assuming it's called 'is_fraud')
if 'is_fraud' in df.columns:
    fraud_distribution = df['is_fraud'].value_counts(normalize=True) * 100
    print("Fraud distribution:")
    print(f"Legitimate transactions: {fraud_distribution[0]:.2f}%")
    print(f"Fraudulent transactions: {fraud_distribution[1]:.2f}%")
    
    # Plot the distribution
    plt.figure(figsize=(10, 6))
    sns.countplot(x='is_fraud', data=df)
    plt.title('Distribution of Fraudulent vs. Legitimate Transactions')
    plt.xlabel('Is Fraud')
    plt.ylabel('Count')
    plt.xticks([0, 1], ['Legitimate', 'Fraudulent'])
    plt.show()
else:
    print("Target variable 'is_fraud' not found in the dataset.")
    print("Available columns:", df.columns.tolist())

## 4. Transaction Amount Analysis

Let's analyze the transaction amounts and how they differ between fraudulent and legitimate transactions.

In [None]:
# Check if 'amount' column exists
if 'amount' in df.columns and 'is_fraud' in df.columns:
    # Basic statistics of transaction amounts by fraud status
    amount_stats = df.groupby('is_fraud')['amount'].describe()
    print("Transaction amount statistics by fraud status:")
    print(amount_stats)
    
    # Plot distribution of transaction amounts
    plt.figure(figsize=(12, 6))
    
    # Use log scale for better visualization
    plt.subplot(1, 2, 1)
    sns.histplot(data=df, x='amount', hue='is_fraud', bins=50, alpha=0.7)
    plt.title('Distribution of Transaction Amounts')
    plt.xlabel('Amount')
    plt.ylabel('Count')
    plt.legend(['Legitimate', 'Fraudulent'])
    
    plt.subplot(1, 2, 2)
    sns.histplot(data=df, x='amount', hue='is_fraud', bins=50, alpha=0.7, log_scale=True)
    plt.title('Distribution of Transaction Amounts (Log Scale)')
    plt.xlabel('Amount (Log Scale)')
    plt.ylabel('Count')
    plt.legend(['Legitimate', 'Fraudulent'])
    
    plt.tight_layout()
    plt.show()
    
    # Box plot of amounts by fraud status
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='is_fraud', y='amount', data=df)
    plt.title('Transaction Amounts by Fraud Status')
    plt.xlabel('Is Fraud')
    plt.ylabel('Amount')
    plt.xticks([0, 1], ['Legitimate', 'Fraudulent'])
    plt.show()
else:
    print("Required columns ('amount' or 'is_fraud') not found in the dataset.")

## 5. Temporal Analysis

Let's analyze the temporal patterns of transactions and fraud.

In [None]:
# Check if 'timestamp' column exists
if 'timestamp' in df.columns:
    # Convert timestamp to datetime if it's not already
    if df['timestamp'].dtype != 'datetime64[ns]':
        df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Extract time components
    df['hour'] = df['timestamp'].dt.hour
    df['day'] = df['timestamp'].dt.day
    df['dayofweek'] = df['timestamp'].dt.dayofweek
    df['month'] = df['timestamp'].dt.month
    
    # Plot transactions by hour of day
    plt.figure(figsize=(12, 6))
    
    if 'is_fraud' in df.columns:
        # Group by hour and fraud status
        hour_fraud = df.groupby(['hour', 'is_fraud']).size().unstack(fill_value=0)
        hour_fraud.plot(kind='bar', stacked=True)
        plt.title('Transactions by Hour of Day and Fraud Status')
        plt.xlabel('Hour of Day')
        plt.ylabel('Number of Transactions')
        plt.legend(['Legitimate', 'Fraudulent'])
    else:
        # Group by hour only
        hour_counts = df['hour'].value_counts().sort_index()
        hour_counts.plot(kind='bar')
        plt.title('Transactions by Hour of Day')
        plt.xlabel('Hour of Day')
        plt.ylabel('Number of Transactions')
    
    plt.xticks(rotation=0)
    plt.show()
    
    # Plot transactions by day of week
    plt.figure(figsize=(12, 6))
    
    if 'is_fraud' in df.columns:
        # Group by day of week and fraud status
        day_fraud = df.groupby(['dayofweek', 'is_fraud']).size().unstack(fill_value=0)
        day_fraud.plot(kind='bar', stacked=True)
        plt.title('Transactions by Day of Week and Fraud Status')
        plt.xlabel('Day of Week (0=Monday, 6=Sunday)')
        plt.ylabel('Number of Transactions')
        plt.legend(['Legitimate', 'Fraudulent'])
    else:
        # Group by day of week only
        day_counts = df['dayofweek'].value_counts().sort_index()
        day_counts.plot(kind='bar')
        plt.title('Transactions by Day of Week')
        plt.xlabel('Day of Week (0=Monday, 6=Sunday)')
        plt.ylabel('Number of Transactions')
    
    plt.xticks(rotation=0)
    plt.show()
else:
    print("Timestamp column not found in the dataset.")

## 6. Categorical Features Analysis

Let's analyze categorical features such as merchant category, payment method, etc.

In [None]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns: {categorical_cols}")

# For each categorical column, analyze its distribution and relationship with fraud
for col in categorical_cols[:5]:  # Limit to first 5 to avoid too many plots
    print(f"\nAnalyzing column: {col}")
    
    # Get value counts
    value_counts = df[col].value_counts().head(10)  # Top 10 values
    print(f"Top 10 values:\n{value_counts}")
    
    # Plot distribution
    plt.figure(figsize=(12, 6))
    
    if 'is_fraud' in df.columns:
        # Cross-tabulation with fraud status
        cross_tab = pd.crosstab(df[col], df['is_fraud'])
        # Calculate fraud rate for each category
        fraud_rate = cross_tab[1] / (cross_tab[0] + cross_tab[1]) * 100
        fraud_rate = fraud_rate.sort_values(ascending=False).head(10)
        
        # Plot fraud rate by category
        fraud_rate.plot(kind='bar')
        plt.title(f'Fraud Rate by {col} (Top 10)')
        plt.xlabel(col)
        plt.ylabel('Fraud Rate (%)')
        plt.xticks(rotation=45, ha='right')
    else:
        # Simple value counts
        value_counts.plot(kind='bar')
        plt.title(f'Distribution of {col} (Top 10)')
        plt.xlabel(col)
        plt.ylabel('Count')
        plt.xticks(rotation=45, ha='right')
    
    plt.tight_layout()
    plt.show()

## 7. Correlation Analysis

Let's examine correlations between numeric features and with the target variable.

In [None]:
# Select numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(f"Numeric columns: {numeric_cols}")

# Calculate correlation matrix
if len(numeric_cols) > 0:
    corr_matrix = df[numeric_cols].corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title('Correlation Matrix of Numeric Features')
    plt.tight_layout()
    plt.show()
    
    # If target variable exists, show correlation with it
    if 'is_fraud' in numeric_cols:
        target_corr = corr_matrix['is_fraud'].sort_values(ascending=False)
        print("\nCorrelation with target variable (is_fraud):")
        print(target_corr)
        
        # Plot top correlations with target
        plt.figure(figsize=(12, 6))
        target_corr[1:11].plot(kind='bar')  # Exclude self-correlation and show top 10
        plt.title('Top 10 Features Correlated with Fraud')
        plt.xlabel('Feature')
        plt.ylabel('Correlation Coefficient')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
else:
    print("No numeric columns found in the dataset.")

## 8. Feature Engineering Ideas

Based on the exploratory analysis, here are some feature engineering ideas that might be useful for fraud detection:

### 8.1 Time-based Features

- Hour of day, day of week, month
- Is weekend/weekday
- Time since last transaction (for the same user)
- Transaction velocity (number of transactions in last 1h, 24h, 7d)

### 8.2 Amount-based Features

- Log-transformed amount
- Amount bins/quantiles
- Deviation from user's average transaction amount
- Ratio to maximum previous transaction

### 8.3 Behavioral Features

- User's transaction history statistics
- Merchant-specific patterns
- Location-based features (new location, distance from usual locations)
- Device-related features

## 9. Next Steps

Based on this exploratory analysis, the next steps would be:

1. Data preprocessing:
   - Handle missing values
   - Convert categorical features to numeric (encoding)
   - Scale/normalize numeric features

2. Feature engineering:
   - Implement the feature ideas identified above
   - Create aggregated features by user, merchant, etc.

3. Model development:
   - Split data into train/validation/test sets
   - Train classification and anomaly detection models
   - Evaluate and compare model performance