In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import time
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Start timer
start_time = time.time()
print("Starting Exploratory Data Analysis...")

In [None]:
# Load the dataset
df = pd.read_csv('../data/creditcard.csv')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nFirst 5 rows:")
display(df.head())

# Check data types and missing values
print(f"\nData Info:")
print(df.info())
print(f"\nMissing values: {df.isnull().sum().sum()}")

# Class distribution
fraud_counts = df['Class'].value_counts()
fraud_rate = (fraud_counts[1] / len(df)) * 100

print(f"\nClass Distribution:")
print(f"Legitimate transactions: {fraud_counts[0]:,} ({100-fraud_rate:.2f}%)")
print(f"Fraudulent transactions: {fraud_counts[1]:,} ({fraud_rate:.2f}%)")

In [None]:
# Statistical summary for Amount and Time
print("="*60)
print("STATISTICAL SUMMARY")
print("="*60)

# Amount statistics
amount_stats = df['Amount'].describe()
print("\nTransaction Amount Statistics:")
print(amount_stats)

# Time statistics (convert to hours)
df['Hour'] = (df['Time'] % 86400) / 3600
time_stats = df['Hour'].describe()
print("\nTime Statistics (in hours from start):")
print(time_stats)

# Fraud rate highlight
print(f"\n⚠️  FRAUD RATE: {fraud_rate:.3f}% ⚠️")
print(f"This is a highly imbalanced dataset!")

In [None]:
# Create amount distribution comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Legitimate transactions
legit_amounts = df[df['Class'] == 0]['Amount']
legit_median = legit_amounts.median()

axes[0].hist(legit_amounts, bins=50, color='blue', alpha=0.7, edgecolor='black')
axes[0].axvline(legit_median, color='red', linestyle='--', linewidth=2, 
                label=f'Median: ${legit_median:.2f}')
axes[0].set_yscale('log')
axes[0].set_xlabel('Transaction Amount ($)')
axes[0].set_ylabel('Frequency (log scale)')
axes[0].set_title(f'Legitimate Transactions\nMedian Amount: ${legit_median:.2f}')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Fraudulent transactions
fraud_amounts = df[df['Class'] == 1]['Amount']
fraud_median = fraud_amounts.median()

axes[1].hist(fraud_amounts, bins=50, color='red', alpha=0.7, edgecolor='black')
axes[1].axvline(fraud_median, color='blue', linestyle='--', linewidth=2,
                label=f'Median: ${fraud_median:.2f}')
axes[1].set_yscale('log')
axes[1].set_xlabel('Transaction Amount ($)')
axes[1].set_ylabel('Frequency (log scale)')
axes[1].set_title(f'Fraudulent Transactions\nMedian Amount: ${fraud_median:.2f}')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.suptitle('Transaction Amount Distribution by Class', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print(f"Key Insight: Fraud median (${fraud_median:.2f}) vs Legitimate median (${legit_median:.2f})")

In [None]:
# Analyze fraud patterns throughout the day
fig, ax1 = plt.subplots(figsize=(14, 7))

# Scatter plot of transactions
legit_data = df[df['Class'] == 0].sample(n=10000, random_state=42)  # Sample for performance
fraud_data = df[df['Class'] == 1]

ax1.scatter(legit_data['Hour'], legit_data['Amount'], 
           alpha=0.3, s=2, c='blue', label='Legitimate')
ax1.scatter(fraud_data['Hour'], fraud_data['Amount'], 
           alpha=0.8, s=10, c='red', label='Fraud')

ax1.set_xlabel('Hour of Day')
ax1.set_ylabel('Transaction Amount ($)', color='black')
ax1.set_yscale('log')
ax1.tick_params(axis='y', labelcolor='black')
ax1.set_title('Fraud Patterns Throughout the Day', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.legend(loc='upper left')

# Calculate hourly fraud rate
hourly_fraud = df.groupby(df['Hour'].astype(int))['Class'].agg(['sum', 'count'])
hourly_fraud['rate'] = (hourly_fraud['sum'] / hourly_fraud['count']) * 100

# Add fraud rate line on secondary axis
ax2 = ax1.twinx()
ax2.plot(hourly_fraud.index, hourly_fraud['rate'], 
         color='orange', linewidth=2, marker='o', label='Fraud Rate')
ax2.set_ylabel('Fraud Rate (%)', color='orange')
ax2.tick_params(axis='y', labelcolor='orange')
ax2.legend(loc='upper right')

plt.tight_layout()
plt.show()

# Find peak fraud hours
peak_hours = hourly_fraud.nlargest(3, 'rate').index.tolist()
print(f"Peak fraud hours: {peak_hours}")

In [None]:
print("\n" + "="*60)
print("DATA PREPROCESSING")
print("="*60)

# Create a copy for processing
df_processed = df.copy()

# Scale Amount and Time features
scaler = StandardScaler()
df_processed['scaled_amount'] = scaler.fit_transform(df[['Amount']])
df_processed['scaled_time'] = scaler.fit_transform(df[['Time']])

# Drop original Amount, Time, and Hour columns
df_processed = df_processed.drop(['Amount', 'Time', 'Hour'], axis=1)

print("✓ Features scaled")

# Create train-test split with stratification
X = df_processed.drop('Class', axis=1)
y = df_processed['Class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"✓ Train-test split created")
print(f"  Training set: {len(X_train):,} samples")
print(f"  Test set: {len(X_test):,} samples")

# Save preprocessed data and indices
df_processed.to_csv('../data/preprocessed_creditcard.csv', index=False)
train_indices = X_train.index.tolist()
test_indices = X_test.index.tolist()
np.save('../data/train_indices.npy', train_indices)
np.save('../data/test_indices.npy', test_indices)

print("✓ Data saved for reproducibility")

In [None]:
# Create summary dashboard
elapsed_time = time.time() - start_time

print("="*60)
print("        DATASET OVERVIEW - FRAUD DETECTION DEMO")
print("="*60)
print(f"Total Transactions: {len(df):,}")
print(f"Fraud Rate: {fraud_rate:.3f}%")
print(f"Legitimate Median Amount: ${legit_median:.2f}")
print(f"Fraud Median Amount: ${fraud_median:.2f}")
print(f"Peak Fraud Hours: {peak_hours}")
print(f"Training Set Size: {len(X_train):,}")
print(f"Test Set Size: {len(X_test):,}")
print(f"Total Features: {X_train.shape[1]}")
print(f"Execution Time: {elapsed_time:.2f} seconds")
print("="*60)
print("\n✅ Segment 1 Complete! Ready for baseline model development.")