# Cell 1: Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import time
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
start_time = time.time()
print("Starting EDA...")

# Cell 2: Load Data

In [None]:
df = pd.read_csv('../data/creditcard.csv')
print(f"Shape: {df.shape}")
print(f"Missing values: {df.isnull().sum().sum()}")
fraud_counts = df['Class'].value_counts()
fraud_rate = (fraud_counts[1] / len(df)) * 100
print(f"Fraud rate: {fraud_rate:.3f}%")

# Cell 3: Amount Distribution Visualization

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
legit_amounts = df[df['Class'] == 0]['Amount']
fraud_amounts = df[df['Class'] == 1]['Amount']
legit_median = legit_amounts.median()
fraud_median = fraud_amounts.median()

axes[0].hist(legit_amounts, bins=50, color='blue', alpha=0.7)
axes[0].axvline(legit_median, color='red', linestyle='--', linewidth=2)
axes[0].set_yscale('log')
axes[0].set_title(f'Legitimate (Median: ${legit_median:.2f})')

axes[1].hist(fraud_amounts, bins=50, color='red', alpha=0.7)
axes[1].axvline(fraud_median, color='blue', linestyle='--', linewidth=2)
axes[1].set_yscale('log')
axes[1].set_title(f'Fraud (Median: ${fraud_median:.2f})')
plt.suptitle('Transaction Amount Distribution')
plt.tight_layout()
plt.show()

# Cell 4: Time Pattern Analysis

In [None]:
df['Hour'] = (df['Time'] % 86400) / 3600
fig, ax1 = plt.subplots(figsize=(14, 7))
legit_sample = df[df['Class'] == 0].sample(n=10000, random_state=42)
fraud_data = df[df['Class'] == 1]

ax1.scatter(legit_sample['Hour'], legit_sample['Amount'], alpha=0.3, s=2, c='blue', label='Legitimate')
ax1.scatter(fraud_data['Hour'], fraud_data['Amount'], alpha=0.8, s=10, c='red', label='Fraud')
ax1.set_yscale('log')
ax1.set_xlabel('Hour of Day')
ax1.set_ylabel('Amount ($)')
ax1.legend()

hourly_fraud = df.groupby(df['Hour'].astype(int))['Class'].agg(['sum', 'count'])
hourly_fraud['rate'] = (hourly_fraud['sum'] / hourly_fraud['count']) * 100
peak_hours = hourly_fraud.nlargest(3, 'rate').index.tolist()

ax2 = ax1.twinx()
ax2.plot(hourly_fraud.index, hourly_fraud['rate'], 'orange', linewidth=2, marker='o')
ax2.set_ylabel('Fraud Rate (%)', color='orange')
plt.title('Fraud Patterns Throughout Day')
plt.show()

# Cell 5: Preprocessing

In [None]:
df_processed = df.copy()
scaler = StandardScaler()
df_processed['scaled_amount'] = scaler.fit_transform(df[['Amount']])
df_processed['scaled_time'] = scaler.fit_transform(df[['Time']])
df_processed = df_processed.drop(['Amount', 'Time', 'Hour'], axis=1)

X = df_processed.drop('Class', axis=1)
y = df_processed['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

df_processed.to_csv('../data/preprocessed_creditcard.csv', index=False)
np.save('../data/train_indices.npy', X_train.index.tolist())
np.save('../data/test_indices.npy', X_test.index.tolist())

# Cell 6: Summary

In [None]:
elapsed = time.time() - start_time
print(f"""
{'='*60}
DATASET OVERVIEW
{'='*60}
Total Transactions: {len(df):,}
Fraud Rate: {fraud_rate:.3f}%
Legitimate Median: ${legit_median:.2f}
Fraud Median: ${fraud_median:.2f}
Peak Fraud Hours: {peak_hours}
Train Size: {len(X_train):,}
Test Size: {len(X_test):,}
Features: {X_train.shape[1]}
Time: {elapsed:.2f}s
{'='*60}
✅ Segment 1 Complete!
""")