# Exploratory Data Analysis - Fraud Detection

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style('whitegrid')
GREEN = '#2ecc71'
RED = '#e74c3c'

df = pd.read_csv('data/processed/transactions_8k.csv')
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

## 1. Dataset Overview

In [None]:
print('Shape:', df.shape)
print('\nDtypes:')
print(df.dtypes)
print('\nMemory usage (MB):', df.memory_usage(deep=True).sum() / 1024**2)
print('\nMissing values per column:')
print(df.isnull().sum())
print('\ndf.describe():')
display(df.describe())
print('\ndf.head():')
display(df.head())

In [None]:
counts = df['is_fraud'].value_counts().sort_index()
labels = ['Non-fraud', 'Fraud']
colors = [GREEN, RED]
total = len(df)
pcts = [100 * counts.get(0, 0) / total, 100 * counts.get(1, 0) / total]

fig, ax = plt.subplots(figsize=(6, 4))
bars = ax.bar(labels, [counts.get(0, 0), counts.get(1, 0)], color=colors)
ax.set_ylabel('Count')
ax.set_title('Target distribution (is_fraud)')
for bar, cnt, pct in zip(bars, [counts.get(0, 0), counts.get(1, 0)], pcts):
    ax.annotate(f'{cnt}\n({pct:.1f}%)', xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
                ha='center', va='bottom', fontsize=11)
plt.tight_layout()
plt.show()

## 2. Transaction Analysis

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
fraud_amt = df.loc[df['is_fraud'] == 1, 'amt']
non_fraud_amt = df.loc[df['is_fraud'] == 0, 'amt']
ax.hist(non_fraud_amt, bins=50, alpha=0.5, color=GREEN, label='Non-fraud')
ax.hist(fraud_amt, bins=50, alpha=0.5, color=RED, label='Fraud')
ax.set_xscale('log')
ax.set_xlabel('Amount')
ax.set_ylabel('Count')
ax.set_title('Distribution of transaction amount (log scale)')
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
df.boxplot(column='amt', by='category', ax=ax)
ax.set_xlabel('Category')
ax.set_ylabel('Amount')
ax.set_title('Transaction Amount by Category')
plt.suptitle('')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
df['hour'] = df['trans_date_trans_time'].dt.hour
hourly = df.groupby('hour').agg(
    total=('is_fraud', 'count'),
    fraud=('is_fraud', 'sum')
).reset_index()
hourly['fraud_rate'] = 100 * hourly['fraud'] / hourly['total']

fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(hourly['hour'], hourly['fraud_rate'], marker='o')
ax.set_xlabel('Hour of day')
ax.set_ylabel('Fraud rate (%)')
ax.set_title('Fraud Rate by Hour of Day')
ax.set_xticks(range(24))
plt.tight_layout()
plt.show()

In [None]:
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek  # 0=Monday, 6=Sunday
daily = df.groupby('day_of_week').agg(
    total=('is_fraud', 'count'),
    fraud=('is_fraud', 'sum')
).reset_index()
daily['fraud_rate'] = 100 * daily['fraud'] / daily['total']
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily['day_name'] = daily['day_of_week'].map(lambda x: day_names[x])

fig, ax = plt.subplots(figsize=(8, 4))
ax.bar(daily['day_name'], daily['fraud_rate'])
ax.set_xlabel('Day of week')
ax.set_ylabel('Fraud rate (%)')
ax.set_title('Fraud Rate by Day of Week')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 3. Categorical Features

In [None]:
cat_rate = df.groupby('category').agg(total=('is_fraud', 'count'), fraud=('is_fraud', 'sum')).reset_index()
cat_rate['fraud_rate'] = 100 * cat_rate['fraud'] / cat_rate['total']
cat_rate = cat_rate.sort_values('fraud_rate', ascending=True)

fig, ax = plt.subplots(figsize=(8, 6))
bars = ax.barh(cat_rate['category'], cat_rate['fraud_rate'],
               color=plt.cm.RdYlGn_r(cat_rate['fraud_rate'] / cat_rate['fraud_rate'].max()))
ax.set_xlabel('Fraud rate (%)')
ax.set_title('Fraud Rate by Transaction Category')
plt.tight_layout()
plt.show()

In [None]:
gender_rate = df.groupby('gender').agg(total=('is_fraud', 'count'), fraud=('is_fraud', 'sum')).reset_index()
gender_rate['fraud_rate'] = 100 * gender_rate['fraud'] / gender_rate['total']

fig, ax = plt.subplots(figsize=(5, 4))
bars = ax.bar(gender_rate['gender'], gender_rate['fraud_rate'], color=[GREEN, RED][:len(gender_rate)])
ax.set_ylabel('Fraud rate (%)')
ax.set_title('Fraud Rate by Gender')
for bar, pct in zip(bars, gender_rate['fraud_rate']):
    ax.annotate(f'{pct:.1f}%', xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
                ha='center', va='bottom', fontsize=11)
plt.tight_layout()
plt.show()

In [None]:
merch_rate = df.groupby('merchant').agg(total=('is_fraud', 'count'), fraud=('is_fraud', 'sum')).reset_index()
merch_rate = merch_rate[merch_rate['total'] >= 5]
merch_rate['fraud_rate'] = 100 * merch_rate['fraud'] / merch_rate['total']
merch_rate = merch_rate.nlargest(10, 'fraud_rate').sort_values('fraud_rate', ascending=True)

fig, ax = plt.subplots(figsize=(8, 5))
ax.barh(merch_rate['merchant'], merch_rate['fraud_rate'], color=RED, alpha=0.8)
ax.set_xlabel('Fraud rate (%)')
ax.set_title('Top 10 Highest Fraud Rate Merchants')
plt.tight_layout()
plt.show()

In [None]:
state_rate = df.groupby('state').agg(total=('is_fraud', 'count'), fraud=('is_fraud', 'sum')).reset_index()
state_rate['fraud_rate'] = 100 * state_rate['fraud'] / state_rate['total']
state_rate = state_rate.nlargest(10, 'fraud_rate').sort_values('fraud_rate', ascending=True)

fig, ax = plt.subplots(figsize=(8, 5))
ax.barh(state_rate['state'], state_rate['fraud_rate'], color=RED, alpha=0.8)
ax.set_xlabel('Fraud rate (%)')
ax.set_title('Top 10 States by Fraud Rate')
plt.tight_layout()
plt.show()

## 4. Geospatial Analysis

In [None]:
def haversine(lat1, lon1, lat2, lon2):
    """Calculate great-circle distance in km between two (lat, long) points."""
    R = 6371  # Earth radius in km
    lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(np.clip(a, 0, 1)))
    return R * c

df['distance_km'] = haversine(df['lat'].values, df['long'].values,
                               df['merch_lat'].values, df['merch_long'].values)

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
non_fraud_dist = df.loc[df['is_fraud'] == 0, 'distance_km']
fraud_dist = df.loc[df['is_fraud'] == 1, 'distance_km']
ax.hist(non_fraud_dist, bins=50, alpha=0.5, color=GREEN, label='Non-fraud')
ax.hist(fraud_dist, bins=50, alpha=0.5, color=RED, label='Fraud')
ax.set_xscale('log')
ax.set_xlabel('Distance to merchant (km)')
ax.set_ylabel('Count')
ax.set_title('Distance to Merchant: Fraud vs Non-Fraud')
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(df.loc[df['is_fraud'] == 0, 'long'], df.loc[df['is_fraud'] == 0, 'lat'],
           c=GREEN, alpha=0.3, s=10, label='Non-fraud')
ax.scatter(df.loc[df['is_fraud'] == 1, 'long'], df.loc[df['is_fraud'] == 1, 'lat'],
           c=RED, alpha=0.8, s=20, label='Fraud')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_title('Transaction Locations (Red = Fraud)')
ax.legend()
plt.tight_layout()
plt.show()

## 5. Demographic Analysis

In [None]:
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = (df['trans_date_trans_time'] - df['dob']).dt.days / 365.25

print('Mean age - Non-fraud:', df.loc[df['is_fraud'] == 0, 'age'].mean())
print('Mean age - Fraud:', df.loc[df['is_fraud'] == 1, 'age'].mean())

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
ax.hist(df.loc[df['is_fraud'] == 0, 'age'], bins=30, alpha=0.5, color=GREEN, label='Non-fraud')
ax.hist(df.loc[df['is_fraud'] == 1, 'age'], bins=30, alpha=0.5, color=RED, label='Fraud')
ax.set_xlabel('Age')
ax.set_ylabel('Count')
ax.set_title('Age Distribution: Fraud vs Non-Fraud')
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
labels_age = ['18-30', '31-45', '46-60', '60+']
df['age_group'] = pd.cut(df['age'], bins=[17, 30, 45, 60, 150], labels=labels_age)

age_rate = df.groupby('age_group').agg(total=('is_fraud', 'count'), fraud=('is_fraud', 'sum')).reset_index()
age_rate['fraud_rate'] = 100 * age_rate['fraud'] / age_rate['total']

fig, ax = plt.subplots(figsize=(6, 4))
bars = ax.bar(age_rate['age_group'].astype(str), age_rate['fraud_rate'], color=[GREEN, '#f1c40f', '#e67e22', RED], alpha=0.8)
ax.set_ylabel('Fraud rate (%)')
ax.set_title('Fraud Rate by Age Group')
for bar, pct in zip(bars, age_rate['fraud_rate']):
    ax.annotate(f'{pct:.1f}%', xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
                ha='center', va='bottom', fontsize=11)
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
ax.hist(df.loc[df['is_fraud'] == 0, 'city_pop'], bins=50, alpha=0.5, color=GREEN, label='Non-fraud')
ax.hist(df.loc[df['is_fraud'] == 1, 'city_pop'], bins=50, alpha=0.5, color=RED, label='Fraud')
ax.set_xscale('log')
ax.set_xlabel('City population')
ax.set_ylabel('Count')
ax.set_title('City Population: Fraud vs Non-Fraud')
ax.legend()
plt.tight_layout()
plt.show()

## 6. Correlation Analysis

In [None]:
numeric_df = df.select_dtypes(include=[np.number])
corr = numeric_df.corr()

fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0, ax=ax,
            square=False, linewidths=0.5)
ax.set_title('Correlation Heatmap - Numeric Features')
plt.tight_layout()
plt.show()

In [None]:
BLUE = '#3498db'
fraud_corr = numeric_df.corr()['is_fraud'].drop('is_fraud', errors='ignore').sort_values(key=abs, ascending=True)
top10 = fraud_corr.tail(10)

colors = [RED if x >= 0 else BLUE for x in top10.values]
fig, ax = plt.subplots(figsize=(8, 5))
ax.barh(top10.index, top10.values, color=colors)
ax.axvline(0, color='black', linewidth=0.5)
ax.set_xlabel('Correlation with is_fraud')
ax.set_title('Top 10 Features Correlated with Fraud')
plt.tight_layout()
plt.show()

## 7. Key Insights Summary

1. **Class Imbalance**: The dataset has ~1.7% fraud rate, requiring careful handling with stratified sampling and appropriate metrics (PR-AUC over accuracy).

2. **Transaction Amount**: Fraudulent transactions tend to have higher amounts than legitimate ones. Log-transformed amount will be a strong feature.

3. **Time Patterns**: Fraud is more common during certain hours (late night/early morning). Hour of day and is_night flag will be useful features.

4. **Category Risk**: Certain merchant categories have significantly higher fraud rates. Category-level fraud rate encoding will be valuable.

5. **Geospatial Signal**: Distance between cardholder and merchant location differs for fraud vs non-fraud. Distance features will be important predictors.

6. **Age Factor**: Certain age groups are more targeted by fraud. Age and age-group features should be included.

7. **Next Steps**: Feature engineering should focus on: log_amount, hour_of_day, distance_to_merchant, category_fraud_rate, age, and velocity features (transaction frequency per card).