In [1]:
"""
Fraud Detection - Exploratory Data Analysis
===========================================

This notebook provides comprehensive exploratory data analysis 
for the fraud detection dataset.
"""

# Cell 1: Import Libraries and Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Statistical libraries
from scipy import stats
from scipy.stats import chi2_contingency
import warnings
warnings.filterwarnings('ignore')

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("📊 Fraud Detection - Exploratory Data Analysis")
print("=" * 50)

ModuleNotFoundError: No module named 'plotly'

In [2]:
# Cell 2: Load and Overview Data
def load_fraud_data():
    """Load fraud detection dataset"""
    try:
        # Try to load from multiple possible locations
        possible_paths = [
            '../data/raw/credit_card_transaction_train.csv',
            '../data/processed/train.parquet',
            '../data/raw/fraud_data.csv'
        ]
        
        for path in possible_paths:
            try:
                if path.endswith('.parquet'):
                    df = pd.read_parquet(path)
                else:
                    df = pd.read_csv(path)
                print(f"✅ Data loaded from: {path}")
                return df
            except FileNotFoundError:
                continue
        
        # If no file found, generate sample data
        print("⚠️ No data files found. Generating sample data...")
        return generate_sample_data()
        
    except Exception as e:
        print(f"❌ Error loading data: {e}")
        return generate_sample_data()

def generate_sample_data(n_samples=5000):
    """Generate sample fraud detection data for analysis"""
    np.random.seed(42)
    
    # Basic transaction data
    data = {
        'trans_date_trans_time': pd.date_range('2023-01-01', periods=n_samples, freq='H'),
        'cc_num': [f"{np.random.randint(1000, 9999)}{np.random.randint(1000, 9999)}{np.random.randint(1000, 9999)}{np.random.randint(1000, 9999)}" for _ in range(n_samples)],
        'merchant': [f"merchant_{np.random.randint(1, 1000)}" for _ in range(n_samples)],
        'category': np.random.choice(['grocery_pos', 'gas_transport', 'misc_net', 'grocery_net', 'entertainment', 'misc_pos'], n_samples),
        'amt': np.random.lognormal(3, 1, n_samples),  # Log-normal distribution for amounts
        'first': np.random.choice(['John', 'Jane', 'Michael', 'Sarah', 'David', 'Lisa'], n_samples),
        'last': np.random.choice(['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia'], n_samples),
        'gender': np.random.choice(['M', 'F'], n_samples),
        'street': [f"{np.random.randint(1, 9999)} {np.random.choice(['Main', 'Oak', 'Pine', 'Elm'])} St" for _ in range(n_samples)],
        'city': [f"City_{np.random.randint(1, 100)}" for _ in range(n_samples)],
        'state': np.random.choice(['CA', 'TX', 'FL', 'NY', 'PA', 'IL', 'OH', 'GA', 'NC', 'MI'], n_samples),
        'zip': [f"{np.random.randint(10000, 99999)}" for _ in range(n_samples)],
        'lat': np.random.uniform(25, 49, n_samples),
        'long': np.random.uniform(-125, -66, n_samples),
        'city_pop': np.random.randint(1000, 500000, n_samples),
        'job': np.random.choice(['Engineer', 'Teacher', 'Doctor', 'Lawyer', 'Artist', 'Manager'], n_samples),
        'dob': [f"{np.random.randint(1950, 2000)}-{np.random.randint(1, 12):02d}-{np.random.randint(1, 28):02d}" for _ in range(n_samples)],
        'merch_lat': np.random.uniform(25, 49, n_samples),
        'merch_long': np.random.uniform(-125, -66, n_samples),
        'merch_zipcode': [f"{np.random.randint(10000, 99999)}" for _ in range(n_samples)]
    }
    
    df = pd.DataFrame(data)
    
    # Generate fraud labels with realistic patterns
    fraud_probability = (
        (df['amt'] > df['amt'].quantile(0.95)) * 0.3 +  # High amounts
        (np.abs(df['lat'] - df['merch_lat']) > 5) * 0.2 +  # Far from merchant
        (pd.to_datetime(df['trans_date_trans_time']).dt.hour < 6) * 0.2 +  # Late night
        np.random.random(n_samples) * 0.3  # Random component
    )
    
    df['is_fraud'] = (fraud_probability > 0.6).astype(int)
    
    return df

# Load data
df = load_fraud_data()

print("📈 Dataset Overview:")
print(f"   Shape: {df.shape}")
print(f"   Fraud Rate: {df['is_fraud'].mean():.2%}")
print(f"   Date Range: {df['trans_date_trans_time'].min()} to {df['trans_date_trans_time'].max()}")

✅ Data loaded from: ../data/raw/credit_card_transaction_train.csv
📈 Dataset Overview:
   Shape: (1296675, 24)
   Fraud Rate: 0.58%
   Date Range: 2019-01-01 00:00:18 to 2020-06-21 12:13:37


In [3]:
# Cell 3: Basic Data Information
def display_basic_info(df):
    """Display basic information about the dataset"""
    
    print("🔍 Basic Dataset Information")
    print("-" * 40)
    
    # Dataset dimensions
    print(f"Rows: {df.shape[0]:,}")
    print(f"Columns: {df.shape[1]}")
    print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Data types
    print("\n📋 Data Types:")
    dtype_counts = df.dtypes.value_counts()
    for dtype, count in dtype_counts.items():
        print(f"   {dtype}: {count} columns")
    
    # Missing values
    missing_values = df.isnull().sum()
    missing_percent = (missing_values / len(df)) * 100
    
    if missing_values.sum() > 0:
        print("\n⚠️ Missing Values:")
        missing_df = pd.DataFrame({
            'Missing Count': missing_values[missing_values > 0],
            'Percentage': missing_percent[missing_values > 0]
        }).round(2)
        print(missing_df)
    else:
        print("\n✅ No missing values detected")
    
    # Target variable distribution
    print("\n🎯 Target Variable (is_fraud):")
    fraud_counts = df['is_fraud'].value_counts()
    fraud_pct = df['is_fraud'].value_counts(normalize=True) * 100
    
    for value, count in fraud_counts.items():
        label = "Fraud" if value == 1 else "Legitimate"
        print(f"   {label}: {count:,} ({fraud_pct[value]:.2f}%)")

display_basic_info(df)

🔍 Basic Dataset Information
----------------------------------------
Rows: 1,296,675
Columns: 24
Memory Usage: 1044.88 MB

📋 Data Types:
   object: 12 columns
   int64: 6 columns
   float64: 6 columns

⚠️ Missing Values:
               Missing Count  Percentage
merch_zipcode         195973       15.11

🎯 Target Variable (is_fraud):
   Legitimate: 1,289,169 (99.42%)
   Fraud: 7,506 (0.58%)


In [None]:
# Cell 4: Transaction Amount Analysis
def analyze_transaction_amounts(df):
    """Comprehensive analysis of transaction amounts"""
    
    print("💰 Transaction Amount Analysis")
    print("-" * 40)
    
    # Basic statistics
    amount_stats = df['amt'].describe()
    print("📊 Amount Statistics:")
    for stat, value in amount_stats.items():
        print(f"   {stat.title()}: ${value:.2f}")
    
    # Create visualizations
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Amount Distribution (All Transactions)',
            'Amount Distribution by Fraud Status',
            'Box Plot by Fraud Status',
            'Amount vs Fraud Rate by Bins'
        ),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    # Overall distribution
    fig.add_trace(
        go.Histogram(x=df['amt'], nbinsx=50, name='All Transactions', opacity=0.7),
        row=1, col=1
    )
    
    # Distribution by fraud status
    legitimate = df[df['is_fraud'] == 0]['amt']
    fraud = df[df['is_fraud'] == 1]['amt']
    
    fig.add_trace(
        go.Histogram(x=legitimate, nbinsx=30, name='Legitimate', opacity=0.7),
        row=1, col=2
    )
    fig.add_trace(
        go.Histogram(x=fraud, nbinsx=30, name='Fraud', opacity=0.7),
        row=1, col=2
    )
    
    # Box plots
    fig.add_trace(
        go.Box(y=legitimate, name='Legitimate', boxpoints='outliers'),
        row=2, col=1
    )
    fig.add_trace(
        go.Box(y=fraud, name='Fraud', boxpoints='outliers'),
        row=2, col=1
    )
    
    # Fraud rate by amount bins
    df['amount_bin'] = pd.cut(df['amt'], bins=20, labels=False)
    fraud_by_bin = df.groupby('amount_bin')['is_fraud'].agg(['count', 'sum', 'mean']).reset_index()
    fraud_by_bin['fraud_rate'] = fraud_by_bin['mean']
    
    fig.add_trace(
        go.Scatter(x=fraud_by_bin['amount_bin'], y=fraud_by_bin['fraud_rate'], 
                  mode='lines+markers', name='Fraud Rate'),
        row=2, col=2
    )
    
    fig.update_layout(height=800, showlegend=True, title_text="Transaction Amount Analysis")
    fig.show()
    
    # Statistical tests
    legitimate_amounts = df[df['is_fraud'] == 0]['amt']
    fraud_amounts = df[df['is_fraud'] == 1]['amt']
    
    # Mann-Whitney U test (non-parametric)
    statistic, p_value = stats.mannwhitneyu(legitimate_amounts, fraud_amounts, alternative='two-sided')
    
    print("\n🧮 Statistical Tests:")
    print(f"   Mann-Whitney U Test p-value: {p_value:.6f}")
    if p_value < 0.05:
        print("   ✅ Significant difference in amounts between fraud and legitimate transactions")
    else:
        print("   ❌ No significant difference in amounts")
    
    print("\n💡 Key Insights:")
    print(f"   - Average legitimate amount: ${legitimate_amounts.mean():.2f}")
    print(f"   - Average fraud amount: ${fraud_amounts.mean():.2f}")
    print(f"   - Median legitimate amount: ${legitimate_amounts.median():.2f}")
    print(f"   - Median fraud amount: ${fraud_amounts.median():.2f}")

analyze_transaction_amounts(df)

💰 Transaction Amount Analysis
----------------------------------------
📊 Amount Statistics:
   Count: $1296675.00
   Mean: $70.35
   Std: $160.32
   Min: $1.00
   25%: $9.65
   50%: $47.52
   75%: $83.14
   Max: $28948.90


NameError: name 'make_subplots' is not defined

In [None]:
# Cell 5: Temporal Pattern Analysis
def analyze_temporal_patterns(df):
    """Analyze fraud patterns over time"""
    
    print("⏰ Temporal Pattern Analysis")
    print("-" * 40)
    
    # Convert to datetime
    df['datetime'] = pd.to_datetime(df['trans_date_trans_time'])
    df['hour'] = df['datetime'].dt.hour
    df['day_of_week'] = df['datetime'].dt.day_name()
    df['month'] = df['datetime'].dt.month
    
    # Create temporal visualizations
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Fraud Rate by Hour of Day',
            'Fraud Rate by Day of Week',
            'Transaction Volume by Hour',
            'Monthly Fraud Trends'
        )
    )
    
    # Fraud rate by hour
    hourly_fraud = df.groupby('hour')['is_fraud'].agg(['count', 'sum', 'mean']).reset_index()
    fig.add_trace(
        go.Scatter(x=hourly_fraud['hour'], y=hourly_fraud['mean'], 
                  mode='lines+markers', name='Fraud Rate by Hour'),
        row=1, col=1
    )
    
    # Fraud rate by day of week
    daily_fraud = df.groupby('day_of_week')['is_fraud'].agg(['count', 'sum', 'mean']).reset_index()
    # Reorder days
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    daily_fraud['day_of_week'] = pd.Categorical(daily_fraud['day_of_week'], categories=day_order, ordered=True)
    daily_fraud = daily_fraud.sort_values('day_of_week')
    
    fig.add_trace(
        go.Bar(x=daily_fraud['day_of_week'], y=daily_fraud['mean'], name='Fraud Rate by Day'),
        row=1, col=2
    )
    
    # Transaction volume by hour
    fig.add_trace(
        go.Bar(x=hourly_fraud['hour'], y=hourly_fraud['count'], name='Transaction Count'),
        row=2, col=1
    )
    
    # Monthly trends
    monthly_fraud = df.groupby('month')['is_fraud'].agg(['count', 'sum', 'mean']).reset_index()
    fig.add_trace(
        go.Scatter(x=monthly_fraud['month'], y=monthly_fraud['mean'], 
                  mode='lines+markers', name='Monthly Fraud Rate'),
        row=2, col=2
    )
    
    fig.update_layout(height=800, showlegend=True, title_text="Temporal Pattern Analysis")
    fig.show()
    
    # Time-based insights
    print("⏰ Temporal Insights:")
    
    # Peak fraud hours
    peak_fraud_hour = hourly_fraud.loc[hourly_fraud['mean'].idxmax(), 'hour']
    peak_fraud_rate = hourly_fraud['mean'].max()
    print(f"   - Peak fraud hour: {peak_fraud_hour}:00 ({peak_fraud_rate:.2%} fraud rate)")
    
    # Safest hours
    safest_hour = hourly_fraud.loc[hourly_fraud['mean'].idxmin(), 'hour']
    safest_rate = hourly_fraud['mean'].min()
    print(f"   - Safest hour: {safest_hour}:00 ({safest_rate:.2%} fraud rate)")
    
    # Day of week patterns
    highest_fraud_day = daily_fraud.loc[daily_fraud['mean'].idxmax(), 'day_of_week']
    lowest_fraud_day = daily_fraud.loc[daily_fraud['mean'].idxmin(), 'day_of_week']
    print(f"   - Highest fraud day: {highest_fraud_day}")
    print(f"   - Lowest fraud day: {lowest_fraud_day}")
    
    # Business hours analysis
    business_hours = df[df['hour'].between(9, 17)]
    after_hours = df[~df['hour'].between(9, 17)]
    
    print(f"   - Business hours fraud rate: {business_hours['is_fraud'].mean():.2%}")
    print(f"   - After hours fraud rate: {after_hours['is_fraud'].mean():.2%}")

analyze_temporal_patterns(df)

In [None]:
# Cell 6: Geographic Analysis
def analyze_geographic_patterns(df):
    """Analyze geographic fraud patterns"""
    
    print("🗺️ Geographic Pattern Analysis")
    print("-" * 40)
    
    # Calculate distance between customer and merchant
    def haversine_distance(lat1, lon1, lat2, lon2):
        """Calculate distance using Haversine formula"""
        from math import radians, cos, sin, asin, sqrt
        lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        c = 2 * asin(sqrt(a))
        return c * 6371  # Earth radius in km
    
    df['distance_km'] = df.apply(
        lambda x: haversine_distance(x['lat'], x['long'], x['merch_lat'], x['merch_long']), 
        axis=1
    )
    
    # Create geographic visualizations
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Customer Locations (Fraud vs Legitimate)',
            'Distance Distribution',
            'Fraud Rate by State',
            'Distance vs Fraud Rate'
        ),
        specs=[[{"type": "scattergeo"}, {"type": "histogram"}],
               [{"type": "bar"}, {"type": "scatter"}]]
    )
    
    # Geographic scatter plot
    fraud_transactions = df[df['is_fraud'] == 1]
    legitimate_transactions = df[df['is_fraud'] == 0].sample(n=min(1000, len(df[df['is_fraud'] == 0])))
    
    fig.add_trace(
        go.Scattergeo(
            lon=legitimate_transactions['long'],
            lat=legitimate_transactions['lat'],
            mode='markers',
            marker=dict(size=4, color='blue', opacity=0.6),
            name='Legitimate'
        ),
        row=1, col=1
    )
    
    fig.add_trace(
        go.Scattergeo(
            lon=fraud_transactions['long'],
            lat=fraud_transactions['lat'],
            mode='markers',
            marker=dict(size=6, color='red', opacity=0.8),
            name='Fraud'
        ),
        row=1, col=1
    )
    
    # Distance distribution
    fig.add_trace(
        go.Histogram(x=df[df['is_fraud'] == 0]['distance_km'], name='Legitimate', opacity=0.7, nbinsx=50),
        row=1, col=2
    )
    fig.add_trace(
        go.Histogram(x=df[df['is_fraud'] == 1]['distance_km'], name='Fraud', opacity=0.7, nbinsx=50),
        row=1, col=2
    )
    
    # Fraud rate by state
    state_fraud = df.groupby('state')['is_fraud'].agg(['count', 'sum', 'mean']).reset_index()
    state_fraud = state_fraud[state_fraud['count'] >= 10]  # Filter states with enough data
    state_fraud = state_fraud.sort_values('mean', ascending=False)
    
    fig.add_trace(
        go.Bar(x=state_fraud['state'], y=state_fraud['mean'], name='Fraud Rate by State'),
        row=2, col=1
    )
    
    # Distance vs fraud rate
    df['distance_bin'] = pd.cut(df['distance_km'], bins=20, labels=False)
    distance_fraud = df.groupby('distance_bin')['is_fraud'].agg(['count', 'mean']).reset_index()
    distance_fraud = distance_fraud[distance_fraud['count'] >= 10]
    
    fig.add_trace(
        go.Scatter(x=distance_fraud['distance_bin'], y=distance_fraud['mean'], 
                  mode='lines+markers', name='Fraud Rate vs Distance'),
        row=2, col=2
    )
    
    fig.update_geos(scope="usa")
    fig.update_layout(height=800, showlegend=True, title_text="Geographic Pattern Analysis")
    fig.show()
    
    # Geographic insights
    print("🗺️ Geographic Insights:")
    print(f"   - Average distance (legitimate): {df[df['is_fraud'] == 0]['distance_km'].mean():.2f} km")
    print(f"   - Average distance (fraud): {df[df['is_fraud'] == 1]['distance_km'].mean():.2f} km")
    print(f"   - Max distance: {df['distance_km'].max():.2f} km")
    
    # Distance analysis
    short_distance = df[df['distance_km'] < 10]
    long_distance = df[df['distance_km'] > 100]
    
    print(f"   - Short distance (<10km) fraud rate: {short_distance['is_fraud'].mean():.2%}")
    print(f"   - Long distance (>100km) fraud rate: {long_distance['is_fraud'].mean():.2%}")
    
    # Top fraud states
    if len(state_fraud) > 0:
        top_fraud_state = state_fraud.iloc[0]
        print(f"   - Highest fraud state: {top_fraud_state['state']} ({top_fraud_state['mean']:.2%})")

analyze_geographic_patterns(df)

In [None]:
# Cell 7: Category and Merchant Analysis
def analyze_categories_merchants(df):
    """Analyze fraud patterns by transaction category and merchant"""
    
    print("🏪 Category and Merchant Analysis")
    print("-" * 40)
    
    # Category analysis
    category_analysis = df.groupby('category').agg({
        'is_fraud': ['count', 'sum', 'mean'],
        'amt': ['mean', 'median']
    }).round(4)
    
    category_analysis.columns = ['transaction_count', 'fraud_count', 'fraud_rate', 'avg_amount', 'median_amount']
    category_analysis = category_analysis.sort_values('fraud_rate', ascending=False)
    
    print("📊 Category Analysis:")
    print(category_analysis)
    
    # Merchant analysis (top merchants by transaction volume)
    merchant_analysis = df.groupby('merchant').agg({
        'is_fraud': ['count', 'sum', 'mean'],
        'amt': ['mean']
    }).round(4)
    
    merchant_analysis.columns = ['transaction_count', 'fraud_count', 'fraud_rate', 'avg_amount']
    merchant_analysis = merchant_analysis[merchant_analysis['transaction_count'] >= 5]  # Filter for reliability
    merchant_analysis = merchant_analysis.sort_values('fraud_rate', ascending=False)
    
    # Create visualizations
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Fraud Rate by Category',
            'Transaction Volume by Category',
            'Top 20 Riskiest Merchants',
            'Amount Distribution by Category'
        )
    )
    
    # Fraud rate by category
    fig.add_trace(
        go.Bar(x=category_analysis.index, y=category_analysis['fraud_rate'], 
               name='Fraud Rate by Category'),
        row=1, col=1
    )
    
    # Transaction volume by category
    fig.add_trace(
        go.Bar(x=category_analysis.index, y=category_analysis['transaction_count'], 
               name='Transaction Count'),
        row=1, col=2
    )
    
    # Top risky merchants
    top_risky_merchants = merchant_analysis.head(20)
    fig.add_trace(
        go.Bar(x=top_risky_merchants.index, y=top_risky_merchants['fraud_rate'], 
               name='Top Risky Merchants'),
        row=2, col=1
    )
    
    # Amount distribution by category (box plot)
    for i, category in enumerate(df['category'].unique()):
        category_data = df[df['category'] == category]['amt']
        fig.add_trace(
            go.Box(y=category_data, name=category, showlegend=False),
            row=2, col=2
        )
    
    fig.update_layout(height=800, showlegend=True, title_text="Category and Merchant Analysis")
    fig.update_xaxes(tickangle=45)
    fig.show()
    
    # Statistical analysis
    print("\n📈 Category Insights:")
    
    # Highest and lowest risk categories
    highest_risk_cat = category_analysis.index[0]
    lowest_risk_cat = category_analysis.index[-1]
    
    print(f"   - Highest risk category: {highest_risk_cat} ({category_analysis.loc[highest_risk_cat, 'fraud_rate']:.2%})")
    print(f"   - Lowest risk category: {lowest_risk_cat} ({category_analysis.loc[lowest_risk_cat, 'fraud_rate']:.2%})")
    
    # Chi-square test for category independence
    contingency_table = pd.crosstab(df['category'], df['is_fraud'])
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    
    print("\n🧮 Statistical Tests:")
    print(f"   - Chi-square test p-value: {p_value:.6f}")
    if p_value < 0.05:
        print("   ✅ Significant association between category and fraud")
    else:
        print("   ❌ No significant association between category and fraud")
    
    # Merchant insights
    print("\n🏪 Merchant Insights:")
    print(f"   - Total unique merchants: {df['merchant'].nunique()}")
    print(f"   - Merchants with fraud: {merchant_analysis[merchant_analysis['fraud_count'] > 0].shape[0]}")
    
    if len(merchant_analysis) > 0:
        avg_merchant_fraud_rate = merchant_analysis['fraud_rate'].mean()
        print(f"   - Average merchant fraud rate: {avg_merchant_fraud_rate:.2%}")

analyze_categories_merchants(df)

In [None]:
# Cell 8: Customer Demographics Analysis
def analyze_demographics(df):
    """Analyze fraud patterns by customer demographics"""
    
    print("👥 Customer Demographics Analysis")
    print("-" * 40)
    
    # Calculate age from date of birth
    df['dob_date'] = pd.to_datetime(df['dob'], errors='coerce')
    current_date = pd.Timestamp.now()
    df['age'] = (current_date - df['dob_date']).dt.days / 365.25
    df['age'] = df['age'].fillna(df['age'].median())  # Fill missing ages
    
    # Create age bins
    df['age_group'] = pd.cut(df['age'], 
                            bins=[0, 25, 35, 50, 65, 100], 
                            labels=['18-25', '26-35', '36-50', '51-65', '65+'])
    
    # Gender analysis
    gender_analysis = df.groupby('gender')['is_fraud'].agg(['count', 'sum', 'mean']).round(4)
    gender_analysis.columns = ['transaction_count', 'fraud_count', 'fraud_rate']
    
    # Age group analysis
    age_analysis = df.groupby('age_group')['is_fraud'].agg(['count', 'sum', 'mean']).round(4)
    age_analysis.columns = ['transaction_count', 'fraud_count', 'fraud_rate']
    
    # Job analysis
    job_analysis = df.groupby('job')['is_fraud'].agg(['count', 'sum', 'mean']).round(4)
    job_analysis.columns = ['transaction_count', 'fraud_count', 'fraud_rate']
    job_analysis = job_analysis.sort_values('fraud_rate', ascending=False)
    
    print("👤 Demographic Analysis Results:")
    print("\n📊 Gender Analysis:")
    print(gender_analysis)
    
    print("\n📊 Age Group Analysis:")
    print(age_analysis)
    
    print("\n📊 Top 10 Jobs by Fraud Rate:")
    print(job_analysis.head(10))
    
    # Create visualizations
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Fraud Rate by Gender',
            'Fraud Rate by Age Group',
            'Age Distribution (Fraud vs Legitimate)',
            'Top 10 Jobs by Fraud Rate'
        )
    )
    
    # Gender analysis
    fig.add_trace(
        go.Bar(x=gender_analysis.index, y=gender_analysis['fraud_rate'], name='Gender'),
        row=1, col=1
    )
    
    # Age group analysis
    fig.add_trace(
        go.Bar(x=age_analysis.index, y=age_analysis['fraud_rate'], name='Age Group'),
        row=1, col=2
    )
    
    # Age distribution
    fig.add_trace(
        go.Histogram(x=df[df['is_fraud'] == 0]['age'], name='Legitimate', opacity=0.7, nbinsx=30),
        row=2, col=1
    )
    fig.add_trace(
        go.Histogram(x=df[df['is_fraud'] == 1]['age'], name='Fraud', opacity=0.7, nbinsx=30),
        row=2, col=1
    )
    
    # Top jobs by fraud rate
    top_jobs = job_analysis.head(10)
    fig.add_trace(
        go.Bar(x=top_jobs.index, y=top_jobs['fraud_rate'], name='Job Risk'),
        row=2, col=2
    )
    
    fig.update_layout(height=800, showlegend=True, title_text="Customer Demographics Analysis")
    fig.update_xaxes(tickangle=45, row=2, col=2)
    fig.show()
    
    # Demographic insights
    print("\n💡 Demographic Insights:")
    
    # Age insights
    fraud_ages = df[df['is_fraud'] == 1]['age']
    legitimate_ages = df[df['is_fraud'] == 0]['age']
    
    print(f"   - Average fraud customer age: {fraud_ages.mean():.1f} years")
    print(f"   - Average legitimate customer age: {legitimate_ages.mean():.1f} years")
    
    # Gender insights
    if len(gender_analysis) > 1:
        high_risk_gender = gender_analysis['fraud_rate'].idxmax()
        print(f"   - Higher risk gender: {high_risk_gender} ({gender_analysis.loc[high_risk_gender, 'fraud_rate']:.2%})")
    
    # Age group insights
    high_risk_age = age_analysis['fraud_rate'].idxmax()
    low_risk_age = age_analysis['fraud_rate'].idxmin()
    print(f"   - Highest risk age group: {high_risk_age} ({age_analysis.loc[high_risk_age, 'fraud_rate']:.2%})")
    print(f"   - Lowest risk age group: {low_risk_age} ({age_analysis.loc[low_risk_age, 'fraud_rate']:.2%})")

analyze_demographics(df)

In [None]:
# Cell 9: Correlation Analysis
def analyze_correlations(df):
    """Analyze correlations between numerical features and fraud"""
    
    print("🔗 Correlation Analysis")
    print("-" * 40)
    
    # Select numerical columns
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Remove non-meaningful correlations
    exclude_cols = ['cc_num', 'merch_zipcode', 'unix_time'] if any(col in numerical_cols for col in ['cc_num', 'merch_zipcode', 'unix_time']) else []
    numerical_cols = [col for col in numerical_cols if col not in exclude_cols]
    
    # Calculate correlation matrix
    correlation_matrix = df[numerical_cols].corr()
    
    # Create correlation heatmap
    fig = go.Figure(data=go.Heatmap(
        z=correlation_matrix.values,
        x=correlation_matrix.columns,
        y=correlation_matrix.columns,
        colorscale='RdBu',
        zmid=0,
        text=correlation_matrix.round(3).values,
        texttemplate="%{text}",
        textfont={"size": 8},
        hoverongaps=False
    ))
    
    fig.update_layout(
        title="Feature Correlation Matrix",
        height=600,
        width=800
    )
    fig.show()
    
    # Fraud correlations
    fraud_correlations = correlation_matrix['is_fraud'].abs().sort_values(ascending=False)
    fraud_correlations = fraud_correlations[fraud_correlations.index != 'is_fraud']
    
    print("🎯 Features Most Correlated with Fraud:")
    for feature, correlation in fraud_correlations.head(10).items():
        print(f"   {feature}: {correlation:.4f}")
    
    # Feature relationships
    print("\n🔍 Key Relationships:")
    
    # Strong positive correlations (excluding fraud)
    upper_triangle = correlation_matrix.where(
        np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
    )
    
    strong_correlations = []
    for col in upper_triangle.columns:
        for idx in upper_triangle.index:
            if abs(upper_triangle.loc[idx, col]) > 0.7:
                strong_correlations.append((idx, col, upper_triangle.loc[idx, col]))
    
    if strong_correlations:
        print("   Strong correlations (>0.7):")
        for feat1, feat2, corr in strong_correlations[:5]:
            print(f"     {feat1} ↔ {feat2}: {corr:.3f}")
    
    return correlation_matrix

correlation_matrix = analyze_correlations(df)

In [None]:
# Cell 10: Summary and Key Findings
def generate_summary(df):
    """Generate comprehensive summary of findings"""
    
    print("📋 FRAUD DETECTION - DATA ANALYSIS SUMMARY")
    print("=" * 60)
    
    # Dataset overview
    print("📊 Dataset Overview:")
    print(f"   • Total transactions: {len(df):,}")
    print(f"   • Fraud transactions: {df['is_fraud'].sum():,}")
    print(f"   • Fraud rate: {df['is_fraud'].mean():.2%}")
    print(f"   • Time period: {df['trans_date_trans_time'].min()} to {df['trans_date_trans_time'].max()}")
    
    # Key fraud patterns
    print("\n🎯 Key Fraud Patterns Identified:")
    
    # Amount patterns
    fraud_amounts = df[df['is_fraud'] == 1]['amt']
    legitimate_amounts = df[df['is_fraud'] == 0]['amt']
    
    if fraud_amounts.mean() > legitimate_amounts.mean():
        print("   • Fraud transactions have higher average amounts")
        print(f"     - Fraud avg: ${fraud_amounts.mean():.2f}")
        print(f"     - Legitimate avg: ${legitimate_amounts.mean():.2f}")
    
    # Temporal patterns
    df['hour'] = pd.to_datetime(df['trans_date_trans_time']).dt.hour
    hourly_fraud = df.groupby('hour')['is_fraud'].mean()
    peak_hour = hourly_fraud.idxmax()
    print(f"   • Peak fraud hour: {peak_hour}:00 ({hourly_fraud[peak_hour]:.2%} fraud rate)")
    
    # Geographic patterns
    if 'distance_km' in df.columns:
        long_distance_fraud = df[df['distance_km'] > 100]['is_fraud'].mean()
        short_distance_fraud = df[df['distance_km'] <= 10]['is_fraud'].mean()
        print(f"   • Long distance transactions (>100km) have {long_distance_fraud:.2%} fraud rate")
        print(f"   • Short distance transactions (≤10km) have {short_distance_fraud:.2%} fraud rate")
    
    # Category patterns
    category_fraud = df.groupby('category')['is_fraud'].mean().sort_values(ascending=False)
    highest_risk_category = category_fraud.index[0]
    print(f"   • Highest risk category: {highest_risk_category} ({category_fraud.iloc[0]:.2%})")
    
    # Data quality assessment
    print("\n✅ Data Quality:")
    missing_percentage = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
    print(f"   • Missing data: {missing_percentage:.2f}%")
    print(f"   • Duplicate transactions: {df.duplicated().sum()}")
    
    # Feature recommendations
    print("\n🔧 Recommended Features for Modeling:")
    
    # Based on correlation analysis
    if 'distance_km' in df.columns:
        print("   • Transaction amount (continuous and log-transformed)")
        print("   • Customer-merchant distance")
        print("   • Time-based features (hour, day of week)")
        print("   • Geographic features (state, city population)")
        print("   • Category encoding")
        print("   • Customer demographics (age, gender)")
    
    # Model considerations
    print("\n🤖 Modeling Considerations:")
    print("   • Class imbalance: Consider SMOTE, class weights, or threshold tuning")
    print("   • Feature scaling: Standardize numerical features")
    print("   • Categorical encoding: Use target encoding for high-cardinality features")
    print("   • Cross-validation: Use time-based splits for temporal data")
    print("   • Evaluation metrics: Focus on precision, recall, and AUC")
    
    # Business insights
    print("\n💼 Business Insights:")
    total_fraud_amount = df[df['is_fraud'] == 1]['amt'].sum()
    print(f"   • Total fraud amount in dataset: ${total_fraud_amount:,.2f}")
    print(f"   • Average fraud loss per transaction: ${fraud_amounts.mean():.2f}")
    
    if len(category_fraud) > 1:
        print(f"   • Focus fraud prevention on {highest_risk_category} category")
    
    print("   • Implement real-time distance checking for transactions >100km")
    print(f"   • Enhanced monitoring during peak fraud hours ({peak_hour}:00)")
    
    print("\n🎉 Analysis Complete! Dataset is ready for feature engineering and modeling.")

generate_summary(df)

In [None]:
# Cell 11: Export Processed Data
def export_processed_data(df):
    """Export processed data for modeling"""
    
    print("💾 Exporting Processed Data")
    print("-" * 30)
    
    # Create output directory
    import os
    os.makedirs('../data/processed/', exist_ok=True)
    
    # Add engineered features used in analysis
    if 'distance_km' not in df.columns:
        # Add distance calculation if not already present
        def haversine_distance(lat1, lon1, lat2, lon2):
            from math import radians, cos, sin, asin, sqrt
            lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
            dlat = lat2 - lat1
            dlon = lon2 - lon1
            a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
            c = 2 * asin(sqrt(a))
            return c * 6371
        
        df['distance_km'] = df.apply(
            lambda x: haversine_distance(x['lat'], x['long'], x['merch_lat'], x['merch_long']), 
            axis=1
        )
    
    # Add time features
    df['datetime'] = pd.to_datetime(df['trans_date_trans_time'])
    df['hour'] = df['datetime'].dt.hour
    df['day_of_week'] = df['datetime'].dt.dayofweek
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    
    # Export to multiple formats
    
    # 1. Full dataset as parquet (efficient storage)
    df.to_parquet('../data/processed/fraud_data_analyzed.parquet', index=False)
    print("✅ Exported full dataset to: fraud_data_analyzed.parquet")
    
    # 2. CSV for compatibility
    df.to_csv('../data/processed/fraud_data_analyzed.csv', index=False)
    print("✅ Exported full dataset to: fraud_data_analyzed.csv")
    
    # 3. Split into train/validation/test sets
    from sklearn.model_selection import train_test_split
    
    # First split: train+val vs test (80-20)
    train_val, test = train_test_split(df, test_size=0.2, stratify=df['is_fraud'], random_state=42)
    
    # Second split: train vs val (80-20 of remaining)
    train, val = train_test_split(train_val, test_size=0.25, stratify=train_val['is_fraud'], random_state=42)
    
    # Export splits
    train.to_parquet('../data/processed/train_analyzed.parquet', index=False)
    val.to_parquet('../data/processed/validation_analyzed.parquet', index=False)
    test.to_parquet('../data/processed/test_analyzed.parquet', index=False)
    
    print(f"✅ Exported train set: {len(train):,} samples ({train['is_fraud'].mean():.2%} fraud)")
    print(f"✅ Exported validation set: {len(val):,} samples ({val['is_fraud'].mean():.2%} fraud)")
    print(f"✅ Exported test set: {len(test):,} samples ({test['is_fraud'].mean():.2%} fraud)")
    
    # 4. Export feature summary
    feature_summary = {
        'total_features': len(df.columns),
        'numerical_features': list(df.select_dtypes(include=[np.number]).columns),
        'categorical_features': list(df.select_dtypes(include=['object', 'category']).columns),
        'target_variable': 'is_fraud',
        'fraud_rate': float(df['is_fraud'].mean()),
        'sample_size': len(df),
        'date_range': {
            'start': str(df['trans_date_trans_time'].min()),
            'end': str(df['trans_date_trans_time'].max())
        }
    }
    
    import json
    with open('../data/processed/feature_summary.json', 'w') as f:
        json.dump(feature_summary, f, indent=2)
    
    print("✅ Exported feature summary to: feature_summary.json")
    print(f"\n🎯 Ready for model training with {len(df)} samples and {len(df.columns)} features!")

export_processed_data(df)

In [None]:
print("""
🎉 EXPLORATORY DATA ANALYSIS COMPLETE!
=====================================

📁 Files Created:
   • fraud_data_analyzed.parquet/csv - Full processed dataset
   • train_analyzed.parquet - Training set
   • validation_analyzed.parquet - Validation set  
   • test_analyzed.parquet - Test set
   • feature_summary.json - Feature metadata

🚀 Next Steps:
   1. Run feature engineering notebook (02_feature_engineering.ipynb)
   2. Train models using processed data (03_model_training.ipynb)
   3. Evaluate model performance (04_model_evaluation.ipynb)
   4. Deploy best model to production API

💡 Key Findings:
   • Fraud patterns identified in amounts, time, and geography
   • Data quality is suitable for machine learning
   • Clear feature candidates for model training
   • Business insights for fraud prevention strategy
""")