# Exploratory Data Analysis: Anomaly Detection

This notebook visualizes the data to spot anomalies before econometric modeling.

## Objectives:
1. Load the final panel dataset
2. Trend Analysis: Visualize average FHTC Coverage over time
3. Correlation Check: Examine relationship between FHTC Coverage and health outcomes
4. Bias Inspection: Identify and visualize suspicious spikes in the data


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
import warnings
import random

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set seaborn styling
sns.set_style("whitegrid")
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Add parent directory to path for config imports
sys.path.insert(0, str(Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()))

try:
    from config import FILE_PATHS
    print("✓ Config imported successfully")
except ImportError:
    # Fallback paths if config not available
    FILE_PATHS = {
        "data": {
            "raw": "data/raw",
            "processed": "data/processed"
        }
    }
    print("⚠ Using fallback paths")

print("✓ Libraries imported successfully")


## Step 1: Load Data

Load the final panel dataset from the processed data directory.


In [None]:
# Load the final panel dataset
panel_file = Path(FILE_PATHS["data"]["processed"]) / "final_panel.csv"
print(f"Loading panel data from: {panel_file}")

try:
    df = pd.read_csv(panel_file)
    print(f"✓ Panel data loaded successfully!")
    print(f"  Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
    print(f"\nColumn names:")
    for i, col in enumerate(df.columns, 1):
        print(f"  {i:2d}. {col}")
except FileNotFoundError:
    print(f"⚠ Error: File not found at {panel_file}")
    print("Please run the 01_Data_Preparation notebook first to create the panel dataset.")
    raise
except Exception as e:
    print(f"⚠ Error loading data: {e}")
    raise

# Display first few rows
print("\nFirst few rows:")
display(df.head())


In [None]:
# Identify key columns for analysis
print("="*60)
print("IDENTIFYING KEY COLUMNS")
print("="*60)

# Find FHTC coverage column
fhtc_col = None
fhtc_keywords = ['fhtc', 'coverage', 'fhtc_coverage']
for col in df.columns:
    if any(keyword in col.lower() for keyword in fhtc_keywords):
        fhtc_col = col
        break

if fhtc_col:
    print(f"✓ FHTC Coverage column found: '{fhtc_col}'")
else:
    print("⚠ FHTC Coverage column not found. Please check column names.")
    print("Available columns:", list(df.columns))

# Find date column
date_col = None
date_keywords = ['date', 'time', 'period', 'month']
for col in df.columns:
    if any(keyword in col.lower() for keyword in date_keywords):
        if pd.api.types.is_datetime64_any_dtype(df[col]) or 'date' in col.lower():
            date_col = col
            break

if date_col:
    print(f"✓ Date column found: '{date_col}'")
    # Convert to datetime if not already
    if not pd.api.types.is_datetime64_any_dtype(df[date_col]):
        df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
else:
    print("⚠ Date column not found. Please check column names.")

# Find district column
district_col = None
district_keywords = ['district', 'dist']
for col in df.columns:
    if any(keyword in col.lower() for keyword in district_keywords):
        district_col = col
        break

if district_col:
    print(f"✓ District column found: '{district_col}'")
else:
    print("⚠ District column not found.")

# Find suspicious_spike column
spike_col = None
if 'suspicious_spike' in df.columns:
    spike_col = 'suspicious_spike'
    print(f"✓ Suspicious spike column found: '{spike_col}'")
else:
    print("⚠ Suspicious spike column not found. Bias inspection may not work.")


## Step 2: Trend Analysis

Plot the average FHTC Coverage over time across all districts.


In [None]:
# Trend Analysis: Average FHTC Coverage over time
if fhtc_col and date_col:
    print("="*60)
    print("TREND ANALYSIS: Average FHTC Coverage Over Time")
    print("="*60)
    
    # Group by date and calculate average coverage
    trend_data = df.groupby(date_col)[fhtc_col].mean().reset_index()
    trend_data = trend_data.sort_values(date_col)
    
    # Create the plot
    plt.figure(figsize=(14, 7))
    
    # Plot the trend line
    sns.lineplot(data=trend_data, x=date_col, y=fhtc_col, 
                  marker='o', linewidth=2.5, markersize=8, color='#2E86AB')
    
    # Add shaded confidence interval (using std)
    if len(trend_data) > 1:
        std_data = df.groupby(date_col)[fhtc_col].std().reset_index()
        std_data = std_data.sort_values(date_col)
        plt.fill_between(trend_data[date_col], 
                         trend_data[fhtc_col] - std_data[fhtc_col],
                         trend_data[fhtc_col] + std_data[fhtc_col],
                         alpha=0.2, color='#2E86AB', label='±1 Std Dev')
    
    plt.title('Average FHTC Coverage Over Time (All Districts)', 
              fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Date', fontsize=12, fontweight='bold')
    plt.ylabel('Average FHTC Coverage (%)', fontsize=12, fontweight='bold')
    plt.grid(True, alpha=0.3, linestyle='--')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print(f"\nSummary Statistics:")
    print(f"  Mean Coverage: {trend_data[fhtc_col].mean():.2f}%")
    print(f"  Min Coverage: {trend_data[fhtc_col].min():.2f}%")
    print(f"  Max Coverage: {trend_data[fhtc_col].max():.2f}%")
    print(f"  Overall Trend: {((trend_data[fhtc_col].iloc[-1] - trend_data[fhtc_col].iloc[0]) / trend_data[fhtc_col].iloc[0] * 100):.2f}% change")
    
else:
    print("⚠ Cannot perform trend analysis: Missing required columns (FHTC or Date)")


## Step 3: Correlation Check

Create a scatter plot to examine the relationship between FHTC Coverage and health outcomes.


In [None]:
# Identify health outcome columns
print("="*60)
print("IDENTIFYING HEALTH OUTCOME COLUMNS")
print("="*60)

# Common health outcome column names
health_keywords = ['disease', 'mortality', 'morbidity', 'health', 'cases', 'rate', 'outcome']
health_cols = [col for col in df.columns 
               if any(keyword in col.lower() for keyword in health_keywords) 
               and col != fhtc_col 
               and pd.api.types.is_numeric_dtype(df[col])]

if health_cols:
    print(f"Found {len(health_cols)} potential health outcome columns:")
    for i, col in enumerate(health_cols, 1):
        print(f"  {i}. {col}")
    health_outcome_col = health_cols[0]  # Use first found
    print(f"\nUsing '{health_outcome_col}' for correlation analysis")
else:
    print("⚠ No health outcome columns automatically detected.")
    print("\nPlease specify the health outcome column name.")
    print("Available numerical columns:")
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    for i, col in enumerate(numeric_cols, 1):
        if col != fhtc_col:
            print(f"  {i}. {col}")
    health_outcome_col = None


In [None]:
# Correlation Check: Scatter plot
if fhtc_col and health_outcome_col:
    print("="*60)
    print("CORRELATION CHECK: FHTC Coverage vs Health Outcomes")
    print("="*60)
    
    # Remove rows with missing values
    plot_data = df[[fhtc_col, health_outcome_col]].dropna()
    
    # Create scatter plot with seaborn
    plt.figure(figsize=(12, 8))
    
    # Scatter plot with regression line
    sns.scatterplot(data=plot_data, x=fhtc_col, y=health_outcome_col, 
                   alpha=0.6, s=100, color='#A23B72')
    sns.regplot(data=plot_data, x=fhtc_col, y=health_outcome_col, 
               scatter=False, color='#F18F01', line_kws={'linewidth': 2.5})
    
    plt.title(f'FHTC Coverage vs {health_outcome_col}', 
              fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('FHTC Coverage (%)', fontsize=12, fontweight='bold')
    plt.ylabel(health_outcome_col.replace('_', ' ').title(), fontsize=12, fontweight='bold')
    plt.grid(True, alpha=0.3, linestyle='--')
    plt.tight_layout()
    plt.show()
    
    # Calculate correlation
    correlation = plot_data[fhtc_col].corr(plot_data[health_outcome_col])
    print(f"\nCorrelation Coefficient: {correlation:.4f}")
    
    if abs(correlation) < 0.1:
        print("  Interpretation: Very weak correlation")
    elif abs(correlation) < 0.3:
        print("  Interpretation: Weak correlation")
    elif abs(correlation) < 0.5:
        print("  Interpretation: Moderate correlation")
    elif abs(correlation) < 0.7:
        print("  Interpretation: Strong correlation")
    else:
        print("  Interpretation: Very strong correlation")
        
elif fhtc_col and not health_outcome_col:
    print("\n⚠ Please define the health outcome column name.")
    print("You can set it manually in the next cell or modify the code above.")
    
    # Create a placeholder for manual specification
    print("\nTo manually specify, run:")
    print("  health_outcome_col = 'YOUR_COLUMN_NAME'")
    print("Then re-run the correlation check cell.")
    
else:
    print("⚠ Cannot perform correlation analysis: Missing required columns")


## Step 4: Bias Inspection

Filter data for suspicious spikes and visualize time series for districts with anomalies.


In [None]:
# Bias Inspection: Identify districts with suspicious spikes
if spike_col and fhtc_col and date_col and district_col:
    print("="*60)
    print("BIAS INSPECTION: Suspicious Spikes Analysis")
    print("="*60)
    
    # Filter rows with suspicious spikes
    suspicious_data = df[df[spike_col] == True].copy()
    
    if len(suspicious_data) > 0:
        print(f"\nFound {len(suspicious_data)} rows with suspicious spikes")
        print(f"Affected districts: {suspicious_data[district_col].nunique()}")
        
        # Get unique districts with spikes
        districts_with_spikes = suspicious_data[district_col].unique().tolist()
        print(f"\nDistricts with suspicious spikes: {len(districts_with_spikes)}")
        
        # Select 3 random districts (or all if less than 3)
        if len(districts_with_spikes) >= 3:
            selected_districts = random.sample(districts_with_spikes, 3)
        else:
            selected_districts = districts_with_spikes
        
        print(f"\nVisualizing {len(selected_districts)} district(s): {selected_districts}")
        
        # Create time series plots for selected districts
        fig, axes = plt.subplots(len(selected_districts), 1, figsize=(14, 5*len(selected_districts)))
        if len(selected_districts) == 1:
            axes = [axes]
        
        for idx, district in enumerate(selected_districts):
            # Get all data for this district
            district_data = df[df[district_col] == district].copy()
            district_data = district_data.sort_values(date_col)
            
            # Plot the time series
            ax = axes[idx]
            sns.lineplot(data=district_data, x=date_col, y=fhtc_col, 
                        marker='o', linewidth=2, markersize=6, 
                        color='#2E86AB', ax=ax, label='FHTC Coverage')
            
            # Highlight suspicious spikes
            spike_points = district_data[district_data[spike_col] == True]
            if len(spike_points) > 0:
                ax.scatter(spike_points[date_col], spike_points[fhtc_col],
                          color='red', s=200, marker='X', zorder=5,
                          label='Suspicious Spike', linewidths=2)
            
            ax.set_title(f'FHTC Coverage Over Time: {district}', 
                        fontsize=14, fontweight='bold', pad=15)
            ax.set_xlabel('Date', fontsize=11, fontweight='bold')
            ax.set_ylabel('FHTC Coverage (%)', fontsize=11, fontweight='bold')
            ax.grid(True, alpha=0.3, linestyle='--')
            ax.legend(loc='best')
            
            # Add annotation for spike values
            if len(spike_points) > 0:
                for _, row in spike_points.iterrows():
                    ax.annotate(f'Spike: {row[fhtc_col]:.1f}%',
                              xy=(row[date_col], row[fhtc_col]),
                              xytext=(10, 10), textcoords='offset points',
                              bbox=dict(boxstyle='round,pad=0.5', facecolor='yellow', alpha=0.7),
                              arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))
        
        plt.tight_layout()
        plt.show()
        
        # Summary statistics for suspicious spikes
        print("\n" + "="*60)
        print("SUSPICIOUS SPIKE SUMMARY")
        print("="*60)
        print(f"\nTotal suspicious spikes: {len(suspicious_data)}")
        print(f"\nCoverage statistics for spike points:")
        print(suspicious_data[fhtc_col].describe())
        
    else:
        print("\n✓ No suspicious spikes found in the dataset!")
        
else:
    print("⚠ Cannot perform bias inspection: Missing required columns")
    if not spike_col:
        print("  - Missing: suspicious_spike column")
    if not fhtc_col:
        print("  - Missing: FHTC coverage column")
    if not date_col:
        print("  - Missing: Date column")
    if not district_col:
        print("  - Missing: District column")


## Summary

This notebook has visualized key patterns and anomalies in the data:

1. **Trend Analysis**: Shows the overall trajectory of FHTC coverage over time
2. **Correlation Check**: Examines the relationship between coverage and health outcomes
3. **Bias Inspection**: Identifies and visualizes suspicious data spikes

These visualizations help identify data quality issues before proceeding with econometric modeling.


In [None]:
# Final summary
print("="*60)
print("EXPLORATORY ANALYSIS COMPLETE")
print("="*60)
print(f"\nDataset shape: {df.shape[0]:,} rows × {df.shape[1]} columns")

if fhtc_col:
    print(f"\nFHTC Coverage Statistics:")
    print(df[fhtc_col].describe())

if spike_col and spike_col in df.columns:
    spike_count = df[df[spike_col] == True].shape[0]
    print(f"\nSuspicious Spikes: {spike_count} ({spike_count/len(df)*100:.2f}% of data)")

print("\n✓ Anomaly detection visualizations complete!")
print("Review the plots above to identify data quality issues before modeling.")


In [None]:
# Check for missing values
print("="*60)
print("MISSING VALUES ANALYSIS")
print("="*60)

missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_data.index,
    'Missing Count': missing_data.values,
    'Missing Percentage': missing_percent.values
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print(f"\nColumns with missing values: {len(missing_df)}")
    display(missing_df)
else:
    print("\n✓ No missing values found in the dataset!")

print(f"\nTotal missing values: {df.isnull().sum().sum()}")
print(f"Percentage of missing data: {(df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100:.2f}%")


In [None]:
# Basic statistics for numerical columns
print("="*60)
print("DESCRIPTIVE STATISTICS")
print("="*60)

# Identify numerical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumerical columns ({len(numeric_cols)}): {numeric_cols}")

if len(numeric_cols) > 0:
    print("\nSummary Statistics:")
    display(df[numeric_cols].describe())
else:
    print("\n⚠ No numerical columns found in the dataset")


In [None]:
# Check for categorical/string columns
print("="*60)
print("CATEGORICAL VARIABLES")
print("="*60)

categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"\nCategorical columns ({len(categorical_cols)}): {categorical_cols}")

if len(categorical_cols) > 0:
    for col in categorical_cols:
        unique_count = df[col].nunique()
        print(f"\n{col}:")
        print(f"  Unique values: {unique_count}")
        if unique_count <= 20:
            print(f"  Values: {df[col].unique().tolist()}")
        else:
            print(f"  First 10 values: {df[col].unique()[:10].tolist()}")


In [None]:
# Check date columns and time range
print("="*60)
print("TEMPORAL ANALYSIS")
print("="*60)

# Try to identify date columns
date_cols = []
for col in df.columns:
    if 'date' in col.lower() or 'time' in col.lower() or 'period' in col.lower():
        date_cols.append(col)
        try:
            df[col] = pd.to_datetime(df[col], errors='coerce')
        except:
            pass

if len(date_cols) > 0:
    print(f"\nDate columns found: {date_cols}")
    for col in date_cols:
        if pd.api.types.is_datetime64_any_dtype(df[col]):
            print(f"\n{col}:")
            print(f"  Date range: {df[col].min()} to {df[col].max()}")
            print(f"  Unique dates: {df[col].nunique()}")
            print(f"  Missing dates: {df[col].isna().sum()}")
else:
    print("\n⚠ No date columns identified")


In [None]:
# Check for district/geographic columns
print("="*60)
print("GEOGRAPHIC ANALYSIS")
print("="*60)

# Try to identify district/geographic columns
geo_keywords = ['district', 'state', 'region', 'zone', 'area']
geo_cols = [col for col in df.columns if any(keyword in col.lower() for keyword in geo_keywords)]

if len(geo_cols) > 0:
    print(f"\nGeographic columns found: {geo_cols}")
    for col in geo_cols:
        unique_count = df[col].nunique()
        print(f"\n{col}:")
        print(f"  Unique values: {unique_count}")
        if unique_count <= 30:
            print(f"  Values: {sorted(df[col].dropna().unique().tolist())}")
        else:
            print(f"  First 20 values: {sorted(df[col].dropna().unique().tolist())[:20]}")
else:
    print("\n⚠ No geographic columns identified")


## Step 2: Data Quality Checks

Perform initial data quality assessments.


In [None]:
# Check for duplicates
print("="*60)
print("DUPLICATE RECORDS CHECK")
print("="*60)

duplicate_count = df.duplicated().sum()
print(f"\nTotal duplicate rows: {duplicate_count}")

if duplicate_count > 0:
    print(f"Percentage of duplicates: {(duplicate_count / len(df)) * 100:.2f}%")
    print("\nSample duplicate rows:")
    display(df[df.duplicated()].head())
else:
    print("✓ No duplicate rows found!")


In [None]:
# Check for outliers in numerical columns (using IQR method)
print("="*60)
print("OUTLIER DETECTION (IQR Method)")
print("="*60)

if len(numeric_cols) > 0:
    outlier_summary = []
    
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outlier_count = len(outliers)
        
        if outlier_count > 0:
            outlier_summary.append({
                'Column': col,
                'Outliers': outlier_count,
                'Percentage': (outlier_count / len(df)) * 100,
                'Lower Bound': lower_bound,
                'Upper Bound': upper_bound
            })
    
    if len(outlier_summary) > 0:
        outlier_df = pd.DataFrame(outlier_summary)
        print(f"\nColumns with outliers: {len(outlier_df)}")
        display(outlier_df)
    else:
        print("\n✓ No outliers detected using IQR method!")
else:
    print("\n⚠ No numerical columns to check for outliers")


## Step 3: Initial Visualizations

Create initial visualizations to understand the data distribution and patterns.


In [None]:
# Distribution plots for numerical variables
if len(numeric_cols) > 0:
    n_cols = min(3, len(numeric_cols))
    n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    axes = axes.flatten() if len(numeric_cols) > 1 else [axes]
    
    for idx, col in enumerate(numeric_cols):
        if idx < len(axes):
            df[col].hist(bins=30, ax=axes[idx], edgecolor='black')
            axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Frequency')
            axes[idx].grid(True, alpha=0.3)
    
    # Hide extra subplots
    for idx in range(len(numeric_cols), len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout()
    plt.show()
else:
    print("⚠ No numerical columns available for distribution plots")


In [None]:
# Correlation matrix for numerical variables
if len(numeric_cols) > 1:
    print("="*60)
    print("CORRELATION MATRIX")
    print("="*60)
    
    corr_matrix = df[numeric_cols].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix of Numerical Variables', fontsize=14, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.show()
    
    print("\nStrong correlations (|r| > 0.7):")
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_val = corr_matrix.iloc[i, j]
            if abs(corr_val) > 0.7:
                print(f"  {corr_matrix.columns[i]} ↔ {corr_matrix.columns[j]}: {corr_val:.3f}")
else:
    print("⚠ Need at least 2 numerical columns for correlation analysis")


## Summary

This notebook provides an initial exploration of the final panel dataset. Key findings and next steps can be documented here.


In [None]:
# Final summary
print("="*60)
print("EXPLORATORY ANALYSIS SUMMARY")
print("="*60)
print(f"\nDataset loaded successfully:")
print(f"  - Total rows: {df.shape[0]:,}")
print(f"  - Total columns: {df.shape[1]}")
print(f"  - Numerical columns: {len(numeric_cols)}")
print(f"  - Categorical columns: {len(categorical_cols)}")
print(f"  - Missing values: {df.isnull().sum().sum()}")
print(f"  - Duplicate rows: {df.duplicated().sum()}")

print("\n✓ Exploratory analysis complete!")
print("\nNext steps:")
print("  - Perform deeper analysis on key variables")
print("  - Create time series visualizations if date data is available")
print("  - Analyze relationships between JJM coverage and health outcomes")
print("  - Prepare data for econometric modeling")
