# Indian Road Traffic Accident Data Analysis

This notebook provides exploratory data analysis of Indian road traffic accident patterns based on official MORTH data and real-time weather information.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn')
sns.set_palette('viridis')

print('Libraries imported successfully!')

In [None]:
# Load data collection modules
import sys
sys.path.append('../src')

from data_collection.accident_processor import AccidentDataProcessor
from data_collection.weather_collector import WeatherDataCollector

print('Modules loaded successfully!')

## 1. Data Generation and Loading

In [None]:
# Initialize data processor
processor = AccidentDataProcessor()

# Generate synthetic accident data
print('Generating synthetic accident data...')
accident_df = processor.generate_synthetic_accident_data(5000)

print(f'Generated {len(accident_df)} accident records')
print(f'Data shape: {accident_df.shape}')
accident_df.head()

## 2. Basic Data Statistics

In [None]:
# Generate statistics
stats = processor.get_accident_statistics(accident_df)

print('=== ACCIDENT STATISTICS SUMMARY ===')
print(f'Total Accidents: {stats["total_accidents"]:,}')
print(f'Total Deaths: {stats["total_deaths"]:,}')
print(f'Total Injuries: {stats["total_injuries"]:,}')
print(f'Fatality Rate: {stats["fatality_rate"]:.2f}%')

## 3. Visualizations

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Accidents by state
state_counts = pd.Series(stats['accident_by_state']).head(10)
state_counts.plot(kind='bar', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('Top 10 States by Accident Count')
axes[0,0].set_xlabel('State')
axes[0,0].set_ylabel('Number of Accidents')
axes[0,0].tick_params(axis='x', rotation=45)

# 2. Accidents by severity
severity_counts = pd.Series(stats['accident_by_severity'])
axes[0,1].pie(severity_counts.values, labels=severity_counts.index, autopct='%1.1f%%')
axes[0,1].set_title('Accidents by Severity')

# 3. Hourly distribution
hourly_counts = pd.Series(stats['accident_by_hour'])
hourly_counts.plot(kind='line', ax=axes[1,0], marker='o', color='red')
axes[1,0].set_title('Accidents by Hour of Day')
axes[1,0].set_xlabel('Hour')
axes[1,0].set_ylabel('Number of Accidents')
axes[1,0].grid(True)

# 4. Monthly distribution
monthly_counts = pd.Series(stats['accident_by_month'])
monthly_counts.plot(kind='bar', ax=axes[1,1], color='green')
axes[1,1].set_title('Accidents by Month')
axes[1,1].set_xlabel('Month')
axes[1,1].set_ylabel('Number of Accidents')

plt.tight_layout()
plt.show()

## 4. Weather Impact Analysis

In [None]:
# Weather impact visualization
weather_impact = pd.Series(stats['weather_impact'])

plt.figure(figsize=(10, 6))
weather_impact.plot(kind='bar', color=['lightblue', 'orange', 'gray', 'red'])
plt.title('Average Casualties by Weather Condition')
plt.xlabel('Weather Condition')
plt.ylabel('Average Casualties per Accident')
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print('Weather Impact Summary:')
for weather, avg_casualties in weather_impact.items():
    print(f'{weather}: {avg_casualties:.2f} average casualties per accident')

## 5. Time Pattern Analysis

In [None]:
# Create heatmap for time patterns
# Prepare data for heatmap
time_data = accident_df.groupby(['day_of_week', 'hour']).size().unstack(fill_value=0)

plt.figure(figsize=(15, 8))
sns.heatmap(time_data, cmap='YlOrRd', annot=False, fmt='d', cbar_kws={'label': 'Number of Accidents'})
plt.title('Accident Heatmap: Day of Week vs Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Day of Week (0=Monday, 6=Sunday)')
plt.tight_layout()
plt.show()

## 6. Road Type and Traffic Violation Analysis

In [None]:
# Road type analysis
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Road type distribution
road_type_counts = accident_df['road_type'].value_counts()
road_type_counts.plot(kind='pie', ax=ax1, autopct='%1.1f%%')
ax1.set_title('Accidents by Road Type')
ax1.set_ylabel('')

# Traffic violations
violation_counts = accident_df['traffic_violation'].value_counts().head(8)
violation_counts.plot(kind='barh', ax=ax2, color='coral')
ax2.set_title('Top Traffic Violations Leading to Accidents')
ax2.set_xlabel('Number of Accidents')

plt.tight_layout()
plt.show()

## 7. Correlation Analysis

In [None]:
# Select numeric columns for correlation
numeric_cols = ['temperature', 'humidity', 'rainfall', 'wind_speed', 'visibility', 
                'hour', 'day_of_week', 'month', 'deaths', 'injuries', 'total_casualties']

# Calculate correlation matrix
correlation_matrix = accident_df[numeric_cols].corr()

# Create correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f', cbar_kws={'label': 'Correlation Coefficient'})
plt.title('Correlation Matrix: Environmental and Accident Factors')
plt.tight_layout()
plt.show()

## 8. Data Export for Tableau

In [None]:
# Prepare data for Tableau
tableau_df = accident_df.copy()

# Add additional calculated fields for Tableau
tableau_df['Risk_Score'] = (
    (tableau_df['severity'] == 'Fatal').astype(int) * 4 +
    (tableau_df['severity'] == 'Grievous Injury').astype(int) * 3 +
    (tableau_df['severity'] == 'Minor Injury').astype(int) * 2 +
    (tableau_df['severity'] == 'Non-Injury').astype(int) * 1
)

tableau_df['Hour_Range'] = tableau_df['hour'].apply(lambda x: 
    'Night (0-6)' if x < 6 else
    'Morning (6-12)' if x < 12 else
    'Afternoon (12-18)' if x < 18 else
    'Evening (18-24)'
)

# Save for Tableau
output_file = '../data/processed/tableau_ready_data.csv'
tableau_df.to_csv(output_file, index=False)

print(f'Data exported for Tableau: {output_file}')
print(f'Total records: {len(tableau_df)}')
print(f'Columns: {list(tableau_df.columns)}')

## 9. Summary and Next Steps

In [None]:
print('=== ANALYSIS SUMMARY ===')
print(f'• Analyzed {len(accident_df):,} accident records')
print(f'• Identified key patterns in time, weather, and geography')
print(f'• Most accidents occur during evening hours (18-21)')
print(f'• Weather conditions significantly impact accident severity')
print(f'• Data prepared for Tableau visualization')
print()
print('Next Steps:')
print('1. Train machine learning models')
print('2. Create Tableau dashboard')
print('3. Implement real-time prediction system')
print('4. Deploy TabPy functions for interactive analysis')