In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load data
df = pd.read_csv('../data/raw/transport_delays.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nColumn Names and Types:")
print(df.dtypes)
print("\nFirst Few Rows:")
display(df.head())

# Statistical summary
print("\nStatistical Summary:")
display(df.describe())

# Check for missing values
print("\nMissing Values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing,
    'Percentage': missing_pct
}).sort_values('Percentage', ascending=False)
print(missing_df[missing_df['Missing_Count'] > 0])

# Target variable analysis
print("\nTarget Variable Analysis (delayed):")
print(df['delayed'].value_counts())
print("\nDelay Statistics:")
print(f"Average Departure Delay: {df['actual_departure_delay_min'].mean():.2f} minutes")
print(f"Average Arrival Delay: {df['actual_arrival_delay_min'].mean():.2f} minutes")
print(f"Max Departure Delay: {df['actual_departure_delay_min'].max():.2f} minutes")
print(f"Max Arrival Delay: {df['actual_arrival_delay_min'].max():.2f} minutes")

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Delay distribution
axes[0, 0].hist(df['actual_arrival_delay_min'], bins=50, edgecolor='black')
axes[0, 0].set_title('Distribution of Arrival Delays')
axes[0, 0].set_xlabel('Delay (minutes)')
axes[0, 0].set_ylabel('Frequency')

# 2. Delays by weather condition
weather_delays = df.groupby('weather_condition')['actual_arrival_delay_min'].mean().sort_values()
axes[0, 1].barh(weather_delays.index, weather_delays.values)
axes[0, 1].set_title('Average Delay by Weather Condition')
axes[0, 1].set_xlabel('Average Delay (minutes)')

# 3. Delays by time of day (peak hour)
peak_delays = df.groupby('peak_hour')['actual_arrival_delay_min'].mean()
axes[1, 0].bar(['Non-Peak', 'Peak'], peak_delays.values, color=['green', 'red'])
axes[1, 0].set_title('Average Delay: Peak vs Non-Peak Hours')
axes[1, 0].set_ylabel('Average Delay (minutes)')

# 4. Delays by season
season_delays = df.groupby('season')['actual_arrival_delay_min'].mean()
axes[1, 1].bar(season_delays.index, season_delays.values, color=['skyblue', 'orange', 'green', 'gray'])
axes[1, 1].set_title('Average Delay by Season')
axes[1, 1].set_ylabel('Average Delay (minutes)')

plt.tight_layout()
plt.savefig('../docs/initial_exploration.png', dpi=300, bbox_inches='tight')
plt.show()

# Correlation analysis
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numeric Features')
plt.tight_layout()
plt.savefig('../docs/correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# Interactive visualization with Plotly
fig = px.scatter(df, 
                 x='temperature_C', 
                 y='actual_arrival_delay_min',
                 color='weather_condition',
                 size='precipitation_mm',
                 hover_data=['route_id', 'event_type'],
                 title='Temperature vs Delay (colored by weather, sized by precipitation)')
fig.write_html('../docs/interactive_delay_analysis.html')
fig.show()

# Event impact analysis
print("\nEvent Impact on Delays:")
event_impact = df.groupby('event_type').agg({
    'actual_arrival_delay_min': ['mean', 'median', 'std'],
    'trip_id': 'count'
}).round(2)
event_impact.columns = ['Mean_Delay', 'Median_Delay', 'Std_Delay', 'Trip_Count']
print(event_impact.sort_values('Mean_Delay', ascending=False))

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/transport_delays.csv'

In [4]:
pip install pandas numpy matplotlib seaborn plotly

Collecting matplotlib
  Using cached matplotlib-3.10.8-cp311-cp311-win_amd64.whl.metadata (52 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting plotly
  Using cached plotly-6.5.0-py3-none-any.whl.metadata (8.5 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.3-cp311-cp311-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Using cached fonttools-4.61.1-cp311-cp311-win_amd64.whl.metadata (116 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Using cached kiwisolver-1.4.9-cp311-cp311-win_amd64.whl.metadata (6.4 kB)
Collecting pillow>=8 (from matplotlib)
  Using cached pillow-12.0.0-cp311-cp311-win_amd64.whl.metadata (9.0 kB)
Collecting pyparsing>=3 (from matplotlib)
  Using cached pyparsing-3.3.1-py3-none-any.whl.metadata (5.6 kB)
Collecting narwhals>=1.15.1 (from plotly)
  