In [None]:
# Import libraries needed to execute the code
import os
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import scipy.cluster.hierarchy as shc
from sklearn.decomposition import PCA
from IPython.display import display, HTML
from scipy.cluster.hierarchy import dendrogram, linkage
from pandas.plotting import andrews_curves, parallel_coordinates, lag_plot, autocorrelation_plot, radviz

In [None]:
# Import the clean data
data = pd.read_csv('source/data.csv')

In [None]:
# List of columns to convert to object type
columns_to_convert = [
    'AccID', 'accident_situation', 'atm_condition', 'collision_type', 'com_code', 'dep_code', 
    'fixed_obstacle', 'gender', 'gravity', 'infra', 'initial_impact_point', 'int', 'location', 
    'longitudinal_profile', 'lum', 'manv', 'mobile_obstacle', 'motor', 'num_veh_x', 'num_veh_y',
    'plan', 'reason_travel', 'reserved_lane_code', 'route_category', 'safety_equipment1', 'seat', 
    'surface_condition', 'traffic_direction', 'traffic_regime', 'user_category', 
    'vehicle_category', 'vehicleID_x', 'vehicleID_y'
]

# Convert specified columns to object type
data[columns_to_convert] = data[columns_to_convert].astype('object')

In [None]:
# Identifying correlations with the target variable 'gravity'

# Identifying non-numeric columns again
non_numeric_columns = data.select_dtypes(include=['object']).columns

# Copy the original dataframe
data_numeric = data.copy()

# Convert columns to numeric where appropriate, errors='coerce' will convert non-convertible values to NaN
for col in non_numeric_columns:
    data_numeric[col] = pd.to_numeric(data_numeric[col], errors='coerce')

# Identifying and dropping problematic columns specifically
problematic_columns = ['time', 'vehicleID_x', 'num_veh_x', 'vehicleID_y', 'num_veh_y', 'year']

# Drop the problematic columns and recalculate the correlation matrix
data_cleaned = data_numeric.drop(columns=problematic_columns)

# Calculate correlation matrix
correlation_matrix = data_cleaned.corr()

# Extract correlations with the target variable 'gravity'
correlation_with_gravity = correlation_matrix['gravity'].sort_values(ascending=False)

correlation_with_gravity

<font size="6">  
    Visualizations Preparation
</font> 

In [None]:
# Set up the figure size and style
sns.set_style("darkgrid")

In [None]:
# Ignore specific warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

In [None]:
# Replace the values in gravity field
data['gravity'] = data['gravity'].replace({
     1: '1 - Uninjured',
     2: '2 - Fatal',
     3: '3 - Hospitalized',
     4: '4 - Minor injury'
})
# Define custom colors
custom_colors = {
    '1 - Uninjured': '#66B2FF',      
    '2 - Fatal': '#FF9999',       
    '3 - Hospitalized': '#FFCC99',   
    '4 - Minor injury': '#99FF99' 
}

# Sort gravity levels alphabetically
gravity_order = sorted(custom_colors.keys())

In [None]:
# Function to calculate and display percentages in a separated table 
def calculate_and_display_percentages(data, group_by_col):
    counts = data.groupby([group_by_col, 'gravity']).size().reset_index(name='count')
    total_counts = data[group_by_col].value_counts().reset_index(name='total')
    total_counts.columns = [group_by_col, 'total']
    counts = counts.merge(total_counts, on=group_by_col)
    counts['percentage'] = counts['count'] / counts['total'] * 100

    # Pivot the table to show only percentages
    percentage_table = counts.pivot(index=group_by_col, columns='gravity', values='percentage').fillna(0)
    
    styled_table = percentage_table.style.format("{:.1f}%").set_table_styles(
        [{'selector': 'th', 'props': [('font-size', '12pt'), ('font-weight', 'bold'), ('text-align', 'center')]},
         {'selector': 'td', 'props': [('font-size', '10pt'), ('text-align', 'center')]},
         {'selector': 'caption', 'props': [('caption-side', 'top')]}]
    ).set_caption(f'Percentages of Accident Gravity by {group_by_col.capitalize()}').background_gradient(cmap='Blues', axis=None)

    # Display the styled table
    html = styled_table.to_html()
    display(HTML(html))

<font size="6">  
Graph Visualization
</font>

In [None]:
# 1. Number of accidents by Severity Category

# Calculate counts and sort them
counts = data['gravity'].value_counts().sort_values(ascending=False)

# Calculate percentages
total = counts.sum()
percentages = counts / total * 100

plt.figure(figsize=(18, 12))
ax = sns.countplot(data=data, x='gravity', order=counts.index, palette=custom_colors)

for p, percentage in zip(ax.patches, percentages):
    height = p.get_height()
    ax.annotate(f'{percentage:.1f}%', 
                (p.get_x() + p.get_width() / 2., height), 
                ha='center', va='center', 
                xytext=(0, 8), 
                textcoords='offset points')

plt.title('Number of accidents by Severity Category')
plt.xlabel('Severity')
plt.ylabel('Number of Accidents')
plt.show()

In [None]:
# 2. Number of accidents by Severity vs.Year
plt.figure(figsize=(18, 12))
sns.countplot(data=data, x='year', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Severity vs. Year of Accident')
plt.xlabel('Year')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.show()

# Call function to show the percentage Gravity vs. Year
calculate_and_display_percentages(data, 'year')

In [None]:
# 2.1 Number of accidents by Severity vs.Year
years = data['year'].unique()
n_years = len(years)

fig, axes = plt.subplots(1, n_years, figsize=(18, 6))
fig.suptitle('Severity Distribution per Year', fontsize=16)

custom_colors_mat = ['#66B2FF', '#99FF99', '#FFCC99', '#FF9999']

for i, year in enumerate(years):
    yearly_data = data[data['year'] == year]['gravity'].value_counts()
    colors = custom_colors_mat[:len(yearly_data)]  
    axes[i].pie(yearly_data, labels=yearly_data.index, autopct='%1.1f%%', colors=colors)
    axes[i].set_title(f'Year {year}')

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

In [None]:
# 3. Number of accidents by Severity vs. Month
plt.figure(figsize=(18, 12))
sns.countplot(data=data, x='month', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Severity vs. Month')
plt.xlabel('Month')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.show()

calculate_and_display_percentages(data, 'month')

In [None]:
# 4. Number of accidents by Severity vs. Day
aggregated_data = data.groupby(['day', 'gravity']).size().reset_index(name='count')

plt.figure(figsize=(18, 12))
sns.lineplot(data=aggregated_data, x='day', y='count', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Number of Accidents by Severity vs. Day')
plt.xlabel('Day')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.show()

calculate_and_display_percentages(data, 'day')

In [None]:
# 5. Number of accidents by Severity vs. Weather Conditions
plt.figure(figsize=(18, 12))
sns.countplot(data=data, x='atm_condition', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Severity vs. Weather Conditions')
plt.xlabel('Weather Condition')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.xticks(rotation=45)
plt.show()

calculate_and_display_percentages(data, 'atm_condition')

In [None]:
# 6. Number of accidents by Severity vs. Lighting Conditions
plt.figure(figsize=(18, 12))
sns.countplot(data=data, x='lum', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Gravity vs. Lighting Conditions')
plt.xlabel('Lighting Condition')
plt.ylabel('Count')
plt.legend(title='Gravity')
plt.xticks(rotation=45)
plt.show()

calculate_and_display_percentages(data, 'lum')

In [None]:
# 7. Number of accidents by Severity vs. Vehicle Category
plt.figure(figsize=(18, 12))
sns.countplot(data=data, x='vehicle_category', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Gravity vs. Vehicle Category')
plt.xlabel('Vehicle Category')
plt.ylabel('Count')
plt.legend(title='Gravity')
plt.xticks(rotation=45)
plt.show()

calculate_and_display_percentages(data, 'vehicle_category')

In [None]:
# 7.1 Number of accidents by Severity vs.Year
genders = data['gender'].unique()
n_genders = len(genders)

fig, axes = plt.subplots(1, n_genders, figsize=(18, 6))
fig.suptitle('Severity Distribution per gender', fontsize=16)

custom_colors_mat = ['#66B2FF', '#99FF99', '#FFCC99', '#FF9999']

for i, gender in enumerate(genders):
    genderly_data = data[data['gender'] == gender]['gravity'].value_counts()
    colors = custom_colors_mat[:len(genderly_data)]  
    axes[i].pie(genderly_data, labels=genderly_data.index, autopct='%1.1f%%', colors=colors)
    axes[i].set_title(f'gender {gender}')

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

In [None]:
# 8. Number of accidents by Severity vs. Gender
plt.figure(figsize=(18, 12))
sns.countplot(data=data, x='gender', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Gravity vs. Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Gravity')
plt.show()

calculate_and_display_percentages(data, 'gender')

In [None]:
# 9. Number of accidents by Severity vs. Surface Condition
plt.figure(figsize=(18,12))
sns.countplot(data=data, x='surface_condition', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Gravity vs. Surface Condition')
plt.xlabel('Surface Condition')
plt.ylabel('Count')
plt.legend(title='Gravity')
plt.xticks(rotation=45)
plt.show()

calculate_and_display_percentages(data, 'surface_condition')

In [None]:
# 10. Number of accidents by Severity vs. Maximum Speed
# Define the bins and corresponding labels
bins = [0, 49, 59, 69, 79, 89, 99, 109, float('inf')]
labels = ['0-49', '50-59', '60-69', '70-79', '80-89', '90-99', '100-109', '110-119']

# Create a new column for the binned speeds
data['speed_group'] = pd.cut(data['maximum_speed'], bins=bins, labels=labels, right=False)

plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='speed_group', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Gravity vs. Maximum Speed')
plt.xlabel('Maximum Speed Group')
plt.ylabel('Gravity')
plt.legend(title='Gravity')
plt.xticks(rotation=45)
plt.show()

calculate_and_display_percentages(data, 'maximum_speed')

In [None]:
# 11. Number of accidents by Severity vs. Age
plt.figure(figsize=(10, 6))
sns.boxplot(data=data, x='gravity', y='age', hue_order=gravity_order, palette=custom_colors)
plt.title('Age Distribution by Accident Gravity Category')
plt.show()

calculate_and_display_percentages(data, 'age')

In [None]:
# 12. Number of accidents by Severity vs. Age
plt.figure(figsize=(10, 6))
sns.kdeplot(data=data, x='age', hue='gravity', fill=True, hue_order=gravity_order, palette=custom_colors)
plt.title('Density of Age Distribution by Accident Gravity Category')
plt.show()

In [None]:
# 13. Number of accidents by Severity vs. Age
plt.figure(figsize=(10, 6))
sns.lineplot(data=data, x='age', y='maximum_speed', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Age vs. Maximum Speed by Accident Gravity')
plt.show()

In [None]:
# 14. Number of accidents by Severity vs. motor
plt.figure(figsize=(18, 12))
sns.countplot(data=data, x='motor', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Severity vs. motor')
plt.xlabel('motor')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.show()

calculate_and_display_percentages(data, 'motor')

In [None]:
# 15. Number of accidents by Severity vs. Impact point
plt.figure(figsize=(18, 12))
sns.countplot(data=data, x='initial_impact_point', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Severity vs. Impact point')
plt.xlabel('Impact point')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.show()

calculate_and_display_percentages(data, 'initial_impact_point')

In [None]:
# 16. Number of accidents by Severity vs. Safety equipment
plt.figure(figsize=(18, 12))
sns.countplot(data=data, x='safety_equipment1', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Severity vs. Safety equipment')
plt.xlabel('Safety equipment')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.show()

calculate_and_display_percentages(data, 'safety_equipment1')

In [None]:
# 17. Number of accidents by Severity vs. Manv
plt.figure(figsize=(18, 12))
sns.countplot(data=data, x='manv', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Severity vs. manv')
plt.xlabel('manv')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.show()

calculate_and_display_percentages(data, 'manv')

In [None]:
# 18. Number of accidents by Severity vs. user_category
plt.figure(figsize=(18, 12))
sns.countplot(data=data, x='user_category', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Severity vs. user_category')
plt.xlabel('user_category')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.show()

calculate_and_display_percentages(data, 'user_category')

In [None]:
# 19. Number of accidents by Severity vs. seat
plt.figure(figsize=(18, 12))
sns.countplot(data=data, x='seat', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Severity vs. seat')
plt.xlabel('seat')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.show()

calculate_and_display_percentages(data, 'seat')

In [None]:
# 20. Accidents Severity vs. geographical location
fig = px.scatter_mapbox(
    data, 
    lat="lat", 
    lon="long", 
    color="gravity", 
    color_continuous_scale=px.colors.cyclical.IceFire, 
    zoom=3, 
    height=1000, 
    width=2000, 
    labels={"gravity": "Severity"}
)

# Update the layout for mapbox style
fig.update_layout(
    mapbox_style="open-street-map",
    mapbox_center={"lat": data["lat"].mean(), "lon": data["long"].mean()}, 
    title={'x': 0.5} 
)

# Add hover data for more information on each point
fig.update_traces(marker=dict(size=8), 
                  hovertemplate="<br>".join([
                      "Latitude: %{lat}",
                      "Longitude: %{lon}",
                      "Severity: %{marker.color}"
                  ])
)

# Save the map as an HTML file
fig.write_html("accidents_severity_map.html")
