In [None]:
import os
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
# Hour and minutes of the accident.
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import scipy.cluster.hierarchy as shc
from sklearn.decomposition import PCA
from IPython.display import display, HTML
from scipy.cluster.hierarchy import dendrogram, linkage
# Latitude of the accident location.
from pandas.plotting import andrews_curves, parallel_coordinates, lag_plot, autocorrelation_plot, radviz

In [None]:
data = pd.read_csv('source/data.csv')

In [None]:
# Lighting conditions at the time of the accident.
columns_to_convert = [
# Accident identifier number.
    'AccID', 'accident_situation', 'atm_condition', 'collision_type', 'com_code', 'dep_code', 
# Fixed obstacle hit.
    'fixed_obstacle', 'gender', 'gravity', 'infra', 'initial_impact_point', 'int', 'location', 
# Longitude of the accident location.
    'longitudinal_profile', 'lum', 'manv', 'mobile_obstacle', 'motor', 'num_veh_x', 'num_veh_y',
# Location of the accident.
    'pedestrian_action', 'pedestrian_location', 'plan', 'reason_travel', 'reserved_lane_code', 
# Road Category.
    'route_category', 'safety_equipment1', 'safety_equipment2', 'safety_equipment3', 'seat', 
# Surface condition.
    'surface_condition', 'time', 'traffic_direction', 'traffic_regime', 'upstream_terminal_number', 
# User category.
    'user_category', 'vehicle_category', 'vehicleID_x', 'vehicleID_y'
]

# Lighting conditions at the time of the accident.
data[columns_to_convert] = data[columns_to_convert].astype('object')


# Hour and minutes of the accident.
data['time'] = pd.to_datetime(data['time'], format='%H:%M:%S').dt.time

<font size="6">  
    Exploratory analysis
</font> 

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.head()

In [None]:
duplicate_count = data.duplicated().sum()
duplicate_count

In [None]:
# Lighting conditions at the time of the accident.
nan_count_per_column = data.isna().sum()
# Lighting conditions at the time of the accident.
nan_count_per_column

In [None]:
total_nan_count = data.isna().sum().sum()
total_nan_count

<font size="6">  
    Visualizations
</font> 

In [None]:
sns.set_style("darkgrid")
plt.figure(figsize=(12, 8))

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

In [None]:
# Severity of the injury. Accident victims are classified into three categories, in addition to those who were not injured.
data['gravity'] = data['gravity'].replace({
     1: '1 - Unharmed',
     2: '2 - Killed',
     3: '3 - Hospitalized',
     4: '4 - Slightly injured'
})

custom_colors = {
    '1 - Unharmed': '#66B2FF',      
    '2 - Killed': '#FF9999',       
    '3 - Hospitalized': '#FFCC99',   
    '4 - Slightly injured': '#99FF99' 
}

# Severity of the injury. Accident victims are classified into three categories, in addition to those who were not injured.
gravity_order = sorted(custom_colors.keys())

In [None]:
# Latitude of the accident location.
def calculate_and_display_percentages(data, group_by_col):
# Severity of the injury. Accident victims are classified into three categories, in addition to those who were not injured.
    counts = data.groupby([group_by_col, 'gravity']).size().reset_index(name='count')
    total_counts = data[group_by_col].value_counts().reset_index(name='total')
# Lighting conditions at the time of the accident.
    total_counts.columns = [group_by_col, 'total']
    counts = counts.merge(total_counts, on=group_by_col)
    counts['percentage'] = counts['count'] / counts['total'] * 100
    
# Severity of the injury. Accident victims are classified into three categories, in addition to those who were not injured.
    percentage_table = counts.pivot(index=group_by_col, columns='gravity', values='percentage').fillna(0)
    
    styled_table = percentage_table.style.format("{:.1f}%").set_table_styles(
        [{'selector': 'th', 'props': [('font-size', '12pt'), ('font-weight', 'bold'), ('text-align', 'center')]},
         {'selector': 'td', 'props': [('font-size', '10pt'), ('text-align', 'center')]},
         {'selector': 'caption', 'props': [('caption-side', 'top')]}]
    ).set_caption(f'Percentages of Accident Gravity by {group_by_col.capitalize()}').background_gradient(cmap='Blues', axis=None)
    
    html = styled_table.to_html()
    display(HTML(html))

In [None]:
# Severity of the injury. Accident victims are classified into three categories, in addition to those who were not injured.
counts = data['gravity'].value_counts().sort_values(ascending=False)

total = counts.sum()
percentages = counts / total * 100

plt.figure(figsize=(10, 6))
# Severity of the injury. Accident victims are classified into three categories, in addition to those who were not injured.
ax = sns.countplot(data=data, x='gravity', order=counts.index, palette=custom_colors)

for p, percentage in zip(ax.patches, percentages):
    height = p.get_height()
    ax.annotate(f'{percentage:.1f}%', 
                (p.get_x() + p.get_width() / 2., height), 
                ha='center', va='center', 
                xytext=(0, 8), 
# Intersection of the accident.
                textcoords='offset points')

plt.title('Number of accidents by Severity Category')
plt.xlabel('Severity')
plt.ylabel('Number of Accidents')
plt.show()

In [None]:
plt.figure(figsize=(18, 12))
# Year of the accident.
sns.countplot(data=data, x='year', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Severity vs. Year of Accident')
plt.xlabel('Year')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.show()

# Latitude of the accident location.
calculate_and_display_percentages(data, 'year')

In [None]:
# Year of the accident.
years = data['year'].unique()
# Year of the accident.
n_years = len(years)

# Year of the accident.
fig, axes = plt.subplots(1, n_years, figsize=(18, 6))
fig.suptitle('Severity Distribution per Year', fontsize=16)

custom_colors_mat = ['#66B2FF', '#99FF99', '#FFCC99', '#FF9999']

# Year of the accident.
for i, year in enumerate(years):
# Year of the accident.
    yearly_data = data[data['year'] == year]['gravity'].value_counts()
# Year of the accident.
    colors = custom_colors_mat[:len(yearly_data)]  # Select enough colors for the current pie chart
# Year of the accident.
    axes[i].pie(yearly_data, labels=yearly_data.index, autopct='%1.1f%%', colors=colors)
# Year of the accident.
    axes[i].set_title(f'Year {year}')

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

In [None]:
plt.figure(figsize=(18, 12))
# Month of the accident.
sns.countplot(data=data, x='month', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Severity vs. Month')
plt.xlabel('Month')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.show()

# Latitude of the accident location.
calculate_and_display_percentages(data, 'month')

In [None]:
# Day of the accident.
aggregated_data = data.groupby(['day', 'gravity']).size().reset_index(name='count')

plt.figure(figsize=(18, 12))
# Day of the accident.
sns.lineplot(data=aggregated_data, x='day', y='count', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Number of Accidents by Severity vs. Day')
plt.xlabel('Day')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.show()

# Latitude of the accident location.
calculate_and_display_percentages(data, 'day')

In [None]:
plt.figure(figsize=(18, 12))
# Atmosphere conditions at the moment of accident.
sns.countplot(data=data, x='atm_condition', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Severity vs. Weather Conditions')
plt.xlabel('Weather Condition')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.xticks(rotation=45)
plt.show()

# Latitude of the accident location.
calculate_and_display_percentages(data, 'atm_condition')

In [None]:
plt.figure(figsize=(18, 12))
# Severity of the injury. Accident victims are classified into three categories, in addition to those who were not injured.
sns.countplot(data=data, x='lum', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Gravity vs. Lighting Conditions')
plt.xlabel('Lighting Condition')
plt.ylabel('Count')
plt.legend(title='Gravity')
plt.xticks(rotation=45)
plt.show()

# Latitude of the accident location.
calculate_and_display_percentages(data, 'lum')

In [None]:
plt.figure(figsize=(18, 12))
# Severity of the injury. Accident victims are classified into three categories, in addition to those who were not injured.
sns.countplot(data=data, x='vehicle_category', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Gravity vs. Vehicle Category')
plt.xlabel('Vehicle Category')
plt.ylabel('Count')
plt.legend(title='Gravity')
plt.xticks(rotation=45)
plt.show()

# Latitude of the accident location.
calculate_and_display_percentages(data, 'vehicle_category')

In [None]:
plt.figure(figsize=(18, 12))
# Gender of the user involved in the accident.
sns.countplot(data=data, x='gender', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Gravity vs. Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Gravity')
plt.show()

# Latitude of the accident location.
calculate_and_display_percentages(data, 'gender')

In [None]:
plt.figure(figsize=(18,12))
# Severity of the injury. Accident victims are classified into three categories, in addition to those who were not injured.
sns.countplot(data=data, x='surface_condition', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Gravity vs. Surface Condition')
plt.xlabel('Surface Condition')
plt.ylabel('Count')
plt.legend(title='Gravity')
plt.xticks(rotation=45)
plt.show()

# Latitude of the accident location.
calculate_and_display_percentages(data, 'surface_condition')

In [None]:
bins = [0, 49, 59, 69, 79, 89, 99, 109, float('inf')]
labels = ['0-49', '50-59', '60-69', '70-79', '80-89', '90-99', '100-109', '110-119']

# Maximum authorized speed at the place and time of the accident.
data['speed_group'] = pd.cut(data['maximum_speed'], bins=bins, labels=labels, right=False)

plt.figure(figsize=(10, 6))
# Severity of the injury. Accident victims are classified into three categories, in addition to those who were not injured.
sns.countplot(data=data, x='speed_group', hue='gravity', hue_order=gravity_order, palette=custom_colors)

plt.title('Gravity vs. Maximum Speed')
plt.xlabel('Maximum Speed Group')
plt.ylabel('Gravity')
plt.legend(title='Gravity')
plt.xticks(rotation=45)
plt.show()

# Latitude of the accident location.
calculate_and_display_percentages(data, 'maximum_speed')

In [None]:
plt.figure(figsize=(10, 6))
# Severity of the injury. Accident victims are classified into three categories, in addition to those who were not injured.
sns.boxplot(data=data, x='gravity', y='age', hue_order=gravity_order, palette=custom_colors)
plt.title('Age Distribution by Accident Gravity Category')
plt.show()

# Latitude of the accident location.
calculate_and_display_percentages(data, 'age')

In [None]:
plt.figure(figsize=(10, 6))
# Severity of the injury. Accident victims are classified into three categories, in addition to those who were not injured.
sns.kdeplot(data=data, x='age', hue='gravity', fill=True, hue_order=gravity_order, palette=custom_colors)
plt.title('Density of Age Distribution by Accident Gravity Category')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
# Maximum authorized speed at the place and time of the accident.
sns.lineplot(data=data, x='age', y='maximum_speed', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Age vs. Maximum Speed by Accident Gravity')
plt.show()