In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
import os
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import scipy.cluster.hierarchy as shc
from sklearn.decomposition import PCA
from IPython.display import display, HTML
from scipy.cluster.hierarchy import dendrogram, linkage
from pandas.plotting import andrews_curves, parallel_coordinates, lag_plot, autocorrelation_plot, radviz

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
data = pd.read_csv('source/data.csv')

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
columns_to_convert = [
    'AccID', 'accident_situation', 'atm_condition', 'collision_type', 'com_code', 'dep_code', 
    'fixed_obstacle', 'gender', 'gravity', 'infra', 'initial_impact_point', 'int', 'location', 
    'longitudinal_profile', 'lum', 'manv', 'mobile_obstacle', 'motor', 'num_veh_x', 'num_veh_y',
    'pedestrian_action', 'pedestrian_location', 'plan', 'reason_travel', 'reserved_lane_code', 
    'route_category', 'safety_equipment1', 'safety_equipment2', 'safety_equipment3', 'seat', 
    'surface_condition', 'time', 'traffic_direction', 'traffic_regime', 'upstream_terminal_number', 
    'user_category', 'vehicle_category', 'vehicleID_x', 'vehicleID_y'
]

data[columns_to_convert] = data[columns_to_convert].astype('object')


data['time'] = pd.to_datetime(data['time'], format='%H:%M:%S').dt.time

<font size="6">  
    Exploratory analysis
</font> 

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
data.describe()

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
data.info()

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
data.head()

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
duplicate_count = data.duplicated().sum()
duplicate_count

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
nan_count_per_column = data.isna().sum()
nan_count_per_column

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
total_nan_count = data.isna().sum().sum()
total_nan_count

<font size="6">  
    Visualizations
</font> 

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
sns.set_style("darkgrid")
plt.figure(figsize=(12, 8))

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
data['gravity'] = data['gravity'].replace({
     1: '1 - Unharmed',
     2: '2 - Killed',
     3: '3 - Hospitalized',
     4: '4 - Slightly injured'
})

custom_colors = {
    '1 - Unharmed': '#66B2FF',      
    '2 - Killed': '#FF9999',       
    '3 - Hospitalized': '#FFCC99',   
    '4 - Slightly injured': '#99FF99' 
}

gravity_order = sorted(custom_colors.keys())

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
def calculate_and_display_percentages(data, group_by_col):
    counts = data.groupby([group_by_col, 'gravity']).size().reset_index(name='count')
    total_counts = data[group_by_col].value_counts().reset_index(name='total')
    total_counts.columns = [group_by_col, 'total']
    counts = counts.merge(total_counts, on=group_by_col)
    counts['percentage'] = counts['count'] / counts['total'] * 100
    
    percentage_table = counts.pivot(index=group_by_col, columns='gravity', values='percentage').fillna(0)
    
    styled_table = percentage_table.style.format("{:.1f}%").set_table_styles(
        [{'selector': 'th', 'props': [('font-size', '12pt'), ('font-weight', 'bold'), ('text-align', 'center')]},
         {'selector': 'td', 'props': [('font-size', '10pt'), ('text-align', 'center')]},
         {'selector': 'caption', 'props': [('caption-side', 'top')]}]
    ).set_caption(f'Percentages of Accident Gravity by {group_by_col.capitalize()}').background_gradient(cmap='Blues', axis=None)
    
    html = styled_table.to_html()
    display(HTML(html))

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
counts = data['gravity'].value_counts().sort_values(ascending=False)

total = counts.sum()
percentages = counts / total * 100

plt.figure(figsize=(10, 6))
ax = sns.countplot(data=data, x='gravity', order=counts.index, palette=custom_colors)

for p, percentage in zip(ax.patches, percentages):
    height = p.get_height()
    ax.annotate(f'{percentage:.1f}%', 
                (p.get_x() + p.get_width() / 2., height), 
                ha='center', va='center', 
                xytext=(0, 8), 
                textcoords='offset points')

plt.title('Number of accidents by Severity Category')
plt.xlabel('Severity')
plt.ylabel('Number of Accidents')
plt.show()

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
plt.figure(figsize=(18, 12))
sns.countplot(data=data, x='year', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Severity vs. Year of Accident')
plt.xlabel('Year')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.show()

calculate_and_display_percentages(data, 'year')

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
years = data['year'].unique()
n_years = len(years)

fig, axes = plt.subplots(1, n_years, figsize=(18, 6))
fig.suptitle('Severity Distribution per Year', fontsize=16)

custom_colors_mat = ['#66B2FF', '#99FF99', '#FFCC99', '#FF9999']

for i, year in enumerate(years):
    yearly_data = data[data['year'] == year]['gravity'].value_counts()
    colors = custom_colors_mat[:len(yearly_data)]  # Select enough colors for the current pie chart
    axes[i].pie(yearly_data, labels=yearly_data.index, autopct='%1.1f%%', colors=colors)
    axes[i].set_title(f'Year {year}')

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
plt.figure(figsize=(18, 12))
sns.countplot(data=data, x='month', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Severity vs. Month')
plt.xlabel('Month')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.show()

calculate_and_display_percentages(data, 'month')

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
aggregated_data = data.groupby(['day', 'gravity']).size().reset_index(name='count')

plt.figure(figsize=(18, 12))
sns.lineplot(data=aggregated_data, x='day', y='count', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Number of Accidents by Severity vs. Day')
plt.xlabel('Day')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.show()

calculate_and_display_percentages(data, 'day')

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
plt.figure(figsize=(18, 12))
sns.countplot(data=data, x='atm_condition', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Severity vs. Weather Conditions')
plt.xlabel('Weather Condition')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.xticks(rotation=45)
plt.show()

calculate_and_display_percentages(data, 'atm_condition')

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
plt.figure(figsize=(18, 12))
sns.countplot(data=data, x='lum', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Gravity vs. Lighting Conditions')
plt.xlabel('Lighting Condition')
plt.ylabel('Count')
plt.legend(title='Gravity')
plt.xticks(rotation=45)
plt.show()

calculate_and_display_percentages(data, 'lum')

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
plt.figure(figsize=(18, 12))
sns.countplot(data=data, x='vehicle_category', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Gravity vs. Vehicle Category')
plt.xlabel('Vehicle Category')
plt.ylabel('Count')
plt.legend(title='Gravity')
plt.xticks(rotation=45)
plt.show()

calculate_and_display_percentages(data, 'vehicle_category')

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
plt.figure(figsize=(18, 12))
sns.countplot(data=data, x='gender', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Gravity vs. Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Gravity')
plt.show()

calculate_and_display_percentages(data, 'gender')

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
plt.figure(figsize=(18,12))
sns.countplot(data=data, x='surface_condition', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Gravity vs. Surface Condition')
plt.xlabel('Surface Condition')
plt.ylabel('Count')
plt.legend(title='Gravity')
plt.xticks(rotation=45)
plt.show()

calculate_and_display_percentages(data, 'surface_condition')

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
bins = [0, 49, 59, 69, 79, 89, 99, 109, float('inf')]
labels = ['0-49', '50-59', '60-69', '70-79', '80-89', '90-99', '100-109', '110-119']

data['speed_group'] = pd.cut(data['maximum_speed'], bins=bins, labels=labels, right=False)

plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='speed_group', hue='gravity', hue_order=gravity_order, palette=custom_colors)

plt.title('Gravity vs. Maximum Speed')
plt.xlabel('Maximum Speed Group')
plt.ylabel('Gravity')
plt.legend(title='Gravity')
plt.xticks(rotation=45)
plt.show()

calculate_and_display_percentages(data, 'maximum_speed')

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
plt.figure(figsize=(10, 6))
sns.boxplot(data=data, x='gravity', y='age', hue_order=gravity_order, palette=custom_colors)
plt.title('Age Distribution by Accident Gravity Category')
plt.show()

calculate_and_display_percentages(data, 'age')

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
plt.figure(figsize=(10, 6))
sns.kdeplot(data=data, x='age', hue='gravity', fill=True, hue_order=gravity_order, palette=custom_colors)
plt.title('Density of Age Distribution by Accident Gravity Category')
plt.show()

In [None]:
# This cell contains code for ... (provide explanation based on content)
# Add specific comments based on the functionality and purpose of the code
plt.figure(figsize=(10, 6))
sns.lineplot(data=data, x='age', y='maximum_speed', hue='gravity', hue_order=gravity_order, palette=custom_colors)
plt.title('Age vs. Maximum Speed by Accident Gravity')
plt.show()