# ROAD ACCIDENTS

### Import required libraries

In [7]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

### Read the road_accident data set

In [8]:
df = pd.read_csv(r'road_accident.csv') # you

In [9]:
df

Unnamed: 0,accident_index,accident_date,day_of_week,junction_control,junction_detail,accident_severity,light_conditions,local_authority,carriageway_hazards,number_of_casualties,number_of_vehicles,police_force,road_surface_conditions,road_type,speed_limit,time,urban_or_rural_area,weather_conditions,vehicle_type
0,BS0000001,01-01-2021,Thursday,Give way or uncontrolled,T or staggered junction,Serious,Daylight,Kensington and Chelsea,,1,2,Metropolitan Police,Dry,One way street,30,15:11,Urban,Fine no high winds,Car
1,BS0000002,05-01-2021,Monday,Give way or uncontrolled,Crossroads,Serious,Daylight,Kensington and Chelsea,,11,2,Metropolitan Police,Wet or damp,Single carriageway,30,10:59,Urban,Fine no high winds,Taxi/Private hire car
2,BS0000003,04-01-2021,Sunday,Give way or uncontrolled,T or staggered junction,Slight,Daylight,Kensington and Chelsea,,1,2,Metropolitan Police,Dry,Single carriageway,30,14:19,Urban,Fine no high winds,Taxi/Private hire car
3,BS0000004,05-01-2021,Monday,Auto traffic signal,T or staggered junction,Serious,Daylight,Kensington and Chelsea,,1,2,Metropolitan Police,Frost or ice,Single carriageway,30,8:10,Urban,Other,Motorcycle over 500cc
4,BS0000005,06-01-2021,Tuesday,Auto traffic signal,Crossroads,Serious,Darkness - lights lit,Kensington and Chelsea,,1,2,Metropolitan Police,Dry,Single carriageway,30,17:25,Urban,Fine no high winds,Car
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307968,BS0307969,18-02-2022,Thursday,Data missing or out of range,Not at junction or within 20 metres,Slight,Daylight,Highland,,2,1,Northern,Dry,Single carriageway,60,7:00,Rural,Fine no high winds,Car
307969,BS0307970,21-02-2022,Sunday,Data missing or out of range,Not at junction or within 20 metres,Slight,Darkness - no lighting,Highland,,1,1,Northern,Frost or ice,Single carriageway,60,3:00,Rural,Fine no high winds,Car
307970,BS0307971,23-02-2022,Tuesday,Give way or uncontrolled,T or staggered junction,Slight,Daylight,Highland,,1,3,Northern,Frost or ice,Single carriageway,30,9:38,Rural,Fine no high winds,Car
307971,BS0307972,23-02-2022,Tuesday,Give way or uncontrolled,T or staggered junction,Serious,Darkness - no lighting,Highland,,1,2,Northern,Wet or damp,Single carriageway,60,18:25,Rural,Fine no high winds,Motorcycle over 500cc


In [10]:
type(df)

pandas.core.frame.DataFrame

### Shape and Size

In [11]:
# rows and colums

df.shape

(307973, 19)

In [12]:
# size = rows * columns

df.size

5851487

### Data types of the columns

In [13]:
df.dtypes

accident_index             object
accident_date              object
day_of_week                object
junction_control           object
junction_detail            object
accident_severity          object
light_conditions           object
local_authority            object
carriageway_hazards        object
number_of_casualties        int64
number_of_vehicles          int64
police_force               object
road_surface_conditions    object
road_type                  object
speed_limit                 int64
time                       object
urban_or_rural_area        object
weather_conditions         object
vehicle_type               object
dtype: object

In [14]:
# the date is in object format
# change the object to datetime format

df['accident_date'] = pd.to_datetime(df['accident_date'],format='%d-%m-%Y')

In [15]:
# the time is in object format
# change the object to time format

df['time'] = pd.to_datetime(df['time']).dt.time

In [16]:
df['time']

0         15:11:00
1         10:59:00
2         14:19:00
3         08:10:00
4         17:25:00
            ...   
307968    07:00:00
307969    03:00:00
307970    09:38:00
307971    18:25:00
307972    15:45:00
Name: time, Length: 307973, dtype: object

### Data Preprocessing / cleaning

In [17]:
# duplicated() - used to identify duplicates in the dataset
# sum() - calculate the total of the values

df.duplicated().sum()

0

* There are no duplicates.

### Handling missing values.

In [18]:
df.isnull().sum()

accident_index                  0
accident_date                   0
day_of_week                     0
junction_control                0
junction_detail                 0
accident_severity               0
light_conditions                0
local_authority                 0
carriageway_hazards        302549
number_of_casualties            0
number_of_vehicles              0
police_force                    0
road_surface_conditions         0
road_type                       0
speed_limit                     0
time                           17
urban_or_rural_area             0
weather_conditions              0
vehicle_type                    0
dtype: int64

In [19]:
df['carriageway_hazards'].value_counts()

carriageway_hazards
Other object on road                               2243
Any animal in carriageway (except ridden horse)    1620
Pedestrian in carriageway - not injured             715
Previous accident                                   511
Vehicle load on road                                335
Name: count, dtype: int64

In [20]:
missing_carriageway_hazards = df['carriageway_hazards'].isnull().sum()
missing_carriageway_hazards_per = (missing_carriageway_hazards/df['accident_index'].nunique()) * 100
print(f"The missing values of carriageway_hazards are {round(missing_carriageway_hazards_per,2)}%")

The missing values of carriageway_hazards are 98.24%


* Nearly 98% of carriageway_hazards values are missing, so the row can be removed.

In [21]:
df.drop(['carriageway_hazards'],axis=1,inplace=True)

In [22]:
df['time'].isnull().sum()

17

In [23]:
time_mode = df['time'].mode()[0]

In [24]:
df['time'] = df['time'].fillna(time_mode)

In [25]:
df.isnull().sum()

accident_index             0
accident_date              0
day_of_week                0
junction_control           0
junction_detail            0
accident_severity          0
light_conditions           0
local_authority            0
number_of_casualties       0
number_of_vehicles         0
police_force               0
road_surface_conditions    0
road_type                  0
speed_limit                0
time                       0
urban_or_rural_area        0
weather_conditions         0
vehicle_type               0
dtype: int64

#### 1. CY_Casualties (Current Year Casualties)

In [26]:
df['year'] = pd.to_datetime(df['accident_date']).dt.year

In [33]:
current_year_casulaties = df[df['year']==2025]
current_year_casulaties

Unnamed: 0,accident_index,accident_date,day_of_week,junction_control,junction_detail,accident_severity,light_conditions,local_authority,number_of_casualties,number_of_vehicles,police_force,road_surface_conditions,road_type,speed_limit,time,urban_or_rural_area,weather_conditions,vehicle_type,year


#### 2. CY – Fatal Casualties - 2022

In [39]:
year2022_fatal_casulaties = df[(df['year']==2022) & (df['accident_severity']=='Fatal')]
year2022_fatal_casulaties

Unnamed: 0,accident_index,accident_date,day_of_week,junction_control,junction_detail,accident_severity,light_conditions,local_authority,number_of_casualties,number_of_vehicles,police_force,road_surface_conditions,road_type,speed_limit,time,urban_or_rural_area,weather_conditions,vehicle_type,year
164563,BS0164564,2022-08-17,Tuesday,Data missing or out of range,Not at junction or within 20 metres,Fatal,Daylight,City of London,1,1,City of London,Dry,Single carriageway,30,14:45:00,Urban,Fine no high winds,Car,2022
179443,BS0179444,2022-01-08,Friday,Give way or uncontrolled,T or staggered junction,Fatal,Daylight,Islington,1,2,Metropolitan Police,Frost or ice,Single carriageway,30,09:00:00,Urban,Fine no high winds,Car,2022
179455,BS0179456,2022-02-21,Sunday,Auto traffic signal,T or staggered junction,Fatal,Darkness - lights lit,Westminster,1,1,Metropolitan Police,Dry,Single carriageway,30,01:50:00,Urban,Fine no high winds,Car,2022
179467,BS0179468,2022-04-03,Saturday,Data missing or out of range,Not at junction or within 20 metres,Fatal,Daylight,Newham,1,1,Metropolitan Police,Dry,Dual carriageway,30,16:34:00,Urban,Fine no high winds,Car,2022
179475,BS0179476,2022-04-14,Wednesday,Data missing or out of range,Not at junction or within 20 metres,Fatal,Daylight,Westminster,1,2,Metropolitan Police,Dry,Single carriageway,30,13:00:00,Urban,Fine no high winds,Car,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307764,BS0307765,2022-01-29,Friday,Data missing or out of range,Not at junction or within 20 metres,Fatal,Daylight,Western Isles,1,1,Northern,Wet or damp,Single carriageway,40,14:58:00,Rural,Fine + high winds,Car,2022
307813,BS0307814,2022-02-02,Tuesday,Data missing or out of range,Not at junction or within 20 metres,Fatal,Daylight,Highland,1,1,Northern,Snow,Single carriageway,60,10:00:00,Rural,Snowing no high winds,Car,2022
307859,BS0307860,2022-08-15,Sunday,Data missing or out of range,Not at junction or within 20 metres,Fatal,Daylight,Highland,1,1,Northern,Dry,Single carriageway,60,16:15:00,Rural,Fine no high winds,Goods 7.5 tonnes mgw and over,2022
307875,BS0307876,2022-10-21,Thursday,Give way or uncontrolled,T or staggered junction,Fatal,Darkness - lights lit,Highland,1,1,Northern,Wet or damp,Single carriageway,30,20:53:00,Rural,Fine no high winds,Car,2022


#### 3. CY – Serious Casualties -2022

In [None]:
year2022_serious_casulaties = df[(df['year']==2022) & (df['accident_severity']=='Serious')]
year2022_serious_casulaties

#### 4. CY – Slight Casualties – 2022

In [None]:
year2022_serious_casulaties = df[(df['year']==2022) & (df['accident_severity']=='Slight')]
year2022_serious_casulaties

#### 5. Total Number of [Slight, Fatal, Serious] Casualties


In [None]:
df['accident_severity'].value_counts().to_frame()

#### 6. Percentage(%) of Accidents that got Severity – Slight

In [None]:
total_num_of_acc = len(df['accident_severity'])
num_of_slight_acc = len(df[df['accident_severity']=='Slight'])

per_slight_acc = (num_of_slight_acc / total_num_of_acc) * 100

In [None]:
round(per_slight_acc,2)

#### 7. Percentage(%) of Accidents that got Severity – Fatal

In [None]:
total_num_of_acc = len(df['accident_severity'])
num_of_fatal_acc = len(df[df['accident_severity']=='Fatal'])

per_fatal_acc = (num_of_fatal_acc / total_num_of_acc) * 100

In [None]:
round(per_fatal_acc,2)

#### 8. Percentage(%) of Accidents that got Severity – Serious

In [None]:
total_num_of_acc = len(df['accident_severity'])
num_of_serious_acc = len(df[df['accident_severity']=='Serious'])

per_serious_acc = (num_of_serious_acc / total_num_of_acc) * 100

In [None]:
round(per_serious_acc,2)

#### 9. Vehicle Group – Total Number of Casualties

In [None]:
df.groupby('vehicle_type')['number_of_casualties'].sum().sort_values(ascending=False).to_frame()

#### 10. CY – Casualties Monthly Trend

In [None]:
monthly_accidents = (df
                     .groupby(df['accident_date'].dt.month)
                     .size()
                     .reset_index(name='total_accidents')
                     .rename(columns={'accident_date': 'month'})
                    )
monthly_accidents

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(monthly_accidents['month'], monthly_accidents['total_accidents'], 
         marker='o', label='2021', color='blue')


plt.title('Monthly Accidents Trends')
plt.xlabel('Month')
plt.ylabel('Total Accidents')
plt.xticks(ticks=range(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                                        'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.legend()
plt.grid()
plt.show()

In [None]:
df_2021 = df[df['year']==2021]
df_2022 = df[df['year']==2022]

In [None]:
monthly_accidents_2021 = (df_2021
                     .groupby(df['accident_date'].dt.month)
                     .size()
                     .reset_index(name='total_accidents')
                     .rename(columns={'accident_date': 'month'})
                    )
monthly_accidents_2021

In [None]:
monthly_accidents_2022 = (df_2022
                     .groupby(df['accident_date'].dt.month)
                     .size()
                     .reset_index(name='total_accidents')
                     .rename(columns={'accident_date': 'month'})
                    )
monthly_accidents_2022

In [None]:

plt.figure(figsize=(12, 6))
plt.plot(monthly_accidents_2021['month'], monthly_accidents_2021['total_accidents'], 
         marker='o', label='2021', color='blue')
plt.plot(monthly_accidents_2022['month'], monthly_accidents_2022['total_accidents'], 
         marker='o', label='2022', color='green')


plt.title('Monthly Accidents Trends (2021 vs 2022)')
plt.xlabel('Month')
plt.ylabel('Total Accidents')
plt.xticks(ticks=range(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                                        'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.legend()
plt.grid()
plt.show()


#### 11. Types of Road – Total Number of Casualties:

In [None]:
df.groupby('road_type')['number_of_casualties'].sum().sort_values(ascending=False).to_frame()

#### 12. Area – wise Percentage(%) and Total Number of Casualties

In [None]:
total_casualties_area = df.groupby('urban_or_rural_area')['number_of_casualties'].sum()
total_casualties_area

In [None]:
area_casualties = (
    df.groupby('urban_or_rural_area')['number_of_casualties']
    .sum()
    .reset_index()
    .assign(percentage=lambda x: round((x['number_of_casualties'] / x['number_of_casualties'].sum()) * 100, 2))
)
area_casualties

#### 13. Count of Casualties By Light Conditions

In [None]:
light_casualties = (
    df.groupby('light_conditions')['number_of_casualties']
    .sum()
    .sort_values(ascending = False))


light_casualties.to_frame()

#### 14. Percentage (%) and Segregation of Casualties by Different Light Conditions

In [None]:
light_casualties_per = (
    df.groupby('light_conditions')['number_of_casualties']
    .sum()
    .reset_index()
    .assign(percentage=lambda x: round((x['number_of_casualties'] / x['number_of_casualties'].sum()) * 100, 2))
    .sort_values(by = 'percentage', ascending = False)
)
light_casualties_per

#### 15. Top 10 Local Authority with Highest Total Number of Casualties

In [None]:
top10_authority = ( df.groupby('local_authority')['number_of_casualties']
                   .sum()
                   .to_frame()
                   .sort_values(by ='number_of_casualties', ascending=False)
                   )

top10_authority.head(10)