# EDA: Handle missing and Unknown Values

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_collision = pd.read_csv("../0_preprocessing/data/collisions3.csv")
df_collision.head().T

Unnamed: 0,0,1,2,3,4
police_force,metropolitan_police,metropolitan_police,metropolitan_police,metropolitan_police,metropolitan_police
number_of_vehicles,1,3,2,2,2
day_of_week,sunday,sunday,sunday,sunday,sunday
time,01:00,02:00,04:00,02:00,02:00
first_road_class,c,unclassified,a,a,a
road_type,one_way_street,single_carriageway,roundabout,single_carriageway,single_carriageway
speed_limit,20,30,30,30,30
junction_detail,other_junction,t_or_staggered_junction,roundabout,t_or_staggered_junction,private_drive_or_entrance
pedestrian_crossing_human_control,control_by_other_authorised_person,none_within_50_metres,none_within_50_metres,none_within_50_metres,none_within_50_metres
pedestrian_crossing_physical_facilities,pedestrian_phase_at_traffic_signal_junction,zebra,no_physical_crossing_facilities_within_50_metres,no_physical_crossing_facilities_within_50_metres,no_physical_crossing_facilities_within_50_metres


In [3]:
df_collision.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71480 entries, 0 to 71479
Data columns (total 20 columns):
 #   Column                                   Non-Null Count  Dtype 
---  ------                                   --------------  ----- 
 0   police_force                             71480 non-null  object
 1   number_of_vehicles                       71480 non-null  int64 
 2   day_of_week                              71480 non-null  object
 3   time                                     71480 non-null  object
 4   first_road_class                         71480 non-null  object
 5   road_type                                71480 non-null  object
 6   speed_limit                              71480 non-null  int64 
 7   junction_detail                          71480 non-null  object
 8   pedestrian_crossing_human_control        71480 non-null  object
 9   pedestrian_crossing_physical_facilities  71480 non-null  object
 10  light_conditions                         71480 non-null  o

## Missing Values

In this dataset missing numericals values are encoded as -1 whereas missing categorical variables are encoded as `data_missing_or_out_of_range` 


In [4]:
def evaluate_missing_values():
    number_categorised_as_missing = []
    
    for col in df_collision.columns:
        num_missing = len(df_collision[(df_collision[col] == "data_missing_or_out_of_range") | (df_collision[col] == -1)])
        percentage = round(num_missing/len(df_collision), 2)
        number_categorised_as_missing.append((col, num_missing, percentage))
    return pd.DataFrame(number_categorised_as_missing, columns=["column_name", "number_missing", "percentage"])

In [5]:
evaluate_missing_values()

Unnamed: 0,column_name,number_missing,percentage
0,police_force,0,0.0
1,number_of_vehicles,0,0.0
2,day_of_week,0,0.0
3,time,0,0.0
4,first_road_class,0,0.0
5,road_type,0,0.0
6,speed_limit,0,0.0
7,junction_detail,0,0.0
8,pedestrian_crossing_human_control,1976,0.03
9,pedestrian_crossing_physical_facilities,1976,0.03


### 1. Trunk Road Flag

The category with the most missing values is `trunk_road_flag`

`trunk_road_flag` identifies whether a road is managed by Highways England. Highways England are a government company in charge of operating and maintaining motorways and major A roads. 

**Strategy: Impute values**

We know that Highways England does not manage B, C or unclassified roads so where we have missing data, we can categorise these as "non_trunk"

Looking at the classification of the other first_road_classes, we see that

- over 80% of the A roads are non trunk so we can classify the missing values as non_trunk
- over 70% of the A(M) roads are trunk so we can classify the missing values as trunk
- almost 90% of the motorways roads are classified as trunk so we can classify the missing values as trunk

In [6]:
df_collision.groupby("first_road_class")["trunk_road_flag"].value_counts(normalize=True).to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
first_road_class,trunk_road_flag,Unnamed: 2_level_1
a,non_trunk,0.812902
a,data_missing_or_out_of_range,0.095464
a,trunk_(roads_managed_by_highways_england),0.091633
a(m),trunk_(roads_managed_by_highways_england),0.744898
a(m),non_trunk,0.153061
a(m),data_missing_or_out_of_range,0.102041
b,non_trunk,0.891776
b,data_missing_or_out_of_range,0.108224
c,non_trunk,0.993204
c,data_missing_or_out_of_range,0.006796


In [7]:
non_major_roads_filter = df_collision["first_road_class"].isin(["b","c","unclassified"])
a_roads_filter = (df_collision["first_road_class"] == "a") & (df_collision["trunk_road_flag"] == "data_missing_or_out_of_range")
motorways_and_big_a_roads_filter = (df_collision["first_road_class"].isin(["a(m)","motorway"])) & (df_collision["trunk_road_flag"] == "data_missing_or_out_of_range")

df_collision.loc[non_major_roads_filter, "trunk_road_flag"] = "non_trunk"
df_collision.loc[a_roads_filter, "trunk_road_flag"] = "non_trunk"
df_collision.loc[motorways_and_big_a_roads_filter, "trunk_road_flag"] = "trunk_(roads_managed_by_highways_england)"
df_collision.groupby("first_road_class")["trunk_road_flag"].value_counts(normalize=True).to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
first_road_class,trunk_road_flag,Unnamed: 2_level_1
a,non_trunk,0.908367
a,trunk_(roads_managed_by_highways_england),0.091633
a(m),trunk_(roads_managed_by_highways_england),0.846939
a(m),non_trunk,0.153061
b,non_trunk,1.0
c,non_trunk,1.0
motorway,trunk_(roads_managed_by_highways_england),0.986971
motorway,non_trunk,0.013029
unclassified,non_trunk,1.0


In [8]:
df_collision["trunk_road_flag"].value_counts(normalize=True)

trunk_road_flag
non_trunk                                    0.921936
trunk_(roads_managed_by_highways_england)    0.078064
Name: proportion, dtype: float64

### Create is_trunk flag from `trunk_road_flag`

trunk_road_flag is now a boolean so we can transform it to `is_trunk` be more readable.

In [9]:
df_collision.loc[df_collision["trunk_road_flag"] == "trunk_(roads_managed_by_highways_england)", "trunk_road_flag"] = 1
df_collision.loc[df_collision["trunk_road_flag"] == "non_trunk", "trunk_road_flag"] = 0

df_collision["trunk_road_flag"].value_counts(normalize=True)

trunk_road_flag
0    0.921936
1    0.078064
Name: proportion, dtype: float64

In [10]:
df_collision["is_trunk"] = df_collision["trunk_road_flag"].copy()
del df_collision["trunk_road_flag"]

In [11]:
evaluate_missing_values()

Unnamed: 0,column_name,number_missing,percentage
0,police_force,0,0.0
1,number_of_vehicles,0,0.0
2,day_of_week,0,0.0
3,time,0,0.0
4,first_road_class,0,0.0
5,road_type,0,0.0
6,speed_limit,0,0.0
7,junction_detail,0,0.0
8,pedestrian_crossing_human_control,1976,0.03
9,pedestrian_crossing_physical_facilities,1976,0.03


### 2. Pedestrian crossing

The next largest category with the most missing values is related to information about pedestrian crossings.

`pedestrian_crossing_human_control`: this field captures if there was a pedestrian crossing facilitated by some kind of official within 50 metres of the accident e.g. "lollipop person" (school crossing patrol) or another "authorised person" (police office or a traffic warden in uniform). In 90% of records there was no pedestrian crossing operated by a human.

`pedestrian_crossing_physical_facilities`: this field captures if there was any (non human facilitated) pedestrian crossing within 50 metres of the accident. This could be a zebra crossing, a foot bridge, a space in the middle of the road (central refuge), a crossing at a traffic light (toucan, pelican, puffin). Note that the crossings at traffic lights must contain a specific indicator light (a "green man") and time for pedestrians to cross. Not all traffic lights have a pedestrian crossing so this field does not mean traffic lights were nearby. 

**Strategy: Drop rows with missing values**

Since we have a large dataset, as long as we do not make the `police_force` category incomplete in removing these rows, it should not be an issue.

In [12]:
df_collision["pedestrian_crossing_human_control"].value_counts(normalize=True)

pedestrian_crossing_human_control
none_within_50_metres                 0.952840
data_missing_or_out_of_range          0.027644
control_by_other_authorised_person    0.015403
control_by_school_crossing_patrol     0.004113
Name: proportion, dtype: float64

In [13]:
df_collision["pedestrian_crossing_physical_facilities"].value_counts(normalize=True)

pedestrian_crossing_physical_facilities
no_physical_crossing_facilities_within_50_metres                             0.774217
pedestrian_phase_at_traffic_signal_junction                                  0.072034
pelican,_puffin,_toucan_or_similar_non_junction_pedestrian_light_crossing    0.060982
zebra                                                                        0.036738
data_missing_or_out_of_range                                                 0.027644
central_refuge                                                               0.026105
footbridge_or_subway                                                         0.002280
Name: proportion, dtype: float64

In [14]:
filter_missing_pedestrian_human_control = df_collision["pedestrian_crossing_human_control"] == "data_missing_or_out_of_range"
filter_missing_pedestrian_physical_control = df_collision["pedestrian_crossing_physical_facilities"] == "data_missing_or_out_of_range"

len(df_collision[~filter_missing_pedestrian_human_control & ~filter_missing_pedestrian_physical_control]), len(df_collision)

(69503, 71480)

In [15]:
df_collision = df_collision.drop(df_collision[filter_missing_pedestrian_human_control | filter_missing_pedestrian_physical_control].index)
len(df_collision)

69503

In [16]:
evaluate_missing_values()

Unnamed: 0,column_name,number_missing,percentage
0,police_force,0,0.0
1,number_of_vehicles,0,0.0
2,day_of_week,0,0.0
3,time,0,0.0
4,first_road_class,0,0.0
5,road_type,0,0.0
6,speed_limit,0,0.0
7,junction_detail,0,0.0
8,pedestrian_crossing_human_control,0,0.0
9,pedestrian_crossing_physical_facilities,0,0.0


### Convert pedestrian_crossing into simpler variable

We saw that he majority of collision did not occur near a pedestrian crossing: rather than having a set of categories for a small subset of data, it might be more useful initially to have a simple `is_near_pedestrian_crossing` flag.

We can then drop `pedestrian_crossing_human_control` and `pedestrian_crossing_physical_facilities`

In [17]:
df_collision["pedestrian_crossing_human_control"].value_counts()

pedestrian_crossing_human_control
none_within_50_metres                 68108
control_by_other_authorised_person     1101
control_by_school_crossing_patrol       294
Name: count, dtype: int64

In [18]:
is_human_control_pc = df_collision["pedestrian_crossing_human_control"] != "none_within_50_metres"
df_collision[is_human_control_pc]["pedestrian_crossing_human_control"].value_counts()

pedestrian_crossing_human_control
control_by_other_authorised_person    1101
control_by_school_crossing_patrol      294
Name: count, dtype: int64

In [19]:
df_collision["pedestrian_crossing_physical_facilities"].value_counts()

pedestrian_crossing_physical_facilities
no_physical_crossing_facilities_within_50_metres                             55340
pedestrian_phase_at_traffic_signal_junction                                   5149
pelican,_puffin,_toucan_or_similar_non_junction_pedestrian_light_crossing     4359
zebra                                                                         2626
central_refuge                                                                1866
footbridge_or_subway                                                           163
Name: count, dtype: int64

In [20]:
is_physical_facilities = df_collision["pedestrian_crossing_physical_facilities"] != "no_physical_crossing_facilities_within_50_metres"
df_collision[is_physical_facilities]["pedestrian_crossing_physical_facilities"].value_counts()

pedestrian_crossing_physical_facilities
pedestrian_phase_at_traffic_signal_junction                                  5149
pelican,_puffin,_toucan_or_similar_non_junction_pedestrian_light_crossing    4359
zebra                                                                        2626
central_refuge                                                               1866
footbridge_or_subway                                                          163
Name: count, dtype: int64

In [21]:
is_near_pedestrian_crossing_filter = is_human_control_pc | is_physical_facilities
len(df_collision[is_near_pedestrian_crossing_filter])

14434

In [22]:
df_collision["is_near_pedestrian_crossing"] = np.where(is_near_pedestrian_crossing_filter,1,0)
df_collision[is_near_pedestrian_crossing_filter][["pedestrian_crossing_human_control","pedestrian_crossing_physical_facilities","is_near_pedestrian_crossing"]]

Unnamed: 0,pedestrian_crossing_human_control,pedestrian_crossing_physical_facilities,is_near_pedestrian_crossing
0,control_by_other_authorised_person,pedestrian_phase_at_traffic_signal_junction,1
1,none_within_50_metres,zebra,1
5,control_by_other_authorised_person,pedestrian_phase_at_traffic_signal_junction,1
8,none_within_50_metres,"pelican,_puffin,_toucan_or_similar_non_junctio...",1
9,control_by_school_crossing_patrol,"pelican,_puffin,_toucan_or_similar_non_junctio...",1
...,...,...,...
71444,none_within_50_metres,pedestrian_phase_at_traffic_signal_junction,1
71450,control_by_school_crossing_patrol,no_physical_crossing_facilities_within_50_metres,1
71457,none_within_50_metres,"pelican,_puffin,_toucan_or_similar_non_junctio...",1
71459,none_within_50_metres,"pelican,_puffin,_toucan_or_similar_non_junctio...",1


In [23]:
df_collision[~is_near_pedestrian_crossing_filter][["pedestrian_crossing_human_control","pedestrian_crossing_physical_facilities","is_near_pedestrian_crossing"]]

Unnamed: 0,pedestrian_crossing_human_control,pedestrian_crossing_physical_facilities,is_near_pedestrian_crossing
2,none_within_50_metres,no_physical_crossing_facilities_within_50_metres,0
3,none_within_50_metres,no_physical_crossing_facilities_within_50_metres,0
4,none_within_50_metres,no_physical_crossing_facilities_within_50_metres,0
6,none_within_50_metres,no_physical_crossing_facilities_within_50_metres,0
7,none_within_50_metres,no_physical_crossing_facilities_within_50_metres,0
...,...,...,...
71467,none_within_50_metres,no_physical_crossing_facilities_within_50_metres,0
71470,none_within_50_metres,no_physical_crossing_facilities_within_50_metres,0
71471,none_within_50_metres,no_physical_crossing_facilities_within_50_metres,0
71475,none_within_50_metres,no_physical_crossing_facilities_within_50_metres,0


In [24]:
df_collision["is_near_pedestrian_crossing"].value_counts()

is_near_pedestrian_crossing
0    55069
1    14434
Name: count, dtype: int64

In [25]:
del df_collision["pedestrian_crossing_physical_facilities"]
del df_collision["pedestrian_crossing_human_control"]

In [26]:
df_collision.head().T

Unnamed: 0,0,1,2,3,4
police_force,metropolitan_police,metropolitan_police,metropolitan_police,metropolitan_police,metropolitan_police
number_of_vehicles,1,3,2,2,2
day_of_week,sunday,sunday,sunday,sunday,sunday
time,01:00,02:00,04:00,02:00,02:00
first_road_class,c,unclassified,a,a,a
road_type,one_way_street,single_carriageway,roundabout,single_carriageway,single_carriageway
speed_limit,20,30,30,30,30
junction_detail,other_junction,t_or_staggered_junction,roundabout,t_or_staggered_junction,private_drive_or_entrance
light_conditions,darkness___lights_lit,darkness___lights_lit,darkness___lights_lit,darkness___lights_lit,darkness___lights_lit
weather_conditions,other,fine_no_high_winds,fine_no_high_winds,unknown,fine_no_high_winds


In [27]:
evaluate_missing_values()

Unnamed: 0,column_name,number_missing,percentage
0,police_force,0,0.0
1,number_of_vehicles,0,0.0
2,day_of_week,0,0.0
3,time,0,0.0
4,first_road_class,0,0.0
5,road_type,0,0.0
6,speed_limit,0,0.0
7,junction_detail,0,0.0
8,light_conditions,0,0.0
9,weather_conditions,0,0.0


### 3. Urban or rural area


`urban_or_rural_area`: this indicates whether the accident occured in an urban area or rural area. There is not a lot of detail regarding this classification. We assume it is based on an [ONS classification](https://www.ons.gov.uk/methodology/geography/geographicalproducts/ruralurbanclassifications/2011ruralurbanclassification): "OAs are treated as ‘urban’ if they were allocated to a 2011 built-up area with a population of 10,000 people or more, while all remaining OAs are classed as ‘rural’."

**Strategy: Drop rows with missing values**

Since we have a large dataset, as long as we do not make the `police_force` category incomplete in removing these rows, it should not be an issue to drop rows with values "Data missing or out of range".

There are 4 rows with unallocated data. Since such a small percentage of the total data has this value, we will drop these columns too.

In [28]:
df_collision["urban_or_rural_area"].value_counts(normalize=True)

urban_or_rural_area
urban                           0.610564
rural                           0.389293
data_missing_or_out_of_range    0.000101
unallocated                     0.000043
Name: proportion, dtype: float64

In [29]:
filter_missing_urban_rural = df_collision["urban_or_rural_area"] == "data_missing_or_out_of_range"
filter_unallocated_urban_rural = df_collision["urban_or_rural_area"] == "unallocated"

df_collision = df_collision.drop(df_collision[filter_missing_urban_rural | filter_unallocated_urban_rural].index)

In [30]:
evaluate_missing_values()

Unnamed: 0,column_name,number_missing,percentage
0,police_force,0,0.0
1,number_of_vehicles,0,0.0
2,day_of_week,0,0.0
3,time,0,0.0
4,first_road_class,0,0.0
5,road_type,0,0.0
6,speed_limit,0,0.0
7,junction_detail,0,0.0
8,light_conditions,0,0.0
9,weather_conditions,0,0.0


### Change `urban_or_rural_area` into `is_urban`


In [31]:
df_collision["urban_or_rural_area"].value_counts()

urban_or_rural_area
urban    42436
rural    27057
Name: count, dtype: int64

In [32]:
df_collision["is_urban"] = (df_collision["urban_or_rural_area"] =="urban").astype(int)
df_collision[["urban_or_rural_area", "is_urban"]].tail()

Unnamed: 0,urban_or_rural_area,is_urban
71467,urban,1
71470,urban,1
71471,rural,0
71475,urban,1
71479,rural,0


In [33]:
del df_collision["urban_or_rural_area"]

### 4. Road Surface Conditions

`road_surface_conditions`: This refers to the road surface condition at the time of the accident.

**Strategy: Drop rows with missing values**

Since we have a large dataset, as long as we do not make the `police_force` category incomplete in removing these rows, it should not be an issue.

In [34]:
df_collision["road_surface_conditions"].value_counts(normalize=True)

road_surface_conditions
dry                             0.693595
wet_or_damp                     0.285252
frost_or_ice                    0.016448
snow                            0.002461
flood_over_3cm._deep            0.002173
data_missing_or_out_of_range    0.000072
Name: proportion, dtype: float64

In [35]:
filter_missing_road_surface = df_collision["road_surface_conditions"] == "data_missing_or_out_of_range"

df_collision = df_collision.drop(df_collision[filter_missing_road_surface].index)

In [36]:
evaluate_missing_values()

Unnamed: 0,column_name,number_missing,percentage
0,police_force,0,0.0
1,number_of_vehicles,0,0.0
2,day_of_week,0,0.0
3,time,0,0.0
4,first_road_class,0,0.0
5,road_type,0,0.0
6,speed_limit,0,0.0
7,junction_detail,0,0.0
8,light_conditions,0,0.0
9,weather_conditions,0,0.0


### 5. Special conditions at site

`special_conditions_at_site`: this field is used to identify anything particularly defective at the site .e.g traffic signals aren't working or there are roadworks.

When we converted the categorical columns from numbers to strings, the value for No special conditions was "None" which is why we see so many NaN. 

Similar to pedestrian crossing, this field has a lot of categories but only applies to a small subset of the data. For our first algorithm, it would be better to simplify this. 

**Strategy: Drop rows with real missing values and create new category**

We will delete real missing data i.e. that categorised as "Data missing or out of range". Since we have a large dataset, as long as we do not make the `police_force` category incomplete in removing these rows, it should not be an issue.

In [37]:
df_collision["special_conditions_at_site"].value_counts(dropna=False, normalize=True)

special_conditions_at_site
NaN                                           0.974168
roadworks                                     0.012966
auto_traffic_signal___out                     0.003339
mud                                           0.003281
road_surface_defective                        0.002648
road_sign_or_marking_defective_or_obscured    0.001669
oil_or_diesel                                 0.001468
auto_signal_part_defective                    0.000446
data_missing_or_out_of_range                  0.000014
Name: proportion, dtype: float64

In [38]:
filter_missing_special_conditions = df_collision["special_conditions_at_site"] == "data_missing_or_out_of_range"

df_collision = df_collision.drop(df_collision[filter_missing_special_conditions].index)
df_collision["special_conditions_at_site"].value_counts(dropna=False, normalize=True)

special_conditions_at_site
NaN                                           0.974182
roadworks                                     0.012966
auto_traffic_signal___out                     0.003339
mud                                           0.003281
road_surface_defective                        0.002648
road_sign_or_marking_defective_or_obscured    0.001669
oil_or_diesel                                 0.001468
auto_signal_part_defective                    0.000446
Name: proportion, dtype: float64

In [39]:
df_collision["has_special_conditions_at_site"] = (~df_collision["special_conditions_at_site"].isna()).astype(int)

df_collision["has_special_conditions_at_site"].value_counts(normalize=True)

has_special_conditions_at_site
0    0.974182
1    0.025818
Name: proportion, dtype: float64

In [40]:
del df_collision["special_conditions_at_site"]

In [41]:
evaluate_missing_values()

Unnamed: 0,column_name,number_missing,percentage
0,police_force,0,0.0
1,number_of_vehicles,0,0.0
2,day_of_week,0,0.0
3,time,0,0.0
4,first_road_class,0,0.0
5,road_type,0,0.0
6,speed_limit,0,0.0
7,junction_detail,0,0.0
8,light_conditions,0,0.0
9,weather_conditions,0,0.0


In [42]:
# We know from previous work that the police force category has 44 unique values. Check here that this is still the case
len(df_collision["police_force"].unique())

44

In [43]:
df_collision.info()

<class 'pandas.core.frame.DataFrame'>
Index: 69487 entries, 0 to 71479
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   police_force                    69487 non-null  object
 1   number_of_vehicles              69487 non-null  int64 
 2   day_of_week                     69487 non-null  object
 3   time                            69487 non-null  object
 4   first_road_class                69487 non-null  object
 5   road_type                       69487 non-null  object
 6   speed_limit                     69487 non-null  int64 
 7   junction_detail                 69487 non-null  object
 8   light_conditions                69487 non-null  object
 9   weather_conditions              69487 non-null  object
 10  road_surface_conditions         69487 non-null  object
 11  carriageway_hazards             1696 non-null   object
 12  is_severe                       69487 non-null  int

### 6. Carriageway Hazards

`carriageway_hazards`: these are to record objects that are not expected to be found in the road e.g. an animal, something that has fallen off a lorry.

Similar to pedestrian crossing and special_conditions_at_site, this field has a lot of categories but only applies to a small subset of the data. For our first algorithm, it would be better to simplify this. 

**Strategy: create new category**


In [44]:
df_collision["carriageway_hazards"].value_counts(dropna=False)

carriageway_hazards
NaN                                                67791
other_object_on_road                                 905
any_animal_in_carriageway_(except_ridden_horse)      237
vehicle_load_on_road                                 220
previous_accident                                    179
pedestrian_in_carriageway___not_injured              155
Name: count, dtype: int64

In [45]:
df_collision["is_carriageway_hazard"] = (~df_collision["carriageway_hazards"].isna()).astype(int)
df_collision[["carriageway_hazards","is_carriageway_hazard"]].tail()

Unnamed: 0,carriageway_hazards,is_carriageway_hazard
71467,vehicle_load_on_road,1
71470,,0
71471,,0
71475,,0
71479,,0


In [46]:
del df_collision["carriageway_hazards"]

In [47]:
df_collision.head().T

Unnamed: 0,0,1,2,3,4
police_force,metropolitan_police,metropolitan_police,metropolitan_police,metropolitan_police,metropolitan_police
number_of_vehicles,1,3,2,2,2
day_of_week,sunday,sunday,sunday,sunday,sunday
time,01:00,02:00,04:00,02:00,02:00
first_road_class,c,unclassified,a,a,a
road_type,one_way_street,single_carriageway,roundabout,single_carriageway,single_carriageway
speed_limit,20,30,30,30,30
junction_detail,other_junction,t_or_staggered_junction,roundabout,t_or_staggered_junction,private_drive_or_entrance
light_conditions,darkness___lights_lit,darkness___lights_lit,darkness___lights_lit,darkness___lights_lit,darkness___lights_lit
weather_conditions,other,fine_no_high_winds,fine_no_high_winds,unknown,fine_no_high_winds


## Unknown and other 

### 1. weather conditions

`weather_conditions` refers to weather conditions at the time and location of the accident.

'Fine without high winds' means any weather condition which does not have an adverse effect on
driving.

'Raining' includes drizzle, hail and sleet not tending to build up a deposit. 

'Snowing' includes sleet building up a deposit. 

'Fog' does not include light mists which did not constitute a driving hazard on the road where the
accident occurred. 

The combinations of certain conditions with high winds (codes 4 - 6) should only be coded if the
winds are deemed to have adversely affected driving conditions for one or more of the vehicles in
the accident. The use of these codes does not imply that the high winds were a cause of the
accident. 

'Other' should be used to indicate any other adverse weather condition which is not separately
specified by the codes above. This code should be avoided if at all possible. 

**Strategy: drop unknown and recategorise other for clarity**

It is unclear what `unknown` means so we will drop data with this category. `Other` can be reinterpreted as `other_adverse_weather_condition` for clarity.



In [48]:
df_collision["weather_conditions"].value_counts(normalize=True)

weather_conditions
fine_no_high_winds       0.803172
raining_no_high_winds    0.131032
other                    0.025847
raining__high_winds      0.014089
fine__high_winds         0.011326
unknown                  0.006390
snowing_no_high_winds    0.004030
fog_or_mist              0.003612
snowing__high_winds      0.000504
Name: proportion, dtype: float64

In [49]:
filter_unknown_weather_conditions = df_collision["weather_conditions"] == "unknown"

df_collision = df_collision.drop(df_collision[filter_unknown_weather_conditions].index)
df_collision["weather_conditions"].value_counts(dropna=False, normalize=True)

weather_conditions
fine_no_high_winds       0.808337
raining_no_high_winds    0.131874
other                    0.026013
raining__high_winds      0.014180
fine__high_winds         0.011399
snowing_no_high_winds    0.004055
fog_or_mist              0.003635
snowing__high_winds      0.000507
Name: proportion, dtype: float64

In [50]:
len(df_collision["police_force"].unique())

44

In [51]:
df_collision.loc[df_collision["weather_conditions"] == "other", "weather_conditions"] = "other_adverse_weather_condition"

df_collision["weather_conditions"].value_counts(dropna=False, normalize=True)

weather_conditions
fine_no_high_winds                 0.808337
raining_no_high_winds              0.131874
other_adverse_weather_condition    0.026013
raining__high_winds                0.014180
fine__high_winds                   0.011399
snowing_no_high_winds              0.004055
fog_or_mist                        0.003635
snowing__high_winds                0.000507
Name: proportion, dtype: float64

## Recategorise junction_detail to is_junction

In [52]:
df_collision["junction_detail"].value_counts()

junction_detail
not_at_junction_or_within_20_metres    30530
t_or_staggered_junction                18688
crossroads                              6546
roundabout                              4412
other_junction                          4380
private_drive_or_entrance               1540
more_than_4_arms_(not_roundabout)       1256
mini_roundabout                          926
slip_road                                765
Name: count, dtype: int64

In [53]:
df_collision["is_near_junction"] = (df_collision["junction_detail"] != "not_at_junction_or_within_20_metres").astype(int)
df_collision[["is_near_junction","junction_detail"]]

Unnamed: 0,is_near_junction,junction_detail
0,1,other_junction
1,1,t_or_staggered_junction
2,1,roundabout
4,1,private_drive_or_entrance
5,0,not_at_junction_or_within_20_metres
...,...,...
71467,0,not_at_junction_or_within_20_metres
71470,0,not_at_junction_or_within_20_metres
71471,0,not_at_junction_or_within_20_metres
71475,0,not_at_junction_or_within_20_metres


In [54]:
del df_collision["junction_detail"]

In [55]:
df_collision.head().T

Unnamed: 0,0,1,2,4,5
police_force,metropolitan_police,metropolitan_police,metropolitan_police,metropolitan_police,metropolitan_police
number_of_vehicles,1,3,2,2,1
day_of_week,sunday,sunday,sunday,sunday,sunday
time,01:00,02:00,04:00,02:00,02:00
first_road_class,c,unclassified,a,a,a
road_type,one_way_street,single_carriageway,roundabout,single_carriageway,single_carriageway
speed_limit,20,30,30,30,30
light_conditions,darkness___lights_lit,darkness___lights_lit,darkness___lights_lit,darkness___lights_lit,darkness___lights_lit
weather_conditions,other_adverse_weather_condition,fine_no_high_winds,fine_no_high_winds,fine_no_high_winds,fine_no_high_winds
road_surface_conditions,wet_or_damp,dry,dry,dry,dry


## Check remaining categories

In [56]:
df_collision["police_force"].unique()

array(['metropolitan_police', 'cumbria', 'lancashire', 'merseyside',
       'greater_manchester', 'cheshire', 'northumbria', 'durham',
       'north_yorkshire', 'west_yorkshire', 'south_yorkshire',
       'humberside', 'cleveland', 'west_midlands', 'staffordshire',
       'west_mercia', 'warwickshire', 'derbyshire', 'nottinghamshire',
       'lincolnshire', 'leicestershire', 'northamptonshire',
       'cambridgeshire', 'norfolk', 'suffolk', 'bedfordshire',
       'hertfordshire', 'essex', 'thames_valley', 'hampshire', 'surrey',
       'kent', 'sussex', 'city_of_london', 'devon_and_cornwall',
       'avon_and_somerset', 'gloucestershire', 'wiltshire', 'dorset',
       'north_wales', 'gwent', 'south_wales', 'dyfed_powys',
       'police_scotland'], dtype=object)

In [57]:
df_collision["number_of_vehicles"].unique()

array([ 1,  3,  2,  5,  6,  4,  7,  8, 12,  9, 17, 14, 11, 13, 10, 15])

In [58]:
df_collision["day_of_week"].unique()

array(['sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday',
       'saturday'], dtype=object)

In [59]:
sorted(list(df_collision["time"].unique()))

['00:00',
 '01:00',
 '02:00',
 '03:00',
 '04:00',
 '05:00',
 '06:00',
 '07:00',
 '08:00',
 '09:00',
 '10:00',
 '11:00',
 '12:00',
 '13:00',
 '14:00',
 '15:00',
 '16:00',
 '17:00',
 '18:00',
 '19:00',
 '20:00',
 '21:00',
 '22:00',
 '23:00']

In [60]:
df_collision["first_road_class"].unique()

array(['c', 'unclassified', 'a', 'b', 'motorway', 'a(m)'], dtype=object)

In [61]:
df_collision["speed_limit"].unique()

array([20, 30, 50, 40, 70, 60])

In [62]:
df_collision["is_near_junction"].value_counts(normalize=True)

is_near_junction
1    0.557812
0    0.442188
Name: proportion, dtype: float64

In [63]:
df_collision["light_conditions"].unique()

array(['darkness___lights_lit', 'daylight', 'darkness___no_lighting',
       'darkness___lighting_unknown', 'darkness___lights_unlit'],
      dtype=object)

In [64]:
df_collision["weather_conditions"].unique()

array(['other_adverse_weather_condition', 'fine_no_high_winds',
       'raining_no_high_winds', 'fine__high_winds', 'raining__high_winds',
       'snowing_no_high_winds', 'fog_or_mist', 'snowing__high_winds'],
      dtype=object)

In [65]:
df_collision["road_surface_conditions"].value_counts(normalize=True, dropna=False)

road_surface_conditions
dry                     0.693075
wet_or_damp             0.285807
frost_or_ice            0.016454
snow                    0.002477
flood_over_3cm._deep    0.002187
Name: proportion, dtype: float64

## Save data


In [66]:
df_collision.to_csv("./data/collisions4.csv", index=False)