In [27]:
import pandas as pd
import numpy as np
import statistics as st

In [28]:
df = pd.read_csv('../EDA_values.csv')
df.shape

  df = pd.read_csv('../EDA_values.csv')


(1048575, 55)

In [29]:
df['severity'].value_counts()

property_damage    575655
minor_injury       335148
serious_injury     126325
fatality            11447
Name: severity, dtype: int64

# Remove speed-limit=777, 888, 999, Blanks

In [30]:
df = df.dropna(subset=['speed_limit'])
df['speed_limit'].unique()

array([60, 40, 100, 50, 80, 70, 90, 25, 110, 15, 10, 20, 30, 5, 75, 999,
       888, 777, '50', '100', '80', '90', '60', '40', '70', '30', '20',
       '110', '10', '100 - 110 km/h', '60 km/h', '0 - 50 km/h', '70 km/h',
       '80 - 90 km/h'], dtype=object)

# Change string speeds to numeric

In [31]:
df.loc[df['speed_limit'] == '0 - 50 km/h', 'speed_limit'] = 50
df.loc[df['speed_limit'] == '60 km/h', 'speed_limit'] = 60
df.loc[df['speed_limit'] == '70 km/h', 'speed_limit'] = 70
df.loc[df['speed_limit'] == '80 - 90 km/h', 'speed_limit'] = 90
df.loc[df['speed_limit'] == '100 - 110 km/h', 'speed_limit'] = 100
df['speed_limit']=pd.to_numeric(df.speed_limit).astype(np.int64)
df['speed_limit'].unique()

array([ 60,  40, 100,  50,  80,  70,  90,  25, 110,  15,  10,  20,  30,
         5,  75, 999, 888, 777], dtype=int64)

# Change speed limit to categorical using new speed_labels column
### 0-50 km/h, 50-80 km/h, 80-100 km/h, 100+ km/h

In [32]:
speed_labels = pd.cut(x=df['speed_limit'], bins=[0, 50, 80, 90, 999],
                    labels=['0-50 km/h', '50-80 km/h', '80-100 km/h',
                             '100+ km/h'])
df.insert(loc = 2, column = 'speed_labels', value = speed_labels)
df['speed_labels'].unique()

['50-80 km/h', '0-50 km/h', '100+ km/h', '80-100 km/h']
Categories (4, object): ['0-50 km/h' < '50-80 km/h' < '80-100 km/h' < '100+ km/h']

# Remove road_position horizontal and vertical, road_sealed, road_wet

In [33]:
df.drop(['road_position_horizontal', 'road_position_vertical', 'road_sealed', 'road_wet'], axis=1, inplace=True)
df.columns

Index(['description_id', 'severity', 'speed_labels', 'speed_limit', 'midblock',
       'intersection', 'weather', 'crash_type', 'lighting', 'traffic_controls',
       'drugs_alcohol', 'DCA_code', 'comment', 'vehicles_id', 'animals',
       'car_sedan', 'car_utility', 'car_van', 'car_4x4', 'car_station_wagon',
       'motor_cycle', 'truck_small', 'truck_large', 'bus', 'taxi', 'bicycle',
       'scooter', 'pedestrian', 'inanimate', 'train', 'tram', 'vehicle_other',
       'date_time_id', 'year', 'month', 'day_of_week', 'day_of_month', 'hour',
       'approximate', 'lat_long', 'latitude', 'longitude', 'country', 'state',
       'local_government_area', 'statistical_area', 'suburb', 'casualties_id',
       'casualties', 'fatalities', 'serious_injuries', 'minor_injuries'],
      dtype='object')

# Remove weather=unknown

In [34]:
weather_outliers = ['unknown']
df['weather'] = df['weather'].replace(weather_outliers, st.mode(df["weather"]))
df['weather'].unique()

array(['fine', 'rain', 'smoke_dust', 'high_wind', 'fog', 'snow', 'mist'],
      dtype=object)

In [35]:
df['severity'].value_counts()

property_damage    575647
minor_injury       335147
serious_injury     126324
fatality            11446
Name: severity, dtype: int64

# Remove lighting=unknown, other

In [36]:
lighting_outliers = ['unknown', 'other']
df = df[df.lighting.isin(lighting_outliers) == False]
df['lighting'].unique()

array(['daylight', 'darkness_not_lit', 'darkness_lit', 'dawn_dusk'],
      dtype=object)

# Remove traffic_controls=other

In [37]:
traffic_outliers = ['other']
df = df[df.lighting.isin(traffic_outliers) == False]
df.traffic_controls.fillna('none', inplace=True)
df['traffic_controls'].value_counts()

none                   751919
giveway_sign           140078
traffic_lights         100733
stop_sign               48168
pedestrian_crossing       489
school_crossing           339
railway_crossing          331
manual_control            255
other                      43
Name: traffic_controls, dtype: int64

In [38]:
df['severity'].value_counts()

property_damage    574598
minor_injury       331137
serious_injury     125225
fatality            11395
Name: severity, dtype: int64

# Drug_alcohol change blank to 0 and 'Y' to 1

In [39]:
df['drugs_alcohol'] = df['drugs_alcohol'].fillna(False)
df.loc[df['drugs_alcohol'] == 'Y', 'drugs_alcohol'] = True
df['drugs_alcohol'] = df['drugs_alcohol'].astype(bool)
df['drugs_alcohol'].unique()

array([False,  True])

# DCA code important, unsure how to continue
### Removing column for now

In [40]:
df.drop(['DCA_code'], axis=1, inplace=True)
df.columns

Index(['description_id', 'severity', 'speed_labels', 'speed_limit', 'midblock',
       'intersection', 'weather', 'crash_type', 'lighting', 'traffic_controls',
       'drugs_alcohol', 'comment', 'vehicles_id', 'animals', 'car_sedan',
       'car_utility', 'car_van', 'car_4x4', 'car_station_wagon', 'motor_cycle',
       'truck_small', 'truck_large', 'bus', 'taxi', 'bicycle', 'scooter',
       'pedestrian', 'inanimate', 'train', 'tram', 'vehicle_other',
       'date_time_id', 'year', 'month', 'day_of_week', 'day_of_month', 'hour',
       'approximate', 'lat_long', 'latitude', 'longitude', 'country', 'state',
       'local_government_area', 'statistical_area', 'suburb', 'casualties_id',
       'casualties', 'fatalities', 'serious_injuries', 'minor_injuries'],
      dtype='object')

# Remove comments, vehicle_id, vehicle_other

In [41]:
df.drop(['comment', 'vehicles_id', 'vehicle_other'], axis=1, inplace=True)
df.columns

Index(['description_id', 'severity', 'speed_labels', 'speed_limit', 'midblock',
       'intersection', 'weather', 'crash_type', 'lighting', 'traffic_controls',
       'drugs_alcohol', 'animals', 'car_sedan', 'car_utility', 'car_van',
       'car_4x4', 'car_station_wagon', 'motor_cycle', 'truck_small',
       'truck_large', 'bus', 'taxi', 'bicycle', 'scooter', 'pedestrian',
       'inanimate', 'train', 'tram', 'date_time_id', 'year', 'month',
       'day_of_week', 'day_of_month', 'hour', 'approximate', 'lat_long',
       'latitude', 'longitude', 'country', 'state', 'local_government_area',
       'statistical_area', 'suburb', 'casualties_id', 'casualties',
       'fatalities', 'serious_injuries', 'minor_injuries'],
      dtype='object')

# Categorize severity

In [42]:
df['severity'] = df['severity'] = pd.Categorical(df.severity)
df['severity'].unique()

['property_damage', 'minor_injury', 'serious_injury', 'fatality']
Categories (4, object): ['fatality', 'minor_injury', 'property_damage', 'serious_injury']

# Categorize weather, lighting, and traffic controls

In [43]:
df['weather'] = df['weather'] = pd.Categorical(df.weather)
df['lighting'] = df['lighting'] = pd.Categorical(df.lighting)
df['traffic_controls'] = df['traffic_controls'] = pd.Categorical(df.traffic_controls)
print(df['weather'].unique())
print(df['lighting'].unique())
print(df['traffic_controls'].unique())

['fine', 'rain', 'smoke_dust', 'high_wind', 'fog', 'snow', 'mist']
Categories (7, object): ['fine', 'fog', 'high_wind', 'mist', 'rain', 'smoke_dust', 'snow']
['daylight', 'darkness_not_lit', 'darkness_lit', 'dawn_dusk']
Categories (4, object): ['darkness_lit', 'darkness_not_lit', 'dawn_dusk', 'daylight']
['none', 'stop_sign', 'traffic_lights', 'giveway_sign', 'railway_crossing', 'other', 'manual_control', 'school_crossing', 'pedestrian_crossing']
Categories (9, object): ['giveway_sign', 'manual_control', 'none', 'other', ..., 'railway_crossing', 'school_crossing', 'stop_sign', 'traffic_lights']


# Compare severity vs datetime columns and decide what to do with those columns
### Unsure of severity correlation with date time so those columns were removed for now

In [44]:
df.drop(['year', 'month','day_of_week', 'day_of_month', 'hour'], axis=1, inplace=True)
df.columns

Index(['description_id', 'severity', 'speed_labels', 'speed_limit', 'midblock',
       'intersection', 'weather', 'crash_type', 'lighting', 'traffic_controls',
       'drugs_alcohol', 'animals', 'car_sedan', 'car_utility', 'car_van',
       'car_4x4', 'car_station_wagon', 'motor_cycle', 'truck_small',
       'truck_large', 'bus', 'taxi', 'bicycle', 'scooter', 'pedestrian',
       'inanimate', 'train', 'tram', 'date_time_id', 'approximate', 'lat_long',
       'latitude', 'longitude', 'country', 'state', 'local_government_area',
       'statistical_area', 'suburb', 'casualties_id', 'casualties',
       'fatalities', 'serious_injuries', 'minor_injuries'],
      dtype='object')

# Remove description_id, date_time_id, crash_type, approximate to minor_injuries columns

In [45]:
df.drop(['description_id', 'date_time_id', 'crash_type','approximate', 'lat_long',
         'latitude', 'longitude', 'country', 'state', 'local_government_area',
         'statistical_area', 'suburb', 'casualties_id', 'casualties',
         'fatalities', 'serious_injuries', 'minor_injuries'], axis=1, inplace=True)
df.columns

Index(['severity', 'speed_labels', 'speed_limit', 'midblock', 'intersection',
       'weather', 'lighting', 'traffic_controls', 'drugs_alcohol', 'animals',
       'car_sedan', 'car_utility', 'car_van', 'car_4x4', 'car_station_wagon',
       'motor_cycle', 'truck_small', 'truck_large', 'bus', 'taxi', 'bicycle',
       'scooter', 'pedestrian', 'inanimate', 'train', 'tram'],
      dtype='object')

# Final shape and data types

In [46]:
print(df.shape)
print(df.dtypes)

(1042355, 26)
severity             category
speed_labels         category
speed_limit             int64
midblock                 bool
intersection             bool
weather              category
lighting             category
traffic_controls     category
drugs_alcohol            bool
animals                 int64
car_sedan               int64
car_utility             int64
car_van                 int64
car_4x4                 int64
car_station_wagon       int64
motor_cycle             int64
truck_small             int64
truck_large             int64
bus                     int64
taxi                    int64
bicycle                 int64
scooter                 int64
pedestrian              int64
inanimate               int64
train                   int64
tram                    int64
dtype: object


In [47]:
df['severity'].value_counts()

property_damage    574598
minor_injury       331137
serious_injury     125225
fatality            11395
Name: severity, dtype: int64

In [48]:
df.isna().sum()

severity             0
speed_labels         0
speed_limit          0
midblock             0
intersection         0
weather              0
lighting             0
traffic_controls     0
drugs_alcohol        0
animals              0
car_sedan            0
car_utility          0
car_van              0
car_4x4              0
car_station_wagon    0
motor_cycle          0
truck_small          0
truck_large          0
bus                  0
taxi                 0
bicycle              0
scooter              0
pedestrian           0
inanimate            0
train                0
tram                 0
dtype: int64

In [49]:
df.dtypes

severity             category
speed_labels         category
speed_limit             int64
midblock                 bool
intersection             bool
weather              category
lighting             category
traffic_controls     category
drugs_alcohol            bool
animals                 int64
car_sedan               int64
car_utility             int64
car_van                 int64
car_4x4                 int64
car_station_wagon       int64
motor_cycle             int64
truck_small             int64
truck_large             int64
bus                     int64
taxi                    int64
bicycle                 int64
scooter                 int64
pedestrian              int64
inanimate               int64
train                   int64
tram                    int64
dtype: object

In [50]:
# drop speed limit in favor of speed_labels
df.drop(['speed_limit'], axis=1, inplace=True)

# Convert categories and bool to numeric

In [51]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

df['drugs_alcohol'] = df['drugs_alcohol'].replace(False, 0)
df['drugs_alcohol'] = df['drugs_alcohol'].replace(True, 1)

df['midblock'] = df['midblock'].replace(False, 0)
df['midblock'] = df['midblock'].replace(True, 1)

df['intersection'] = df['intersection'].replace(False, 0)
df['intersection'] = df['intersection'].replace(True, 1)

categories = ['severity', 'speed_labels', 'weather', 'lighting', 'traffic_controls', ]
label_encoder = LabelEncoder()
for category in categories:
    df[category] = label_encoder.fit_transform(df[category])

numeric_columns = ['animals', 'car_sedan', 'car_utility', 'car_van', 'car_4x4', 'car_station_wagon',
                   'motor_cycle', 'truck_small', 'truck_large', 'bus', 'taxi', 'bicycle',
                   'scooter', 'pedestrian', 'inanimate', 'train', 'tram' ]
sc = StandardScaler()
df[numeric_columns] = sc.fit_transform(df[numeric_columns])

In [52]:
df.dtypes

severity               int32
speed_labels           int32
midblock               int64
intersection           int64
weather                int32
lighting               int32
traffic_controls       int32
drugs_alcohol          int64
animals              float64
car_sedan            float64
car_utility          float64
car_van              float64
car_4x4              float64
car_station_wagon    float64
motor_cycle          float64
truck_small          float64
truck_large          float64
bus                  float64
taxi                 float64
bicycle              float64
scooter              float64
pedestrian           float64
inanimate            float64
train                float64
tram                 float64
dtype: object

# Export to CSV

In [53]:
df.to_csv('./Data_Cleaning_output.csv')