In [2]:
import pandas as pd

df = pd.read_csv('classification/data/traffic_accidents.csv')
print(f"Shape: {df.shape})")
df.head()

Shape: (209306, 24))


Unnamed: 0,crash_date,traffic_control_device,weather_condition,lighting_condition,first_crash_type,trafficway_type,alignment,roadway_surface_cond,road_defect,crash_type,...,most_severe_injury,injuries_total,injuries_fatal,injuries_incapacitating,injuries_non_incapacitating,injuries_reported_not_evident,injuries_no_indication,crash_hour,crash_day_of_week,crash_month
0,07/29/2023 01:00:00 PM,TRAFFIC SIGNAL,CLEAR,DAYLIGHT,TURNING,NOT DIVIDED,STRAIGHT AND LEVEL,UNKNOWN,UNKNOWN,NO INJURY / DRIVE AWAY,...,NO INDICATION OF INJURY,0.0,0.0,0.0,0.0,0.0,3.0,13,7,7
1,08/13/2023 12:11:00 AM,TRAFFIC SIGNAL,CLEAR,"DARKNESS, LIGHTED ROAD",TURNING,FOUR WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,NO INJURY / DRIVE AWAY,...,NO INDICATION OF INJURY,0.0,0.0,0.0,0.0,0.0,2.0,0,1,8
2,12/09/2021 10:30:00 AM,TRAFFIC SIGNAL,CLEAR,DAYLIGHT,REAR END,T-INTERSECTION,STRAIGHT AND LEVEL,DRY,NO DEFECTS,NO INJURY / DRIVE AWAY,...,NO INDICATION OF INJURY,0.0,0.0,0.0,0.0,0.0,3.0,10,5,12
3,08/09/2023 07:55:00 PM,TRAFFIC SIGNAL,CLEAR,DAYLIGHT,ANGLE,FOUR WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,INJURY AND / OR TOW DUE TO CRASH,...,NONINCAPACITATING INJURY,5.0,0.0,0.0,5.0,0.0,0.0,19,4,8
4,08/19/2023 02:55:00 PM,TRAFFIC SIGNAL,CLEAR,DAYLIGHT,REAR END,T-INTERSECTION,STRAIGHT AND LEVEL,UNKNOWN,UNKNOWN,NO INJURY / DRIVE AWAY,...,NO INDICATION OF INJURY,0.0,0.0,0.0,0.0,0.0,3.0,14,7,8


In [3]:
# Create a copy of the dataframe for encoding
df_encoded = df.copy()

# 1. ORDINAL ENCODING - Define ordering for ordinal variables
from sklearn.preprocessing import OrdinalEncoder

# Most severe injury - ordered by severity
injury_order = ['NO INDICATION OF INJURY', 'REPORTED, NOT EVIDENT', 
                'NONINCAPACITATING INJURY', 'INCAPACITATING INJURY', 'FATAL']

# Damage - ordered by cost
damage_order = ['$500 OR LESS', '$501 - $1,500', 'OVER $1,500']

# Lighting condition - ordered by visibility
lighting_order = ['DAYLIGHT', 'DAWN', 'DUSK', 'DARKNESS, LIGHTED ROAD', 'DARKNESS', 'UNKNOWN']

# Roadway surface - ordered by risk/severity
surface_order = ['DRY', 'WET', 'SAND, MUD, DIRT', 'SNOW OR SLUSH', 'ICE', 'OTHER', 'UNKNOWN']

# Apply ordinal encoding
ordinal_mappings = {
    'most_severe_injury': {val: idx for idx, val in enumerate(injury_order)},
    'damage': {val: idx for idx, val in enumerate(damage_order)},
    'lighting_condition': {val: idx for idx, val in enumerate(lighting_order)},
    'roadway_surface_cond': {val: idx for idx, val in enumerate(surface_order)}
}

for col, mapping in ordinal_mappings.items():
    df_encoded[col] = df_encoded[col].map(mapping)

print("Ordinal encoding completed for:")
print(list(ordinal_mappings.keys()))

Ordinal encoding completed for:
['most_severe_injury', 'damage', 'lighting_condition', 'roadway_surface_cond']


In [4]:
# 2. BINARY ENCODING - Convert yes/no and binary variables
df_encoded['intersection_related_i'] = (df_encoded['intersection_related_i'] == 'Y').astype(int)
df_encoded['crash_type'] = (df_encoded['crash_type'] == 'INJURY AND / OR TOW DUE TO CRASH').astype(int)

print("Binary encoding completed for:")
print(['intersection_related_i', 'crash_type'])

Binary encoding completed for:
['intersection_related_i', 'crash_type']


In [5]:
# 3. ONE-HOT ENCODING - For nominal categories
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Columns to one-hot encode
onehot_cols = ['alignment', 'road_defect', 'weather_condition', 
               'first_crash_type', 'traffic_control_device', 'trafficway_type']

# Use pandas get_dummies for simplicity
df_encoded = pd.get_dummies(df_encoded, columns=onehot_cols, prefix=onehot_cols, drop_first=True)

print(f"One-hot encoding completed. New shape: {df_encoded.shape}")
print(f"Added {df_encoded.shape[1] - df.shape[1] + len(onehot_cols)} new columns")

One-hot encoding completed. New shape: (209306, 94)
Added 76 new columns


In [6]:
# 4. FREQUENCY ENCODING - For high-cardinality categorical variable
# Encode prim_contributory_cause by its frequency
cause_freq = df_encoded['prim_contributory_cause'].value_counts() / len(df_encoded)
df_encoded['prim_contributory_cause_freq'] = df_encoded['prim_contributory_cause'].map(cause_freq)

print(f"Frequency encoding completed for prim_contributory_cause")
print(f"Sample frequencies: {cause_freq.head()}")

Frequency encoding completed for prim_contributory_cause
Sample frequencies: prim_contributory_cause
UNABLE TO DETERMINE              0.278616
FAILING TO YIELD RIGHT-OF-WAY    0.205030
FOLLOWING TOO CLOSELY            0.091178
DISREGARDING TRAFFIC SIGNALS     0.069711
IMPROPER TURNING/NO SIGNAL       0.060404
Name: count, dtype: float64


In [7]:
# 5. DROP irrelevant columns
# Drop crash_date (high cardinality, already parsed into hour/day/month)
# Drop original prim_contributory_cause (replaced with frequency encoding)
df_encoded = df_encoded.drop(['crash_date', 'prim_contributory_cause'], axis=1)

print(f"\nFinal encoded dataset shape: {df_encoded.shape}")
print(f"Original shape: {df.shape}")
print(f"Columns increased from {df.shape[1]} to {df_encoded.shape[1]}")

# Display sample of encoded data
df_encoded.head()


Final encoded dataset shape: (209306, 93)
Original shape: (209306, 24)
Columns increased from 24 to 93


Unnamed: 0,lighting_condition,roadway_surface_cond,crash_type,intersection_related_i,damage,num_units,most_severe_injury,injuries_total,injuries_fatal,injuries_incapacitating,...,trafficway_type_OTHER,trafficway_type_PARKING LOT,trafficway_type_RAMP,trafficway_type_ROUNDABOUT,trafficway_type_T-INTERSECTION,trafficway_type_TRAFFIC ROUTE,trafficway_type_UNKNOWN,trafficway_type_UNKNOWN INTERSECTION TYPE,trafficway_type_Y-INTERSECTION,prim_contributory_cause_freq
0,0,6,0,1,1,2,0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,0.278616
1,3,0,0,1,2,2,0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,0.060404
2,0,0,0,1,1,3,0,0.0,0.0,0.0,...,False,False,False,False,True,False,False,False,False,0.091178
3,0,0,1,1,2,2,2,5.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,0.278616
4,0,6,0,1,1,2,0,0.0,0.0,0.0,...,False,False,False,False,True,False,False,False,False,0.024118


In [12]:
import os
filepath_dir = 'classification/outputs'
os.makedirs(filepath_dir, exist_ok=True)

df_encoded.to_csv(os.path.join(filepath_dir, 'preparation_1_output.csv'), index=False)
