In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import bamboolib as bam


df = pd.read_csv("C:/Users/Zeeshan/Downloads/Bike-Severity-Application-main/Bike-Severity-Application-main/bike_crash-B-PF307G4M.csv")



In [2]:
df

     $1000 Damage to Any One Person's Property Active School Zone Flag  \
0                                           No                      No   
1                                           No                      No   
2                                           No                      No   
3                                           No                      No   
4                                           No                      No   
...                                        ...                     ...   
2458                                        No                      No   
2459                                        No                      No   
2460                                        No                      No   
2461                                       Yes                      No   
2462                                       Yes                      No   

      At Intersection Flag Average Daily Traffic Amount  \
0                    False                        15

In [3]:
severity_mapping = {
    'Non-Incapacitating Injury': 0,  # Low severity
    'Not Injured': 0,                # Low severity
    'Incapacitating Injury': 1,      # Medium severity
    'Possible Injury': 1,            # Medium severity
    'Killed': 2                      # High severity
}

# Apply the mapping to the 'Crash Severity' column
df['Crash Severity'] = df['Crash Severity'].map(severity_mapping)

# Optional: Check the transformation has been applied correctly
print(df['Crash Severity'].value_counts())

0    1637
1     810
2      16
Name: Crash Severity, dtype: int64


In [4]:
df['target'] = -1  
df.loc[(df['$1000 Damage to Any One Person\'s Property'] == 1) & (df['Crash Severity'] == 1), 'target'] = 1
df.loc[(df['$1000 Damage to Any One Person\'s Property'] == 0) & (df['Crash Severity'] == 0), 'target'] = 0


In [5]:
# Convert 'Crash Time' to strings
df['Crash Time'] = df['Crash Time'].astype(str)

# Pad the strings with leading zeros to ensure they are 4 characters long
df['Crash Time'] = df['Crash Time'].apply(lambda x: x.zfill(4))

# Now, convert the formatted strings to datetime.time objects
df['Crash Time'] = pd.to_datetime(df['Crash Time'], format='%H%M').dt.time

# Check the first few entries to verify the conversion
print(df['Crash Time'].head())

0    02:39:00
1    03:10:00
2    03:10:00
3    03:10:00
4    23:00:00
Name: Crash Time, dtype: object


Morning Rush Hour (6 AM to 9 AM): Traffic volume increases as people commute to work or school, potentially leading to more crashes.

Midday (9 AM to 3 PM): Traffic might be lighter, but this period can include a mix of commercial traffic, lunchtime traffic, and off-peak travel.

Afternoon Rush Hour (3 PM to 7 PM): Like the morning rush hour, the afternoon/early evening sees high traffic volumes as people return from work or school. Increased traffic, along with potentially lower light levels in the evening, can contribute to higher crash rates.

Evening (7 PM to Midnight): Traffic volume decreases, but there might be a higher risk of crashes involving impaired drivers or reduced visibility.

Late Night to Early Morning (Midnight to 6 AM): While traffic volumes are low, this period can have a higher proportion of crashes involving fatigue or impairment.


In [6]:
# Define a function to categorize and convert crash times to numerical codes directly
def categorize_and_convert_crash_time(time_obj):
    if pd.isnull(time_obj):
        return 5  # 'Unknown'
    hours = time_obj.hour
    if 6 <= hours < 9:
        return 0  # 'Morning Rush Hour'
    elif 9 <= hours < 15:
        return 1  # 'Midday'
    elif 15 <= hours < 19:
        return 2  # 'Afternoon Rush Hour'
    elif 19 <= hours <= 23:
        return 3  # 'Evening'
    else:
        return 4  # 'Late Night to Early Morning'

# Assuming 'Crash Time' is already a datetime.time column
df['Crash Time Category'] = df['Crash Time'].apply(categorize_and_convert_crash_time)

# Mapping for reference
category_to_numerical = {
    'Morning Rush Hour': 0,
    'Midday': 1,
    'Afternoon Rush Hour': 2,
    'Evening': 3,
    'Late Night to Early Morning': 4,
    'Unknown': 5
}

# Print the mapping for reference
print("Category to Numerical Mapping:")
for category, numerical in category_to_numerical.items():
    print(f"'{category}': {numerical}")

# Optional: Verify the conversion by viewing the first few entries
print(df['Crash Time Category'].head())

Category to Numerical Mapping:
'Morning Rush Hour': 0
'Midday': 1
'Afternoon Rush Hour': 2
'Evening': 3
'Late Night to Early Morning': 4
'Unknown': 5
0    4
1    4
2    4
3    4
4    3
Name: Crash Time Category, dtype: int64


In [7]:

# Assuming df is your DataFrame and it has a column named 'Speed Limit'

# Function to categorize speed limits
def categorize_speed_limit(speed):
    if -1 <= speed <= 20:
        return 0  # Stop/Slow
    #elif 1 <= speed <= 20:
     #   return 1  # Slow
    elif 21 <= speed <= 40:
        return 2  # Medium
    else:
        return 3  # High

# Apply the function to the 'Speed Limit' column to create a new 'Speed Category' column
df['Speed Category'] = df['Speed Limit'].apply(categorize_speed_limit)

# Mapping for reference
category_to_numerical_speed = {
    'Stop': 0,
    'Slow Speed': 1,
    'Medium Speed': 2,
    'High Speed': 3,
    'Unknown': 4
}

# Print the mapping for reference
print("Category to Numerical Mapping:")
for category, numerical in category_to_numerical_speed.items():
    print(f"'{category}': {numerical}")

# Optional: Verify the conversion by viewing the first few entries
print(df['Speed Category'].head())


Category to Numerical Mapping:
'Stop': 0
'Slow Speed': 1
'Medium Speed': 2
'High Speed': 3
'Unknown': 4
0    3
1    2
2    2
3    2
4    3
Name: Speed Category, dtype: int64


In [8]:
#df.rename(columns={'$1000 Damage to Any One Person\'s Property': 'Damage to Property'}, inplace=True)

# Identify categorical columns (You might need to adjust this list based on your dataset)
categorical_columns = ['$1000 Damage to Any One Person\'s Property','Active School Zone Flag','At Intersection Flag','Construction Zone Flag',  'Day of Week','Intersection Related', 'Roadway Part', 'Person Helmet', 'Surface Condition','Traffic Control Type', 'Crash Time Category','Speed Category']

# Initialize LabelEncoders and apply them to each categorical column
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Print out mappings for each categorical variable
for col, le in label_encoders.items():
    mappings = {index: label for index, label in enumerate(le.classes_)}
    print(f'Mappings for {col}: {mappings}\n')

Mappings for $1000 Damage to Any One Person's Property: {0: 'No', 1: 'Yes'}

Mappings for Active School Zone Flag: {0: 'No', 1: 'Yes'}

Mappings for At Intersection Flag: {0: False, 1: True}

Mappings for Construction Zone Flag: {0: 'No', 1: 'Yes'}

Mappings for Day of Week: {0: 'Friday', 1: 'Monday', 2: 'Saturday', 3: 'Sunday', 4: 'Thursday', 5: 'Tuesday', 6: 'Wednesday'}

Mappings for Intersection Related: {0: 'Driveway Access', 1: 'Intersection', 2: 'Intersection Related', 3: 'Non Intersection', 4: 'Not Reported'}

Mappings for Roadway Part: {0: 'Entrance/On Ramp', 1: 'Main/Proper Lane', 2: 'Other (Explain In Narrative)', 3: 'Service/Frontage Road'}

Mappings for Person Helmet: {0: 'Not Worn', 1: 'Unknown If Worn', 2: 'Worn, Damaged', 3: 'Worn, Not Damaged', 4: 'Worn, Unk Damage'}

Mappings for Surface Condition: {0: 'Dry', 1: 'Ice', 2: 'Other (Explain In Narrative)', 3: 'Sand, Mud, Dirt', 4: 'Standing Water', 5: 'Unknown', 6: 'Wet'}


Mappings for Crash Time Category: {0: 0, 1: 1, 

In [9]:
df

      $1000 Damage to Any One Person's Property  Active School Zone Flag  \
0                                             0                        0   
1                                             0                        0   
2                                             0                        0   
3                                             0                        0   
4                                             0                        0   
...                                         ...                      ...   
2458                                          0                        0   
2459                                          0                        0   
2460                                          0                        0   
2461                                          1                        0   
2462                                          1                        0   

      At Intersection Flag Average Daily Traffic Amount  \
0                        0  

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from joblib import dump

# Assuming df is your DataFrame and is already preprocessed
# Define the list of features you want to use for modeling
selected_features = [
    'Day of Week',
    'Active School Zone Flag',
    'Speed Category',
    'Crash Time Category',
    'Surface Condition',
    'Person Helmet',
    'Intersection Related',
    'Construction Zone Flag',
    'Roadway Part',
    'Traffic Control Type'
]

# Selecting only the specified features for X
X = df[selected_features]

# Assuming 'Crash Severity' is your target variable
y = df['Crash Severity']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a pipeline with SMOTE and Random Forest Classifier
pipeline = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Fitting the pipeline to the training data
pipeline.fit(X_train, y_train)

# Making predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluating the model
print("Random Forest with SMOTE Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Saving the pipeline for later use
dump(pipeline, 'random_forest_with_smote.joblib')


Random Forest with SMOTE Accuracy: 0.5456389452332657
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.69      0.67       328
           1       0.31      0.26      0.28       162
           2       0.00      0.00      0.00         3

    accuracy                           0.55       493
   macro avg       0.32      0.32      0.32       493
weighted avg       0.54      0.55      0.54       493

Confusion Matrix:
 [[227  94   7]
 [118  42   2]
 [  2   1   0]]


['random_forest_with_smote.joblib']