In [1]:
pip install gradio

Collecting gradio
  Downloading gradio-5.29.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6

In [4]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import gradio as gr
from sklearn.preprocessing import LabelEncoder
# 2. Load Dataset
df = pd.read_csv("https://github.com/Joshva07/Enhancing-road-safety-with-AI-driven-traffic-accident-analysis-and-prediction/raw/main/Traffic%20Accidents.zip")  # Replace with actual file name
print("Data loaded. Shape:cidents_March23.csv", df.shape)

Data loaded. Shape:cidents_March23.csv (209306, 24)


In [6]:
# 3. Initial Exploration
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209306 entries, 0 to 209305
Data columns (total 24 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   crash_date                     209306 non-null  object 
 1   traffic_control_device         209306 non-null  object 
 2   weather_condition              209306 non-null  object 
 3   lighting_condition             209306 non-null  object 
 4   first_crash_type               209306 non-null  object 
 5   trafficway_type                209306 non-null  object 
 6   alignment                      209306 non-null  object 
 7   roadway_surface_cond           209306 non-null  object 
 8   road_defect                    209306 non-null  object 
 9   crash_type                     209306 non-null  object 
 10  intersection_related_i         209306 non-null  object 
 11  damage                         209306 non-null  object 
 12  prim_contributory_cause       

In [7]:
df.describe()

Unnamed: 0,num_units,injuries_total,injuries_fatal,injuries_incapacitating,injuries_non_incapacitating,injuries_reported_not_evident,injuries_no_indication,crash_hour,crash_day_of_week,crash_month
count,209306.0,209306.0,209306.0,209306.0,209306.0,209306.0,209306.0,209306.0,209306.0,209306.0
mean,2.0633,0.382717,0.001859,0.038102,0.221241,0.121516,2.244002,13.373047,4.144024,6.771822
std,0.396012,0.79972,0.047502,0.233964,0.61496,0.450865,1.241175,5.60383,1.966864,3.427593
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,2.0,9.0,2.0,4.0
50%,2.0,0.0,0.0,0.0,0.0,0.0,2.0,14.0,4.0,7.0
75%,2.0,1.0,0.0,0.0,0.0,0.0,3.0,17.0,6.0,10.0
max,11.0,21.0,3.0,7.0,21.0,15.0,49.0,23.0,7.0,12.0


In [8]:
df.isnull().sum()

Unnamed: 0,0
crash_date,0
traffic_control_device,0
weather_condition,0
lighting_condition,0
first_crash_type,0
trafficway_type,0
alignment,0
roadway_surface_cond,0
road_defect,0
crash_type,0


In [12]:
# 4. Drop irrelevant or mostly null columns
df = df.drop(columns=['ID', 'Source', 'End_Lat', 'End_Lng'], errors='ignore')
print(df.columns)


Index(['crash_date', 'traffic_control_device', 'weather_condition',
       'lighting_condition', 'first_crash_type', 'trafficway_type',
       'alignment', 'roadway_surface_cond', 'road_defect', 'crash_type',
       'intersection_related_i', 'damage', 'prim_contributory_cause',
       'num_units', 'most_severe_injury', 'injuries_total', 'injuries_fatal',
       'injuries_incapacitating', 'injuries_non_incapacitating',
       'injuries_reported_not_evident', 'injuries_no_indication', 'crash_hour',
       'crash_day_of_week', 'crash_month'],
      dtype='object')


In [43]:
# 5. Handle missing values

df = df.dropna()

In [24]:
# 6. Convert time columns safely
# Add a check to see if 'Start_Time' and 'End_Time' exist before processing
if 'Start_Time' in df.columns and 'End_Time' in df.columns:
    df['Start_Time'] = pd.to_datetime(df['Start_Time'], format='mixed', errors='coerce')
    df['End_Time'] = pd.to_datetime(df['End_Time'], format='mixed', errors='coerce')

    # Remove rows where conversion failed
    df = df.dropna(subset=['Start_Time', 'End_Time'])

    # Calculate Duration in minutes
    df['Duration'] = (df['End_Time'] - df['Start_Time']).dt.total_seconds() / 60

In [27]:
# 7. Feature Engineering: Extracting hour, day, etc.
# These lines should be inside the if block
if 'Start_Time' in df.columns and 'End_Time' in df.columns: # This check should encompass the feature engineering as well
    df['Hour'] = df['Start_Time'].dt.hour
    df['Weekday'] = df['Start_Time'].dt.weekday
    df['Month'] = df['Start_Time'].dt.month
else:
    print("Error: 'Start_Time' or 'End_Time' column not found in the DataFrame.")
    print("Available columns:", df.columns.tolist())
    # This else block should handle the case where time columns are missing,
    # and therefore time-based features cannot be created.
    print("Error: 'Start_Time' or 'End_Time' column not found in the DataFrame. Time-based features will not be created.")
    print("Available columns:", df.columns.tolist())

Error: 'Start_Time' or 'End_Time' column not found in the DataFrame.
Available columns: ['crash_date', 'traffic_control_device', 'weather_condition', 'lighting_condition', 'first_crash_type', 'trafficway_type', 'alignment', 'roadway_surface_cond', 'road_defect', 'crash_type', 'intersection_related_i', 'damage', 'prim_contributory_cause', 'num_units', 'most_severe_injury', 'injuries_total', 'injuries_fatal', 'injuries_incapacitating', 'injuries_non_incapacitating', 'injuries_reported_not_evident', 'injuries_no_indication', 'crash_hour', 'crash_day_of_week', 'crash_month']
Error: 'Start_Time' or 'End_Time' column not found in the DataFrame. Time-based features will not be created.
Available columns: ['crash_date', 'traffic_control_device', 'weather_condition', 'lighting_condition', 'first_crash_type', 'trafficway_type', 'alignment', 'roadway_surface_cond', 'road_defect', 'crash_type', 'intersection_related_i', 'damage', 'prim_contributory_cause', 'num_units', 'most_severe_injury', 'injur

In [28]:
# 8. Encode categorical features safely

# Convert boolean columns to integers (0/1) if present
bool_cols = ['Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
             'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming',
             'Traffic_Signal', 'Turning_Loop']

for col in bool_cols:
    if col in df.columns:
        df[col] = df[col].astype(int)

# One-hot encode twilight-related time features if they exist
time_cols = ['Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']
available_time_cols = [col for col in time_cols if col in df.columns]

if available_time_cols:
    df = pd.get_dummies(df, columns=available_time_cols, drop_first=True)
else:
    print("No twilight-related time columns found.")

No twilight-related time columns found.


In [29]:
print(df.columns.tolist())

['crash_date', 'traffic_control_device', 'weather_condition', 'lighting_condition', 'first_crash_type', 'trafficway_type', 'alignment', 'roadway_surface_cond', 'road_defect', 'crash_type', 'intersection_related_i', 'damage', 'prim_contributory_cause', 'num_units', 'most_severe_injury', 'injuries_total', 'injuries_fatal', 'injuries_incapacitating', 'injuries_non_incapacitating', 'injuries_reported_not_evident', 'injuries_no_indication', 'crash_hour', 'crash_day_of_week', 'crash_month']


In [30]:
# Select non-numeric columns (typically object or bool types)
cat_cols = df.select_dtypes(exclude='number').columns

# Initialize LabelEncoder
le = LabelEncoder()

# Apply label encoding to each categorical column
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

In [35]:
# 9. Define features and target

# List of columns to potentially drop for feature set X
# Updated cols_to_drop_X based on potential column name differences
# Check the actual column names in your dataframe to replace 'Severity' if needed
# For example, if the severity column is named 'crash_severity', change 'Severity' to 'crash_severity'
cols_to_drop_X = ['Severity', 'Start_Time', 'End_Time', 'Duration'] # Add Duration as it's derived from Start/End

# Filter the list to include only columns that are present in df
# We should also remove the actual target column from the list of columns to drop from X
# The target column name is now dynamic, based on the 'target_column' variable.
cols_to_drop_X_updated = [col for col in cols_to_drop_X if col != target_column]
existing_cols_to_drop_X = [col for col in cols_to_drop_X_updated if col in df.columns]


# Define features by dropping the existing columns from the list
print(f"\nDefining feature set X by dropping: {existing_cols_to_drop_X}")

# Check if X can be created
if len(existing_cols_to_drop_X) < len(df.columns): # Ensure we are not dropping ALL columns
    # Drop columns from X, handling potential absence of cols in existing_cols_to_drop_X
    X = df.drop(columns=existing_cols_to_drop_X, errors='ignore')
    print("Feature set X created successfully.")
    print("Shape of X:", X.shape)
else:
    print("Error: All columns would be dropped when creating feature set X. Check 'cols_to_drop_X'.")
    # Handle this error appropriately, e.g., exit or raise an exception
    raise ValueError("Cannot create feature set X: all columns selected for dropping.")


# Define the target variable, ensuring it exists
# *** IMPORTANT: Replace 'Severity' with the actual name of your severity column ***
# Based on the global variable info, 'most_severe_injury' appears to be the likely target column name.
# Replace 'Severity' below with the correct column name from your dataset.
target_column = 'most_severe_injury' # <--- CHANGE THIS TO YOUR ACTUAL SEVERITY COLUMN NAME


if target_column in df.columns:
    print(f"\nDefining target variable y using column '{target_column}'...")
    y = df[target_column]
    print("Target variable y created successfully.")
    print("Shape of y:", y.shape)
else:
    print(f"\nError: '{target_column}' column not found in the DataFrame. Cannot define target variable.")
    print("Available columns:", df.columns.tolist()) # Print available columns for inspection
    # Handle this error appropriately, e.g., exit or raise an exception
    raise KeyError(f"'{target_column}' column not found")

# You can add train_test_split, scaling, model training, etc. here
# based on your subsequent steps.


Defining feature set X by dropping: []
Feature set X created successfully.
Shape of X: (209306, 24)

Defining target variable y using column 'most_severe_injury'...
Target variable y created successfully.
Shape of y: (209306,)


In [36]:
# 10. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
# 11. Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [38]:
# 12. Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [39]:
# 13. Evaluate
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[   56     0     0     0     0]
 [    0  1338     0     0     0]
 [    0     0 30831     0     0]
 [    0     0     0  6384     0]
 [    0     0     0     0  3253]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        56
           1       1.00      1.00      1.00      1338
           2       1.00      1.00      1.00     30831
           3       1.00      1.00      1.00      6384
           4       1.00      1.00      1.00      3253

    accuracy                           1.00     41862
   macro avg       1.00      1.00      1.00     41862
weighted avg       1.00      1.00      1.00     41862



In [40]:
# 14. Prediction Function for Gradio
def predict_severity(Distance, Duration, Hour, Weekday, Month,
                     Amenity, Bump, Crossing, Give_Way, Junction,
                     No_Exit, Railway, Roundabout, Station, Stop,
                     Traffic_Calming, Traffic_Signal, Turning_Loop,
                     SS_Night, CT_Night, NT_Night, AT_Night):
    input_data = np.array([[Distance, Duration, Hour, Weekday, Month,
                            Amenity, Bump, Crossing, Give_Way, Junction,
                            No_Exit, Railway, Roundabout, Station, Stop,
                            Traffic_Calming, Traffic_Signal, Turning_Loop,
                            SS_Night, CT_Night, NT_Night, AT_Night]])
    input_scaled = scaler.transform(input_data)
    prediction = model.predict(input_scaled)
    return f"🚨 Predicted Severity Level: {int(prediction[0])}"

In [41]:
# 15. Gradio Interface
inputs = [
    gr.Number(label="Distance (miles)"),
    gr.Number(label="Duration (minutes)"),
    gr.Slider(0, 23, label="Hour of Day"),
    gr.Slider(0, 6, label="Day of Week (0=Monday)"),
    gr.Slider(1, 12, label="Month"),
    *[gr.Checkbox(label=col) for col in [
        "Amenity", "Bump", "Crossing", "Give_Way", "Junction",
        "No_Exit", "Railway", "Roundabout", "Station", "Stop",
        "Traffic_Calming", "Traffic_Signal", "Turning_Loop"
    ]],
    *[gr.Checkbox(label=col) for col in [
        "SS_Night", "CT_Night", "NT_Night", "AT_Night"
    ]]
]

output = gr.Textbox(label="Prediction")

gr.Interface(fn=predict_severity, inputs=inputs, outputs=output,
             title="🧠 AI Traffic Accident Severity Predictor",
             description="Enter traffic accident data to predict its severity using AI."
).launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://94be7cc2408c548462.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


