# Predicting Traffic Crashes: A Data-Driven Approach to Road Safety

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset (replace with your file if reading from CSV)
df = pd.read_csv("Enhanced_Crash_Prediction_Data.csv")

# Use the prepared DataFrame if already in memory
#df = df_enhanced.copy()

# ------------------------------------
# Step 1: Define features and target
# ------------------------------------
X = df.drop(columns=['Crash_Occurred', 'Crash_ID', 'Date_Time'])  # Drop non-feature columns
y = df['Crash_Occurred']

# ------------------------------------
# Step 2: Define column types
# ------------------------------------
categorical_cols = [
    'Weather_Condition', 'Road_Condition', 'Road_Geometry',
    'Law_Enforcement_Activity', 'Special_Event', 'Holiday',
    'Day_Name', 'Time_Bin', 'Near_Holiday', 'Spatial_Zone'
]

numeric_cols = [
    'Traffic_Volume', 'Truck_Volume', 'Population_Density',
    'Hour', 'DayOfWeek', 'Is_Weekend'
]

# ------------------------------------
# Step 3: Preprocessing pipeline
# ------------------------------------
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numeric_cols)
])

# ------------------------------------
# Step 4: Modeling pipeline
# ------------------------------------
# pipeline = Pipeline([
#     ('preprocess', preprocessor),
#     ('classifier', LogisticRegression(max_iter=1000))
# ])

from sklearn.ensemble import RandomForestClassifier

# With RandomForestClassifier
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# You can optionally skip StandardScaler for numeric columns (trees don’t need it), but it's not harmful if left in.

# XGBClassifier
# from xgboost import XGBClassifier

# pipeline = Pipeline([
#     ('preprocess', preprocessor),
#     ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
# ])





# ------------------------------------
# Step 5: Train-test split
# ------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ------------------------------------
# Step 6: Train the model
# ------------------------------------
pipeline.fit(X_train, y_train)

# ------------------------------------
# Step 7: Predict and evaluate
# ------------------------------------
y_pred = pipeline.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
