In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 📂 Step 2: Load Data
df = pd.read_csv('Crash_Analysis_System_(CAS)_data.csv')
print("Shape of dataset:", df.shape)
df.head()

# 🧹 Step 3: Preprocessing
# Drop columns with too many nulls or irrelevant columns
df = df.drop(columns=['X', 'Y', 'OBJECTID', 'advisorySpeed', 'areaUnitID', 'bicycle',
                      'bus', 'cliffBank', 'crashDirectionDescription', 'crashLocation1', 'crashLocation2',
                      'crashRoadSideRoad', 'crashSHDescription', 'debris', 'directionRoleDescription', 
                      'ditch', 'fence', 'flatHill', 'guardRail', 'holiday', 'houseOrBuilding', 
                      'intersection', 'kerb', 'light', 'meshblockId', 'moped', 'motorcycle', 
                      'NumberOfLanes', 'objectThrownOrDropped', 'otherObject', 'otherVehicleType', 
                      'overBank', 'parkedVehicle', 'phoneBoxEtc', 'postOrPole', 'region', 'roadCharacter', 
                      'roadLane', 'roadSurface', 'roadworks', 'schoolBus', 'slipOrFlood', 'strayAnimal', 
                      'streetLight', 'suv', 'taxi', 'temporarySpeedLimit', 'tlaId', 'tlaName', 
                      'trafficControl', 'trafficIsland', 'trafficSign', 'train', 'tree', 'truck', 
                      'unknownVehicleType', 'urban', 'vanOrUtility', 'vehicle', 'waterRiver'], errors='ignore')

# Handle missing values - drop rows with missing target value 'crashSeverity'
df = df.dropna(subset=['crashSeverity'])

# Fill missing values in other columns (with mean or mode, depending on the feature)
df.fillna(df.mean(), inplace=True)

# Encode categorical variables (weatherA, weatherB, region, etc.)
le = LabelEncoder()
categorical_cols = ['weatherA', 'weatherB']  # Add other categorical columns as needed
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# 🎯 Step 4: Define Features & Target
X = df.drop('crashSeverity', axis=1)  # Features excluding target column
y = df['crashSeverity']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🔍 Step 5: Model Training
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# 📊 Step 6: Evaluation
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Shape of dataset: (870753, 72)
