In [None]:
# Import the necessary packages
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, precision_recall_curve
import numpy as np
from imblearn.over_sampling import SMOTE

In [None]:
# Suppress specific future warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# Import the clean data
data = pd.read_csv('source/data.csv', low_memory=False)

In [None]:
# Copy of the original dataset for feature engineering and preprocessing
data_processed = data.copy()

In [None]:
# Drop unnecessary columns
data_processed = data_processed.drop(['AccID', 'birth_year', 'vehicleID', 'num_veh'], axis=1)

In [None]:
# Converting 'time', 'day', 'month', and 'year' to float type
data_processed['time'] = data_processed['time'].astype('float64')
data_processed['day'] = data_processed['day'].astype('float64')
data_processed['month'] = data_processed['month'].astype('float64')
data_processed['year'] = data_processed['year'].astype('float64')

In [None]:
# Selecting features and target variable
features = ['lum', 'atm_condition', 'collision_type', 'route_category', 'traffic_regime', 'reserved_lane_code', 
            'longitudinal_profile', 'upstream_terminal_number', 'plan', 'surface_condition', 'infra', 'accident_situation', 
            'traffic_direction', 'vehicle_category', 'fixed_obstacle', 'mobile_obstacle', 'initial_impact_point', 'manv', 
            'motor', 'seat', 'user_category', 'gender', 'reason_travel', 'safety_equipment1', 'maximum_speed', 'age', 
            'lat', 'long', 'distance_upstream_terminal', 'total_number_lanes', 'day', 'time', 'month', 'year']
target = 'gravity'

In [None]:
# Handling categorical features with One Hot Encoding
X = pd.get_dummies(data_processed[features], drop_first=True)
y = data_processed[target]

In [None]:
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Standardization: Fit only on the training data, then apply to both train and test
scaler = StandardScaler()
numerical_columns = X.select_dtypes(include=['float64']).columns

X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [None]:
# Check the dimensions of your dataframe
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

In [None]:
# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200],          # Number of trees in the forest
    'max_depth': [20, 30],               # Maximum depth of the tree
    'min_samples_split': [5, 10],        # Minimum number of samples to split a node
    'min_samples_leaf': [2, 4],          # Minimum number of samples per leaf
    'max_features': ['sqrt'],            # Max features considered for splitting
    'class_weight': [{1: 1, 2: 20, 3: 1, 4: 1}]  # Increased weight for fatalities
}

Apply ML model v1---->

In [None]:
# Initialize RandomForest
rf = RandomForestClassifier(random_state=42)

In [None]:
# Setup GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           scoring='f1_macro', cv=3, verbose=2, n_jobs=-1)

In [None]:
# Fit the model with the grid search
grid_search.fit(X_train, y_train)

In [None]:
# Best hyperparameters
print("Best Parameters found: ", grid_search.best_params_)

In [None]:
# Predict on the test set with the best model from GridSearchCV
y_pred_grid = grid_search.best_estimator_.predict(X_test)

In [None]:
# Initial classification report
print("Initial classification report after hyperparameter tuning:")
print(classification_report(y_test, y_pred_grid))

In [None]:
# Threshold Tuning for Higher Precision for Fatalities
# Get prediction probabilities
y_probs = grid_search.best_estimator_.predict_proba(X_test)

In [None]:
# Check how classes are ordered in the model
class_labels = grid_search.best_estimator_.classes_
print("Class labels in the model:", class_labels)

# Verify the index for Class 2 (Fatal)
fatal_class_index = np.where(class_labels == 2)[0][0]
print(f"Index for Class 2 (Fatal): {fatal_class_index}")


In [None]:
# Use probabilities for Class 2 (Fatal)
fatal_probs = y_probs[:, 1]  

In [None]:
# Use precision-recall curve to evaluate different thresholds
precision, recall, thresholds = precision_recall_curve(y_test == 2, fatal_probs)

In [None]:
# Find the threshold that balances both precision and recall
f1_scores = 2 * (precision * recall) / (precision + recall)  # Calculate F1-scores for each threshold
best_threshold_index = np.argmax(f1_scores)  # Find the index with the highest F1-score
best_threshold = thresholds[best_threshold_index]


In [None]:
# Apply the threshold to classify fatal cases
y_pred_adjusted = (fatal_probs >= best_threshold).astype(int)

In [None]:
# Generate classification report after adjusting the threshold
print(f"Best Threshold: {best_threshold}")
print("Classification report after further threshold tuning:")
print(classification_report(y_test == 2, y_pred_adjusted))

Apply ML model v2---->

In [None]:
# Define class weights to penalize misclassifications of Class 2 (Fatal)
class_weights = {1: 1, 2: 20, 3: 1, 4: 1}  # Increase weight for Class 2

# Train the Random Forest classifier with class weights
rf_model = RandomForestClassifier(random_state=42, class_weight=class_weights)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Generate classification report
print("Cost-Sensitive Random Forest Classification Report:")
print(classification_report(y_test, y_pred))


In [None]:
Apply ML model v3---->

In [None]:
# Create interaction features for more complex relationships

# Interaction between lighting conditions and time of accident (e.g., night-time accidents may be more fatal)
data_processed['lighting_time_interaction'] = data_processed['lum'] * data_processed['time']

# Interaction between weather conditions and location (e.g., certain locations may be more dangerous in bad weather)
data_processed['weather_location_interaction'] = data_processed['atm_condition'] * (data_processed['lat'] + data_processed['long'])

# Update the feature list with the new interaction terms
features.extend(['lighting_time_interaction', 'weather_location_interaction'])

# Re-train the model with the new features included
X = data_processed[features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE for balancing the classes
smote = SMOTE()
X_res, y_res = smote.fit_resample(X_train, y_train)

# Train the Random Forest model with the newly engineered features and class weights
rf_model.fit(X_res, y_res)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Generate classification report
print("Random Forest Classification Report with Feature Engineering:")
print(classification_report(y_test, y_pred))


In [None]:
Apply ML model v4---->

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_curve

# Get the prediction probabilities for the test set
y_probs = rf_model.predict_proba(X_test)

# Extract the probabilities for Class 2 (Fatal)
fatal_probs = y_probs[:, np.where(rf_model.classes_ == 2)[0][0]]  

# Use precision-recall curve to evaluate different thresholds
precision, recall, thresholds = precision_recall_curve(y_test == 2, fatal_probs)

# Find the best threshold for an optimal balance of precision and recall (based on F1 score)
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold_index = np.argmax(f1_scores)
best_threshold = thresholds[best_threshold_index]

# Apply the best threshold
y_pred_threshold = (fatal_probs >= best_threshold).astype(int)

# Generate classification report for the adjusted threshold
print(f"Best Threshold: {best_threshold}")
print("Classification report after further threshold adjustment:")
print(classification_report(y_test == 2, y_pred_threshold))
