In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


Model Training


In [17]:
df_model = pd.read_csv('Data/Accident_Data_Cleaned.csv')

In [18]:
df_model.head()


Unnamed: 0,Accident_Severity,Number_of_Casualties,Number_of_Vehicles,Light_Conditions_Darkness_Lights_Off,Light_Conditions_Darkness_Lights_On,Light_Conditions_Darkness_Lights_Unknown,Light_Conditions_Darkness_No_Lights,Light_Conditions_Daylight,Road_Surface_Conditions_Dry,Road_Surface_Conditions_Flood over 3cm. deep,...,Urban_or_Rural_Area_Rural,Urban_or_Rural_Area_Urban,Vehicle_Type_Agricultural vehicle,Vehicle_Type_Bus,Vehicle_Type_Car,Vehicle_Type_Goods Carrier,Vehicle_Type_MotorCycle,Vehicle_Type_Other vehicle,Vehicle_Type_Pedal cycle,Vehicle_Type_Ridden horse
0,2,1,2,False,True,False,False,False,True,False,...,False,True,False,False,True,False,False,False,False,False
1,2,1,2,False,False,False,False,True,False,False,...,False,True,False,False,True,False,False,False,False,False
2,2,1,4,False,False,False,False,True,True,False,...,False,True,False,True,False,False,False,False,False,False
3,2,2,3,False,False,False,False,True,True,False,...,False,True,False,False,True,False,False,False,False,False
4,2,1,2,False,False,False,False,True,True,False,...,False,True,False,False,False,True,False,False,False,False


In [27]:
x = df_model.drop(['Accident_Severity'],axis=1)
y= df_model['Accident_Severity']

In [29]:
from sklearn.model_selection import train_test_split
X_trains,X_tests,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=44)

In [30]:
from sklearn.preprocessing import StandardScaler
stdscaler = StandardScaler()
x_train = stdscaler.fit_transform(X_trains)
x_test = stdscaler.transform(X_tests)

In [23]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,f1_score,precision_score,recall_score,roc_auc_score

Initial Training

In [None]:
models={
    
    "Logistic":LogisticRegression(),
    "Xgboost":XGBClassifier(),
    "Decision Tree":DecisionTreeClassifier(),
    "Random Forest":RandomForestClassifier(),
    "Gradient Boost":GradientBoostingClassifier(),
    "Adaboost": AdaBoostClassifier()
    
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
   


    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
  


    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    


    
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))


    
    print('='*35)
    print('\n')

Logistic
Model performance for Training set
- Accuracy: 0.4537
- F1 score: 0.4479
----------------------------------
Model performance for Test set
- Accuracy: 0.4539
- F1 score: 0.4479


Decision Tree
Model performance for Training set
- Accuracy: 0.4996
- F1 score: 0.4926
----------------------------------
Model performance for Test set
- Accuracy: 0.4476
- F1 score: 0.4384


Random Forest
Model performance for Training set
- Accuracy: 0.4996
- F1 score: 0.4918
----------------------------------
Model performance for Test set
- Accuracy: 0.4511
- F1 score: 0.4413


Gradient Boost
Model performance for Training set
- Accuracy: 0.4723
- F1 score: 0.4633
----------------------------------
Model performance for Test set
- Accuracy: 0.4612
- F1 score: 0.4505


Adaboost
Model performance for Training set
- Accuracy: 0.4640
- F1 score: 0.4575
----------------------------------
Model performance for Test set
- Accuracy: 0.4651
- F1 score: 0.4581




Hyperparameter tuning

In [38]:
## Hyperparameter Training
rf_params = {'criterion':['gini','entropy', 'log_loss'],
             "max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]
            }
xgboost_params = {"learning_rate": [0.1, 0.01,1.0,10,100],
                  "max_depth": [5, 8, 12, 20, 30,40,50],
                  "n_estimators": [100, 200, 300,400,500],
                  "colsample_bytree": [0.5, 0.8, 1, 0.3, 0.4]}
adaboost_param={
                    "learning_rate": [0.1, 0.01,1.0,10,100],
                    "n_estimators":[50,60,70,80,90],
                    "algorithm":['SAMME','SAMME.R']}

gradient_params={"loss": ['log_loss','deviance','exponential'],
             "criterion": ['friedman_mse','squared_error','mse'],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500],
              "max_depth": [5, 8, 15, None, 10],
              "learning_rate": [0.1, 0.01,1.0,10,100]}



In [39]:
# Models list for Hyperparameter tuning
cv_models = [
                #    ("RF", RandomForestClassifier(), rf_params),
                   ("Xgboost", XGBClassifier(), xgboost_params),
                    # ("GradientBoost", GradientBoostingClassifier(), gradient_params),
                    # ("Adaboost",AdaBoostClassifier(),adaboost_param),
                    # ("DT",DecisionTreeClassifier(),dtparam)
    
                   
                   ]

Random CV

In [40]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in cv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(x_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
---------------- Best Params for Xgboost -------------------
{'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 0.8}


In [26]:
#Random CV
models={
    
    "Xgboost":XGBClassifier(n_estimators=500,max_depth=5,learning_rate=0.01,colsample_bytree=0.8),
    "Adaboost": AdaBoostClassifier(n_estimators= 50, algorithm= 'SAMME',learning_rate=0.1),
   
    
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score



    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score



    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    


    
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))


    
    print('='*35)
    print('\n')

Adaboost
Model performance for Training set
- Accuracy: 0.8525
- F1 score: 0.7847
----------------------------------
Model performance for Test set
- Accuracy: 0.8510
- F1 score: 0.7825




Grid CV

In [None]:
from sklearn.model_selection import GridSearchCV
model_param = {}
for name, model, params in cv_models:
    random = GridSearchCV(estimator=model,
                                   param_grid=params,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(x_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

In [154]:
#Grid CV
models={
    
    "Xgboost":XGBClassifier(n_estimators=500,max_depth=12,learning_rate=0.1,colsample_bytree=0.5),
    # "Adaboost": AdaBoostClassifier(n_estimators= 50, algorithm= 'SAMME',learning_rate=0.1)
    
    
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score



    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score



    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    


    
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))


    
    print('='*35)
    print('\n')

Xgboost
Model performance for Training set
- Accuracy: 0.8547
- F1 score: 0.7908
----------------------------------
Model performance for Test set
- Accuracy: 0.8502
- F1 score: 0.7837




In [153]:
XGB_Classifier = XGBClassifier(n_estimators=400,max_depth=12,learning_rate=0.1,colsample_bytree=0.5)
XGB_Classifier.fit(X_trains, y_train)

    

In [155]:
import pickle

# Assuming your trained model is stored in the variable 'model'


pickle.dump(XGB_Classifier,open('XBGC_Model_Accident_Severity_Prediction.pkl', 'wb'))

pickle.dump(stdscaler,open('Scalar_Model_Accident_Severity_Prediction.pkl', 'wb'))
