In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [97]:
# Read CSV file

data=pd.read_csv('preprocessed_UK_Accidents_2009_updated.csv',index_col='accident_index')
data=data.drop('seasons_ranges',axis=1)
x = data.drop('accident_severity', axis=1) 
y = data['accident_severity']

In [99]:
# Scalling data
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [100]:
## PCA
priciple_component_analayzer = PCA(n_components=15) # based on domain knowledge from feature extraction notebook
x_pca = priciple_component_analayzer.fit_transform(x_scaled)

In [31]:
## Data Imbalance
class_counts = y.value_counts()
class_counts

1    134714
2     21475
3      2003
Name: accident_severity, dtype: int64

In [None]:
## As shown above the data suffer from sever data imbalance
## Solutions:
## 1-class weights (not available in NB, but prior probabilities can be adjusted to treate each class according to 
## its occurrence)
## 2- Resampling (random resampling will be used in our case )

In [None]:
## Gaussian NB
## This model assumes that all features are normally distributed, which could be a problem in our case. So, we will be testing
## first on a base model 

In [44]:
## base model with data before pca
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

# Create a Gaussian Naive Bayes model
gnb = GaussianNB()

# Train the model on the training data
gnb.fit(X_train, y_train)

# Evaluate the performance of the model on the training and testing data
y_train_pred = gnb.predict(X_train)
y_test_pred = gnb.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

train_f1 = f1_score(y_train, y_train_pred, average='micro')
test_f1 = f1_score(y_test, y_test_pred, average='micro')

train_precision = precision_score(y_train, y_train_pred, average='micro')
test_precision = precision_score(y_test, y_test_pred, average='micro')

train_recall = recall_score(y_train, y_train_pred, average='micro')
test_recall = recall_score(y_test, y_test_pred, average='micro')

# Print the evaluation metrics
print("Training accuracy:", train_accuracy)
print("Testing accuracy:", test_accuracy)

print("Training F1 score:", train_f1)
print("Testing F1 score:", test_f1)

print("Training precision:", train_precision)
print("Testing precision:", test_precision)

print("Training recall:", train_recall)
print("Testing recall:", test_recall)

Training accuracy: 0.8519741490224076
Testing accuracy: 0.8493826119937629
Training F1 score: 0.8519741490224076
Testing F1 score: 0.8493826119937629
Training precision: 0.8519741490224076
Testing precision: 0.8493826119937629
Training recall: 0.8519741490224076
Testing recall: 0.8493826119937629


In [46]:
## base model with data after pca
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x_pca, y, test_size=0.15, random_state=42)

# Create a Gaussian Naive Bayes model
gnb = GaussianNB()

# Train the model on the training data
gnb.fit(X_train, y_train)

# Evaluate the performance of the model on the training and testing data
y_train_pred = gnb.predict(X_train)
y_test_pred = gnb.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

train_f1 = f1_score(y_train, y_train_pred, average='micro')
test_f1 = f1_score(y_test, y_test_pred, average='micro')

train_precision = precision_score(y_train, y_train_pred, average='micro')
test_precision = precision_score(y_test, y_test_pred, average='micro')

train_recall = recall_score(y_train, y_train_pred, average='micro')
test_recall = recall_score(y_test, y_test_pred, average='micro')

# Print the evaluation metrics
print("Training accuracy:", train_accuracy)
print("Testing accuracy:", test_accuracy)

print("Training F1 score:", train_f1)
print("Testing F1 score:", test_f1)

print("Training precision:", train_precision)
print("Testing precision:", test_precision)

print("Training recall:", train_recall)
print("Testing recall:", test_recall)

Training accuracy: 0.8457865732580709
Testing accuracy: 0.8432719457204265
Training F1 score: 0.8457865732580709
Testing F1 score: 0.8432719457204265
Training precision: 0.8457865732580709
Testing precision: 0.8432719457204265
Training recall: 0.8457865732580709
Testing recall: 0.8432719457204265


In [47]:
## base model with data after pca and normalization
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.15, random_state=42)

# Create a Gaussian Naive Bayes model
gnb = GaussianNB()

# Train the model on the training data
gnb.fit(X_train, y_train)

# Evaluate the performance of the model on the training and testing data
y_train_pred = gnb.predict(X_train)
y_test_pred = gnb.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

train_f1 = f1_score(y_train, y_train_pred, average='micro')
test_f1 = f1_score(y_test, y_test_pred, average='micro')

train_precision = precision_score(y_train, y_train_pred, average='micro')
test_precision = precision_score(y_test, y_test_pred, average='micro')

train_recall = recall_score(y_train, y_train_pred, average='micro')
test_recall = recall_score(y_test, y_test_pred, average='micro')

# Print the evaluation metrics
print("Training accuracy:", train_accuracy)
print("Testing accuracy:", test_accuracy)

print("Training F1 score:", train_f1)
print("Testing F1 score:", test_f1)

print("Training precision:", train_precision)
print("Testing precision:", test_precision)

print("Training recall:", train_recall)
print("Testing recall:", test_recall)

Training accuracy: 0.20061280798435258
Testing accuracy: 0.2025369800665852
Training F1 score: 0.2006128079843526
Testing F1 score: 0.2025369800665852
Training precision: 0.20061280798435258
Testing precision: 0.2025369800665852
Training recall: 0.20061280798435258
Testing recall: 0.2025369800665852


In [48]:
## From the results above scalling really degraded the performance of the model. Moreover, the PCA reduced 
## the performance by a very small percentage. Therefore, PCA data will be used to reduce 
## features in order to save computation time

In [52]:
oversampler = RandomOverSampler(random_state=10)
x_resampled, y_resampled = oversampler.fit_resample(x_pca, y)
x_resampled=pd.DataFrame(x_resampled)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.15, random_state=42)

# Create a Gaussian Naive Bayes model
gnb = GaussianNB()

# Train the model on the training data
gnb.fit(X_train, y_train)

# Evaluate the performance of the model on the training and testing data
y_train_pred = gnb.predict(X_train)
y_test_pred = gnb.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

train_f1 = f1_score(y_train, y_train_pred, average='micro')
test_f1 = f1_score(y_test, y_test_pred, average='micro')

train_precision = precision_score(y_train, y_train_pred, average='micro')
test_precision = precision_score(y_test, y_test_pred, average='micro')

train_recall = recall_score(y_train, y_train_pred, average='micro')
test_recall = recall_score(y_test, y_test_pred, average='micro')

# Print the evaluation metrics
print("Training accuracy:", train_accuracy)
print("Testing accuracy:", test_accuracy)

print("Training F1 score:", train_f1)
print("Testing F1 score:", test_f1)

print("Training precision:", train_precision)
print("Testing precision:", test_precision)

print("Training recall:", train_recall)
print("Testing recall:", test_recall)

Training accuracy: 0.4489724033535165
Testing accuracy: 0.4470489261324272
Training F1 score: 0.44897240335351657
Testing F1 score: 0.4470489261324272
Training precision: 0.4489724033535165
Testing precision: 0.4470489261324272
Training recall: 0.4489724033535165
Testing recall: 0.4470489261324272


In [None]:
## resampling
oversampler = RandomOverSampler(random_state=10)
x_resampled, y_resampled = oversampler.fit_resample(x_pca, y)
x_resampled=pd.DataFrame(x_resampled)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.15, random_state=42)

# Create a Gaussian Naive Bayes model
gnb = GaussianNB()

# Train the model on the training data
gnb.fit(X_train, y_train)

# Evaluate the performance of the model on the training and testing data
y_train_pred = gnb.predict(X_train)
y_test_pred = gnb.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

train_f1 = f1_score(y_train, y_train_pred, average='micro')
test_f1 = f1_score(y_test, y_test_pred, average='micro')

train_precision = precision_score(y_train, y_train_pred, average='micro')
test_precision = precision_score(y_test, y_test_pred, average='micro')

train_recall = recall_score(y_train, y_train_pred, average='micro')
test_recall = recall_score(y_test, y_test_pred, average='micro')

# Print the evaluation metrics
print("Training accuracy:", train_accuracy)
print("Testing accuracy:", test_accuracy)

print("Training F1 score:", train_f1)
print("Testing F1 score:", test_f1)

print("Training precision:", train_precision)
print("Testing precision:", test_precision)

print("Training recall:", train_recall)
print("Testing recall:", test_recall)

In [56]:
## Prriors
# Compute the number of samples for each class
class_counts = np.bincount(y)

# Compute the prior probabilities of each class
priors = class_counts / len(y)



[0.         0.85158542 0.13575276 0.01266183]


In [60]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.15, random_state=42)
# Create a Gaussian Naive Bayes model
prior_prob = np.array([0.85158542 , 0.13575276, 0.01266183])
gnb = GaussianNB(priors=prior_prob)

# Train the model on the training data
gnb.fit(X_train, y_train)

# Evaluate the performance of the model on the training and testing data
y_train_pred = gnb.predict(X_train)
y_test_pred = gnb.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

train_f1 = f1_score(y_train, y_train_pred, average='micro')
test_f1 = f1_score(y_test, y_test_pred, average='micro')

train_precision = precision_score(y_train, y_train_pred, average='micro')
test_precision = precision_score(y_test, y_test_pred, average='micro')

train_recall = recall_score(y_train, y_train_pred, average='micro')
test_recall = recall_score(y_test, y_test_pred, average='micro')

# Print the evaluation metrics
print("Training accuracy:", train_accuracy)
print("Testing accuracy:", test_accuracy)

print("Training F1 score:", train_f1)
print("Testing F1 score:", test_f1)

print("Training precision:", train_precision)
print("Testing precision:", test_precision)

print("Training recall:", train_recall)
print("Testing recall:", test_recall)

Training accuracy: 0.34161911970190967
Testing accuracy: 0.34108079575071754
Training F1 score: 0.34161911970190967
Testing F1 score: 0.34108079575071754
Training precision: 0.34161911970190967
Testing precision: 0.34108079575071754
Training recall: 0.34161911970190967
Testing recall: 0.34108079575071754


In [61]:
## as the results show the priors made the performance of the model worse. Therefore we will work with the imbalanced data, 
## although it's expected that this model wont yield good results as it assumes that the features are normally distributed

In [62]:
## anothe model is Multinomial Naive Bayes, but we wont use it as, Multinomial Naive Bayes assumes that the features 
## are counts of occurrences of different events. Our data is not suitable as it contains numerical data and not categorical
## data only

In [91]:
## Train/Test split
X_train, X_test, y_train, y_test = train_test_split(x_pca, y, test_size=0.15, random_state=10,stratify=y)## stratify=y used to have the same ration of classes in each split

In [92]:
param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]}
# Create a Gaussian Naive Bayes model
gnb = GaussianNB()

# Define custom scoring functions to calculate accuracy, precision, recall, and F1 score with average='macro'
accuracy_scorer = make_scorer(accuracy_score)
precision_scorer = make_scorer(precision_score, average='micro', zero_division=0)
recall_scorer = make_scorer(recall_score, average='micro')
f1_scorer = make_scorer(f1_score, average='micro')

# Perform grid search and obtain performance metrics for each combination of hyperparameters
grid_search = GridSearchCV(gnb, param_grid=param_grid, cv=StratifiedKFold(n_splits=2, random_state=10, shuffle=True),
                           scoring={'accuracy': accuracy_scorer, 'precision': precision_scorer, 'recall': recall_scorer, 'f1': f1_scorer},
                           refit='precision', verbose=3, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and corresponding performance metrics
print("Best hyperparameters:", grid_search.best_params_)
print("Best accuracy:", grid_search.best_score_)
print("Best precision:", grid_search.cv_results_['mean_test_precision'][grid_search.best_index_])
print("Best recall:", grid_search.cv_results_['mean_test_recall'][grid_search.best_index_])
print("Best F1 score:", grid_search.cv_results_['mean_test_f1'][grid_search.best_index_])

Fitting 2 folds for each of 9 candidates, totalling 18 fits
Best hyperparameters: {'var_smoothing': 0.1}
Best accuracy: 0.8497727881080065
Best precision: 0.8497727881080065
Best recall: 0.8497727881080065
Best F1 score: 0.8497727881080065


In [93]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Create a Gaussian Naive Bayes model with the best var_smoothing parameter
gnb = GaussianNB(var_smoothing=0.1)

# Train the model on the training set
gnb.fit(X_train, y_train)

# Make predictions on the training set and calculate performance metrics
y_train_pred = gnb.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred, average='micro', zero_division=0)
train_recall = recall_score(y_train, y_train_pred, average='micro')
train_f1 = f1_score(y_train, y_train_pred, average='micro')

# Make predictions on the testing set and calculate performance metrics
y_test_pred = gnb.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='micro', zero_division=0)
test_recall = recall_score(y_test, y_test_pred, average='micro')
test_f1 = f1_score(y_test, y_test_pred, average='micro')

# Print the performance metrics
print("Training accuracy:", train_accuracy)
print("Training precision:", train_precision)
print("Training recall:", train_recall)
print("Training F1 score:", train_f1)
print("Testing accuracy:", test_accuracy)
print("Testing precision:", test_precision)
print("Testing recall:", test_recall)
print("Testing F1 score:", test_f1)

Training accuracy: 0.8496538081107814
Training precision: 0.8496538081107814
Training recall: 0.8496538081107814
Training F1 score: 0.8496538081107814
Testing accuracy: 0.8491297568376248
Testing precision: 0.8491297568376248
Testing recall: 0.8491297568376248
Testing F1 score: 0.8491297568376248


In [None]:
## The reults for the best parameters shows that the model is not overfitting (but may be underfitting), which indicates that this is the best possible
## results for this data.
## The fact that the GNB assumes that all features are normally distributed, affect the model alot. Other NB variations 
## are not suitable for our data 
## Another approach that we could have done, but we couldn't due to tigh project time, is to normalize the distribution 
## the features that are not normally distributed