# LR

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils.class_weight import compute_class_weight


In [2]:
# Read CSV file

data=pd.read_csv('preprocessed_UK_Accidents_2009_updated.csv',index_col='accident_index')
data=data.drop('seasons_ranges',axis=1)
x = data.drop('accident_severity', axis=1) 
y = data['accident_severity']

In [3]:
# Scalling data
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [4]:
## PCA
priciple_component_analayzer = PCA(n_components=15) # based on domain knowledge from feature extraction notebook
x_pca = priciple_component_analayzer.fit_transform(x_scaled)

In [5]:
## Data Imbalance
class_counts = y.value_counts()
class_counts

1    134714
2     21475
3      2003
Name: accident_severity, dtype: int64

In [7]:
## As shown above the data suffer from sever data imbalance
## Solutions:
## 1-class weights (not available in KNN)
## 2- Resampling (random resampling will be used in our case )

In [22]:
## base model before class weights and resampling
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.15, random_state=42)

# Create a logistic regression model
log_reg = LogisticRegression(max_iter=10000,solver='liblinear',C=0.02,penalty='l1')

# Train the model on the training data
log_reg.fit(X_train, y_train)

# Evaluate the performance of the model on the training and testing data
y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

train_f1 = f1_score(y_train, y_train_pred, average='micro')
test_f1 = f1_score(y_test, y_test_pred, average='micro')

train_precision = precision_score(y_train, y_train_pred, average='micro')
test_precision = precision_score(y_test, y_test_pred, average='micro')

train_recall = recall_score(y_train, y_train_pred, average='micro')
test_recall = recall_score(y_test, y_test_pred, average='micro')

# Print the evaluation metrics
print("Training accuracy:", train_accuracy)
print("Testing accuracy:", test_accuracy)

print("Training F1 score:", train_f1)
print("Testing F1 score:", test_f1)

print("Training precision:", train_precision)
print("Testing precision:", test_precision)

print("Training recall:", train_recall)
print("Testing recall:", test_recall)

Training accuracy: 0.8519741490224076
Testing accuracy: 0.8493826119937629
Training F1 score: 0.8519741490224076
Testing F1 score: 0.8493826119937629
Training precision: 0.8519741490224076
Testing precision: 0.8493826119937629
Training recall: 0.8519741490224076
Testing recall: 0.8493826119937629


In [14]:
## base model after resampling
oversampler = RandomOverSampler(random_state=10)
x_resampled, y_resampled = oversampler.fit_resample(x_pca, y)
x_resampled=pd.DataFrame(x_resampled)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.15, random_state=42)

# Create a logistic regression model
log_reg = LogisticRegression(max_iter=10000,solver='liblinear',C=0.02,penalty='l1')

# Train the model on the training data
log_reg.fit(X_train, y_train)

# Evaluate the performance of the model on the training and testing data
y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

train_f1 = f1_score(y_train, y_train_pred, average='micro')
test_f1 = f1_score(y_test, y_test_pred, average='micro')

train_precision = precision_score(y_train, y_train_pred, average='micro')
test_precision = precision_score(y_test, y_test_pred, average='micro')

train_recall = recall_score(y_train, y_train_pred, average='micro')
test_recall = recall_score(y_test, y_test_pred, average='micro')

# Print the evaluation metrics
print("Training accuracy:", train_accuracy)
print("Testing accuracy:", test_accuracy)

print("Training F1 score:", train_f1)
print("Testing F1 score:", test_f1)

print("Training precision:", train_precision)
print("Testing precision:", test_precision)

print("Training recall:", train_recall)
print("Testing recall:", test_recall)

Training accuracy: 0.4867867955286446
Testing accuracy: 0.4890633763320247
Training F1 score: 0.4867867955286446
Testing F1 score: 0.4890633763320247
Training precision: 0.4867867955286446
Testing precision: 0.4890633763320247
Training recall: 0.4867867955286446
Testing recall: 0.4890633763320247


In [9]:
## base model after class weights
class_labels = np.unique(y)
class_weights = compute_class_weight(class_weight='balanced', classes=class_labels, y=y)
class_weight_value = {class_labels[i]: weight for i, weight in enumerate(class_weights)}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x_pca, y, test_size=0.15, random_state=42)

# Create a logistic regression model
log_reg = LogisticRegression(max_iter=10000,class_weight=class_weight_value,solver='liblinear',C=0.02,penalty='l1')
# Train the model on the training data
log_reg.fit(X_train, y_train)

# Evaluate the performance of the model on the training and testing data
y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

train_f1 = f1_score(y_train, y_train_pred, average='micro')
test_f1 = f1_score(y_test, y_test_pred, average='micro')

train_precision = precision_score(y_train, y_train_pred, average='micro')
test_precision = precision_score(y_test, y_test_pred, average='micro')

train_recall = recall_score(y_train, y_train_pred, average='micro')
test_recall = recall_score(y_test, y_test_pred, average='micro')

# Print the evaluation metrics
print("Training accuracy:", train_accuracy)
print("Testing accuracy:", test_accuracy)

print("Training F1 score:", train_f1)
print("Testing F1 score:", test_f1)

print("Training precision:", train_precision)
print("Testing precision:", test_precision)

print("Training recall:", train_recall)
print("Testing recall:", test_recall)

Training accuracy: 0.8258256918260042
Testing accuracy: 0.8238442412238189
Training F1 score: 0.8258256918260042
Testing F1 score: 0.8238442412238189
Training precision: 0.8258256918260042
Testing precision: 0.8238442412238189
Training recall: 0.8258256918260042
Testing recall: 0.8238442412238189


In [16]:
## As the results show, the resampling is not a good technique to use in our case. Although the model without any
## data imbalance technique gave the highest scores, class weights will be used as a safe practice (difference is negligible
## and class weights will make model evaluation more fair)

In [10]:
## Train/Test split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=10,stratify=y)## stratify=y used to have the same ration of classes in each split


In [18]:
## Hyperparameter tuning using search grid

In [19]:
## before runing the grid, a smaller grid will be used to determine the range of C that we should be using to 
## reduce the computational time of the main grid 

In [57]:
# create a logistic regression model
log_reg = LogisticRegression(max_iter=10000,class_weight=class_weight_value,solver='liblinear',penalty='l1')

# define the hyperparameters to search over
param_grid = {'C': [0.00001,0.0001,0.001, 0.01, 0.1, 1, 10]}

# create custom scoring functions to calculate accuracy, precision, recall, and F1 score with average='macro'
accuracy_scorer = make_scorer(accuracy_score)
precision_scorer = make_scorer(precision_score, average='micro', zero_division=0)
recall_scorer = make_scorer(recall_score, average='micro')
f1_scorer = make_scorer(f1_score, average='micro')

# perform grid search and obtain performance metrics for each combination of hyperparameters
grid_search = GridSearchCV(log_reg, param_grid=param_grid, cv=StratifiedKFold(n_splits=2, random_state=10, shuffle=True), scoring={'accuracy': accuracy_scorer, 'precision': precision_scorer, 'recall': recall_scorer, 'f1': f1_scorer}, refit='precision', verbose=3)

# fit the grid search to the data
grid_search.fit(X_train, y_train)

# print the best hyperparameters and corresponding performance metrics
print("Best hyperparameters:", grid_search.best_params_)
print("Best accuracy:", grid_search.best_score_)
print("Best precision:", grid_search.cv_results_['mean_test_precision'][grid_search.best_index_])
print("Best recall:", grid_search.cv_results_['mean_test_recall'][grid_search.best_index_])
print("Best F1 score:", grid_search.cv_results_['mean_test_f1'][grid_search.best_index_])

Fitting 2 folds for each of 7 candidates, totalling 14 fits
[CV 1/2] END C=1e-05; accuracy: (test=0.852) f1: (test=0.852) precision: (test=0.852) recall: (test=0.852) total time=   0.8s
[CV 2/2] END C=1e-05; accuracy: (test=0.852) f1: (test=0.852) precision: (test=0.852) recall: (test=0.852) total time=   1.0s
[CV 1/2] END C=0.0001; accuracy: (test=0.849) f1: (test=0.849) precision: (test=0.849) recall: (test=0.849) total time=   1.3s
[CV 2/2] END C=0.0001; accuracy: (test=0.847) f1: (test=0.847) precision: (test=0.847) recall: (test=0.847) total time=   2.4s
[CV 1/2] END C=0.001; accuracy: (test=0.834) f1: (test=0.834) precision: (test=0.834) recall: (test=0.834) total time=  10.7s
[CV 2/2] END C=0.001; accuracy: (test=0.832) f1: (test=0.832) precision: (test=0.832) recall: (test=0.832) total time=   8.6s
[CV 1/2] END C=0.01; accuracy: (test=0.821) f1: (test=0.821) precision: (test=0.821) recall: (test=0.821) total time=  12.3s
[CV 2/2] END C=0.01; accuracy: (test=0.818) f1: (test=0.8

In [56]:
## based on the results, small values of Cs are better so, we will be looping over small values 

In [11]:
# create a logistic regression model
log_reg = LogisticRegression(max_iter=10000,class_weight=class_weight_value,C=0.00000001)

# define the hyperparameters to search over
param_grid = {'penalty': ['l1', 'l2'],
              'solver': ['liblinear', 'saga']}
## liblinear and saga are good for multiclass data and large datasets

# create custom scoring functions to calculate accuracy, precision, recall, and F1 score with average='macro'
accuracy_scorer = make_scorer(accuracy_score)
precision_scorer = make_scorer(precision_score, average='micro', zero_division=0)
recall_scorer = make_scorer(recall_score, average='micro')
f1_scorer = make_scorer(f1_score, average='micro')

# perform grid search and obtain performance metrics for each combination of hyperparameters
grid_search = GridSearchCV(log_reg, param_grid=param_grid, cv=StratifiedKFold(n_splits=2, random_state=10, shuffle=True), scoring={'accuracy': accuracy_scorer, 'precision': precision_scorer, 'recall': recall_scorer, 'f1': f1_scorer}, refit='precision', verbose=3)

# fit the grid search to the data
grid_search.fit(X_train, y_train)

# print the best hyperparameters and corresponding performance metrics
print("Best hyperparameters:", grid_search.best_params_)
print("Best accuracy:", grid_search.best_score_)
print("Best precision:", grid_search.cv_results_['mean_test_precision'][grid_search.best_index_])
print("Best recall:", grid_search.cv_results_['mean_test_recall'][grid_search.best_index_])
print("Best F1 score:", grid_search.cv_results_['mean_test_f1'][grid_search.best_index_])

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2] END penalty=l1, solver=liblinear; accuracy: (test=0.852) f1: (test=0.852) precision: (test=0.852) recall: (test=0.852) total time=   0.8s
[CV 2/2] END penalty=l1, solver=liblinear; accuracy: (test=0.852) f1: (test=0.852) precision: (test=0.852) recall: (test=0.852) total time=   0.8s




[CV 1/2] END penalty=l1, solver=saga; accuracy: (test=0.086) f1: (test=0.086) precision: (test=0.086) recall: (test=0.086) total time=19.6min




[CV 2/2] END penalty=l1, solver=saga; accuracy: (test=0.136) f1: (test=0.136) precision: (test=0.136) recall: (test=0.136) total time=19.9min
[CV 1/2] END penalty=l2, solver=liblinear; accuracy: (test=0.852) f1: (test=0.852) precision: (test=0.852) recall: (test=0.852) total time=   3.0s
[CV 2/2] END penalty=l2, solver=liblinear; accuracy: (test=0.852) f1: (test=0.852) precision: (test=0.852) recall: (test=0.852) total time=   3.4s




[CV 1/2] END penalty=l2, solver=saga; accuracy: (test=0.246) f1: (test=0.246) precision: (test=0.246) recall: (test=0.246) total time=17.2min




[CV 2/2] END penalty=l2, solver=saga; accuracy: (test=0.346) f1: (test=0.346) precision: (test=0.346) recall: (test=0.346) total time=17.2min
Best hyperparameters: {'penalty': 'l1', 'solver': 'liblinear'}
Best accuracy: 0.8515874255289281
Best precision: 0.8515874255289281
Best recall: 0.8515874255289281
Best F1 score: 0.851587425528928


In [12]:
## base model after class weights
class_labels = np.unique(y)
class_weights = compute_class_weight(class_weight='balanced', classes=class_labels, y=y)
class_weight_value = {class_labels[i]: weight for i, weight in enumerate(class_weights)}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x_pca, y, test_size=0.15, random_state=42)

# Create a logistic regression model
log_reg = LogisticRegression(max_iter=10000,class_weight=class_weight_value,solver='liblinear',C=0.00000001,penalty='l1')
# Train the model on the training data
log_reg.fit(X_train, y_train)

# Evaluate the performance of the model on the training and testing data
y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

train_f1 = f1_score(y_train, y_train_pred, average='micro')
test_f1 = f1_score(y_test, y_test_pred, average='micro')

train_precision = precision_score(y_train, y_train_pred, average='micro')
test_precision = precision_score(y_test, y_test_pred, average='micro')

train_recall = recall_score(y_train, y_train_pred, average='micro')
test_recall = recall_score(y_test, y_test_pred, average='micro')

# Print the evaluation metrics
print("Training accuracy:", train_accuracy)
print("Testing accuracy:", test_accuracy)

print("Training F1 score:", train_f1)
print("Testing F1 score:", test_f1)

print("Training precision:", train_precision)
print("Testing precision:", test_precision)

print("Training recall:", train_recall)
print("Testing recall:", test_recall)

Training accuracy: 0.8519741490224076
Testing accuracy: 0.8493826119937629
Training F1 score: 0.8519741490224076
Testing F1 score: 0.8493826119937629
Training precision: 0.8519741490224076
Testing precision: 0.8493826119937629
Training recall: 0.8519741490224076
Testing recall: 0.8493826119937629


In [None]:
## This model is not suitable for our data or maybe the data needs more study and preproccessing
## Data is: 1-Non-linearly separable data
##          2-High-dimensional data
##          3-Non-Gaussian distribution (LR assumes that data is gaussian)
## all of these reasons may be the reason behind LR's poor performance 