In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
import numpy as np

# Read the file for the model
model_df = pd.read_csv(r'C:\Users\pola_\OneDrive\Documents\Cesar\Data-bootcamp-UoT\Lessons\week 23\final project\Final_Project_Machine_Learning\Resources\clean_sampled_US_Accidents_for_model.csv')


In [2]:
# Extract a random 10% of the data
model_df = model_df.sample(frac=0.1, random_state=78)

In [3]:
# Drop the 'ID' and 'County' columns
model_df = model_df.drop(['ID', 'County'], axis=1)


In [4]:
# One-Hot Encode categorical variables
categorical_columns = ['State', 'Weather_Condition', 'Astronomical_Twilight']
model_df = pd.get_dummies(model_df, columns=categorical_columns)

In [5]:
# Drop redundant variable based on correlations analysis
model_df = model_df.drop('Astronomical_Twilight_Day', axis=1)

###  I tried eliminating the states to see if the models fit better, but that was not the case.  Leaving the code for reference. 

#### List all columns that start with 'State_'
#### state_columns = [col for col in model_df.columns if col.startswith('State_')]
#### model_df = model_df.drop(state_columns, axis=1)

###  I tried eliminating columns that might not have impact on  accidents.  Leaving the code for reference.

#### model_df = model_df.drop(['Amenity', 'No_Exit', 'Railway', 'Station'], axis=1)

In [6]:
# Convert 'Start_Time' to datetime and extract 'Hour'
model_df['Start_Time'] = pd.to_datetime(model_df['Start_Time'])
model_df['Hour'] = model_df['Start_Time'].dt.hour

In [7]:
# Drop the 'Time' and 'Start_Time' columns as their components have been extracted
model_df = model_df.drop(['Time', 'Start_Time'], axis=1)
model_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 353510 entries, 3030768 to 30760
Data columns (total 90 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Severity                           353510 non-null  int64  
 1   Start_Lat                          353510 non-null  float64
 2   Start_Lng                          353510 non-null  float64
 3   Distance(mi)                       353510 non-null  float64
 4   Temperature(F)                     353510 non-null  float64
 5   Humidity(%)                        353510 non-null  float64
 6   Pressure(in)                       353510 non-null  float64
 7   Visibility(mi)                     353510 non-null  float64
 8   Wind_Speed(mph)                    353510 non-null  float64
 9   Precipitation(in)                  353510 non-null  float64
 10  Amenity                            353510 non-null  bool   
 11  Bump                              

In [8]:
# Split our preprocessed data into our features and target arrays
X = model_df.drop('Severity', axis=1)  # Features
y = model_df['Severity']  # Target variable

In [9]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=78)


### I tried appliying the  SMOTE method to deal with the imbalance of the sample  only on training data, but it didn't work.  Leavign code for reference.
#### smote = SMOTE()
#### X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [10]:
# Create a StandardScaler instance and scale the resampled training data and the testing data.  Leaving code for reference.
#scaler = StandardScaler()
#X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
#X_test_scaled = scaler.transform(X_test)

# Create a StandardScaler instance and scale the training data and the testing data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Try 3 different models from simpler to more complex.  Will start with a Logistic Regression, then a Decision Tree, and finally a Deep Neural Net.

In [11]:
# Logistic Regression analysis

#lr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
#lr_model.fit(X_train_resampled_scaled, y_train_resampled)

lr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
lr_model.fit(X_train_scaled, y_train)  # Use the original training data
y_pred = lr_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.20      0.00      0.00       856
           2       0.82      0.97      0.89     70658
           3       0.48      0.17      0.25     14592
           4       0.23      0.00      0.01      2272

    accuracy                           0.80     88378
   macro avg       0.43      0.28      0.29     88378
weighted avg       0.74      0.80      0.75     88378

[[    1   854     1     0]
 [    4 68308  2332    14]
 [    0 12128  2447    17]
 [    0  1983   280     9]]


## add analysis for the Logistic Regression model

In [12]:
# Decision Tree model

#leaving code for the resampled set for reference.
#dt_model = DecisionTreeClassifier(random_state=78)
#dt_model.fit(X_train_resampled_scaled, y_train_resampled)

dt_model = DecisionTreeClassifier(random_state=78)
dt_model.fit(X_train_scaled, y_train)  # Use the original training data
y_pred = dt_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.43      0.41      0.42       856
           2       0.89      0.88      0.89     70658
           3       0.54      0.56      0.55     14592
           4       0.19      0.21      0.20      2272

    accuracy                           0.81     88378
   macro avg       0.51      0.52      0.52     88378
weighted avg       0.81      0.81      0.81     88378

[[  355   378   120     3]
 [  365 62386  6333  1574]
 [   98  5881  8207   406]
 [   10  1361   426   475]]


## add analysis for the decision tree model

In [13]:
# Deep Neural Model
# Convert labels to categorical one-hot encoding
y_one_hot = tf.keras.utils.to_categorical(y - 1)  # -1 because to_categorical assumes classes start at 0

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.25, random_state=78)

# Define the model
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 100
hidden_nodes_layer2 = 100

model = tf.keras.models.Sequential()

# First hidden layer
model.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, activation='relu', input_dim=number_input_features))

# Second hidden layer
model.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='relu'))

# Output layer
model.add(tf.keras.layers.Dense(units=y_train.shape[1], activation='softmax'))

# Check the structure of the model
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 100)               9000      
                                                                 
 dense_1 (Dense)             (None, 100)               10100     
                                                                 
 dense_2 (Dense)             (None, 4)                 404       
                                                                 
Total params: 19504 (76.19 KB)
Trainable params: 19504 (76.19 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [18]:
# Train the model
fit_model = model.fit(X_train_scaled, y_train, epochs=20, batch_size=50)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [20]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test, verbose=2)

print(f"Test accuracy: {test_accuracy}")

2762/2762 - 7s - loss: 0.4906 - accuracy: 0.8279 - 7s/epoch - 3ms/step
Test accuracy: 0.8279436230659485


In [21]:

# Predict the classes for the test set
y_pred = model.predict(X_test_scaled)

# Convert the predictions from one-hot encoded vectors to the class labels
y_pred_classes = np.argmax(y_pred, axis=1)

# Convert the true test set labels from one-hot encoded vectors to the class labels
y_true_classes = np.argmax(y_test, axis=1)

# Generate the classification report
clf_report = classification_report(y_true_classes, y_pred_classes)

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_true_classes, y_pred_classes)

print(clf_report)
print(conf_matrix)

              precision    recall  f1-score   support

           0       0.51      0.35      0.42       856
           1       0.87      0.94      0.90     70658
           2       0.59      0.44      0.50     14592
           3       0.37      0.09      0.15      2272

    accuracy                           0.83     88378
   macro avg       0.58      0.46      0.49     88378
weighted avg       0.81      0.83      0.81     88378

[[  303   493    57     3]
 [  224 66275  3963   196]
 [   64  7987  6388   153]
 [    8  1715   343   206]]
