In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/us-accidents/US_Accidents_March23.csv


In [6]:
##### Import necessary libraries
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the dataset
df = pd.read_csv('/kaggle/input/us-accidents/US_Accidents_March23.csv')  # Update with your file path
print("Dataset loaded successfully. Columns are:\n", df.columns)

# Step 2: Convert `Severity` to binary classification
# Convert Severity to binary classification: 1 for Severe (3 and 4), 0 for Non-Severe (1 and 2)
df['Severity'] = df['Severity'].apply(lambda x: 1 if x >= 3 else 0)
print("Unique values in Severity column after conversion:", df['Severity'].unique())

# Step 3: Select relevant columns for the prediction task
df = df[['Severity', 'Weather_Condition', 'Sunrise_Sunset', 'Civil_Twilight', 'Traffic_Signal']]
print("Selected columns:\n", df.head())

# Step 4: Drop rows with missing values
df = df.dropna()
print("Data after dropping missing values. Shape:", df.shape)

# Step 5: Check initial class distribution to understand imbalance
print("Initial class distribution:\n", df['Severity'].value_counts())

# Step 6: Undersample the majority class to 70% of its original size
# Separate majority and minority classes
df_majority = df[df['Severity'] == 0]  # Non-Severe
df_minority = df[df['Severity'] == 1]  # Severe

# Set the target majority size to 70% of its original size
target_majority_size = int(0.85 * len(df_majority))

# Perform undersampling on the majority class
df_majority_undersampled = resample(df_majority, 
                                    replace=False,                # Sample without replacement
                                    n_samples=target_majority_size, # Reduce to 70% of majority class size
                                    random_state=42)               # For reproducibility

# Combine undersampled majority class with the minority class
df_balanced = pd.concat([df_majority_undersampled, df_minority])

# Shuffle the balanced dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Display new class distribution
print("Balanced class distribution:\n", df_balanced['Severity'].value_counts())

# Step 7: Encode categorical columns to numeric values
# Define features and target from the balanced dataset
X = df_balanced.drop(columns=['Severity'])
y = df_balanced['Severity']

# Encode categorical columns
label_encoder = LabelEncoder()
for column in X.columns:
    if X[column].dtype == 'object':
        X[column] = label_encoder.fit_transform(X[column])

print("Encoding completed. Sample data:\n", X.head())

# Step 8: Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Display the class distribution in the training and testing sets
print("Training set class distribution:\n", y_train.value_counts())
print("Testing set class distribution:\n", y_test.value_counts())

# Step 9: Train Multinomial, Bernoulli, and Gaussian Naive Bayes models

# Multinomial Naive Bayes
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)
print("\nMultinomial Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_mnb))
print("Multinomial Naive Bayes Confusion Matrix:\n", confusion_matrix(y_test, y_pred_mnb))
print("Multinomial Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_mnb))

# Bernoulli Naive Bayes
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_pred_bnb = bnb.predict(X_test)
print("\nBernoulli Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_bnb))
print("Bernoulli Naive Bayes Confusion Matrix:\n", confusion_matrix(y_test, y_pred_bnb))
print("Bernoulli Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_bnb))

# Gaussian Naive Bayes
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
print("\nGaussian Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_gnb))
print("Gaussian Naive Bayes Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gnb))
print("Gaussian Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_gnb))


Dataset loaded successfully. Columns are:
 Index(['ID', 'Source', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat',
       'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description',
       'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
       'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')
Unique values in Severity column after conversion: [1 0]
Selected columns:
    Severity Weather_Condition Sunrise_Sunset Civil_Twilight  Traffic_Signal
0         1        Light Rain          Night          Night    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Multinomial Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.78      1.00      0.88   1548100
           1       0.00      0.00      0.00    439090

    accuracy                           0.78   1987190
   macro avg       0.39      0.50      0.44   1987190
weighted avg       0.61      0.78      0.68   1987190


Bernoulli Naive Bayes Accuracy: 0.7790397495961634
Bernoulli Naive Bayes Confusion Matrix:
 [[1548100       0]
 [ 439090       0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Bernoulli Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.78      1.00      0.88   1548100
           1       0.00      0.00      0.00    439090

    accuracy                           0.78   1987190
   macro avg       0.39      0.50      0.44   1987190
weighted avg       0.61      0.78      0.68   1987190


Gaussian Naive Bayes Accuracy: 0.7784786557903371
Gaussian Naive Bayes Confusion Matrix:
 [[1546700    1400]
 [ 438805     285]]
Gaussian Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.78      1.00      0.88   1548100
           1       0.17      0.00      0.00    439090

    accuracy                           0.78   1987190
   macro avg       0.47      0.50      0.44   1987190
weighted avg       0.64      0.78      0.68   1987190

