In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import pickle

def preprocessing(df):
    data_df_without_nan = df.dropna(axis=0)
    features_df = data_df_without_nan[['Severity','Distance(mi)','Description','Start_Time','End_Time','Timezone','Weather_Timestamp','Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight']]

    input_df = features_df.drop(['Description','Start_Time','End_Time','Timezone','Weather_Timestamp','Wind_Direction','Distance(mi)'],axis=1)
    encoders = {}
    for column in ['Weather_Condition', 'Amenity', 'Bump', 'Crossing',
               'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
               'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
               'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
               'Astronomical_Twilight']:
        le = LabelEncoder()
        input_df[column] = le.fit_transform(input_df[column])
        encoders[column] = le 

    with open('label_encoders.pkl', 'wb') as f:
        pickle.dump(encoders, f)
    
    y = input_df[["Severity"]]
    X = input_df.drop(['Severity'],axis=1)
    return X, y


In [15]:
def loader(df):
    '''
    df should have the following features
    -float:
    'Temperature(F)', 'Wind_Chill(F)','Humidity(%)', 'Pressure(in)', 'Visibility(mi)‘,'Wind_Speed(mph)', 'Precipitation(in)'
    -String(total have 140+ choices):
    'Weather_Condition'
    -Boolean(True or False):
    'Amenity','Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
    'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset'
    -String(Day or Night):
    'Civil_Twilight', 'Nautical_Twilight','Astronomical_Twilight'
    '''
    with open('label_encoders.pkl', 'rb') as f:
        loaded_encoders = pickle.load(f)

    for column in loaded_encoders:
        df[column] = loaded_encoders[column].transform(df[column])
    return df


In [17]:
data_path = 'US_Accidents_March23.csv'
df = pd.read_csv(data_path)

In [18]:
# Preprocess data
X, y = preprocessing(df)

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [19]:
# Train Random Forest model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)

# Evaluate Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='macro', zero_division=0)
recall_rf = recall_score(y_test, y_pred_rf, average='macro')
f1_rf = f1_score(y_test, y_pred_rf, average='macro')
print("Predicted Severity for Random Forest:", y_pred_rf)
print(f"Random Forest Results: Accuracy={accuracy_rf:.4f}, Precision={precision_rf:.4f}, Recall={recall_rf:.4f}, F1 Score={f1_rf:.4f}")


  random_forest.fit(X_train, y_train)


Random Forest Results: Accuracy=0.9426, Precision=0.6890, Recall=0.4351, F1 Score=0.5051


In [21]:
# Train AdaBoost model
adaboost = AdaBoostClassifier(n_estimators=100, random_state=42)
adaboost.fit(X_train, y_train)
y_pred_ab = adaboost.predict(X_test)

# Evaluate AdaBoost model
accuracy_ab = accuracy_score(y_test, y_pred_ab)
precision_ab = precision_score(y_test, y_pred_ab, average='macro', zero_division=0)
recall_ab = recall_score(y_test, y_pred_ab, average='macro')
f1_ab = f1_score(y_test, y_pred_ab, average='macro')
print("Predicted Severity for AdaBoost:", y_pred_ab)
print(f"AdaBoost Results: Accuracy={accuracy_ab:.4f}, Precision={precision_ab:.4f}, Recall={recall_ab:.4f}, F1 Score={f1_ab:.4f}")


  y = column_or_1d(y, warn=True)


AdaBoost Results: Accuracy=0.9420, Precision=0.4389, Recall=0.2514, F1 Score=0.2453


In [24]:
# Train XGBoost model
le = LabelEncoder()
y = le.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

xgboost = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgboost.fit(X_train, y_train)
y_pred_xgb = xgboost.predict(X_test)

# Evaluate XGBoost model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb, average='macro', zero_division=0)
recall_xgb = recall_score(y_test, y_pred_xgb, average='macro')
f1_xgb = f1_score(y_test, y_pred_xgb, average='macro')
print("Predicted Severity for XGBoost:", y_pred_xgb)
print(f"XGBoost Results: Accuracy={accuracy_xgb:.4f}, Precision={precision_xgb:.4f}, Recall={recall_xgb:.4f}, F1 Score={f1_xgb:.4f}")


XGBoost Results: Accuracy=0.9429, Precision=0.7795, Recall=0.2906, F1 Score=0.3119


In [26]:
import xgboost as xgb
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

classes = np.unique(y)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
class_weights = dict(zip(classes, weights))

# Incorporate weights into DMatrix (used by XGBoost)
dtrain = xgb.DMatrix(X_train, label=y_train, weight=[class_weights[i] for i in y_train])

# Setting up parameters for XGBoost
params = {
    'max_depth': 3,
    'eta': 0.1,
    'objective': 'multi:softprob',
    'num_class': len(classes),
    'eval_metric': 'mlogloss'
}

# Training the model
xgb_model = xgb.train(params, dtrain, num_boost_round=100)

# Making predictions (need to convert X_test to DMatrix)
dtest = xgb.DMatrix(X_test)
y_pred = np.argmax(xgb_model.predict(dtest), axis=1)

# Assuming you have already predicted `y_pred` using the weighted XGBoost model

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Print the metrics
print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1 Score: {:.2f}".format(f1))


Accuracy: 0.45
Precision: 0.28
Recall: 0.46
F1 Score: 0.22


In [29]:
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(random_forest, file)

with open('adaboost_model.pkl', 'wb') as file:
    pickle.dump(adaboost, file)

with open('xgboost_model.pkl', 'wb') as file:
    pickle.dump(xgboost, file)