In [None]:
from clean_data import clean_data
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
import xgboost as xgb
import pandas as pd
import numpy as np

In [None]:
# This is a constant that will be used for anything
# that needs uses randomness, so that our tests stay
# consistent and repeatable
RANDOM_STATE = 0
METRICS = ['accuracy', 'f1_macro', 'f1_micro']
NUM_FEAT_IMPORTANCES = 10
DOWNSAMPLE_LABEL = 1

### Read in Data and Format it

In [None]:
def read_data_and_format(filepath: str = "./data/utah_county_accidents.csv", 
                         test_perc: float = .3,
                         index_column = 'ID',
                         label_column = 'Severity',
                         columns_to_drop: list = ['Unnamed: 0', 'End_Time', 
                                                  'County', 'State', 'City', 
                                                  'Country', 'Timezone', 
                                                  'Airport_Code', 'Street', 
                                                  'Zipcode', 'Source', 
                                                  'Description', 'Weather_Timestamp', 
                                                  'Wind_Direction', 'Nautical_Twilight', 
                                                  'Astronomical_Twilight'],
                         dummy_columns: list = ['Month', 'Day', 'Civil_Twilight', 'Sunrise_Sunset'],
                         rand_state=RANDOM_STATE,
                         stratify=True,
                         ):
    """
    Params
    -------
    filepath (str): the path to the data
    test_perc (float): the percent of test data
    rand_state (bool): Should stay true (Don't change unless the group all agrees)
    stratify (bool): Should be True (that way each class is equally represented
                     in the test and train set)

    Returns
    -------
    X_train (pd.DataFrame): the training data
    y_train (pd.DataFrame): the training labels
    X_test (pd.DataFrame): the testing data
    y_test (pd.DataFrame): the testing labels
    """
    df = pd.read_csv(filepath, index_col=index_column)

    df = clean_data(df, to_drop=columns_to_drop)

    df = pd.get_dummies(df, columns=list(set(dummy_columns) - set(columns_to_drop)))
    df_y = df[label_column].copy()
    df_X = df.drop(columns=[label_column])

    # stratify makes sure each class is equally represented precentage wise in the split
    if stratify:
        df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X, df_y, test_size=test_perc, random_state=rand_state, stratify=df_y)
    else:
        df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X, df_y, test_size=test_perc, random_state=rand_state)

    # Note that I am shifiting all the labels so it works with all the models
    if not (0 in list(df_y_train.unique())):
        df_y_train = df_y_train - 1
        df_y_test = df_y_test - 1

    return df_X_train, df_y_train, df_X_test, df_y_test

In [None]:
def down_sample(X: pd.DataFrame, y: pd.Series, label: int) -> pd.DataFrame:
    """
    This function downsamples all labels to the specified label
    
    Params
    -------
    X (pd.DataFrame): dataframe of training data
    y (pd.Series): training labels
    label (int): label to downsample to

    Returns
    -------
    X_train (pd.DataFrame): down sampled training data
    y_train (pd.Series): down sampled training labels
    """
    counts = y.value_counts()
    num_keep = len(y[y == label])
    X_copy = X.copy()
    y_copy = y.copy()
    
    for class_type in counts.index:
        num_drop = counts[class_type] - num_keep
        if num_drop > 0:
            drop_indices = np.random.choice(y_copy[y_copy == class_type].index, num_drop, replace=False)
            X_copy.drop(index=drop_indices, inplace=True)
            y_copy.drop(index=drop_indices, inplace=True)
    
    return X_copy, y_copy

In [None]:
# CHANGE THE PATH FOR YOUR DATASET
file = 'utah_county_accidents.csv'
filepath = './data/' + file
X_train, y_train, X_test, y_test = read_data_and_format(filepath)

In [None]:
# perform the downsample on the data
X_train, y_train = down_sample(X_train, y_train, DOWNSAMPLE_LABEL)
y_train.value_counts()

##### Quick Note About Our Grid Search:
It may be observed that we used different search sizes for each model. We originally made this decision due to time constraints and which parameters we felt were worth exploring. We recognize that this most likely biased our results. In future experiments, we would create more uniform grid searches so as to not bias our results.

### Train Plain Decision Tree Classifier

In [None]:
# The parameters search
# Actual Params
param_grid = {"criterion": ['gini', 'entropy', 'log_loss'],
              "splitter": ['best', 'random'],
              "min_samples_leaf": [3, 6, 9, 12, 17, 20, 25, 28, 33],
              "max_features": ['sqrt', 'log2']}

# Make the trees
model = DecisionTreeClassifier(random_state=RANDOM_STATE)
model_gs = GridSearchCV(model, param_grid, refit='f1_macro', scoring=METRICS, n_jobs=-1)

# Fit the trees
model_gs.fit(X_train, y_train)

# Print the best params and the report
print(model_gs.best_params_, model_gs.best_score_, sep='\n')
y_pred = model_gs.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=1))

In [None]:
# print out the score on the train set for comparison
y_pred = model_gs.predict(X_train)
print(classification_report(y_train, y_pred, zero_division=1))

In [None]:
# This is for checking which classes got predicted and how frequently
print(np.unique(y_pred, return_counts=True))

In [None]:
feat_importances = model_gs.best_estimator_.feature_importances_
three_worst_feats = feat_importances.argsort() <= NUM_FEAT_IMPORTANCES - 1
three_best_feats = feat_importances.argsort() > len(feat_importances) - (NUM_FEAT_IMPORTANCES + 1)

print("The 3 best features are: ", np.array(X_train.columns)[three_best_feats])
print("The 3 worst features are: ", np.array(X_train.columns)[three_worst_feats])

In [None]:
# plot a confusion matrix for visual analysis
y_pred = model_gs.predict(X_test)
ConfusionMatrixDisplay.from_predictions(y_pred, y_test, normalize='true')

### Train Random Forest Classifier

In [None]:
# The parameters search
# Actual Params
param_grid = {"n_estimators":[50, 100, 150, 200, 250, 300],
              "criterion": ['gini', 'entropy', 'log_loss'],
              "class_weight": ['balanced', 'balanced_subsample'],
              "min_samples_leaf":  [3, 6, 9, 12, 17, 20, 25, 28, 33],
              "max_features": [round(len(X_train.columns)*perc) for perc in np.arange(.1, 1, .2)]}

# Make the trees
model = RandomForestClassifier(random_state=RANDOM_STATE, warm_start=False)
model_gs = GridSearchCV(model, param_grid, refit='f1_macro', verbose=2, scoring=METRICS, n_jobs=-1)

# Fit the trees
model_gs.fit(X_train, y_train)

# Print the best params and the report
print(model_gs.best_params_, model_gs.best_score_, sep='\n')
y_pred = model_gs.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=1))

In [None]:
# print out the score on the train set for comparison
y_pred = model_gs.predict(X_train)
print(classification_report(y_train, y_pred, zero_division=1))

In [None]:
# This is for checking which classes got predicted and how frequently
print(np.unique(y_pred, return_counts=True))

In [None]:
feat_importances = model_gs.best_estimator_.feature_importances_
three_worst_feats = feat_importances.argsort() <= NUM_FEAT_IMPORTANCES - 1
three_best_feats = feat_importances.argsort() > len(feat_importances) - (NUM_FEAT_IMPORTANCES + 1)

print("The 3 best features are: ", np.array(X_train.columns)[three_best_feats])
print("The 3 worst features are: ", np.array(X_train.columns)[three_worst_feats])

In [None]:
# plot a confusion matrix for visual analysis
y_pred = model_gs.predict(X_test)
ConfusionMatrixDisplay.from_predictions(y_pred, y_test, normalize='true')

### Train Gradient Boosted Classifier

In [None]:
# The parameters search
# Actual Params
param_grid = {"loss": ['log_loss', 'exponential'],
              "learning_rate": [0.1, .2, .3, .4],
              "n_estimators": [100, 200, 300],
              "criterion": ['friedman_mse', 'squared_error'],
              "min_samples_leaf":  [3, 6, 9, 12, 17, 20, 25, 28, 33],
              "max_features": ['sqrt', 'log2'],
              "validation_fraction": [0.1],
             }

# Make the trees
model = GradientBoostingClassifier(random_state=RANDOM_STATE, warm_start=False)
model_gs = GridSearchCV(model, param_grid, refit='f1_macro', verbose=2, scoring=METRICS, n_jobs=-1)

# Fit the trees
model_gs.fit(X_train, y_train)

# Print the best params and the report
print(model_gs.best_params_, model_gs.best_score_, sep='\n')
y_pred = model_gs.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=1))

In [None]:
# print out the score on the train set for comparison
y_pred = model_gs.predict(X_train)
print(classification_report(y_train, y_pred, zero_division=1))

In [None]:
# This is for checking which classes got predicted and how frequently
print(np.unique(y_pred, return_counts=True))

In [None]:
feat_importances = model_gs.best_estimator_.feature_importances_
three_worst_feats = feat_importances.argsort() <= NUM_FEAT_IMPORTANCES - 1
three_best_feats = feat_importances.argsort() > len(feat_importances) - (NUM_FEAT_IMPORTANCES + 1)

print("The 3 best features are: ", np.array(X_train.columns)[three_best_feats])
print("The 3 worst features are: ", np.array(X_train.columns)[three_worst_feats])

### Train XGBoosted Classifier

In [None]:
# The parameters search
# Actual Params
param_grid = {"gamma":[0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1],
              "alpha": [0.5, 3, 5, 10, 30, 50],
              "lambda": [0, 5, 10, 25, 50, 100, 250, 500, 1000],
              "eta": [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1],
              "tree_method": ['exact', 'approx', 'hist'],
              }

# Make the trees
model = xgb.XGBClassifier(objective="multi:softmax", random_state=RANDOM_STATE)
model_gs = RandomizedSearchCV(model, param_grid, n_iter=2500, scoring="accuracy", verbose=2, n_jobs=-1)

# Fit the trees
model_gs.fit(X_train, y_train)

# Print the best params and the report
print(model_gs.best_params_, model_gs.best_score_, sep='\n')
y_pred = model_gs.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=1))

In [None]:
# print out the score on the train set for comparison
y_pred = model_gs.predict(X_train)
print(classification_report(y_train, y_pred, zero_division=1))

In [None]:
# This is for checking which classes got predicted and how frequently
print(np.unique(y_pred, return_counts=True))

In [None]:
feat_importances = model_gs.best_estimator_.feature_importances_
three_worst_feats = feat_importances.argsort() <= NUM_FEAT_IMPORTANCES - 1
three_best_feats = feat_importances.argsort() > len(feat_importances) - (NUM_FEAT_IMPORTANCES + 1)

print("The 3 best features are: ", np.array(X_train.columns)[three_best_feats])
print("The 3 worst features are: ", np.array(X_train.columns)[three_worst_feats])