In [76]:
import pandas as pd
import numpy as np
import sklearn
from IPython.display import display
from datetime import datetime

# Data Cleaning

Create variables from the available data such as:
    - AM/PM
    - Season
    - Dark/Bright

In [None]:
df = pd.read_csv('US_Accidents_Dec20_updated.csv') #csv file with Tempearture and Visibility values filled
df.head()

In [78]:
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'], axis =1)

In [None]:
pd.options.display.max_columns = None
df.head()

In [80]:
# create a night binary variable
df['night'] = np.where(df['Sunrise_Sunset'] == 'Day', 0, 1)
df['Precipitation(in)'] = df['Precipitation(in)'].fillna(0)
df['Wind_Speed(mph)'] = df['Wind_Speed(mph)'].fillna(0)


In [89]:
df['night'] = df['night'].dropna()
df['Visibility(mi)'] = df['Visibility(mi)'].dropna()
df['Temperature(F)'] = df['Temperature(F)'].dropna()

In [None]:
df.isna().sum()

In [83]:
# remove tailing decimal from timestamp
def clean_timestamp(timestamp):
    if '.' in timestamp:
        return timestamp.split('.')[0]
    else:
        return timestamp
    
def get_season(timestamp_str):
    timestamp = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S")
    # get the current day of the year
    doy = timestamp.timetuple().tm_yday

    # "day of year" ranges for the northern hemisphere
    spring = range(80, 172)
    summer = range(172, 264)
    fall = range(264, 355)
    # winter = everything else

    if doy in spring:
        season = 'spring'
    elif doy in summer:
        season = 'summer'
    elif doy in fall:
        season = 'fall'
    else:
        season = 'winter'
    return season

In [84]:
df['Start_Time'] = df['Start_Time'].apply(clean_timestamp)
df['season'] = df.apply(lambda row: get_season(row['Start_Time']), axis =1)

In [None]:
df.info()

In [None]:
df = pd.get_dummies(df, columns = ['season'])
df.head()

In [94]:
final_col_list = ['season_fall', 'season_spring', 'season_summer', 'season_winter', 'night', 'Wind_Speed(mph)', 'Temperature(F)', 'Visibility(mi)', 'Precipitation(in)', 'Severity']
model_df = df[final_col_list].dropna()
X = model_df.drop('Severity', axis = 1)
y = model_df['Severity']

In [None]:
X.isna().sum()

# Lasso Regression

Fit a Lasso Regression model to predict the accident severity score. This model will perform variable selection on the available variables so we can see which are the most important for predicting accident severity and weight these more heavily.

In [121]:
from sklearn.linear_model import Lasso

reg = Lasso(normalize = True, alpha = 0.5, random_state = 42)
reg.fit(X, y)

Lasso(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=1000, normalize=True,
      positive=False, precompute=False, random_state=42, selection='cyclic',
      tol=0.0001, warm_start=False)

In [122]:
zip_coef_lasso = zip(X.columns, reg.coef_)
for col, coef in zip_coef_lasso:
    print('{}: {}'.format(col, round(coef,3)))

season_fall: -0.0
season_spring: 0.0
season_summer: 0.0
season_winter: 0.0
night: -0.0
Wind_Speed(mph): 0.0
Temperature(F): -0.0
Visibility(mi): 0.0
Precipitation(in): 0.0


# Linear Regression

In [111]:
from sklearn.linear_model import LinearRegression

reg2 = LinearRegression(normalize = True)
reg2.fit(X, y)

print(X.columns)
print(reg2.coef_)

Index(['season_fall', 'season_spring', 'season_summer', 'season_winter',
       'night', 'Wind_Speed(mph)', 'Temperature(F)', 'Visibility(mi)',
       'Precipitation(in)'],
      dtype='object')
[-1.52413881e+09 -1.52413881e+09 -1.52413881e+09 -1.52413881e+09
  2.33602829e-02  2.23215469e-03 -2.47993557e-03  2.20300558e-03
  1.29316884e-02]


In [112]:
zip_coef_lin_reg = zip(X.columns, reg2.coef_)
for col, coef in zip_coef_lin_reg:
    print('{}: {}'.format(col, round(coef,3)))

season_fall: -1524138810.665
season_spring: -1524138810.485
season_summer: -1524138810.399
season_winter: -1524138810.592
night: 0.023
Wind_Speed(mph): 0.002
Temperature(F): -0.002
Visibility(mi): 0.002
Precipitation(in): 0.013


# Random Forest

In [104]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state = 42)
rf.fit(X, y)

print(X.columns)
print(rf.feature_importances_)

Index(['season_fall', 'season_spring', 'season_summer', 'season_winter',
       'night', 'Wind_Speed(mph)', 'Temperature(F)', 'Visibility(mi)',
       'Precipitation(in)'],
      dtype='object')
[0.04584467 0.00615073 0.00642469 0.00727482 0.04055295 0.21360162
 0.39205639 0.20610566 0.08198846]


In [107]:
zipped_feature_importance = zip(X.columns, rf.feature_importances_)
for col, importance in zipped_feature_importance:
    print('{}: {}'.format(col, round(importance,3)))

season_fall: 0.046
season_spring: 0.006
season_summer: 0.006
season_winter: 0.007
night: 0.041
Wind_Speed(mph): 0.214
Temperature(F): 0.392
Visibility(mi): 0.206
Precipitation(in): 0.082
