# <center> Predicting severity of car accidents in Seattle </center>

# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Load data set

In [314]:
df = pd.read_csv('https://s3.us.cloud-object-storage.appdomain.cloud/cf-courses-data/CognitiveClass/DP0701EN/version-2/Data-Collisions.csv')

# used REPORTNO to filter duplicate samples
df = df.sort_values('REPORTNO').drop_duplicates(['REPORTNO'])

In [315]:
# check null values as a ratio of the number of observations
(df.isnull().sum() / df.shape[0])[df.isnull().sum() / df.shape[0]>0.05]

INTKEY            0.665742
EXCEPTRSNCODE     0.564335
EXCEPTRSNDESC     0.971038
INATTENTIONIND    0.846900
PEDROWNOTGRNT     0.976026
SDOTCOLNUM        0.409601
SPEEDING          0.952057
dtype: float64

# Data cleaning

(i) Correct INATTENTIONIND, PEDROWNOTGRNT, SPEEDING, UNDERINFL for which there shouldn't be missing values.

(ii) Correct LIGHTCOND, ROADCOND, JUNCTIONTYPE, WEATHER for which "Unknown" is a missing value.

In [316]:
df['INATTENTIONIND'] = df['INATTENTIONIND'].apply(lambda x: 1 if x=='Y' else 0)
df['PEDROWNOTGRNT'] = df['PEDROWNOTGRNT'].apply(lambda x: 1 if x=='Y' else 0)
df['SPEEDING'] = df['SPEEDING'].apply(lambda x: 1 if x=='Y' else 0)
df['UNDERINFL'] = df['UNDERINFL'].replace('Y',1).replace('N',0).replace('1',1).replace('0',0)

df['LIGHTCOND'] = df['LIGHTCOND'].replace('Unknown', np.nan)
df['ROADCOND'] = df['ROADCOND'].replace('Unknown', np.nan)
df['JUNCTIONTYPE'] = df['JUNCTIONTYPE'].replace('Unknown', np.nan)
df['WEATHER'] = df['WEATHER'].replace('Unknown', np.nan)

#### Drop the following features as they are redundant or irrelevant

In [317]:
todrop = ['SEVERITYCODE.1', 'SEVERITYDESC', 
          'REPORTNO', 'OBJECTID', 'INCKEY', 'COLDETKEY', 'SDOTCOLNUM', 
          'ST_COLDESC', 'ST_COLCODE', 'SDOT_COLDESC', 'SDOT_COLCODE', 'HITPARKEDCAR',
          'LOCATION', 'EXCEPTRSNCODE', 'EXCEPTRSNDESC', 
          'INCDATE',
          'INTKEY', 
          'STATUS', 'CROSSWALKKEY', 'SEGLANEKEY']

df.drop(columns=todrop, inplace=True) # drop redundant columns

In [318]:
(df.isnull().sum() / df.shape[0])[df.isnull().sum() / df.shape[0]>0.05]

WEATHER      0.103616
ROADCOND     0.103195
LIGHTCOND    0.095762
dtype: float64

Convert INCDTTM into a pandas datetime type.

In [320]:
df['INCDTTM'] = pd.to_datetime(df['INCDTTM'])

In [330]:
# drop rows with missing values
df.dropna(inplace=True)

# Exploratory data analysis

In [331]:
# load libraries
# since we have categorical variables, the best statistic to use is contingency table and chi-squared test
from scipy.stats import chi2_contingency 

## SEVERITY vs ADDRTYPE

In [368]:
print('Pearson correlation between SEVERITYCODE and ADDRTYPE:', 
      df.groupby(['SEVERITYCODE','ADDRTYPE'])['SEVERITYCODE'].count().unstack().T
      .corr(method='pearson').iloc[0,1].round(3)
     )

table = (pd.crosstab(df['SEVERITYCODE'], df['ADDRTYPE']) 
 #/ df.groupby('ADDRTYPE')['ADDRTYPE'].count()
)
table

Pearson correlation between SEVERITYCODE and ADDRTYPE: 1.0


ADDRTYPE,Block,Intersection
SEVERITYCODE,Unnamed: 1_level_1,Unnamed: 2_level_1
1,76229,34279
2,27936,26740


In [369]:
stat, p, dof, expected = chi2_contingency(table)
print('stat={}, degrees of freedom={}, p={}'.format(round(stat,2), dof, p))
expected = pd.DataFrame(expected.round(2), columns=table.columns, index=[1,2])
expected

stat=5023.09, degrees of freedom=1, p=0.0


ADDRTYPE,Block,Intersection
1,69686.32,40821.68
2,34478.68,20197.32


In [372]:
# the categorical value where there are more injuries than expected
diff_df = (table-expected).iloc[1][(table-expected).iloc[1]>=0].index.tolist()
diff_df

['Intersection']

In [374]:
# create a dummy variable that takes value 1 if above ADDRTYPE takes the above categorical value and 0 otherwise
df['INTERSECTION'] = df['ADDRTYPE'].apply(lambda x: 1 if x in diff_df else 0)

## SEVERITYCODE vs COLLISIONTYPE

In [375]:
print('Pearson correlation between SEVERITYCODE and COLLISIONTYPE:', 
      df.groupby(['SEVERITYCODE','COLLISIONTYPE'])['SEVERITYCODE'].count().unstack().T
      .corr(method='pearson').iloc[0,1].round(3)) 

table = (pd.crosstab(df['SEVERITYCODE'], df['COLLISIONTYPE']) 
 #/ df.groupby('COLLISIONTYPE')['COLLISIONTYPE'].count()
)#.round(2)

table

Pearson correlation between SEVERITYCODE and COLLISIONTYPE: 0.381


COLLISIONTYPE,Angles,Cycles,Head On,Left Turn,Other,Parked Car,Pedestrian,Rear Ended,Right Turn,Sideswipe
SEVERITYCODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,20282,604,1084,7984,15414,29816,629,17805,2186,14704
2,13362,4566,847,5319,5630,2574,5612,13821,589,2356


In [376]:
# calculate the contingency table
stat, p, dof, expected = chi2_contingency(table)
print('stat={}, degrees of freedom={}, p={}'.format(round(stat,2), dof, p))
expected = pd.DataFrame(expected.round(2), columns=table.columns, index=[1,2])
expected

stat=31554.34, degrees of freedom=9, p=0.0


COLLISIONTYPE,Angles,Cycles,Head On,Left Turn,Other,Parked Car,Pedestrian,Rear Ended,Right Turn,Sideswipe
1,22507.82,3458.73,1291.84,8899.7,14078.42,21668.89,4175.23,21157.78,1856.47,11413.13
2,11136.18,1711.27,639.16,4403.3,6965.58,10721.11,2065.77,10468.22,918.53,5646.87


In [377]:
table.index.name=None
table-expected

COLLISIONTYPE,Angles,Cycles,Head On,Left Turn,Other,Parked Car,Pedestrian,Rear Ended,Right Turn,Sideswipe
1,-2225.82,-2854.73,-207.84,-915.7,1335.58,8147.11,-3546.23,-3352.78,329.53,3290.87
2,2225.82,2854.73,207.84,915.7,-1335.58,-8147.11,3546.23,3352.78,-329.53,-3290.87


In [378]:
# the categorical value where there are more injuries than expected
diff_df = (table-expected).iloc[1][(table-expected).iloc[1]>=0].index.tolist()
diff_df

['Angles', 'Cycles', 'Head On', 'Left Turn', 'Pedestrian', 'Rear Ended']

In [382]:
# create a dummy variable that takes value 1 if above COLLISIONTYPE takes the above categorical values and 0 otherwise
df['COLTYPEDIV'] = df['COLLISIONTYPE'].apply(lambda x: 1 if (x in diff_df) else 0)

table = pd.crosstab(df['SEVERITYCODE'], df['COLTYPEDIV'])
table.columns = ['parked car, right turn, sideswipe or other',
                 'Angles, cycles, head on, left turn, pedestrian, or rear ended']
table.columns.name = 'SEVERITYCODE'
table.index.name = None
table

SEVERITYCODE,"parked car, right turn, sideswipe or other","Angles, cycles, head on, left turn, pedestrian, or rear ended"
1,62120,48388
2,11149,43527


In [383]:
# calculate contingency table for new categoritcal variable vs the label
stat, p, dof, expected = chi2_contingency(table)
print('stat={}, degrees of freedom={}, p={}'.format(round(stat,2), dof, p))
pd.DataFrame(expected.round(2), columns=table.columns, index=[1,2])

stat=19016.06, degrees of freedom=1, p=0.0


SEVERITYCODE,"parked car, right turn, sideswipe or other","Angles, cycles, head on, left turn, pedestrian, or rear ended"
1,49016.92,61491.08
2,24252.08,30423.92


## SEVERITYCODE vs JUNCTIONTYPE

In [384]:
table = pd.crosstab(df['SEVERITYCODE'], df['JUNCTIONTYPE'])
table

JUNCTIONTYPE,At Intersection (but not related to intersection),At Intersection (intersection related),Driveway Junction,Mid-Block (but intersection related),Mid-Block (not related to intersection),Ramp Junction
SEVERITYCODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1184,33079,6786,14030,55339,90
2,595,26135,3119,6894,17888,45


In [385]:
stat, p, dof, expected = chi2_contingency(table)
print('stat={}, degrees of freedom={}, p={}'.format(round(stat,2), dof, p))
expected = pd.DataFrame(expected.round(2), columns=table.columns, index=[1,2])
expected

stat=5755.85, degrees of freedom=5, p=0.0


JUNCTIONTYPE,At Intersection (but not related to intersection),At Intersection (intersection related),Driveway Junction,Mid-Block (but intersection related),Mid-Block (not related to intersection),Ramp Junction
1,1190.15,39614.13,6626.44,13998.14,48988.82,90.31
2,588.85,19599.87,3278.56,6925.86,24238.18,44.69


In [386]:
table.index.name=None
table-expected

JUNCTIONTYPE,At Intersection (but not related to intersection),At Intersection (intersection related),Driveway Junction,Mid-Block (but intersection related),Mid-Block (not related to intersection),Ramp Junction
1,-6.15,-6535.13,159.56,31.86,6350.18,-0.31
2,6.15,6535.13,-159.56,-31.86,-6350.18,0.31


In [387]:
# the categorical values where there are more injuries than expected
diff_df = (table-expected).iloc[1][(table-expected).iloc[1]>=0].index.tolist()
diff_df

['At Intersection (but not related to intersection)',
 'At Intersection (intersection related)',
 'Ramp Junction']

## SEVERITYCODE vs WEATHER

In [388]:
table = pd.crosstab(df['SEVERITYCODE'], df['WEATHER'])
table

WEATHER,Blowing Sand/Dirt,Clear,Fog/Smog/Smoke,Other,Overcast,Partly Cloudy,Raining,Severe Crosswind,Sleet/Hail/Freezing Rain,Snowing
SEVERITYCODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,30,70892,355,164,17705,2,20612,17,83,648
2,13,34977,180,76,8477,3,10753,7,27,163


In [389]:
stat, p, dof, expected = chi2_contingency(table)
print('stat={}, degrees of freedom={}, p={}'.format(round(stat,2), dof, p))
expected = pd.DataFrame(expected.round(2), columns=table.columns, index=[1,2])
expected

stat=94.0, degrees of freedom=9, p=2.5610763467779737e-16


WEATHER,Blowing Sand/Dirt,Clear,Fog/Smog/Smoke,Other,Overcast,Partly Cloudy,Raining,Severe Crosswind,Sleet/Hail/Freezing Rain,Snowing
1,28.77,70826.3,357.91,160.56,17515.74,3.34,20983.17,16.06,73.59,542.56
2,14.23,35042.7,177.09,79.44,8666.26,1.66,10381.83,7.94,36.41,268.44


In [390]:
table.index.name=None
table-expected

WEATHER,Blowing Sand/Dirt,Clear,Fog/Smog/Smoke,Other,Overcast,Partly Cloudy,Raining,Severe Crosswind,Sleet/Hail/Freezing Rain,Snowing
1,1.23,65.7,-2.91,3.44,189.26,-1.34,-371.17,0.94,9.41,105.44
2,-1.23,-65.7,2.91,-3.44,-189.26,1.34,371.17,-0.94,-9.41,-105.44


In [391]:
# the categorical values where there are more injuries than expected
diff_df = (table-expected).iloc[1][(table-expected).iloc[1]>=0].index.tolist()
diff_df

['Fog/Smog/Smoke', 'Partly Cloudy', 'Raining']

In [392]:
# Define a new categorical value based on the results from the analysis in the above cell.
df['BADWEATHER'] = df['WEATHER'].apply(lambda x: 1 if (x in diff_df) else 0)

## SEVERITYCODE vs ROADCOND

In [402]:
table = pd.crosstab(df['SEVERITYCODE'], df['ROADCOND'])
table.index.name = None
table

ROADCOND,Dry,Ice,Oil,Other,Sand/Mud/Dirt,Snow/Slush,Standing Water,Wet
1,79287,805,27,59,35,672,69,29554
2,39002,259,20,39,21,156,26,15153


In [403]:
stat, p, dof, expected = chi2_contingency(table)
print('stat={}, degrees of freedom={}, p={}'.format(round(stat,2), dof, p))
expected = pd.DataFrame(expected.round(2), columns=table.columns, index=[1,2])
expected

stat=132.27, degrees of freedom=7, p=2.1082890184547544e-25


ROADCOND,Dry,Ice,Oil,Other,Sand/Mud/Dirt,Snow/Slush,Standing Water,Wet
1,79135.27,711.82,31.44,65.56,37.46,553.93,63.55,29908.96
2,39153.73,352.18,15.56,32.44,18.54,274.07,31.45,14798.04


In [404]:
table.index.name=None
table-expected

ROADCOND,Dry,Ice,Oil,Other,Sand/Mud/Dirt,Snow/Slush,Standing Water,Wet
1,151.73,93.18,-4.44,-6.56,-2.46,118.07,5.45,-354.96
2,-151.73,-93.18,4.44,6.56,2.46,-118.07,-5.45,354.96


In [405]:
# the categorical values where there are more injuries than expected

diff_df = (table-expected).iloc[1][(table-expected).iloc[1]>=0].index.tolist()
diff_df

['Oil', 'Other', 'Sand/Mud/Dirt', 'Wet']

In [406]:
# Define a new categorical value based on the results from the analysis in the above cell.
df['BADROAD'] = df['ROADCOND'].apply(lambda x: 1 if (x in diff_df) else 0)

## SEVERITYCODE vs LIGHTCOND

In [409]:
table = pd.crosstab(df['SEVERITYCODE'], df['LIGHTCOND'])
table.index.name = None
table

LIGHTCOND,Dark - No Street Lights,Dark - Street Lights Off,Dark - Street Lights On,Dark - Unknown Lighting,Dawn,Daylight,Dusk,Other
1,1006,764,31293,5,1525,72166,3635,114
2,309,298,13962,4,793,37404,1870,36


In [410]:
stat, p, dof, expected = chi2_contingency(table)
print('stat={}, degrees of freedom={}, p={}'.format(round(stat,2), dof, p))
expected = pd.DataFrame(expected.round(2), columns=table.columns, index=[1,2])
expected

stat=232.74, degrees of freedom=7, p=1.29677344313145e-46


LIGHTCOND,Dark - No Street Lights,Dark - Street Lights Off,Dark - Street Lights On,Dark - Unknown Lighting,Dawn,Daylight,Dusk,Other
1,879.73,710.48,30275.57,6.02,1550.74,73302.27,3682.84,100.35
2,435.27,351.52,14979.43,2.98,767.26,36267.73,1822.16,49.65


In [411]:
table.index.name=None
table-expected

LIGHTCOND,Dark - No Street Lights,Dark - Street Lights Off,Dark - Street Lights On,Dark - Unknown Lighting,Dawn,Daylight,Dusk,Other
1,126.27,53.52,1017.43,-1.02,-25.74,-1136.27,-47.84,13.65
2,-126.27,-53.52,-1017.43,1.02,25.74,1136.27,47.84,-13.65


In [412]:
# the categorical values where there are more injuries than expected
diff_df = (table-expected).iloc[1][(table-expected).iloc[1]>=0].index.tolist()
diff_df

['Dark - Unknown Lighting', 'Dawn', 'Daylight', 'Dusk']

## Create a new variable DAYOFWEEK and explore its relationship with SEVERITYCODE similar to the analyses above

In [417]:
df['DAYOFWEEK'] = df['INCDTTM'].dt.dayofweek+1

(df.groupby(['SEVERITYCODE','DAYOFWEEK'])['SEVERITYCODE'].count().unstack()
 / df.groupby('DAYOFWEEK')['DAYOFWEEK'].count()
).round(2)

table = pd.crosstab(df['SEVERITYCODE'], df['DAYOFWEEK'])
table.index.name = None
table


stat, p, dof, expected = chi2_contingency(table)
print('stat={}, degrees of freedom={}, p={}'.format(round(stat,2), dof, p))
expected = pd.DataFrame(expected.round(2), columns=table.columns, index=[1,2])
expected

# the days where we see more injuries than expected
diff_df = (table-expected).iloc[1][(table-expected).iloc[1]>=0].index.tolist()
diff_df

stat=74.77, degrees of freedom=6, p=4.271171033439675e-14


[1, 2, 3, 4]

In [419]:
# based on the result above, we define a new variable WEEKDAY that is True if Mon, Tue, Wed or Thu.
df['WEEKDAY'] = df['DAYOFWEEK'].apply(lambda x: 1 if (x in diff_df) else 0)

## Create a new variable MONTH and explore its relationship with SEVERITYCODE similar to the analyses above

In [415]:
df['MONTH'] = df['INCDTTM'].dt.month

print('Pearson correlation between SEVERITYCODE and MONTH:', 
      df.groupby(['SEVERITYCODE','MONTH'])['SEVERITYCODE'].count().unstack().T
      .corr(method='pearson').iloc[0,1].round(3)
     )

(df.groupby(['SEVERITYCODE','MONTH'])['SEVERITYCODE'].count().unstack()
 / df.groupby('MONTH')['MONTH'].count()
).round(2)


table = pd.crosstab(df['SEVERITYCODE'], df['MONTH'])
table.index.name = None
table


stat, p, dof, expected = chi2_contingency(table)
print('stat={}, degrees of freedom={}, p={}'.format(round(stat,2), dof, p))
expected = pd.DataFrame(expected.round(2), columns=table.columns, index=[1,2])
expected

diff_df = (table-expected).iloc[1][(table-expected).iloc[1]>=0].index.tolist()
diff_df

Pearson correlation between SEVERITYCODE and MONTH: 0.845
stat=92.29, degrees of freedom=11, p=5.9220218201255965e-15


[5, 7, 8, 9, 10]

In [416]:
# define a new variable based on above analysis which is True if it is May, Jul, Aug, Sep or Oct and False otherwise
df['SUMMER'] = df['MONTH'].apply(lambda x: 1 if (x in diff_df) else 0)

## Create a new variable HOUR and explore its relationship with SEVERITYCODE similar to the analyses above

In [420]:
df['HOUR'] = df['INCDTTM'].dt.hour

print('Pearson correlation between SEVERITYCODE and HOUR:', 
      df.groupby(['SEVERITYCODE','HOUR'])['SEVERITYCODE'].count().unstack().T
      .corr(method='pearson').iloc[0,1].round(3)
     )

(df.groupby(['SEVERITYCODE','HOUR'])['SEVERITYCODE'].count().unstack()
 / df.groupby('HOUR')['HOUR'].count()
).round(2)



table = pd.crosstab(df['SEVERITYCODE'], df['HOUR'])
table.index.name = None
table


stat, p, dof, expected = chi2_contingency(table)
print('stat={}, degrees of freedom={}, p={}'.format(round(stat,2), dof, p))
expected = pd.DataFrame(expected.round(2), columns=table.columns, index=[1,2])
expected

diff_df = (table-expected).iloc[1][(table-expected).iloc[1]>=0].index.tolist()
diff_df

Pearson correlation between SEVERITYCODE and HOUR: 0.981
stat=621.2, degrees of freedom=23, p=1.6433260083835907e-116


[6, 7, 8, 14, 15, 16, 17, 18, 19]

In [421]:
# we see there is a marked increase in injuries during rush hours: 6, 7, 8, 14, 15, 16, 17, 18, 19
# Define a new variable RUSHHOUR
df['RUSHHOUR'] = df['HOUR'].apply(lambda x: 1 if x in diff_df else 0)

table = pd.crosstab(df['SEVERITYCODE'], df['RUSHHOUR'])
table.index.name = None
table


stat, p, dof, expected = chi2_contingency(table)
print('stat={}, degrees of freedom={}, p={}'.format(round(stat,2), dof, p))
expected = pd.DataFrame(expected.round(2), columns=table.columns, index=[1,2])
expected

(table-expected)

stat=396.06, degrees of freedom=1, p=3.967447062971418e-88


RUSHHOUR,0,1
1,1890.04,-1890.04
2,-1890.04,1890.04


In [438]:
df['DAYOFWEEK'] = df['DAYOFWEEK'].astype(str)
df['MONTH'] = df['MONTH'].astype(str)
df['HOUR'] = df['HOUR'].astype(str)

In [439]:
df.columns.values

array(['SEVERITYCODE', 'X', 'Y', 'ADDRTYPE', 'COLLISIONTYPE',
       'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'INCDTTM',
       'JUNCTIONTYPE', 'INATTENTIONIND', 'UNDERINFL', 'WEATHER',
       'ROADCOND', 'LIGHTCOND', 'PEDROWNOTGRNT', 'SPEEDING', 'COLTYPEDIV',
       'DAYOFWEEK', 'MONTH', 'HOUR', 'RUSHHOUR', 'INTERSECTION',
       'BADWEATHER', 'BADROAD', 'SUMMER', 'WEEKDAY', 'RUSHHOURWEEKDAY'],
      dtype=object)

## Feature selection

We select the newly defined features contructed from the categorical variables and drop the old features.

In [494]:
to_dummies = ['ADDRTYPE',    #we have INTERSECTION
              'COLLISIONTYPE',  # we have COLTYPEDIV
              'JUNCTIONTYPE', #we have INTERSECTION
              'WEATHER', # we have BADWEATHER
              'ROADCOND', # we have BADROAD
              'LIGHTCOND', # drop
              'DAYOFWEEK', 'MONTH', 'HOUR'
             ]

df['RUSHHOURWEEKDAY'] = df['RUSHHOUR'] * df['WEEKDAY']

as_is = ['SEVERITYCODE', 
         #'X', 'Y', 
         'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT',
         'INATTENTIONIND', 'UNDERINFL', 'PEDROWNOTGRNT', 'SPEEDING', 
         'COLTYPEDIV', 'RUSHHOUR', 'INTERSECTION','BADWEATHER', 'BADROAD', 'SUMMER', 'WEEKDAY', 'RUSHHOURWEEKDAY'
        ]

#df1 = pd.merge(df[as_is], pd.get_dummies(df[to_dummies]), right_index=True, left_index=True)
df1 = df[as_is].copy()

We split the samples into two to train our ML models and test it later.

In [495]:
target = 'SEVERITYCODE'
features = df1.columns[df1.columns!=target]

from sklearn.model_selection import train_test_split
X, X_test, y, y_test = train_test_split(df1[features], df1[target], test_size=0.2, random_state=0)

We further split the data to train the model and validate our results to learn more about the structure of the data.

In [160]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)

#from imblearn.over_sampling import SMOTE
#os = SMOTE()
#X_train, y_train = os.fit_sample(X_train, y_train)

len(y_train[y_train==1]), len(y_train[y_train==2])

(66332, 66332)

In [485]:
X_train.shape

(99110, 16)

## Dummy classifier

In [518]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score

dummy_clf = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
dummy_clf.score(X_test, y_test).round(3), roc_auc_score(y_test, dummy_clf.predict_proba(X_test)[:,1])

(0.668, 0.5)

## Logistic Regression

In [487]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

from imblearn.over_sampling import SMOTE
os = SMOTE()
    
from imblearn.pipeline import Pipeline
estimators = [('scaler', scaler), 
              ('os', os), 
              ('clf', clf)]
pipeline = Pipeline(steps=estimators)
    
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline, X, y, cv=3, scoring = 'accuracy', n_jobs=-1)
print('The accuracy score: ', np.mean(scores))

The accuracy score:  0.7280679848956085


In [537]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C=1, solver='liblinear').fit(X_train, y_train)
clf.score(X_valid, y_valid).round(3), roc_auc_score(y_valid, clf.predict_proba(X_valid)[:,1]).round(3)

(0.728, 0.753)

Used cross validation to obtain accuracy scores (used the samples split from any data used in training).

In [538]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X_test, y_test, cv=3, scoring = 'accuracy', n_jobs=-1)
print('The accuracy score: ', np.mean(scores))

The accuracy score:  0.7278203377450202


## Gradient Boosting Classifier

A benchmark outcome:

In [530]:
gb_clf = GradientBoostingClassifier(learning_rate=0.1, max_depth=2, max_features=15).fit(X_train, y_train)
gb_clf.score(X_valid, y_valid).round(3), roc_auc_score(y_valid, gb_clf.predict_proba(X_valid)[:,1]).round(3)

(0.735, 0.766)

We used GridSearch to search for the optimal parameter values.

In [524]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(learning_rate=0.4).fit(X_train, y_train)

grid_values = {'learning_rate':np.linspace(0.1,0.4,4),
               'max_features': [14,15], 'max_depth':range(2,5)}#, 'min_samples_split':range(200,1001,400)}
# {'learning_rate':np.linspace(0.3,0.6,4)}

from sklearn.model_selection import GridSearchCV
grid_clf = GridSearchCV(clf, param_grid = grid_values, scoring = 'accuracy', n_jobs=-1).fit(X_train, y_train)

print('Train set accuracy: ', grid_clf.score(X_train, y_train))
print('Test set accuracy: ', grid_clf.score(X_valid, y_valid))
print('Grid best parameter (max. accu): ', grid_clf.best_params_)
print('Grid best parameter (max. accu): ', grid_clf.best_estimator_)

Train set accuracy:  0.7360004035919685
Test set accuracy:  0.7346914066047159
Grid best parameter (max. accu):  {'learning_rate': 0.4, 'max_depth': 3, 'max_features': 14}
Grid best parameter (max. accu):  GradientBoostingClassifier(learning_rate=0.4, max_features=14)


Used cross validation to obtain accuracy scores (used the samples split from any data used in training).

In [543]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(learning_rate=0.1, max_depth=2, max_features=15)
#clf.score(X_valid, y_valid), roc_auc_score(y_valid, gb_clf.predict_proba(X_valid)[:,1])

# (0.7362956684929018, 0.7660217425505267)

scores = cross_val_score(clf, X_test, y_test, cv=3, scoring = 'accuracy', n_jobs=-1)
print('The accuracy score: ', np.mean(scores))

The accuracy score:  0.735024349642285


## Random Forest Classifier

A benchmark result:

In [545]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier().fit(X_train, y_train)
rf_clf.score(X_valid, y_valid).round(3), roc_auc_score(y_valid, rf_clf.predict_proba(X_valid)[:,1]).round(3)

(0.727, 0.751)

Used GridSearch to search for optimal parameter values

In [547]:
clf = RandomForestClassifier().fit(X_train, y_train)

grid_values = {'max_features': range(7,17,3), 'max_depth':range(2,5)}

from sklearn.model_selection import GridSearchCV
grid_clf = GridSearchCV(clf, param_grid = grid_values, scoring = 'accuracy', n_jobs=-1).fit(X_train, y_train)

print('Train set accuracy: ', grid_clf.score(X_train, y_train))
print('Test set accuracy: ', grid_clf.score(X_valid, y_valid))
print('Grid best parameter (max. accu): ', grid_clf.best_params_)
print('Grid best parameter (max. accu): ', grid_clf.best_estimator_)

Train set accuracy:  0.7319543941075572
Test set accuracy:  0.7320579955807125
Grid best parameter (max. accu):  {'max_depth': 4, 'max_features': 13}
Grid best parameter (max. accu):  RandomForestClassifier(max_depth=4, max_features=13)


Used cross validation to obtain accuracy scores (used the samples split from any data used in training).

In [548]:
clf = RandomForestClassifier(max_depth=4, max_features=13)
scores = cross_val_score(clf, X_test, y_test, cv=3, scoring = 'accuracy', n_jobs=-1)
print('The accuracy score: ', np.mean(scores))

The accuracy score:  0.7317553041668869


In [551]:
#clf = RandomForestClassifier(max_depth=4, max_features=13).fit(X_train, y_train)
#roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

0.7642313827223294

## Some other algorithms tried but was not pursued more

(i) MLPClassifier

(ii) K-Nearest Neighbor

(iii) Gaussian Naive Bayes

In [136]:
from sklearn.neural_network import MLPClassifier

nn_clf = MLPClassifier(hidden_layer_sizes = [100], random_state = 0, max_iter=1000).fit(X_train, y_train)
nn_clf.score(X_test, y_test).round(2), roc_auc_score(y_test, nn_clf.predict_proba(X_test)[:,1]).round(2)

(0.66, 0.76)

In [27]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier().fit(X_train, y_train)
knn_clf.score(X_test, y_test).round(2), roc_auc_score(y_test, knn_clf.predict_proba(X_test)[:,1]).round(2)

(0.7, 0.71)

In [28]:
from sklearn.naive_bayes import GaussianNB
nb_clf = GaussianNB().fit(X_train, y_train)
knn_clf.score(X_test, y_test).round(2), roc_auc_score(y_test, knn_clf.predict_proba(X_test)[:,1]).round(2)

(0.7, 0.71)