In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [41]:
# Negative Sample
neg_sample_visibility = pd.read_csv('../data/processed/neg_sample_visibility.csv', index_col=[0])
neg_sample_temperature = pd.read_csv('../data/processed/neg_sample_temperature.csv', index_col=[0])
neg_sample_precipitation = pd.read_csv('../data/processed/neg_sample_precipitation.csv', index_col=[0])
negative_sample = pd.read_csv('../data/processed/negative_sample.csv', index_col=[0])

# Positive sample
pos_sample_visibility = pd.read_csv('../data/processed/pos_sample_visibility.csv', index_col=[0])
pos_sample_temperature = pd.read_csv('../data/processed/pos_sample_temperature.csv', index_col=[0])
pos_sample_precipitation = pd.read_csv('../data/processed/pos_sample_precipitation.csv', index_col=[0])
positive_sample = pd.read_csv('../data/processed/positive_sample.csv', index_col=[0])


In [42]:
# Merge
negative_sample['humidity'], negative_sample['temparature'], negative_sample['visibility']  =  neg_sample_temperature['humidity'], neg_sample_temperature['temperature'], neg_sample_visibility['visibility']
positive_sample['humidity'], positive_sample['temparature'], positive_sample['visibility']  =  pos_sample_temperature['humidity'], pos_sample_temperature['temperature'], pos_sample_visibility['visibility']

full_data = negative_sample.append(positive_sample)
full_data.columns

Index(['LINREFX', 'LINREFY', 'LOR', 'LOR_ab_2021', 'XGCSWGS84', 'YGCSWGS84',
       'acc_cat', 'acc_type1', 'acc_type2', 'bike', 'car', 'col_id',
       'collision', 'collision_cnt', 'district', 'foot', 'geometry', 'hour',
       'hour_cos', 'hour_sin', 'humidity', 'land', 'length_m', 'lightratio',
       'mid_lat', 'mid_lon', 'month', 'month_cos', 'month_sin', 'motor',
       'objectid', 'other', 'road_con', 'segment_id', 'side_strt', 'street',
       'sun_elevation_angle', 'temparature', 'vehicle', 'visibility',
       'weekday', 'year'],
      dtype='object')

In [43]:
# FUll DATA
full_data = full_data[[
    'segment_id',
    'year',
    'month_cos',
    'month_sin',
    'weekday',
    'hour_cos',
    'hour_sin',
    'collision_cnt',
    'side_strt',
    'sun_elevation_angle',
    'humidity',
    'temparature',
    'visibility',
    'collision'
]]

full_data['collision_cnt'] = full_data['collision_cnt'].fillna(0)

full_data

Unnamed: 0,segment_id,year,month_cos,month_sin,weekday,hour_cos,hour_sin,collision_cnt,side_strt,sun_elevation_angle,humidity,temparature,visibility,collision
0,42796.0,2019,8.660254e-01,-5.000000e-01,6,0.962917,0.269797,0.0,1.0,-35.861124,86.186559,4.081452,20049.462366,0
1,34322.0,2020,8.660254e-01,5.000000e-01,6,0.460065,0.887885,0.0,1.0,0.470744,88.225806,3.458065,23919.354839,0
2,29497.0,2018,-1.000000e+00,1.224647e-16,2,-0.775711,0.631088,1.0,1.0,66.775839,55.533333,20.693333,34946.666667,0
3,39002.0,2020,-1.836970e-16,-1.000000e+00,6,-0.334880,0.942261,0.0,1.0,38.164666,76.000000,14.226667,36333.333333,0
4,32881.0,2019,-8.660254e-01,-5.000000e-01,2,1.000000,0.000000,0.0,1.0,-49.330139,72.806452,17.170968,40709.677419,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52113,21737.0,2020,1.000000e+00,-2.449294e-16,6,-0.990686,-0.136167,6.0,0.0,37.531107,77.193548,5.045161,24145.161290,1
52114,2782.0,2020,1.000000e+00,-2.449294e-16,2,-0.068242,-0.997669,20.0,0.0,-27.019352,82.354839,4.119355,24951.612903,1
52115,2781.0,2020,1.000000e+00,-2.449294e-16,5,-0.917211,0.398401,21.0,0.0,52.571240,81.161290,3.970968,22723.333333,1
52116,19720.0,2020,1.000000e+00,-2.449294e-16,3,-0.576680,-0.816970,14.0,0.0,0.730020,79.516129,4.648387,22467.741935,1


In [45]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312820 entries, 0 to 52117
Data columns (total 14 columns):
segment_id             312820 non-null float64
year                   312820 non-null int64
month_cos              312820 non-null float64
month_sin              312820 non-null float64
weekday                312820 non-null int64
hour_cos               312820 non-null float64
hour_sin               312820 non-null float64
collision_cnt          312820 non-null float64
side_strt              312820 non-null float64
sun_elevation_angle    312820 non-null float64
humidity               312820 non-null float64
temparature            312820 non-null float64
visibility             312820 non-null float64
collision              312820 non-null int64
dtypes: float64(11), int64(3)
memory usage: 35.8 MB


In [56]:
# Split
X_train = full_data[full_data['year'] < 2020].drop(['collision'],axis=1)
X_test = full_data[full_data['year'] == 2020].drop(['collision'],axis=1)

y_train = full_data[full_data['year'] < 2020]['collision']
y_test = full_data[full_data['year'] == 2020]['collision']

In [65]:
# Random Forest
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector

num_pipe = SimpleImputer(strategy="mean", add_indicator=True)
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
)

preprocessor_tree = make_column_transformer(
    (num_pipe, selector(dtype_include="number")),
    (cat_pipe, selector(dtype_include="category")),
    n_jobs=2,
)

rf_clf = make_pipeline(
    preprocessor_tree, RandomForestClassifier(n_est)
)

In [None]:
cv_result = cross_validate(rf_clf, X_train, y_train, scoring="roc_auc", cv=5)

In [None]:
cv_result['test_scores'].mean()

In [471]:
## RANDOM PARAMETER GRID SEARCH
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import numpy as np

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 300, stop = 350, num = 10)]
# Number of features to consider at every split
max_features = ['log2','sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(30, 50, num = 5)]
max_depth.append(None)

criterion = ['gini', 'entropy']
# Minimum number of samples required to split a node
min_samples_split = [2,3,4,5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,3,4,5]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split':min_samples_split,
               'min_samples_leaf':min_samples_leaf,
               'bootstrap': bootstrap,
                'criterion':criterion}

print(random_grid)

{'n_estimators': [250, 261, 272, 283, 294, 305, 316, 327, 338, 350], 'max_features': ['log2', 'sqrt'], 'max_depth': [30, 35, 40, 45, 50, None], 'min_samples_leaf': [1, 2, 3, 4, 5], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']}


In [254]:
rf_random = GridSearchCV(estimator = RandomForestClassifier(), 
                               param_grid = random_grid,
                               scoring='roc_auc',
                               cv = 3, 
                               verbose=2, 
                               n_jobs = -1,
                               return_train_score = True)

In [255]:
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 1920 candidates, totalling 5760 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'max_depth': [30, 35, 40, 45, 50, None],
                         'max_features': ['auto'],
                         'min_samples_leaf': [1, 6, 11, 16],
                         'min_samples_split': [2, 5, 8, 12],
                         'n_estimators': [250, 261, 272, 283, 294, 305, 316,
                                          327, 338, 350]},
             return_train_score=True, scoring='roc_auc', verbose=2)

In [256]:
rf_random.best_score_

0.8956388086414308

In [257]:
rf_random.best_params_

{'bootstrap': False,
 'max_depth': 45,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 327}

In [None]:
brf_random = GridSearchCV(estimator = BalancedRandomForestClassifier(), 
                               param_grid = random_grid,
                               scoring='roc_auc',
                               cv = 3, 
                               verbose=2, 
                               n_jobs = -1,
                               return_train_score = True)

In [None]:
brf_random.fit(X_train, y_train)

In [None]:
brf_random.best_score_

In [None]:
brf_random.best_params_

In [None]:
#Random Forest with oversampling using SMOTE and undersampling using RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline

from collections import Counter

# summarize class distribution - highly imbalanced dataset
counter = Counter(y_train)
print(counter)


## RANDOM PARAMETER GRID SEARCH
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import numpy as np

over_n = [float(x) for x in np.arange(0.1, 0.35, 0.05)]
# RandomUnderSampler undersampling
under_n = [float(x) for x in np.arange(0.35, 0.5, 0.05)]

# Create the random grid
random_grid = {'randomforestclassifier__n_estimators': n_estimators,
               'randomforestclassifier__max_features': max_features,
               'randomforestclassifier__max_depth': max_depth,
               'randomforestclassifier__min_samples_split': min_samples_split,
               'randomforestclassifier__min_samples_leaf': min_samples_leaf,
               'randomforestclassifier__bootstrap': bootstrap,
               'randomforestclassifier__criterion':criterion,
              'smote__sampling_strategy':over_n,
              'randomundersampler__sampling_strategy':under_n}

print(random_grid)

In [None]:
# define pipeline
# oversample positive (minority) to be x% the number of negative (majority)
over = SMOTE(sampling_strategy = 0.1)
# randomly undersample negative (majority) to reduce the number of negative to x% of the positive (minority)
under = RandomUnderSampler(sampling_strategy = 0.25) 

pipeline = make_pipeline(over, under, RandomForestClassifier(n_estimators = 260,
 min_samples_split =2,
 min_samples_leaf=1,
 max_features= 'auto',
 max_depth= 90,
 bootstrap= False))

rf_sampling_random = GridSearchCV(pipeline,
                               param_grid = random_grid,
                               scoring='roc_auc',
                               cv = 3, 
                               verbose=2, 
                               n_jobs = -1,
                               return_train_score = True)

In [None]:
rf_sampling_random.fit(X_train,y_train)
rf_sampling_random.best_score_


In [None]:
rf_sampling_random.best_params_

In [472]:
# BALANCED BAGGING

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 150, stop = 300, num = 5)]

# Method of selecting samples for training each tree
bootstrap = [True, False]

over_n = [float(x) for x in np.arange(0.1, 0.35, 0.05)]
# RandomUnderSampler undersampling
under_n = [float(x) for x in np.arange(0.35, 0.5, 0.05)]

# Create the random grid
bb_random_grid = {'balancedbaggingclassifier__n_estimators': n_estimators,
                 'balancedbaggingclassifier__bootstrap': bootstrap,
                 'smote__sampling_strategy':over_n,
              'randomundersampler__sampling_strategy':under_n}


# define pipeline
# oversample positive (minority) to be x% the number of negative (majority)
over = SMOTE(sampling_strategy = 0.1)
# randomly undersample negative (majority) to reduce the number of negative to x% of the positive (minority)
under = RandomUnderSampler(sampling_strategy = 0.25) 

pipeline = make_pipeline(over, under, BalancedBaggingClassifier(base_estimator=HistGradientBoostingClassifier(random_state=42)))

In [473]:
bb_random = GridSearchCV(estimator = pipeline, 
                               param_grid = bb_random_grid,
                               scoring='roc_auc',
                               cv = 3, 
                               verbose=2, 
                               n_jobs = -1,
                               return_train_score = True)

In [250]:
bb_random.fit(X_train, y_train)
bb_random.best_score_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


0.8569596662369879

In [234]:
bb_random.best_params_

{'n_estimators': 207, 'bootstrap': True}

In [57]:
# LOGISTIC REGRESSION
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate



num_pipe = make_pipeline(
    StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True)
)
cat_pipe = make_pipeline(
    OneHotEncoder(handle_unknown="ignore"),
)

preprocessor_linear = make_column_transformer(
    (num_pipe, selector(dtype_include="number")),
    (cat_pipe, selector(dtype_include="category")),
    n_jobs=2,
)

lr_clf = make_pipeline(preprocessor_linear, LogisticRegression(max_iter=1000))
lr_clf.set_params(logisticregression__class_weight="balanced")

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(n_jobs=2,
                                   transformers=[('pipeline-1',
                                                  Pipeline(steps=[('standardscaler',
                                                                   StandardScaler()),
                                                                  ('simpleimputer',
                                                                   SimpleImputer(add_indicator=True))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x0000017D07436A88>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_col

In [62]:
cv_result = cross_validate(lr_clf, X_train, y_train, scoring="roc_auc", cv=5)

In [63]:
cv_result['test_score'].mean()

0.893687113212304

In [477]:
# Support Vector Classification with RandomOverSampling
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline 

num_pipe = make_pipeline(
    MinMaxScaler(feature_range=(0, 1))
)
cat_pipe = make_pipeline(
    OneHotEncoder(handle_unknown="ignore"),
)

preprocessor_svc = make_column_transformer(
    (num_pipe, selector(dtype_include="number")),
    (cat_pipe, selector(dtype_include="category")),
    n_jobs=2,
)

smote = SMOTE(sampling_strategy=0.1)

rus = RandomUnderSampler(sampling_strategy=0.2)

svc_clf = make_pipeline(preprocessor_svc, smote, rus, SVC(kernel='rbf',C=1))

In [478]:
cv_result = cross_validate(svc_clf, X_train, y_train, scoring="roc_auc", cv=5)

In [479]:
cv_result['test_score'].mean()

0.8430180789451789

In [482]:
parameters = {'smote__sampling_strategy':[float(x) for x in np.arange(0.1, 0.35, 0.05)],
              'randomundersampler__sampling_strategy':[float(x) for x in np.arange(0.35, 0.5, 0.05)],
              'svc__kernel':('linear', 'rbf', 'poly'), 
              'svc__C':[1,10,100,1000],
              'svc__gamma':[1,0.1,0.001,0.0001], 
              'svc__degree':[1,2]}

svc_grid = GridSearchCV(svc_clf, parameters, scoring="roc_auc", cv=3)

In [483]:
svc_grid.fit(X_train, y_train)
svc_grid.best_score_

KeyboardInterrupt: 

In [445]:
svc_grid.best_params_

{'rus__sampling_strategy': 0.2,
 'smote__sampling_strategy': 0.1,
 'svc__C': 1,
 'svc__class_weight': None,
 'svc__degree': 2,
 'svc__kernel': 'rbf'}