In [1]:
# import libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score
import pickle
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
# set pandas visualization options

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
# read train preprocessed

base_path=os.path.abspath(os.getcwd())

train=pd.read_csv(base_path+"\\data-scientist-technical-test-main\\data\\auto-insurance-fall-2017\\train_auto_preproc.csv", sep=",", header=0)

In [4]:
#drop extra column

drop_col = ["Unnamed: 0"]
target_col=["TARGET_FLAG"]

train.drop(columns=drop_col, axis=1, inplace=True)

# divide X,y
X=train.drop(columns=target_col, axis=1)
y=train[target_col].values

In [5]:
# create 5-folds

n_splits = 5

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=18, stratify=y)

kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=18)

## logistic regression

In [6]:
# create pipeline object to standardize data for LR

pipe = Pipeline(steps=[('standard_scaler', StandardScaler()),
                       ('LR', LogisticRegression(random_state=18,
                                                 max_iter=200))])

In [7]:
param_grid = dict(LR__penalty = ['l1', 'l2', 'elasticnet'],
                  LR__C = np.logspace(-2, 3, 6))

grid = GridSearchCV(pipe,
                    param_grid=param_grid,
                    cv=kf, 
                    verbose=1,
                    scoring='f1',
                    return_train_score=True)
grid.fit(X_train, y_train)
print(grid.best_score_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
0.5029242287860396


In [8]:
#show cv results
cv_results = pd.DataFrame(grid.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[['param_LR__C',
            'param_LR__penalty', 
            'mean_test_score',
            'std_test_score',
            'rank_test_score']].head(5)

Unnamed: 0,param_LR__C,param_LR__penalty,mean_test_score,std_test_score,rank_test_score
13,100.0,l2,0.502924,0.023478,1
16,1000.0,l2,0.502924,0.023478,1
10,10.0,l2,0.502392,0.023222,3
7,1.0,l2,0.502035,0.023958,4
4,0.1,l2,0.498298,0.023587,5


In [9]:
C=cv_results[['param_LR__C', 'param_LR__penalty']].iloc[0,:].values[0]
penalty=cv_results[['param_LR__C', 'param_LR__penalty']].iloc[0,:].values[1]
print(C, penalty)

100.0 l2


In [10]:
ss=StandardScaler()
X_transf=ss.fit_transform(X_train)
lr=LogisticRegression(random_state=18,
                      C=C,
                      penalty=penalty).fit(X_transf,y_train)

In [11]:
# f1 prediction on validation set
ss=StandardScaler()
X_val_transf=ss.fit_transform(X_val)
y_pred = lr.predict(X_val_transf)
f1_score(y_val, y_pred)

0.49715909090909105

In [12]:
# save LR model
pickle.dump(lr, open(base_path+"\\data-scientist-technical-test-main\\data\\auto-insurance-fall-2017\\models\\LR.pkl", 'wb'))

## random forest

In [13]:
# create pipeline object to standardize data for RF

pipe = Pipeline(steps=[('RFC', RandomForestClassifier())])

param_grid = dict(RFC__max_depth = [5,7,9],
                  RFC__n_estimators = [25,50,75,100])

grid = GridSearchCV(pipe,
                    param_grid=param_grid,
                    cv=kf, 
                    verbose=1,
                    scoring='f1',
                    return_train_score=True)
grid.fit(X_train, y_train)
print(grid.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
0.39877290971217794


In [14]:
#show cv results
cv_results = pd.DataFrame(grid.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[['param_RFC__max_depth',
            'param_RFC__n_estimators', 
            'mean_test_score',
            'std_test_score',
            'rank_test_score']].head(5)

Unnamed: 0,param_RFC__max_depth,param_RFC__n_estimators,mean_test_score,std_test_score,rank_test_score
9,9,50,0.398773,0.032228,1
8,9,25,0.392902,0.014698,2
10,9,75,0.389545,0.020672,3
11,9,100,0.382,0.012735,4
4,7,25,0.331225,0.02467,5


In [15]:
max_depth=cv_results[['param_RFC__max_depth', 'param_RFC__n_estimators']].iloc[0,:].values[0]
n_estimators=cv_results[['param_RFC__max_depth', 'param_RFC__n_estimators']].iloc[0,:].values[1]
print(max_depth, n_estimators)

9 50


In [16]:
rf=RandomForestClassifier(max_depth=max_depth,
                          n_estimators=n_estimators,
                          random_state=18).fit(X_train, y_train)

In [17]:
# f1 prediction on validation set

y_pred = rf.predict(X_val)
f1_score(y_val, y_pred)

0.3793103448275862

In [18]:
# save RF model
pickle.dump(rf, open(base_path+"\\data-scientist-technical-test-main\\data\\auto-insurance-fall-2017\\models\\RF.pkl", 'wb'))

## XGBoost

In [19]:
# create pipeline object to standardize data for XGB

pipe = Pipeline(steps=[('XGB', XGBClassifier())])

param_grid = dict(XGB__max_depth = [5,7],
                  XGB__n_estimators = [100,150,200],
                  XGB__eta = [0.1,0.2],
                  XGB__subsample = [0.9],
                  XGB__colsample_bytree = [0.9],
                  XGB__seed = [18])

grid = GridSearchCV(pipe,
                    param_grid=param_grid,
                    cv=kf, 
                    verbose=0,
                    scoring='f1',
                    return_train_score=True,
                    n_jobs=-2)
grid.fit(X_train, y_train)
print(grid.best_score_)

0.5305252438289872


In [20]:
#show cv results
cv_results = pd.DataFrame(grid.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[['param_XGB__max_depth',
            'param_XGB__n_estimators',
            'param_XGB__eta',
            'mean_test_score',
            'std_test_score',
            'rank_test_score']].head(5)

Unnamed: 0,param_XGB__max_depth,param_XGB__n_estimators,param_XGB__eta,mean_test_score,std_test_score,rank_test_score
0,5,100,0.1,0.530525,0.009204,1
1,5,150,0.1,0.529288,0.00516,2
11,7,200,0.2,0.528039,0.009338,3
7,5,150,0.2,0.526407,0.023277,4
6,5,100,0.2,0.526218,0.015879,5


In [21]:
max_depth=cv_results[['param_XGB__max_depth', 'param_XGB__n_estimators', 'param_XGB__eta']].iloc[0,:].values[0]
n_estimators=cv_results[['param_XGB__max_depth', 'param_XGB__n_estimators', 'param_XGB__eta']].iloc[0,:].values[1]
eta=cv_results[['param_XGB__max_depth', 'param_XGB__n_estimators', 'param_XGB__eta']].iloc[0,:].values[2]
print(max_depth, n_estimators, eta)

5 100 0.1


In [22]:
xgb = XGBClassifier(objective="binary:logistic",
                    max_depth=max_depth,
                    n_estimators=n_estimators,
                    eta=eta,
                    subsample=0.9,
                    colsample_bytree=0.9,
                    seed=18).fit(X_train,y_train)



In [23]:
# f1 prediction on validation set

y_pred = xgb.predict(X_val)
f1_score(y_val, y_pred)

0.5092460881934567

In [24]:
# save XGB model
pickle.dump(xgb, open(base_path+"\\data-scientist-technical-test-main\\data\\auto-insurance-fall-2017\\models\\XGB.pkl", 'wb'))

## catboost

In [25]:
# create pipeline object to standardize data for catboost

pipe = Pipeline(steps=[('CBC', CatBoostClassifier())])

param_grid = dict(CBC__iterations=[50,100,200],
                  CBC__learning_rate=[0.1,0.2],
                  CBC__silent=[True],
                  CBC__depth=[5,7])

grid = GridSearchCV(pipe,
                    param_grid=param_grid,
                    cv=kf, 
                    verbose=0,
                    scoring='f1',
                    return_train_score=True,
                    n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.best_score_)

0.5325305466588336


In [26]:
#show cv results
cv_results = pd.DataFrame(grid.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[['param_CBC__iterations',
            'param_CBC__learning_rate',
            'param_CBC__depth',
            'mean_test_score',
            'std_test_score',
            'rank_test_score']].head(5)

Unnamed: 0,param_CBC__iterations,param_CBC__learning_rate,param_CBC__depth,mean_test_score,std_test_score,rank_test_score
3,100,0.2,5,0.532531,0.016446,1
11,200,0.2,7,0.530823,0.008889,2
9,100,0.2,7,0.528333,0.008278,3
5,200,0.2,5,0.5264,0.021094,4
10,200,0.1,7,0.523968,0.009599,5


In [27]:
iterations=cv_results[['param_CBC__iterations', 'param_CBC__learning_rate', 'param_CBC__depth']].iloc[0,:].values[0]
learning_rate=cv_results[['param_CBC__iterations', 'param_CBC__learning_rate', 'param_CBC__depth']].iloc[0,:].values[1]
depth=cv_results[['param_CBC__iterations', 'param_CBC__learning_rate', 'param_CBC__depth']].iloc[0,:].values[2]
print(iterations, learning_rate, depth)

100 0.2 5


In [28]:
cbc = CatBoostClassifier(iterations=iterations,
                         learning_rate=learning_rate,
                         silent=True,
                         depth=depth).fit(X_train,y_train)

In [29]:
# f1 prediction on validation set

y_pred = cbc.predict(X_val)
f1_score(y_val, y_pred)

0.5276595744680851

In [30]:
# save XGB model
pickle.dump(cbc, open(base_path+"\\data-scientist-technical-test-main\\data\\auto-insurance-fall-2017\\models\\CBC.pkl", 'wb'))