In [279]:
import pandas as pd
import matplotlib.pyplot as plt

from tpot import TPOTClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFECV, RFE
import xgboost
from rgf.sklearn import RGFClassifier
import catboost as cb

import numpy as np

In [177]:
df_train = pd.read_csv('dataset/df_train_nonan.csv')
df_test = pd.read_csv('dataset/df_test_nonan.csv')

In [157]:
df_train.head()

Unnamed: 0,id,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,premium,...,sourcing_channel_A,sourcing_channel_B,sourcing_channel_C,sourcing_channel_D,sourcing_channel_E,residence_area_type_Rural,residence_area_type_Urban,incentives,efforts,improvement
0,110936,0.429,12058,355060,0.0,0.0,0.0,99.02,13,3300,...,0,0,1,0,0,0,1,1650,9.838365,17.204365
1,41492,0.01,21546,315150,0.0,0.0,0.0,99.89,21,18000,...,1,0,0,0,0,0,1,1650,9.838365,17.204365
2,31300,0.917,17531,84140,2.0,3.0,1.0,98.69,7,3300,...,0,0,1,0,0,1,0,1650,9.838365,17.204365
3,19415,0.049,15341,250510,0.0,0.0,0.0,99.57,9,9600,...,1,0,0,0,0,0,1,1650,9.838365,17.204365
4,99379,0.052,31400,198680,0.0,0.0,0.0,99.87,12,9600,...,0,1,0,0,0,0,1,1650,9.838365,17.204365


In [158]:
df_test.head()

Unnamed: 0,id,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,premium,sourcing_channel_A,sourcing_channel_B,sourcing_channel_C,sourcing_channel_D,sourcing_channel_E,residence_area_type_Rural,residence_area_type_Urban,incentives,efforts,improvement
0,649,0.001,27384,51150,0.0,0.0,0.0,99.89,7,3300,1,0,0,0,0,1,0,1650,9.838365,17.204365
1,81136,0.124,23735,285140,0.0,0.0,0.0,98.93,19,11700,1,0,0,0,0,0,1,1650,9.838365,17.204365
2,70762,1.0,17170,186030,0.0,0.0,0.0,99.067291,2,11700,0,1,0,0,0,0,1,1650,9.838365,17.204365
3,53935,0.198,16068,123540,0.0,0.0,0.0,99.0,11,5400,0,1,0,0,0,1,0,1650,9.838365,17.204365
4,15476,0.041,10591,200020,1.0,0.0,0.0,99.17,14,9600,1,0,0,0,0,1,0,1650,9.838365,17.204365


## Scaling dataset using StandardScaler

In [193]:
X, y = df_train.drop(['id', 'renewal', 'incentives', 'efforts', 'improvement'], axis=1), df_train['renewal']
X_test = df_test.drop(['id', 'incentives', 'efforts', 'improvement'], axis=1)

X_train, X_dev, y_train, y_dev = train_test_split(X,y, random_state=42, test_size=0.1)

In [194]:
scaler = StandardScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_dev_scaler = scaler.transform(X_dev)
X_test_scaler = scaler.transform(X_test)

### Random Forest

In [195]:
rf = RandomForestClassifier(random_state=42)

rf.fit(X_train_scaler, y_train)
y_predict = rf.predict(X_dev_scaler)

print(accuracy_score(y_dev, y_predict))
print(classification_report(y_dev, y_predict))

y_predict = rf.predict_proba(X_dev_scaler)

mse = mean_squared_error(y_dev, y_predict[:,1])
print(np.sqrt(mse))

0.9312546957175056
             precision    recall  f1-score   support

          0       0.48      0.18      0.27       540
          1       0.94      0.99      0.96      7446

avg / total       0.91      0.93      0.92      7986

0.23997558102670768


In [196]:
sorted_index = sorted(range(len(rf.feature_importances_)),key=rf.feature_importances_.__getitem__, reverse=True)

for index in sorted_index:
    print(X.columns[index],":", rf.feature_importances_[index])

perc_premium_paid_by_cash_credit : 0.15461694534543358
Income : 0.1523338757232252
age_in_days : 0.15028427580457987
application_underwriting_score : 0.1347304675407108
no_of_premiums_paid : 0.09134963416677513
Count_6-12_months_late : 0.0808515286179405
premium : 0.07578823979325625
Count_3-6_months_late : 0.04884915191858964
Count_more_than_12_months_late : 0.03914109619479943
sourcing_channel_A : 0.012959639743103019
residence_area_type_Urban : 0.01294335100365233
residence_area_type_Rural : 0.012187245980634285
sourcing_channel_B : 0.011989485412100034
sourcing_channel_C : 0.010328312894777758
sourcing_channel_D : 0.009226642749092382
sourcing_channel_E : 0.0024201071113297487


In [197]:
y_predict_test = rf.predict_proba(X_test_scaler)

y_predict_test

array([[0. , 1. ],
       [0. , 1. ],
       [0. , 1. ],
       ...,
       [0.1, 0.9],
       [0.3, 0.7],
       [0. , 1. ]])

### Decision Tree

In [198]:
rf = DecisionTreeClassifier(random_state=42)

rf.fit(X_train_scaler, y_train)
y_predict = rf.predict(X_dev_scaler)

print(accuracy_score(y_dev, y_predict))
print(classification_report(y_dev, y_predict))

y_predict = rf.predict_proba(X_dev_scaler)

mse = mean_squared_error(y_dev, y_predict[:,1])
print(np.sqrt(mse))

0.8906836964688204
             precision    recall  f1-score   support

          0       0.22      0.24      0.23       540
          1       0.94      0.94      0.94      7446

avg / total       0.89      0.89      0.89      7986

0.3306301612545044


In [199]:
sorted_index = sorted(range(len(rf.feature_importances_)),key=rf.feature_importances_.__getitem__, reverse=True)

for index in sorted_index:
    print(X.columns[index],":", rf.feature_importances_[index])

Income : 0.17363045424721057
age_in_days : 0.15583855946158334
application_underwriting_score : 0.14730706876565489
perc_premium_paid_by_cash_credit : 0.14200178710150888
Count_6-12_months_late : 0.10603330606233485
no_of_premiums_paid : 0.07382480308119793
premium : 0.0685380876867335
Count_3-6_months_late : 0.03425744252332534
Count_more_than_12_months_late : 0.022973320072213397
sourcing_channel_C : 0.013467876886418715
sourcing_channel_B : 0.012953988483990039
sourcing_channel_A : 0.012828780947552382
residence_area_type_Rural : 0.012160532809471868
sourcing_channel_D : 0.011450603715473668
residence_area_type_Urban : 0.009973239154475313
sourcing_channel_E : 0.0027601490008552737


In [200]:
y_predict_test = rf.predict_proba(X_test_scaler)

y_predict_test

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]])

### Logistic Regression 

In [201]:
rf = LogisticRegression(random_state=42)

rf.fit(X_train_scaler, y_train)
y_predict = rf.predict(X_dev_scaler)

print(accuracy_score(y_dev, y_predict))
print(classification_report(y_dev, y_predict))

y_predict = rf.predict_proba(X_dev_scaler)

mse = mean_squared_error(y_dev, y_predict[:,1])
print(np.sqrt(mse))

0.9350112697220135
             precision    recall  f1-score   support

          0       0.59      0.12      0.20       540
          1       0.94      0.99      0.97      7446

avg / total       0.92      0.94      0.91      7986

0.2305120202509368


In [202]:
y_predict_test = rf.predict_proba(X_test_scaler)

y_predict_test

array([[0.0103498 , 0.9896502 ],
       [0.02099453, 0.97900547],
       [0.08842922, 0.91157078],
       ...,
       [0.03859125, 0.96140875],
       [0.25244397, 0.74755603],
       [0.0265359 , 0.9734641 ]])

## Balanced dataset

In [203]:
smote = SMOTE(random_state=42)

In [204]:
X, y = df_train.drop(['id', 'renewal', 'incentives', 'efforts', 'improvement'], axis=1), df_train['renewal']
X_test = df_test.drop(['id', 'incentives', 'efforts', 'improvement'], axis=1)

X_smote, y_smote = smote.fit_sample(X, y)
X_train, X_dev, y_train, y_dev = train_test_split(X_smote,y_smote, random_state=42, test_size=0.05)

In [205]:
scaler = StandardScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_dev_scaler = scaler.transform(X_dev)
X_test_scaler = scaler.transform(X_test)

### Random Forest

In [206]:
rf = RandomForestClassifier(random_state=42)

rf.fit(X_train_scaler, y_train)
y_predict = rf.predict(X_dev_scaler)

print(accuracy_score(y_dev, y_predict))
print(classification_report(y_dev, y_predict))

y_predict = rf.predict_proba(X_dev_scaler)

mse = mean_squared_error(y_dev, y_predict[:,1])
print(np.sqrt(mse))

0.9607266898209992
             precision    recall  f1-score   support

          0       0.97      0.94      0.96      3707
          1       0.95      0.98      0.96      3779

avg / total       0.96      0.96      0.96      7486

0.18100042584044268


In [207]:
sorted_index = sorted(range(len(rf.feature_importances_)),key=rf.feature_importances_.__getitem__, reverse=True)

for index in sorted_index:
    print(X.columns[index],":", rf.feature_importances_[index])

Count_3-6_months_late : 0.20073795825259305
Count_6-12_months_late : 0.15918675930069415
perc_premium_paid_by_cash_credit : 0.09885676625476643
residence_area_type_Rural : 0.09842906989909034
Count_more_than_12_months_late : 0.08235379287001911
sourcing_channel_A : 0.06568018669554018
residence_area_type_Urban : 0.042411486421633976
sourcing_channel_B : 0.041951303659967666
Income : 0.03874153416335981
age_in_days : 0.03465803922773699
application_underwriting_score : 0.033387863007065935
sourcing_channel_C : 0.02860839807367637
no_of_premiums_paid : 0.02804778396839468
sourcing_channel_D : 0.02309160178478569
premium : 0.022615356650687042
sourcing_channel_E : 0.0012420997699885946


In [208]:
y_predict_test = rf.predict_proba(X_test_scaler)

y_predict_test

array([[0. , 1. ],
       [0. , 1. ],
       [0.3, 0.7],
       ...,
       [0.1, 0.9],
       [0.5, 0.5],
       [0. , 1. ]])

### Decision Tree 

In [209]:
rf = DecisionTreeClassifier(random_state=42)

rf.fit(X_train_scaler, y_train)
y_predict = rf.predict(X_dev_scaler)

print(accuracy_score(y_dev, y_predict))
print(classification_report(y_dev, y_predict))

y_predict = rf.predict_proba(X_dev_scaler)

mse = mean_squared_error(y_dev, y_predict[:,1])
print(np.sqrt(mse))

0.9366818060379375
             precision    recall  f1-score   support

          0       0.93      0.94      0.94      3707
          1       0.94      0.93      0.94      3779

avg / total       0.94      0.94      0.94      7486

0.25163106716393846


In [210]:
sorted_index = sorted(range(len(rf.feature_importances_)),key=rf.feature_importances_.__getitem__, reverse=True)

for index in sorted_index:
    print(X.columns[index],":", rf.feature_importances_[index])

Count_3-6_months_late : 0.4389375547664826
Count_6-12_months_late : 0.15832952751188906
residence_area_type_Urban : 0.06807729887658891
Count_more_than_12_months_late : 0.06142526496994757
perc_premium_paid_by_cash_credit : 0.04914845649453826
Income : 0.03847468995870054
age_in_days : 0.034474147834892485
sourcing_channel_A : 0.026875621345528188
application_underwriting_score : 0.026755382362474458
no_of_premiums_paid : 0.025139337324714295
sourcing_channel_C : 0.018563829943100362
premium : 0.01683227318272162
sourcing_channel_B : 0.014746647480688609
residence_area_type_Rural : 0.013111078809117943
sourcing_channel_D : 0.007965217300705861
sourcing_channel_E : 0.0011436718379093027


In [211]:
y_predict_test = rf.predict_proba(X_test_scaler)

y_predict_test

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]])

### Logistic Regression 

In [212]:
rf = LogisticRegression(random_state=42)

rf.fit(X_train_scaler, y_train)
y_predict = rf.predict(X_dev_scaler)

print(accuracy_score(y_dev, y_predict))
print(classification_report(y_dev, y_predict))

y_predict = rf.predict_proba(X_dev_scaler)

mse = mean_squared_error(y_dev, y_predict[:,1])
print(np.sqrt(mse))

0.7934811648410366
             precision    recall  f1-score   support

          0       0.81      0.77      0.79      3707
          1       0.78      0.82      0.80      3779

avg / total       0.79      0.79      0.79      7486

0.3802477409037834


In [213]:
y_predict_test = rf.predict_proba(X_test_scaler)

y_predict_test

array([[0.07035195, 0.92964805],
       [0.1831942 , 0.8168058 ],
       [0.50935089, 0.49064911],
       ...,
       [0.29100037, 0.70899963],
       [0.91857247, 0.08142753],
       [0.244946  , 0.755054  ]])

### Xgboost

In [343]:
rf = xgboost.XGBClassifier(seed=42, max_depth=4, n_estimators=500, learning_rate=0.1, )

rf.fit(X_train_scaler, y_train)
y_predict = rf.predict(X_dev_scaler)

print(accuracy_score(y_dev, y_predict))
print(classification_report(y_dev, y_predict))

y_predict = rf.predict_proba(X_dev_scaler)

mse = mean_squared_error(y_dev, y_predict[:,1])
print(np.sqrt(mse))

  if diff:


0.9632647608869891
             precision    recall  f1-score   support

          0       0.99      0.94      0.96      3707
          1       0.94      0.99      0.96      3779

avg / total       0.96      0.96      0.96      7486

0.16942533395627962


In [332]:
rf = xgboost.XGBClassifier(seed=42, max_depth=5, n_estimators=500, learning_rate=0.1)

rf.fit(X_train_scaler, y_train)
y_predict = rf.predict(X_dev_scaler)

print(accuracy_score(y_dev, y_predict))
print(classification_report(y_dev, y_predict))

y_predict = rf.predict_proba(X_dev_scaler)

mse = mean_squared_error(y_dev, y_predict[:,1])
print(np.sqrt(mse))

  if diff:


0.9647341704515094
             precision    recall  f1-score   support

          0       0.99      0.94      0.96      3707
          1       0.94      0.99      0.97      3779

avg / total       0.97      0.96      0.96      7486

0.16858519993064844


In [345]:
rf = xgboost.XGBClassifier(seed=42, max_depth=6, n_estimators=600, learning_rate=0.1)

rf.fit(X_train_scaler, y_train)
y_predict = rf.predict(X_dev_scaler)

print(accuracy_score(y_dev, y_predict))
print(classification_report(y_dev, y_predict))

y_predict = rf.predict_proba(X_dev_scaler)

mse = mean_squared_error(y_dev, y_predict[:,1])
print(np.sqrt(mse))

  if diff:


0.9640662570130911
             precision    recall  f1-score   support

          0       0.99      0.94      0.96      3707
          1       0.94      0.99      0.97      3779

avg / total       0.97      0.96      0.96      7486

0.16812981152806328


In [297]:
y_predict_test = rf.predict_proba(X_test_scaler)

y_predict_test

array([[7.6580048e-04, 9.9923420e-01],
       [1.3990164e-02, 9.8600984e-01],
       [1.7932320e-01, 8.2067680e-01],
       ...,
       [2.2345960e-02, 9.7765404e-01],
       [2.4842042e-01, 7.5157958e-01],
       [3.3690870e-02, 9.6630913e-01]], dtype=float32)

### Catboost

In [285]:
rf = cb.CatBoostClassifier(random_state=42)
rf.fit(X_train_scaler, y_train)
y_predict = rf.predict(X_dev_scaler)

print(accuracy_score(y_dev, y_predict))
print(classification_report(y_dev, y_predict))

y_predict = rf.predict_proba(X_dev_scaler)

mse = mean_squared_error(y_dev, y_predict[:,1])
print(np.sqrt(mse))

Learning rate set to 0.068142
0:	learn: 0.6131903	total: 51.3ms	remaining: 51.2s
1:	learn: 0.5515006	total: 117ms	remaining: 58.5s
2:	learn: 0.4962048	total: 184ms	remaining: 1m 1s
3:	learn: 0.4484910	total: 252ms	remaining: 1m 2s
4:	learn: 0.3990070	total: 326ms	remaining: 1m 4s
5:	learn: 0.3604275	total: 389ms	remaining: 1m 4s
6:	learn: 0.3420374	total: 449ms	remaining: 1m 3s
7:	learn: 0.3163682	total: 511ms	remaining: 1m 3s
8:	learn: 0.2929434	total: 574ms	remaining: 1m 3s
9:	learn: 0.2808848	total: 635ms	remaining: 1m 2s
10:	learn: 0.2649584	total: 700ms	remaining: 1m 2s
11:	learn: 0.2517639	total: 761ms	remaining: 1m 2s
12:	learn: 0.2444943	total: 820ms	remaining: 1m 2s
13:	learn: 0.2361893	total: 882ms	remaining: 1m 2s
14:	learn: 0.2264388	total: 945ms	remaining: 1m 2s
15:	learn: 0.2196374	total: 1.01s	remaining: 1m 2s
16:	learn: 0.2122357	total: 1.07s	remaining: 1m 1s
17:	learn: 0.2059515	total: 1.13s	remaining: 1m 1s
18:	learn: 0.2002790	total: 1.19s	remaining: 1m 1s
19:	learn:

163:	learn: 0.1148200	total: 11s	remaining: 56.2s
164:	learn: 0.1144359	total: 11.1s	remaining: 56.2s
165:	learn: 0.1142313	total: 11.2s	remaining: 56.2s
166:	learn: 0.1141831	total: 11.2s	remaining: 56s
167:	learn: 0.1140985	total: 11.3s	remaining: 55.9s
168:	learn: 0.1140453	total: 11.4s	remaining: 55.8s
169:	learn: 0.1137805	total: 11.4s	remaining: 55.7s
170:	learn: 0.1135167	total: 11.5s	remaining: 55.6s
171:	learn: 0.1133925	total: 11.5s	remaining: 55.4s
172:	learn: 0.1132892	total: 11.6s	remaining: 55.3s
173:	learn: 0.1132628	total: 11.6s	remaining: 55.2s
174:	learn: 0.1132329	total: 11.7s	remaining: 55s
175:	learn: 0.1131905	total: 11.7s	remaining: 54.9s
176:	learn: 0.1131606	total: 11.8s	remaining: 54.8s
177:	learn: 0.1130544	total: 11.8s	remaining: 54.7s
178:	learn: 0.1128875	total: 11.9s	remaining: 54.6s
179:	learn: 0.1128388	total: 11.9s	remaining: 54.4s
180:	learn: 0.1127175	total: 12s	remaining: 54.3s
181:	learn: 0.1126665	total: 12.1s	remaining: 54.2s
182:	learn: 0.112641

322:	learn: 0.1053263	total: 21.2s	remaining: 44.4s
323:	learn: 0.1052869	total: 21.2s	remaining: 44.3s
324:	learn: 0.1052606	total: 21.3s	remaining: 44.2s
325:	learn: 0.1052335	total: 21.4s	remaining: 44.1s
326:	learn: 0.1052116	total: 21.4s	remaining: 44.1s
327:	learn: 0.1050868	total: 21.5s	remaining: 44s
328:	learn: 0.1050565	total: 21.6s	remaining: 44s
329:	learn: 0.1050308	total: 21.6s	remaining: 43.9s
330:	learn: 0.1049981	total: 21.7s	remaining: 43.9s
331:	learn: 0.1049789	total: 21.8s	remaining: 43.8s
332:	learn: 0.1049531	total: 21.8s	remaining: 43.7s
333:	learn: 0.1049399	total: 21.9s	remaining: 43.7s
334:	learn: 0.1049149	total: 22s	remaining: 43.6s
335:	learn: 0.1048970	total: 22s	remaining: 43.5s
336:	learn: 0.1048777	total: 22.1s	remaining: 43.4s
337:	learn: 0.1048629	total: 22.1s	remaining: 43.3s
338:	learn: 0.1048403	total: 22.2s	remaining: 43.3s
339:	learn: 0.1047846	total: 22.2s	remaining: 43.2s
340:	learn: 0.1047661	total: 22.3s	remaining: 43.1s
341:	learn: 0.104740

481:	learn: 0.1002693	total: 30.4s	remaining: 32.7s
482:	learn: 0.1002483	total: 30.5s	remaining: 32.6s
483:	learn: 0.1002355	total: 30.5s	remaining: 32.5s
484:	learn: 0.1002172	total: 30.6s	remaining: 32.5s
485:	learn: 0.1001712	total: 30.6s	remaining: 32.4s
486:	learn: 0.1001125	total: 30.7s	remaining: 32.3s
487:	learn: 0.0999849	total: 30.7s	remaining: 32.3s
488:	learn: 0.0998814	total: 30.8s	remaining: 32.2s
489:	learn: 0.0998491	total: 30.8s	remaining: 32.1s
490:	learn: 0.0998238	total: 30.9s	remaining: 32s
491:	learn: 0.0997963	total: 31s	remaining: 32s
492:	learn: 0.0997687	total: 31s	remaining: 31.9s
493:	learn: 0.0997535	total: 31.1s	remaining: 31.8s
494:	learn: 0.0997341	total: 31.1s	remaining: 31.8s
495:	learn: 0.0997145	total: 31.2s	remaining: 31.7s
496:	learn: 0.0996959	total: 31.2s	remaining: 31.6s
497:	learn: 0.0996774	total: 31.3s	remaining: 31.5s
498:	learn: 0.0996458	total: 31.3s	remaining: 31.5s
499:	learn: 0.0996302	total: 31.4s	remaining: 31.4s
500:	learn: 0.099543

640:	learn: 0.0960156	total: 39.4s	remaining: 22.1s
641:	learn: 0.0960063	total: 39.5s	remaining: 22s
642:	learn: 0.0959910	total: 39.5s	remaining: 22s
643:	learn: 0.0959803	total: 39.6s	remaining: 21.9s
644:	learn: 0.0959606	total: 39.7s	remaining: 21.8s
645:	learn: 0.0959381	total: 39.7s	remaining: 21.8s
646:	learn: 0.0959169	total: 39.8s	remaining: 21.7s
647:	learn: 0.0958996	total: 39.8s	remaining: 21.6s
648:	learn: 0.0958734	total: 39.9s	remaining: 21.6s
649:	learn: 0.0958612	total: 39.9s	remaining: 21.5s
650:	learn: 0.0958461	total: 40s	remaining: 21.4s
651:	learn: 0.0957654	total: 40s	remaining: 21.4s
652:	learn: 0.0957511	total: 40.1s	remaining: 21.3s
653:	learn: 0.0957375	total: 40.2s	remaining: 21.2s
654:	learn: 0.0957231	total: 40.2s	remaining: 21.2s
655:	learn: 0.0956881	total: 40.3s	remaining: 21.1s
656:	learn: 0.0956707	total: 40.3s	remaining: 21s
657:	learn: 0.0955755	total: 40.4s	remaining: 21s
658:	learn: 0.0955475	total: 40.4s	remaining: 20.9s
659:	learn: 0.0955362	to

799:	learn: 0.0930476	total: 49s	remaining: 12.3s
800:	learn: 0.0930391	total: 49.1s	remaining: 12.2s
801:	learn: 0.0930199	total: 49.2s	remaining: 12.1s
802:	learn: 0.0929975	total: 49.3s	remaining: 12.1s
803:	learn: 0.0929888	total: 49.5s	remaining: 12.1s
804:	learn: 0.0928936	total: 49.6s	remaining: 12s
805:	learn: 0.0928786	total: 49.7s	remaining: 12s
806:	learn: 0.0928672	total: 49.8s	remaining: 11.9s
807:	learn: 0.0928423	total: 49.8s	remaining: 11.8s
808:	learn: 0.0928302	total: 49.9s	remaining: 11.8s
809:	learn: 0.0928094	total: 50s	remaining: 11.7s
810:	learn: 0.0927944	total: 50s	remaining: 11.7s
811:	learn: 0.0927847	total: 50.1s	remaining: 11.6s
812:	learn: 0.0927650	total: 50.1s	remaining: 11.5s
813:	learn: 0.0927563	total: 50.2s	remaining: 11.5s
814:	learn: 0.0927426	total: 50.3s	remaining: 11.4s
815:	learn: 0.0927321	total: 50.3s	remaining: 11.4s
816:	learn: 0.0927133	total: 50.4s	remaining: 11.3s
817:	learn: 0.0927042	total: 50.5s	remaining: 11.2s
818:	learn: 0.0926824	

958:	learn: 0.0902674	total: 58.4s	remaining: 2.5s
959:	learn: 0.0902602	total: 58.5s	remaining: 2.44s
960:	learn: 0.0902404	total: 58.5s	remaining: 2.38s
961:	learn: 0.0902238	total: 58.6s	remaining: 2.31s
962:	learn: 0.0902153	total: 58.6s	remaining: 2.25s
963:	learn: 0.0902028	total: 58.7s	remaining: 2.19s
964:	learn: 0.0901904	total: 58.7s	remaining: 2.13s
965:	learn: 0.0901734	total: 58.8s	remaining: 2.07s
966:	learn: 0.0901520	total: 58.8s	remaining: 2.01s
967:	learn: 0.0901406	total: 58.9s	remaining: 1.95s
968:	learn: 0.0901216	total: 58.9s	remaining: 1.89s
969:	learn: 0.0901137	total: 59s	remaining: 1.82s
970:	learn: 0.0901051	total: 59s	remaining: 1.76s
971:	learn: 0.0900902	total: 59.1s	remaining: 1.7s
972:	learn: 0.0900685	total: 59.1s	remaining: 1.64s
973:	learn: 0.0900578	total: 59.2s	remaining: 1.58s
974:	learn: 0.0900483	total: 59.2s	remaining: 1.52s
975:	learn: 0.0900261	total: 59.3s	remaining: 1.46s
976:	learn: 0.0900132	total: 59.4s	remaining: 1.4s
977:	learn: 0.09000

### Regularized Greedy Forest

In [246]:
parameters = {'max_leaf':[1000,1200,1300,1400,1500,1600,1700,1800,1900,2000],
              'l2':[0.1,0.2,0.3],
              'min_samples_leaf':[5,10]}

rgf = RGFClassifier(verbose=0)
rf = GridSearchCV(estimator=rgf,
                   param_grid=parameters,
                   scoring='neg_mean_squared_error',
                   n_jobs = -1,
                   cv = 3)


rf.fit(X_train_scaler, y_train)
y_predict = rf.predict(X_dev_scaler)

print(accuracy_score(y_dev, y_predict))
print(classification_report(y_dev, y_predict))

y_predict = rf.predict_proba(X_dev_scaler)

mse = mean_squared_error(y_dev, y_predict[:,1])
print(np.sqrt(mse))

0.9460325941757948
             precision    recall  f1-score   support

          0       0.98      0.91      0.94      3707
          1       0.92      0.98      0.95      3779

avg / total       0.95      0.95      0.95      7486

0.20726172588169914


In [247]:
y_predict_test = rf.predict_proba(X_test_scaler)

y_predict_test

array([[0.02294054, 0.97705946],
       [0.0669331 , 0.9330669 ],
       [0.09891968, 0.90108032],
       ...,
       [0.08980986, 0.91019014],
       [0.40677583, 0.59322417],
       [0.08928054, 0.91071946]])

## Recursive Feature Elimination

In [356]:
clf = xgboost.XGBClassifier(seed=42, max_depth=5, n_estimators=500, learning_rate=0.1)
rf = RFE(clf, verbose=2)

rf.fit(X_train_scaler, y_train)
y_predict = rf.predict(X_dev_scaler)

print(accuracy_score(y_dev, y_predict))
print(classification_report(y_dev, y_predict))

y_predict = rf.predict_proba(X_dev_scaler)

mse = mean_squared_error(y_dev, y_predict[:,1])
print(np.sqrt(mse))

Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.


  if diff:


0.9612610205717339
             precision    recall  f1-score   support

          0       0.99      0.93      0.96      3707
          1       0.94      0.99      0.96      3779

avg / total       0.96      0.96      0.96      7486

0.1747671716955661


In [47]:
rf = RandomForestClassifier(random_state=42)
rfe = RFE(rf, verbose=2)
rfe.fit(X_train_scaler, y_train)


y_predict = rfe.predict(X_dev_scaler)

print(accuracy_score(y_dev, y_predict))
print(classification_report(y_dev, y_predict))

y_predict = rfe.predict_proba(X_dev_scaler)

mse = mean_squared_error(y_dev, y_predict[:,1])
print(np.sqrt(mse))

Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
0.9511054705764478
             precision    recall  f1-score   support

          0       0.96      0.94      0.95      7460
          1       0.94      0.96      0.95      7511

avg / total       0.95      0.95      0.95     14971

0.19645682532340594


In [52]:
X.columns

Index(['perc_premium_paid_by_cash_credit', 'age_in_days', 'Income',
       'Count_3-6_months_late', 'Count_6-12_months_late',
       'Count_more_than_12_months_late', 'application_underwriting_score',
       'no_of_premiums_paid', 'premium', 'sourcing_channel_A',
       'sourcing_channel_B', 'sourcing_channel_C', 'sourcing_channel_D',
       'sourcing_channel_E', 'residence_area_type_Rural',
       'residence_area_type_Urban'],
      dtype='object')

In [54]:
best_features = []
for index, i in enumerate(rfe.ranking_):
    if i == 1:
        best_features.append(X.columns[index])

In [55]:
X, y = df_train[best_features], df_train['renewal']
X_test = df_test[best_features]

X_smote, y_smote = smote.fit_sample(X, y)
X_train, X_dev, y_train, y_dev = train_test_split(X_smote,y_smote, random_state=42, test_size=0.1)

In [56]:
scaler = StandardScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_dev_scaler = scaler.transform(X_dev)
X_test_scaler = scaler.transform(X_test)

In [153]:
rf = RandomForestClassifier(random_state=42, max_depth=20)

rf.fit(X_train_scaler, y_train)
y_predict = rf.predict(X_dev_scaler)

print(accuracy_score(y_dev, y_predict))
print(classification_report(y_dev, y_predict))

y_predict = rf.predict_proba(X_dev_scaler)

mse = mean_squared_error(y_dev, y_predict[:,1])
print(np.sqrt(mse))

0.9564520438151216
             precision    recall  f1-score   support

          0       0.98      0.93      0.95      3707
          1       0.94      0.98      0.96      3779

avg / total       0.96      0.96      0.96      7486

0.18413902886108097


In [58]:
y_predict_test = rf.predict_proba(X_test_scaler)

y_predict_test

array([[0. , 1. ],
       [0. , 1. ],
       [0.2, 0.8],
       ...,
       [0. , 1. ],
       [0.3, 0.7],
       [0. , 1. ]])

## Total net worth

In [357]:
X_scaler = scaler.transform(X)
y_predict = rf.predict_proba(X_scaler)
df_train['renewal_proba'] = y_predict[:,1]

In [358]:
y_predict_test = rf.predict_proba(X_test_scaler)
df_test['renewal'] = y_predict_test[:,1]

In [359]:
def rel_incentive_renewal_proba(renewal_proba, premium):
    if renewal_proba > 0:
        return premium * 0.0005
    else:
        return 0

def rel_efforts_incentive(incentive):
    return 10 * (1 - np.exp(-incentive / 400))

def rel_improvement_efforts(efforts):
    return 20 * (1 - np.exp(-efforts / 5))

df_train['incentives'] = df_train.apply(lambda x: rel_incentive_renewal_proba(x['renewal_proba'], x['premium']), axis=1)
df_train['efforts'] = df_train['incentives'].apply(rel_efforts_incentive)
df_train['improvement'] = df_train['efforts'].apply(rel_improvement_efforts)


df_test['incentives'] = df_test.apply(lambda x: rel_incentive_renewal_proba(x['renewal'], x['premium']),axis=1)
df_test['efforts'] = df_test['incentives'].apply(rel_efforts_incentive)
df_test['improvement'] = df_test['efforts'].apply(rel_improvement_efforts)

In [360]:
# total_net_revenue = np.sum((pBenchmark + pImprovement) * premium - incentives)
 
np.sum((df_train['renewal_proba'] - (df_train['improvement'] * df_train['renewal_proba'])) * df_train['premium'] - df_train['incentives'])

75171148.7343645

In [361]:
# total_net_revenue = np.sum((pBenchmark + pImprovement) * premium - incentives)
 
np.sum((df_test['renewal'] - (df_test['improvement'] * df_test['renewal'])) * df_test['premium'] - df_test['incentives'])

35611265.32165004

## Save to submission

In [362]:
df_submission = pd.read_csv('dataset/sample_submission_sLex1ul.csv')

In [363]:
df_submission['renewal'] = df_test['renewal']

In [364]:
df_submission['incentives'] = df_test['incentives']

In [365]:
df_submission

Unnamed: 0,id,renewal,incentives
0,649,0.997305,1.65
1,81136,0.958626,5.85
2,70762,0.776334,5.85
3,53935,0.966062,2.70
4,15476,0.959334,4.80
5,64797,0.989407,5.85
6,67412,0.748790,1.65
7,44241,0.695362,2.70
8,5069,0.992151,6.90
9,16615,0.992386,14.25


In [366]:
df_submission.to_csv('dataset/submission19.csv', index=False)