In [None]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/96/3b/bb419654adcf7efff42ed8a3f84e50c8f236424b7ed1cc8ccd290852e003/catboost-0.24.4-cp37-none-manylinux1_x86_64.whl (65.7MB)
[K     |████████████████████████████████| 65.7MB 44kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.4


In [None]:
from google.colab import files
uploaded = files.upload()

Saving sample_submission_QrCyCoT.csv to sample_submission_QrCyCoT.csv
Saving test_YCcRUnU.csv to test_YCcRUnU.csv
Saving train_Df64byy.csv to train_Df64byy.csv


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import *
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

In [None]:
train_df = pd.read_csv('/content/train_Df64byy.csv')
test_df = pd.read_csv('/content/test_YCcRUnU.csv')

In [None]:
def make_df_ready(df):
  df = df.set_index('ID')
  df['Holding_Policy_Duration'] = df['Holding_Policy_Duration'].replace('14+', '15.0').astype(float)
  df['Is_Spouse'] = df['Is_Spouse'].map({'No': 1, 'Yes': 0})
  df['Accomodation_Type'] = df['Accomodation_Type'].map({'Owned': 1, 'Rented': 0})
  df['Reco_Insurance_Type'] = df['Reco_Insurance_Type'].map({'Individual': 1, 'Joint': 0})
  df['Holding_Policy_Type'] = df['Holding_Policy_Type'].astype(str)
  df['Reco_Policy_Cat'] = df['Reco_Policy_Cat'].astype(str)
  df['Region_Code'] = df['Region_Code'].astype(str)
  df['Health Indicator'] = df['Health Indicator'].astype(str)
  return df

In [None]:
df_ready_train = make_df_ready(train_df)

In [None]:
X = df_ready_train.loc[:, df_ready_train.columns != 'Response']
y = df_ready_train.loc[:, 'Response']
# X = X[imp_features_96]
cat_features=['City_Code', 'Region_Code', 'Health Indicator', 'Reco_Policy_Cat']

In [None]:
params = dict(task_type='GPU',  
iterations=1500,
objective='Logloss', 
random_seed=21,
learning_rate=1,
eval_metric='AUC:hints=skip_train~false')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, shuffle=True)

cv_data = X_train.copy()
labels = y_train.values

cv_dataset = Pool(data=cv_data, 
                  label=labels, 
                  cat_features=cat_features)

# scores = cv(cv_dataset,
#             params,
#             fold_count=10, seed=21, partition_random_seed=21,
#             early_stopping_rounds=20)

In [None]:
model = CatBoostClassifier(**params)

In [None]:
grid = {
        'learning_rate': [0.1, 0.3, 0.5, 0.8],
        'depth': [4, 6, 8, 10]
        }

In [None]:
gs_res = model.grid_search(param_grid = grid,
                  X=cv_dataset,
                  cv=10,
                  partition_random_seed=21,
                  train_size=0.9
        )

bestTest = 0.7905160189
bestIteration = 1367
0:	loss: 0.7905160	best: 0.7905160 (0)	total: 53.8s	remaining: 13m 27s
bestTest = 0.7895562947
bestIteration = 300
1:	loss: 0.7895563	best: 0.7905160 (0)	total: 1m 47s	remaining: 12m 29s
bestTest = 0.787479043
bestIteration = 24
2:	loss: 0.7874790	best: 0.7905160 (0)	total: 2m 40s	remaining: 11m 36s
bestTest = 0.7858130634
bestIteration = 21
3:	loss: 0.7858131	best: 0.7905160 (0)	total: 3m 33s	remaining: 10m 39s
bestTest = 0.7913115323
bestIteration = 881
4:	loss: 0.7913115	best: 0.7913115 (4)	total: 4m 40s	remaining: 10m 17s
bestTest = 0.7904568911
bestIteration = 231
5:	loss: 0.7904569	best: 0.7913115 (4)	total: 5m 47s	remaining: 9m 38s
bestTest = 0.7882281244
bestIteration = 46
6:	loss: 0.7882281	best: 0.7913115 (4)	total: 6m 54s	remaining: 8m 52s
bestTest = 0.7854394019
bestIteration = 29
7:	loss: 0.7854394	best: 0.7913115 (4)	total: 8m 1s	remaining: 8m 1s
bestTest = 0.7910027504
bestIteration = 355
8:	loss: 0.7910028	best: 0.7913115 (4)

In [None]:
import pickle

In [None]:
with open('./saved_model/pickled_model_0.80rocauc.pkl', 'wb') as f:
  pickle.dump(model, f)

In [None]:
pd.DataFrame(gs_res['cv_results'])

Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,train-AUC-mean,train-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.719504,0.016743,0.715964,0.007486,0.649524,0.003779,0.646936,0.000847
1,1,0.747699,0.012422,0.738478,0.008653,0.612411,0.003911,0.610367,0.001273
2,2,0.753450,0.011398,0.743125,0.006450,0.583244,0.004472,0.581465,0.001364
3,3,0.754887,0.009829,0.744388,0.004111,0.560972,0.003831,0.559756,0.001535
4,4,0.758509,0.008939,0.748781,0.002276,0.543662,0.002314,0.541820,0.001114
...,...,...,...,...,...,...,...,...,...
1495,1495,0.791897,0.008483,0.806508,0.001248,0.446927,0.006761,0.433308,0.001075
1496,1496,0.791895,0.008479,0.806510,0.001250,0.446928,0.006757,0.433307,0.001076
1497,1497,0.791889,0.008477,0.806512,0.001251,0.446935,0.006754,0.433305,0.001077
1498,1498,0.791889,0.008477,0.806513,0.001251,0.446935,0.006754,0.433304,0.001078


In [None]:
params_to_use = params.copy()
params_to_use.update(gs_res['params'])

In [None]:
model_2 = CatBoostClassifier(**params_to_use)

In [None]:
model_2.fit(X_train, y_train, cat_features=cat_features)

0:	learn: 0.7277870	total: 95.2ms	remaining: 2m 22s
1:	learn: 0.7420458	total: 146ms	remaining: 1m 49s
2:	learn: 0.7471281	total: 253ms	remaining: 2m 6s
3:	learn: 0.7490038	total: 328ms	remaining: 2m 2s
4:	learn: 0.7532080	total: 366ms	remaining: 1m 49s
5:	learn: 0.7528553	total: 430ms	remaining: 1m 47s
6:	learn: 0.7540019	total: 489ms	remaining: 1m 44s
7:	learn: 0.7559309	total: 541ms	remaining: 1m 40s
8:	learn: 0.7567599	total: 602ms	remaining: 1m 39s
9:	learn: 0.7573599	total: 655ms	remaining: 1m 37s
10:	learn: 0.7579081	total: 709ms	remaining: 1m 35s
11:	learn: 0.7589989	total: 754ms	remaining: 1m 33s
12:	learn: 0.7596897	total: 795ms	remaining: 1m 30s
13:	learn: 0.7601647	total: 836ms	remaining: 1m 28s
14:	learn: 0.7634798	total: 880ms	remaining: 1m 27s
15:	learn: 0.7633603	total: 938ms	remaining: 1m 26s
16:	learn: 0.7672204	total: 992ms	remaining: 1m 26s
17:	learn: 0.7693096	total: 1.05s	remaining: 1m 26s
18:	learn: 0.7708866	total: 1.09s	remaining: 1m 24s
19:	learn: 0.7725716	to

<catboost.core.CatBoostClassifier at 0x7fcf4a231790>

In [None]:
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

In [None]:
train_pred_p = model.predict_proba(X_train)
test_pred_p = model.predict_proba(X_test)

In [None]:
thresh = 0.30

print(f'Train: {roc_auc_score(y_train, (train_pred_p[:, 1] > thresh).astype(int))}')
print(f'Test: {roc_auc_score(y_test, (test_pred_p[:, 1] > thresh).astype(int))}')

Train: 0.797291956568362
Test: 0.743824526955958


In [None]:
print(classification_report(y_train, (train_pred_p[:, 1] > thresh).astype(int)))

print()

print(classification_report(y_test, (test_pred_p[:, 1] > thresh).astype(int)))

              precision    recall  f1-score   support

           0       0.93      0.80      0.86     31005
           1       0.56      0.79      0.65      9700

    accuracy                           0.80     40705
   macro avg       0.74      0.80      0.76     40705
weighted avg       0.84      0.80      0.81     40705


              precision    recall  f1-score   support

           0       0.90      0.72      0.80      7668
           1       0.47      0.77      0.59      2509

    accuracy                           0.73     10177
   macro avg       0.69      0.74      0.69     10177
weighted avg       0.80      0.73      0.75     10177



In [None]:
ready_test = make_df_ready(test_df)

In [None]:
submission_df = pd.read_csv('/content/sample_submission_QrCyCoT.csv')

In [None]:
submission_df['Response'] = ((model_2.predict_proba(ready_test))[:, 1])

In [None]:
train_df['Response'].value_counts(normalize=True)

0    0.760053
1    0.239947
Name: Response, dtype: float64

In [None]:
submission_df['Response'].value_counts(normalize=True)

0    0.593121
1    0.406879
Name: Response, dtype: float64

In [None]:
submission_df.to_csv('submission next day 517 pm, 80-10, prob only.csv', index=False)