In [1]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/96/3b/bb419654adcf7efff42ed8a3f84e50c8f236424b7ed1cc8ccd290852e003/catboost-0.24.4-cp37-none-manylinux1_x86_64.whl (65.7MB)
[K     |████████████████████████████████| 65.7MB 44kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.4


In [2]:
from google.colab import files
uploaded = files.upload()

Saving sample_submission_QrCyCoT.csv to sample_submission_QrCyCoT.csv
Saving test_YCcRUnU.csv to test_YCcRUnU.csv
Saving train_Df64byy.csv to train_Df64byy.csv


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import *
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

In [4]:
train_df = pd.read_csv('/content/train_Df64byy.csv')
test_df = pd.read_csv('/content/test_YCcRUnU.csv')

In [35]:
def make_df_ready(df, us=None, ls=None, rps=None, train=True):
  df = df.set_index('ID')
  df['Holding_Policy_Duration'] = df['Holding_Policy_Duration'].replace('14+', '15.0').astype(float)
  df['Is_Spouse'] = df['Is_Spouse'].map({'No': 1, 'Yes': 0})
  df['Accomodation_Type'] = df['Accomodation_Type'].map({'Owned': 1, 'Rented': 0})
  df['Reco_Insurance_Type'] = df['Reco_Insurance_Type'].map({'Individual': 1, 'Joint': 0})
  df['Holding_Policy_Type'] = df['Holding_Policy_Type'].astype(str)
  df['Reco_Policy_Cat'] = df['Reco_Policy_Cat'].astype(str)
  df['Region_Code'] = df['Region_Code'].astype(str)
  df['Health Indicator'] = df['Health Indicator'].astype(str)

  if train:
    upper_age_scaler = MinMaxScaler()
    df['Upper_Age'] = upper_age_scaler.fit_transform(df['Upper_Age'].values.reshape(-1, 1))

    lower_age_scaler = MinMaxScaler()
    df['Lower_Age'] = lower_age_scaler.fit_transform(df['Lower_Age'].values.reshape(-1, 1))

    reco_premium_scaler = MinMaxScaler()
    df['Reco_Policy_Premium'] = reco_premium_scaler.fit_transform(df['Reco_Policy_Premium'].values.reshape(-1, 1))

    return df, upper_age_scaler, lower_age_scaler, reco_premium_scaler
  elif not train:
    df['Upper_Age'] = us.fit_transform(df['Upper_Age'].values.reshape(-1, 1))
    df['Lower_Age'] = ls.fit_transform(df['Lower_Age'].values.reshape(-1, 1))
    df['Reco_Policy_Premium'] = rps.fit_transform(df['Reco_Policy_Premium'].values.reshape(-1, 1))

    return df

In [13]:
df_ready_train, upper_scaler, lower_scaler, premium_scaler = make_df_ready(train_df, train=True)

In [15]:
df_ready_train.head()

Unnamed: 0_level_0,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,C3,3213,0,1,0.315789,0.338983,1,X1,15.0,3.0,22,0.227609,0
2,C5,1117,1,0,1.0,0.101695,1,X2,,,22,0.687356,0
3,C5,3732,1,1,0.245614,0.271186,1,,1.0,1.0,19,0.125881,1
4,C24,4378,1,0,0.596491,0.542373,1,X1,15.0,3.0,19,0.377401,0
5,C8,2190,0,1,0.45614,0.474576,1,X2,3.0,1.0,16,0.197807,0


In [16]:
X = df_ready_train.loc[:, df_ready_train.columns != 'Response']
y = df_ready_train.loc[:, 'Response']
# X = X[imp_features_96]
cat_features=['City_Code', 'Region_Code', 'Health Indicator', 'Reco_Policy_Cat']

In [17]:
params = dict(task_type='GPU',  
iterations=1500,
objective='Logloss', 
random_seed=21,
learning_rate=1,
eval_metric='AUC:hints=skip_train~false')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=21, shuffle=True)

cv_data = X_train.copy()
labels = y_train.values

cv_dataset = Pool(data=cv_data, 
                  label=labels, 
                  cat_features=cat_features)

# scores = cv(cv_dataset,
#             params,
#             fold_count=10, seed=21, partition_random_seed=21,
#             early_stopping_rounds=20)

In [18]:
model = CatBoostClassifier(**params)

In [19]:
grid = {
        'learning_rate': [0.1, 0.3, 0.5, 0.8],
        'depth': [4, 6, 8, 10]
        }

In [20]:
gs_res = model.grid_search(param_grid = grid,
                  X=cv_dataset,
                  cv=10,
                  partition_random_seed=21,
                  train_size=0.9
        )

bestTest = 0.7752071619
bestIteration = 408
0:	loss: 0.7752072	best: 0.7752072 (0)	total: 48.2s	remaining: 12m 3s
bestTest = 0.7749140263
bestIteration = 154
1:	loss: 0.7749140	best: 0.7752072 (0)	total: 1m 36s	remaining: 11m 17s
bestTest = 0.7762673199
bestIteration = 210
2:	loss: 0.7762673	best: 0.7762673 (2)	total: 2m 25s	remaining: 10m 28s
bestTest = 0.7709593475
bestIteration = 55
3:	loss: 0.7709593	best: 0.7762673 (2)	total: 3m 13s	remaining: 9m 39s
bestTest = 0.775190711
bestIteration = 693
4:	loss: 0.7751907	best: 0.7762673 (2)	total: 4m 20s	remaining: 9m 32s
bestTest = 0.7755963206
bestIteration = 59
5:	loss: 0.7755963	best: 0.7762673 (2)	total: 5m 25s	remaining: 9m 2s
bestTest = 0.7736510634
bestIteration = 29
6:	loss: 0.7736511	best: 0.7762673 (2)	total: 6m 30s	remaining: 8m 22s
bestTest = 0.7685105503
bestIteration = 11
7:	loss: 0.7685106	best: 0.7762673 (2)	total: 7m 36s	remaining: 7m 36s
bestTest = 0.775652051
bestIteration = 341
8:	loss: 0.7756521	best: 0.7762673 (2)	tot

In [21]:
pd.DataFrame(gs_res['cv_results'])

Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,train-AUC-mean,train-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.740122,0.011107,0.733511,0.004529,0.517842,0.004987,0.520812,0.001197
1,1,0.757491,0.010243,0.749295,0.002613,0.482573,0.006307,0.484287,0.001686
2,2,0.769966,0.010096,0.762584,0.003149,0.469101,0.005985,0.470288,0.001519
3,3,0.774579,0.010206,0.768635,0.002302,0.462746,0.006539,0.463905,0.001429
4,4,0.779135,0.009854,0.773933,0.001919,0.457853,0.007134,0.459532,0.001097
...,...,...,...,...,...,...,...,...,...
1495,1495,0.781294,0.010316,0.833825,0.004201,0.466354,0.011572,0.408460,0.003890
1496,1496,0.781298,0.010314,0.833826,0.004201,0.466363,0.011571,0.408459,0.003891
1497,1497,0.781224,0.010328,0.833847,0.004203,0.466468,0.011555,0.408447,0.003894
1498,1498,0.781236,0.010318,0.833859,0.004203,0.466481,0.011536,0.408437,0.003895


In [22]:
params_to_use = params.copy()
params_to_use.update(gs_res['params'])

In [23]:
model_2 = CatBoostClassifier(**params_to_use)

In [24]:
model_2.fit(X_train, y_train, cat_features=cat_features)

0:	learn: 0.7355411	total: 255ms	remaining: 6m 22s
1:	learn: 0.7459025	total: 357ms	remaining: 4m 27s
2:	learn: 0.7542903	total: 492ms	remaining: 4m 5s
3:	learn: 0.7656934	total: 551ms	remaining: 3m 25s
4:	learn: 0.7732602	total: 594ms	remaining: 2m 57s
5:	learn: 0.7766255	total: 834ms	remaining: 3m 27s
6:	learn: 0.7779630	total: 900ms	remaining: 3m 11s
7:	learn: 0.7819407	total: 941ms	remaining: 2m 55s
8:	learn: 0.7831771	total: 1.08s	remaining: 2m 58s
9:	learn: 0.7840120	total: 1.16s	remaining: 2m 52s
10:	learn: 0.7858280	total: 1.55s	remaining: 3m 29s
11:	learn: 0.7861367	total: 1.63s	remaining: 3m 21s
12:	learn: 0.7873271	total: 1.71s	remaining: 3m 15s
13:	learn: 0.7876070	total: 1.75s	remaining: 3m 6s
14:	learn: 0.7879245	total: 1.84s	remaining: 3m 1s
15:	learn: 0.7883965	total: 2.25s	remaining: 3m 28s
16:	learn: 0.7886587	total: 2.29s	remaining: 3m 19s
17:	learn: 0.7894473	total: 2.32s	remaining: 3m 11s
18:	learn: 0.7897798	total: 2.37s	remaining: 3m 5s
19:	learn: 0.7899873	total

<catboost.core.CatBoostClassifier at 0x7fa70f4456d0>

In [25]:
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

In [26]:
train_pred_p = model.predict_proba(X_train)
test_pred_p = model.predict_proba(X_test)

In [31]:
thresh = 0.42

print(f'Train: {roc_auc_score(y_train, (train_pred_p[:, 1] > thresh).astype(int))}')
print(f'Test: {roc_auc_score(y_test, (test_pred_p[:, 1] > thresh).astype(int))}')

Train: 0.7360711145568012
Test: 0.6807517967314255


In [32]:
print(classification_report(y_train, (train_pred_p[:, 1] > thresh).astype(int)))

print()

print(classification_report(y_test, (test_pred_p[:, 1] > thresh).astype(int)))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89     34864
           1       0.66      0.56      0.61     10929

    accuracy                           0.83     45793
   macro avg       0.76      0.74      0.75     45793
weighted avg       0.82      0.83      0.82     45793


              precision    recall  f1-score   support

           0       0.84      0.84      0.84      3809
           1       0.53      0.52      0.52      1280

    accuracy                           0.76      5089
   macro avg       0.68      0.68      0.68      5089
weighted avg       0.76      0.76      0.76      5089



In [36]:
ready_test = make_df_ready(test_df, upper_scaler, lower_scaler, premium_scaler, train=False)

In [39]:
submission_df = pd.read_csv('/content/sample_submission_QrCyCoT.csv')

In [40]:
submission_df['Response'] = ((model_2.predict_proba(ready_test))[:, 1] > thresh).astype(int)

In [41]:
train_df['Response'].value_counts(normalize=True)

0    0.760053
1    0.239947
Name: Response, dtype: float64

In [42]:
submission_df['Response'].value_counts(normalize=True)

0    0.75391
1    0.24609
Name: Response, dtype: float64

In [43]:
submission_df.to_csv('submission 1023pm, 90-10, thresh point42, features normalized.csv', index=False)