# A simple application of Catboost module with grid search

In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier,CatBoostRegressor, Pool
import random
import datetime
from sklearn import *
from multiprocessing import *



In [2]:
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')
gender_subm = pd.read_csv('gender_submission.csv')

In [3]:
train_data['Cabin_letter'] = train_data.Cabin.str[0:1]
test_data['Cabin_letter'] = test_data.Cabin.str[0:1]

In [4]:
train_data.shape, test_data.shape

((891, 13), (418, 12))

In [5]:
msk = np.random.rand(len(train_data)) < 0.6
train = train_data[msk]
temp = train_data[~msk]

msk2 = np.random.rand(len(temp)) < 0.5
test = temp[msk2]
val = temp[~msk2]

In [6]:
train_data.shape, train.shape, test.shape, val.shape

((891, 13), (516, 13), (181, 13), (194, 13))

In [7]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_letter
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,E
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,


In [8]:
X_train = train.drop(['PassengerId','Name', 'Ticket', 'Cabin','Survived'], axis=1)
y_train = train['Survived']
X_val = val.drop(['PassengerId','Name', 'Ticket', 'Cabin','Survived'], axis=1)
y_val = val['Survived']
X_test = test.drop(['PassengerId','Name', 'Ticket', 'Cabin','Survived'], axis=1)
y_test = test['Survived']

# Get indices of categorical variables
d_t = X_train.dtypes

list_cat = [x for x in d_t[d_t == object].axes[0]]

print(list_cat)

cat_indices = [X_train.columns.tolist().index(col) for col in list_cat]
print(cat_indices)


['Sex', 'Embarked', 'Cabin_letter']
[1, 6, 7]


In [9]:
# Filling null values with 0

X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
X_val = X_val.fillna(0)

## Grid Search

In [38]:
def catboost_run(X_train,y_train,X_val,y_val,X_test,y_test,cat_indices,n_tr,rsm,lrn_rt,dep,l2_reg,num_split,cat_split,bag_temp):
    
    print("model train start")
    
    t1 = datetime.datetime.now()
    print(t1)
    
    model = CatBoostClassifier(iterations=n_tr,
                               rsm=rsm,
                               learning_rate=lrn_rt, 
                               depth=dep,
                               l2_leaf_reg=l2_reg,
                               bagging_temperature = bag_temp,
                               random_seed=2)
    
    model.fit(X_train, y_train,cat_indices, use_best_model=True, eval_set=(X_val, y_val), verbose=True)
    # Predicitng and calculating performance on test data
    predict_prob = model.predict_proba(X_test)[:,1]

    pred_list = [1 if i > 0.5 else 0 for i in predict_prob.tolist()]

    y_list = y_test.tolist()

    counter = 0
    counter_1 = 0
    for i in range(len(pred_list)):
        if pred_list[i] == y_list[i]:
            counter = counter+1
            if y_list[i] == 1:
                counter_1 = counter_1+1
                
    accuracy = counter/len(pred_list)
    
    return accuracy

In [53]:
# Lists of paramenter values

rsm_pv = [0.5,0.6,0.7,0.8,0.9,1]
print(len(rsm_pv))
lrn_rt_pv = [0.025,0.05,0.075,0.1,0.15]
print(len(lrn_rt_pv))
dep_pv = [4,5,6,7,8]
print(len(dep_pv))
l2_reg_pv = [1,5,10,15,50]
print(len(l2_reg_pv))

6
5
5
5


In [54]:
result_col_list = ['rsm',
                   'learning_rate',
                   'depth',
                   'l2_regularization',
                   'accuracy']

results_df = pd.DataFrame(data=None,columns=result_col_list)

In [55]:
for rsm in rsm_pv:
    for lrn_rt in lrn_rt_pv:
        for dep in dep_pv:
            for l2_reg in l2_reg_pv:
                
                t1 = datetime.datetime.now()
                model = CatBoostClassifier(iterations=100,
                                           rsm=rsm,
                                           learning_rate=lrn_rt, 
                                           depth=dep,
                                           l2_leaf_reg=l2_reg,
                                           random_seed=2)
    
                model.fit(X_train, y_train,cat_indices, use_best_model=True, eval_set=(X_val, y_val),logging_level='Silent')
                # Predicitng and calculating performance on test data
                predict_prob = model.predict_proba(X_test)[:,1]

                pred_list = [1 if i > 0.5 else 0 for i in predict_prob.tolist()]

                y_list = y_test.tolist()

                counter = 0
                for i in range(len(pred_list)):
                    if pred_list[i] == y_list[i]:
                        counter = counter+1
                
                accuracy = counter/len(pred_list)
                
                result_df_temp = pd.DataFrame(data=None,columns=result_col_list)
                
                result_df_temp.loc[0,'rsm'] = rsm
                result_df_temp.loc[0,'learning_rate'] = lrn_rt
                result_df_temp.loc[0,'depth'] = dep
                result_df_temp.loc[0,'l2_regularization'] = l2_reg
                
                result_df_temp.loc[0,'accuracy'] = accuracy

                results_df = results_df.append(result_df_temp)
                
                t2 = datetime.datetime.now()
                
                itr_tm = t2-t1
                
                print("itr_tm = " + str(itr_tm) + ",rsm = " + str(rsm) + ",learning_rate = " + str(lrn_rt) +
                      ",depth = " + str(dep) + ",l2_reg = " + str(l2_reg) + " -> accuracy = " + str(accuracy))

itr_tm = 0:00:03.355946,rsm = 0.5,learning_rate = 0.025,depth = 4,l2_reg = 1 -> accuracy = 0.7624309392265194
itr_tm = 0:00:03.479276,rsm = 0.5,learning_rate = 0.025,depth = 4,l2_reg = 5 -> accuracy = 0.7624309392265194
itr_tm = 0:00:02.782420,rsm = 0.5,learning_rate = 0.025,depth = 4,l2_reg = 10 -> accuracy = 0.7624309392265194
itr_tm = 0:00:02.432484,rsm = 0.5,learning_rate = 0.025,depth = 4,l2_reg = 15 -> accuracy = 0.7679558011049724
itr_tm = 0:00:02.121656,rsm = 0.5,learning_rate = 0.025,depth = 4,l2_reg = 50 -> accuracy = 0.7624309392265194
itr_tm = 0:00:02.737303,rsm = 0.5,learning_rate = 0.025,depth = 5,l2_reg = 1 -> accuracy = 0.7624309392265194
itr_tm = 0:00:02.752341,rsm = 0.5,learning_rate = 0.025,depth = 5,l2_reg = 5 -> accuracy = 0.7679558011049724
itr_tm = 0:00:02.670119,rsm = 0.5,learning_rate = 0.025,depth = 5,l2_reg = 10 -> accuracy = 0.7679558011049724
itr_tm = 0:00:03.089232,rsm = 0.5,learning_rate = 0.025,depth = 5,l2_reg = 15 -> accuracy = 0.7679558011049724
itr_t

itr_tm = 0:00:02.133689,rsm = 0.5,learning_rate = 0.1,depth = 4,l2_reg = 1 -> accuracy = 0.7955801104972375
itr_tm = 0:00:02.428477,rsm = 0.5,learning_rate = 0.1,depth = 4,l2_reg = 5 -> accuracy = 0.7845303867403315
itr_tm = 0:00:02.462565,rsm = 0.5,learning_rate = 0.1,depth = 4,l2_reg = 10 -> accuracy = 0.7900552486187845
itr_tm = 0:00:02.977939,rsm = 0.5,learning_rate = 0.1,depth = 4,l2_reg = 15 -> accuracy = 0.7790055248618785
itr_tm = 0:00:03.016043,rsm = 0.5,learning_rate = 0.1,depth = 4,l2_reg = 50 -> accuracy = 0.7734806629834254
itr_tm = 0:00:02.531750,rsm = 0.5,learning_rate = 0.1,depth = 5,l2_reg = 1 -> accuracy = 0.7900552486187845
itr_tm = 0:00:02.571858,rsm = 0.5,learning_rate = 0.1,depth = 5,l2_reg = 5 -> accuracy = 0.7734806629834254
itr_tm = 0:00:02.848595,rsm = 0.5,learning_rate = 0.1,depth = 5,l2_reg = 10 -> accuracy = 0.7955801104972375
itr_tm = 0:00:02.566844,rsm = 0.5,learning_rate = 0.1,depth = 5,l2_reg = 15 -> accuracy = 0.7900552486187845
itr_tm = 0:00:03.205547

itr_tm = 0:00:03.062144,rsm = 0.6,learning_rate = 0.05,depth = 4,l2_reg = 1 -> accuracy = 0.7790055248618785
itr_tm = 0:00:02.486615,rsm = 0.6,learning_rate = 0.05,depth = 4,l2_reg = 5 -> accuracy = 0.7624309392265194
itr_tm = 0:00:02.571841,rsm = 0.6,learning_rate = 0.05,depth = 4,l2_reg = 10 -> accuracy = 0.7679558011049724
itr_tm = 0:00:02.514689,rsm = 0.6,learning_rate = 0.05,depth = 4,l2_reg = 15 -> accuracy = 0.7679558011049724
itr_tm = 0:00:03.263681,rsm = 0.6,learning_rate = 0.05,depth = 4,l2_reg = 50 -> accuracy = 0.7513812154696132
itr_tm = 0:00:03.543424,rsm = 0.6,learning_rate = 0.05,depth = 5,l2_reg = 1 -> accuracy = 0.7734806629834254
itr_tm = 0:00:04.158059,rsm = 0.6,learning_rate = 0.05,depth = 5,l2_reg = 5 -> accuracy = 0.7679558011049724
itr_tm = 0:00:03.531382,rsm = 0.6,learning_rate = 0.05,depth = 5,l2_reg = 10 -> accuracy = 0.7734806629834254
itr_tm = 0:00:03.496298,rsm = 0.6,learning_rate = 0.05,depth = 5,l2_reg = 15 -> accuracy = 0.7624309392265194
itr_tm = 0:00:

itr_tm = 0:00:02.113618,rsm = 0.6,learning_rate = 0.15,depth = 4,l2_reg = 1 -> accuracy = 0.7734806629834254
itr_tm = 0:00:02.497644,rsm = 0.6,learning_rate = 0.15,depth = 4,l2_reg = 5 -> accuracy = 0.7734806629834254
itr_tm = 0:00:02.221910,rsm = 0.6,learning_rate = 0.15,depth = 4,l2_reg = 10 -> accuracy = 0.7845303867403315
itr_tm = 0:00:01.807891,rsm = 0.6,learning_rate = 0.15,depth = 4,l2_reg = 15 -> accuracy = 0.7845303867403315
itr_tm = 0:00:02.576852,rsm = 0.6,learning_rate = 0.15,depth = 4,l2_reg = 50 -> accuracy = 0.7679558011049724
itr_tm = 0:00:02.607937,rsm = 0.6,learning_rate = 0.15,depth = 5,l2_reg = 1 -> accuracy = 0.8011049723756906
itr_tm = 0:00:02.202859,rsm = 0.6,learning_rate = 0.15,depth = 5,l2_reg = 5 -> accuracy = 0.8011049723756906
itr_tm = 0:00:02.305323,rsm = 0.6,learning_rate = 0.15,depth = 5,l2_reg = 10 -> accuracy = 0.7955801104972375
itr_tm = 0:00:02.220904,rsm = 0.6,learning_rate = 0.15,depth = 5,l2_reg = 15 -> accuracy = 0.7900552486187845
itr_tm = 0:00:

itr_tm = 0:00:02.619969,rsm = 0.7,learning_rate = 0.075,depth = 4,l2_reg = 1 -> accuracy = 0.7790055248618785
itr_tm = 0:00:02.674123,rsm = 0.7,learning_rate = 0.075,depth = 4,l2_reg = 5 -> accuracy = 0.7679558011049724
itr_tm = 0:00:02.808471,rsm = 0.7,learning_rate = 0.075,depth = 4,l2_reg = 10 -> accuracy = 0.7790055248618785
itr_tm = 0:00:02.148768,rsm = 0.7,learning_rate = 0.075,depth = 4,l2_reg = 15 -> accuracy = 0.7679558011049724
itr_tm = 0:00:03.097238,rsm = 0.7,learning_rate = 0.075,depth = 4,l2_reg = 50 -> accuracy = 0.7679558011049724
itr_tm = 0:00:02.809477,rsm = 0.7,learning_rate = 0.075,depth = 5,l2_reg = 1 -> accuracy = 0.7955801104972375
itr_tm = 0:00:03.215548,rsm = 0.7,learning_rate = 0.075,depth = 5,l2_reg = 5 -> accuracy = 0.7845303867403315
itr_tm = 0:00:02.649045,rsm = 0.7,learning_rate = 0.075,depth = 5,l2_reg = 10 -> accuracy = 0.7734806629834254
itr_tm = 0:00:02.380337,rsm = 0.7,learning_rate = 0.075,depth = 5,l2_reg = 15 -> accuracy = 0.7679558011049724
itr_t

itr_tm = 0:00:02.098595,rsm = 0.8,learning_rate = 0.025,depth = 4,l2_reg = 1 -> accuracy = 0.7679558011049724
itr_tm = 0:00:02.297126,rsm = 0.8,learning_rate = 0.025,depth = 4,l2_reg = 5 -> accuracy = 0.7624309392265194
itr_tm = 0:00:02.288099,rsm = 0.8,learning_rate = 0.025,depth = 4,l2_reg = 10 -> accuracy = 0.7624309392265194
itr_tm = 0:00:01.885025,rsm = 0.8,learning_rate = 0.025,depth = 4,l2_reg = 15 -> accuracy = 0.7624309392265194
itr_tm = 0:00:02.088574,rsm = 0.8,learning_rate = 0.025,depth = 4,l2_reg = 50 -> accuracy = 0.7624309392265194
itr_tm = 0:00:02.200896,rsm = 0.8,learning_rate = 0.025,depth = 5,l2_reg = 1 -> accuracy = 0.7679558011049724
itr_tm = 0:00:02.303138,rsm = 0.8,learning_rate = 0.025,depth = 5,l2_reg = 5 -> accuracy = 0.7624309392265194
itr_tm = 0:00:02.504676,rsm = 0.8,learning_rate = 0.025,depth = 5,l2_reg = 10 -> accuracy = 0.7624309392265194
itr_tm = 0:00:02.593917,rsm = 0.8,learning_rate = 0.025,depth = 5,l2_reg = 15 -> accuracy = 0.7679558011049724
itr_t

itr_tm = 0:00:01.934142,rsm = 0.8,learning_rate = 0.1,depth = 4,l2_reg = 1 -> accuracy = 0.7734806629834254
itr_tm = 0:00:01.992299,rsm = 0.8,learning_rate = 0.1,depth = 4,l2_reg = 5 -> accuracy = 0.7734806629834254
itr_tm = 0:00:01.882509,rsm = 0.8,learning_rate = 0.1,depth = 4,l2_reg = 10 -> accuracy = 0.7569060773480663
itr_tm = 0:00:02.077530,rsm = 0.8,learning_rate = 0.1,depth = 4,l2_reg = 15 -> accuracy = 0.7734806629834254
itr_tm = 0:00:02.674109,rsm = 0.8,learning_rate = 0.1,depth = 4,l2_reg = 50 -> accuracy = 0.7679558011049724
itr_tm = 0:00:02.252991,rsm = 0.8,learning_rate = 0.1,depth = 5,l2_reg = 1 -> accuracy = 0.7734806629834254
itr_tm = 0:00:02.343235,rsm = 0.8,learning_rate = 0.1,depth = 5,l2_reg = 5 -> accuracy = 0.7679558011049724
itr_tm = 0:00:02.496640,rsm = 0.8,learning_rate = 0.1,depth = 5,l2_reg = 10 -> accuracy = 0.7624309392265194
itr_tm = 0:00:02.403392,rsm = 0.8,learning_rate = 0.1,depth = 5,l2_reg = 15 -> accuracy = 0.7845303867403315
itr_tm = 0:00:02.268031

itr_tm = 0:00:02.026872,rsm = 0.9,learning_rate = 0.05,depth = 4,l2_reg = 1 -> accuracy = 0.7679558011049724
itr_tm = 0:00:02.722242,rsm = 0.9,learning_rate = 0.05,depth = 4,l2_reg = 5 -> accuracy = 0.7624309392265194
itr_tm = 0:00:03.569497,rsm = 0.9,learning_rate = 0.05,depth = 4,l2_reg = 10 -> accuracy = 0.7679558011049724
itr_tm = 0:00:02.475619,rsm = 0.9,learning_rate = 0.05,depth = 4,l2_reg = 15 -> accuracy = 0.7624309392265194
itr_tm = 0:00:01.865929,rsm = 0.9,learning_rate = 0.05,depth = 4,l2_reg = 50 -> accuracy = 0.7569060773480663
itr_tm = 0:00:02.180801,rsm = 0.9,learning_rate = 0.05,depth = 5,l2_reg = 1 -> accuracy = 0.7513812154696132
itr_tm = 0:00:02.699179,rsm = 0.9,learning_rate = 0.05,depth = 5,l2_reg = 5 -> accuracy = 0.7734806629834254
itr_tm = 0:00:02.316161,rsm = 0.9,learning_rate = 0.05,depth = 5,l2_reg = 10 -> accuracy = 0.7679558011049724
itr_tm = 0:00:02.997974,rsm = 0.9,learning_rate = 0.05,depth = 5,l2_reg = 15 -> accuracy = 0.7734806629834254
itr_tm = 0:00:

itr_tm = 0:00:02.804478,rsm = 0.9,learning_rate = 0.15,depth = 4,l2_reg = 1 -> accuracy = 0.7734806629834254
itr_tm = 0:00:02.800468,rsm = 0.9,learning_rate = 0.15,depth = 4,l2_reg = 5 -> accuracy = 0.7790055248618785
itr_tm = 0:00:02.613971,rsm = 0.9,learning_rate = 0.15,depth = 4,l2_reg = 10 -> accuracy = 0.7955801104972375
itr_tm = 0:00:02.169782,rsm = 0.9,learning_rate = 0.15,depth = 4,l2_reg = 15 -> accuracy = 0.7955801104972375
itr_tm = 0:00:02.094584,rsm = 0.9,learning_rate = 0.15,depth = 4,l2_reg = 50 -> accuracy = 0.7734806629834254
itr_tm = 0:00:02.619988,rsm = 0.9,learning_rate = 0.15,depth = 5,l2_reg = 1 -> accuracy = 0.7734806629834254
itr_tm = 0:00:02.857667,rsm = 0.9,learning_rate = 0.15,depth = 5,l2_reg = 5 -> accuracy = 0.7955801104972375
itr_tm = 0:00:03.768046,rsm = 0.9,learning_rate = 0.15,depth = 5,l2_reg = 10 -> accuracy = 0.7624309392265194
itr_tm = 0:00:03.473263,rsm = 0.9,learning_rate = 0.15,depth = 5,l2_reg = 15 -> accuracy = 0.8121546961325967
itr_tm = 0:00:

itr_tm = 0:00:01.996001,rsm = 1,learning_rate = 0.075,depth = 4,l2_reg = 5 -> accuracy = 0.7624309392265194
itr_tm = 0:00:01.941922,rsm = 1,learning_rate = 0.075,depth = 4,l2_reg = 10 -> accuracy = 0.7734806629834254
itr_tm = 0:00:02.359547,rsm = 1,learning_rate = 0.075,depth = 4,l2_reg = 15 -> accuracy = 0.7734806629834254
itr_tm = 0:00:02.762153,rsm = 1,learning_rate = 0.075,depth = 4,l2_reg = 50 -> accuracy = 0.7679558011049724
itr_tm = 0:00:02.631957,rsm = 1,learning_rate = 0.075,depth = 5,l2_reg = 1 -> accuracy = 0.8011049723756906
itr_tm = 0:00:02.884336,rsm = 1,learning_rate = 0.075,depth = 5,l2_reg = 5 -> accuracy = 0.7734806629834254
itr_tm = 0:00:03.191800,rsm = 1,learning_rate = 0.075,depth = 5,l2_reg = 10 -> accuracy = 0.7734806629834254
itr_tm = 0:00:02.966462,rsm = 1,learning_rate = 0.075,depth = 5,l2_reg = 15 -> accuracy = 0.7624309392265194
itr_tm = 0:00:03.077627,rsm = 1,learning_rate = 0.075,depth = 5,l2_reg = 50 -> accuracy = 0.7679558011049724
itr_tm = 0:00:03.86180

In [58]:
final_result = results_df[results_df.accuracy == max(results_df['accuracy'])]

In [65]:
final_result

Unnamed: 0,rsm,learning_rate,depth,l2_regularization,accuracy
0,0.7,0.15,6,5,0.834254


In [66]:
rsm_f = final_result['rsm'][0]
lrn_rt_f = final_result['learning_rate'][0]
dep_f = final_result['depth'][0]
l2_reg_f = final_result['l2_regularization'][0]

In [67]:
n_tree = 1000
print(str(n_tree) + " TREES")

model = CatBoostClassifier(iterations=n_tree,
                           rsm=rsm_f,
                           learning_rate=lrn_rt_f, 
                           depth=dep_f,
                           l2_leaf_reg=l2_reg_f,
                           random_seed=2)

1000 TREES


In [68]:
t1 = datetime.datetime.now()
print(t1)
model.fit(X_train, y_train,cat_indices, use_best_model=True, eval_set=(X_val, y_val), verbose=True)

t2 = datetime.datetime.now()
print(t2)
print(t2-t1)

2018-06-22 16:11:58.131527
0:	learn: 0.6110867	test: 0.6069245	best: 0.6069245 (0)	total: 40.7ms	remaining: 40.7s
1:	learn: 0.5730936	test: 0.5654800	best: 0.5654800 (1)	total: 65.5ms	remaining: 32.7s
2:	learn: 0.5303060	test: 0.5259213	best: 0.5259213 (2)	total: 107ms	remaining: 35.5s
3:	learn: 0.5087897	test: 0.5053954	best: 0.5053954 (3)	total: 124ms	remaining: 30.9s
4:	learn: 0.4936567	test: 0.4917438	best: 0.4917438 (4)	total: 150ms	remaining: 29.8s
5:	learn: 0.4761769	test: 0.4700884	best: 0.4700884 (5)	total: 168ms	remaining: 27.8s
6:	learn: 0.4643923	test: 0.4546623	best: 0.4546623 (6)	total: 190ms	remaining: 26.9s
7:	learn: 0.4511551	test: 0.4416447	best: 0.4416447 (7)	total: 228ms	remaining: 28.2s
8:	learn: 0.4419853	test: 0.4283042	best: 0.4283042 (8)	total: 257ms	remaining: 28.3s
9:	learn: 0.4382985	test: 0.4223583	best: 0.4223583 (9)	total: 277ms	remaining: 27.4s
10:	learn: 0.4360325	test: 0.4202298	best: 0.4202298 (10)	total: 314ms	remaining: 28.3s
11:	learn: 0.4342776	te

98:	learn: 0.2487021	test: 0.3945139	best: 0.3912945 (87)	total: 3.21s	remaining: 29.2s
99:	learn: 0.2482419	test: 0.3955419	best: 0.3912945 (87)	total: 3.24s	remaining: 29.2s
100:	learn: 0.2467751	test: 0.3935726	best: 0.3912945 (87)	total: 3.27s	remaining: 29.2s
101:	learn: 0.2434702	test: 0.3937780	best: 0.3912945 (87)	total: 3.31s	remaining: 29.1s
102:	learn: 0.2422055	test: 0.3942201	best: 0.3912945 (87)	total: 3.34s	remaining: 29.1s
103:	learn: 0.2411378	test: 0.3940056	best: 0.3912945 (87)	total: 3.39s	remaining: 29.2s
104:	learn: 0.2378540	test: 0.3980913	best: 0.3912945 (87)	total: 3.47s	remaining: 29.6s
105:	learn: 0.2364343	test: 0.4001064	best: 0.3912945 (87)	total: 3.51s	remaining: 29.6s
106:	learn: 0.2356355	test: 0.4017842	best: 0.3912945 (87)	total: 3.54s	remaining: 29.6s
107:	learn: 0.2332864	test: 0.4014579	best: 0.3912945 (87)	total: 3.57s	remaining: 29.5s
108:	learn: 0.2320900	test: 0.4023020	best: 0.3912945 (87)	total: 3.6s	remaining: 29.5s
109:	learn: 0.2320035	te

197:	learn: 0.1623366	test: 0.4143492	best: 0.3912945 (87)	total: 6.97s	remaining: 28.3s
198:	learn: 0.1618571	test: 0.4144152	best: 0.3912945 (87)	total: 7.01s	remaining: 28.2s
199:	learn: 0.1615623	test: 0.4144693	best: 0.3912945 (87)	total: 7.05s	remaining: 28.2s
200:	learn: 0.1614674	test: 0.4142927	best: 0.3912945 (87)	total: 7.08s	remaining: 28.1s
201:	learn: 0.1610773	test: 0.4150821	best: 0.3912945 (87)	total: 7.1s	remaining: 28.1s
202:	learn: 0.1609999	test: 0.4151691	best: 0.3912945 (87)	total: 7.13s	remaining: 28s
203:	learn: 0.1607736	test: 0.4150931	best: 0.3912945 (87)	total: 7.15s	remaining: 27.9s
204:	learn: 0.1605724	test: 0.4156608	best: 0.3912945 (87)	total: 7.18s	remaining: 27.9s
205:	learn: 0.1595550	test: 0.4154483	best: 0.3912945 (87)	total: 7.21s	remaining: 27.8s
206:	learn: 0.1592222	test: 0.4147195	best: 0.3912945 (87)	total: 7.24s	remaining: 27.7s
207:	learn: 0.1583544	test: 0.4151614	best: 0.3912945 (87)	total: 7.27s	remaining: 27.7s
208:	learn: 0.1582147	te

293:	learn: 0.1230908	test: 0.4228567	best: 0.3912945 (87)	total: 10.1s	remaining: 24.3s
294:	learn: 0.1226330	test: 0.4207658	best: 0.3912945 (87)	total: 10.2s	remaining: 24.3s
295:	learn: 0.1223382	test: 0.4199273	best: 0.3912945 (87)	total: 10.2s	remaining: 24.2s
296:	learn: 0.1222950	test: 0.4199523	best: 0.3912945 (87)	total: 10.2s	remaining: 24.2s
297:	learn: 0.1220331	test: 0.4195731	best: 0.3912945 (87)	total: 10.3s	remaining: 24.2s
298:	learn: 0.1219636	test: 0.4195653	best: 0.3912945 (87)	total: 10.3s	remaining: 24.2s
299:	learn: 0.1216897	test: 0.4200505	best: 0.3912945 (87)	total: 10.3s	remaining: 24.1s
300:	learn: 0.1216132	test: 0.4201722	best: 0.3912945 (87)	total: 10.4s	remaining: 24.1s
301:	learn: 0.1213268	test: 0.4197226	best: 0.3912945 (87)	total: 10.4s	remaining: 24s
302:	learn: 0.1211919	test: 0.4198594	best: 0.3912945 (87)	total: 10.4s	remaining: 24s
303:	learn: 0.1211274	test: 0.4196107	best: 0.3912945 (87)	total: 10.5s	remaining: 24s
304:	learn: 0.1204069	test:

390:	learn: 0.0988371	test: 0.4264646	best: 0.3912945 (87)	total: 13.8s	remaining: 21.5s
391:	learn: 0.0986725	test: 0.4269579	best: 0.3912945 (87)	total: 13.9s	remaining: 21.5s
392:	learn: 0.0984541	test: 0.4264451	best: 0.3912945 (87)	total: 13.9s	remaining: 21.5s
393:	learn: 0.0983136	test: 0.4270549	best: 0.3912945 (87)	total: 13.9s	remaining: 21.4s
394:	learn: 0.0981624	test: 0.4268710	best: 0.3912945 (87)	total: 13.9s	remaining: 21.4s
395:	learn: 0.0981248	test: 0.4265250	best: 0.3912945 (87)	total: 14s	remaining: 21.3s
396:	learn: 0.0978899	test: 0.4271349	best: 0.3912945 (87)	total: 14s	remaining: 21.3s
397:	learn: 0.0977645	test: 0.4265609	best: 0.3912945 (87)	total: 14.1s	remaining: 21.3s
398:	learn: 0.0973435	test: 0.4235319	best: 0.3912945 (87)	total: 14.1s	remaining: 21.3s
399:	learn: 0.0969166	test: 0.4234022	best: 0.3912945 (87)	total: 14.2s	remaining: 21.3s
400:	learn: 0.0968670	test: 0.4235154	best: 0.3912945 (87)	total: 14.2s	remaining: 21.2s
401:	learn: 0.0967002	tes

486:	learn: 0.0826468	test: 0.4366055	best: 0.3912945 (87)	total: 17.7s	remaining: 18.6s
487:	learn: 0.0826316	test: 0.4365879	best: 0.3912945 (87)	total: 17.8s	remaining: 18.6s
488:	learn: 0.0825713	test: 0.4364535	best: 0.3912945 (87)	total: 17.8s	remaining: 18.6s
489:	learn: 0.0822003	test: 0.4359430	best: 0.3912945 (87)	total: 17.8s	remaining: 18.6s
490:	learn: 0.0821850	test: 0.4359963	best: 0.3912945 (87)	total: 17.9s	remaining: 18.5s
491:	learn: 0.0821713	test: 0.4362460	best: 0.3912945 (87)	total: 17.9s	remaining: 18.5s
492:	learn: 0.0820986	test: 0.4367132	best: 0.3912945 (87)	total: 17.9s	remaining: 18.4s
493:	learn: 0.0820910	test: 0.4366729	best: 0.3912945 (87)	total: 18s	remaining: 18.4s
494:	learn: 0.0819052	test: 0.4358896	best: 0.3912945 (87)	total: 18s	remaining: 18.3s
495:	learn: 0.0818303	test: 0.4360558	best: 0.3912945 (87)	total: 18s	remaining: 18.3s
496:	learn: 0.0816748	test: 0.4354462	best: 0.3912945 (87)	total: 18s	remaining: 18.3s
497:	learn: 0.0814893	test: 0

581:	learn: 0.0711638	test: 0.4416412	best: 0.3912945 (87)	total: 21.4s	remaining: 15.4s
582:	learn: 0.0711407	test: 0.4414636	best: 0.3912945 (87)	total: 21.5s	remaining: 15.4s
583:	learn: 0.0710490	test: 0.4417311	best: 0.3912945 (87)	total: 21.5s	remaining: 15.3s
584:	learn: 0.0708707	test: 0.4427639	best: 0.3912945 (87)	total: 21.5s	remaining: 15.3s
585:	learn: 0.0697883	test: 0.4398313	best: 0.3912945 (87)	total: 21.6s	remaining: 15.2s
586:	learn: 0.0697192	test: 0.4401072	best: 0.3912945 (87)	total: 21.6s	remaining: 15.2s
587:	learn: 0.0696137	test: 0.4402124	best: 0.3912945 (87)	total: 21.6s	remaining: 15.1s
588:	learn: 0.0693775	test: 0.4408001	best: 0.3912945 (87)	total: 21.6s	remaining: 15.1s
589:	learn: 0.0692492	test: 0.4410793	best: 0.3912945 (87)	total: 21.7s	remaining: 15.1s
590:	learn: 0.0689668	test: 0.4424532	best: 0.3912945 (87)	total: 21.7s	remaining: 15s
591:	learn: 0.0688960	test: 0.4423710	best: 0.3912945 (87)	total: 21.8s	remaining: 15s
592:	learn: 0.0685918	tes

678:	learn: 0.0600102	test: 0.4461180	best: 0.3912945 (87)	total: 24.5s	remaining: 11.6s
679:	learn: 0.0597128	test: 0.4467152	best: 0.3912945 (87)	total: 24.5s	remaining: 11.5s
680:	learn: 0.0596870	test: 0.4466839	best: 0.3912945 (87)	total: 24.6s	remaining: 11.5s
681:	learn: 0.0596423	test: 0.4462555	best: 0.3912945 (87)	total: 24.6s	remaining: 11.5s
682:	learn: 0.0595897	test: 0.4463494	best: 0.3912945 (87)	total: 24.7s	remaining: 11.4s
683:	learn: 0.0594903	test: 0.4468657	best: 0.3912945 (87)	total: 24.7s	remaining: 11.4s
684:	learn: 0.0594505	test: 0.4470407	best: 0.3912945 (87)	total: 24.7s	remaining: 11.4s
685:	learn: 0.0593905	test: 0.4464509	best: 0.3912945 (87)	total: 24.8s	remaining: 11.3s
686:	learn: 0.0593764	test: 0.4463089	best: 0.3912945 (87)	total: 24.8s	remaining: 11.3s
687:	learn: 0.0593528	test: 0.4459164	best: 0.3912945 (87)	total: 24.9s	remaining: 11.3s
688:	learn: 0.0592891	test: 0.4455350	best: 0.3912945 (87)	total: 25s	remaining: 11.3s
689:	learn: 0.0592621	t

772:	learn: 0.0537023	test: 0.4544735	best: 0.3912945 (87)	total: 28s	remaining: 8.23s
773:	learn: 0.0536951	test: 0.4542967	best: 0.3912945 (87)	total: 28.1s	remaining: 8.2s
774:	learn: 0.0536916	test: 0.4543751	best: 0.3912945 (87)	total: 28.1s	remaining: 8.16s
775:	learn: 0.0536479	test: 0.4546475	best: 0.3912945 (87)	total: 28.2s	remaining: 8.13s
776:	learn: 0.0534103	test: 0.4533991	best: 0.3912945 (87)	total: 28.2s	remaining: 8.1s
777:	learn: 0.0533323	test: 0.4536589	best: 0.3912945 (87)	total: 28.3s	remaining: 8.07s
778:	learn: 0.0533248	test: 0.4536637	best: 0.3912945 (87)	total: 28.3s	remaining: 8.03s
779:	learn: 0.0533175	test: 0.4535289	best: 0.3912945 (87)	total: 28.3s	remaining: 8s
780:	learn: 0.0532590	test: 0.4537487	best: 0.3912945 (87)	total: 28.4s	remaining: 7.96s
781:	learn: 0.0532207	test: 0.4535289	best: 0.3912945 (87)	total: 28.4s	remaining: 7.92s
782:	learn: 0.0531807	test: 0.4535230	best: 0.3912945 (87)	total: 28.5s	remaining: 7.89s
783:	learn: 0.0531412	test: 

865:	learn: 0.0483674	test: 0.4610074	best: 0.3912945 (87)	total: 32.1s	remaining: 4.97s
866:	learn: 0.0483454	test: 0.4614055	best: 0.3912945 (87)	total: 32.1s	remaining: 4.93s
867:	learn: 0.0483383	test: 0.4614320	best: 0.3912945 (87)	total: 32.2s	remaining: 4.89s
868:	learn: 0.0483300	test: 0.4613714	best: 0.3912945 (87)	total: 32.2s	remaining: 4.86s
869:	learn: 0.0482932	test: 0.4620619	best: 0.3912945 (87)	total: 32.3s	remaining: 4.82s
870:	learn: 0.0481398	test: 0.4621822	best: 0.3912945 (87)	total: 32.3s	remaining: 4.78s
871:	learn: 0.0481176	test: 0.4626493	best: 0.3912945 (87)	total: 32.3s	remaining: 4.75s
872:	learn: 0.0480578	test: 0.4627898	best: 0.3912945 (87)	total: 32.4s	remaining: 4.71s
873:	learn: 0.0480486	test: 0.4629178	best: 0.3912945 (87)	total: 32.4s	remaining: 4.67s
874:	learn: 0.0480301	test: 0.4625758	best: 0.3912945 (87)	total: 32.5s	remaining: 4.64s
875:	learn: 0.0478578	test: 0.4635222	best: 0.3912945 (87)	total: 32.5s	remaining: 4.6s
876:	learn: 0.0478112	

962:	learn: 0.0424611	test: 0.4708055	best: 0.3912945 (87)	total: 36.6s	remaining: 1.41s
963:	learn: 0.0424032	test: 0.4711484	best: 0.3912945 (87)	total: 36.6s	remaining: 1.37s
964:	learn: 0.0423497	test: 0.4713154	best: 0.3912945 (87)	total: 36.7s	remaining: 1.33s
965:	learn: 0.0422093	test: 0.4714659	best: 0.3912945 (87)	total: 36.7s	remaining: 1.29s
966:	learn: 0.0421754	test: 0.4717885	best: 0.3912945 (87)	total: 36.7s	remaining: 1.25s
967:	learn: 0.0421649	test: 0.4717727	best: 0.3912945 (87)	total: 36.8s	remaining: 1.22s
968:	learn: 0.0421559	test: 0.4718002	best: 0.3912945 (87)	total: 36.8s	remaining: 1.18s
969:	learn: 0.0421264	test: 0.4723802	best: 0.3912945 (87)	total: 36.8s	remaining: 1.14s
970:	learn: 0.0420860	test: 0.4718250	best: 0.3912945 (87)	total: 36.9s	remaining: 1.1s
971:	learn: 0.0419298	test: 0.4712531	best: 0.3912945 (87)	total: 36.9s	remaining: 1.06s
972:	learn: 0.0418477	test: 0.4714365	best: 0.3912945 (87)	total: 37s	remaining: 1.03s
973:	learn: 0.0418100	te

In [70]:
# Predicitng and calculating performance on test data
predict_prob = model.predict_proba(X_test)[:,1]

pred_list = [1 if i > 0.5 else 0 for i in predict_prob.tolist()]

y_list = y_test.tolist()

counter = 0
counter_1 = 0
for i in range(len(pred_list)):
    if pred_list[i] == y_list[i]:
        counter = counter+1
        if y_list[i] == 1:
            counter_1 = counter_1+1

print("Result total = "+str(counter/len(pred_list)))

print("Result survived = "+str(counter_1/sum(pred_list)))

print("Result dead = "+str((counter - counter_1)/(len(pred_list) - sum(pred_list))))

Result total = 0.8342541436464088
Result survived = 0.7868852459016393
Result dead = 0.8583333333333333


In [71]:
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_letter
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.2250,,S,
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q,
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0000,,S,
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C,
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.1500,,S,


In [72]:
X_test_data = test_data.drop(['PassengerId','Name', 'Ticket', 'Cabin'], axis=1).fillna(0)

In [73]:
predict_prob_test = model.predict_proba(X_test_data)[:,1]

pred_list_test = [1 if i > 0.5 else 0 for i in predict_prob_test.tolist()]

In [74]:
gender_subm

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [75]:
len(test_data['PassengerId'].tolist()), len(pred_list_test)

(418, 418)

In [76]:
submission = pd.DataFrame({'PassengerID':test_data['PassengerId'].tolist(), 'Survived':pred_list_test})
submission

Unnamed: 0,PassengerID,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0


In [77]:
submission.to_csv("submission_aakash.csv",index=False)