# Catboost with parallel gridsearch

In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier,CatBoostRegressor, Pool
import random
import datetime
from sklearn import *
from multiprocessing import *
from sklearn.grid_search import GridSearchCV



In [2]:
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')
gender_subm = pd.read_csv('gender_submission.csv')

In [3]:
train_data['Cabin_letter'] = train_data.Cabin.str[0:1]
test_data['Cabin_letter'] = test_data.Cabin.str[0:1]

In [4]:
train_data.shape, test_data.shape

((891, 13), (418, 12))

In [5]:
msk = np.random.rand(len(train_data)) < 0.6
train = train_data[msk]
temp = train_data[~msk]

msk2 = np.random.rand(len(temp)) < 0.5
test = temp[msk2]
val = temp[~msk2]

In [6]:
X_train = train.drop(['PassengerId','Name', 'Ticket', 'Cabin','Survived'], axis=1)
y_train = train['Survived']
X_val = val.drop(['PassengerId','Name', 'Ticket', 'Cabin','Survived'], axis=1)
y_val = val['Survived']
X_test = test.drop(['PassengerId','Name', 'Ticket', 'Cabin','Survived'], axis=1)
y_test = test['Survived']

# Get indices of categorical variables
d_t = X_train.dtypes

list_cat = [x for x in d_t[d_t == object].axes[0]]

print(list_cat)

cat_indices = [X_train.columns.tolist().index(col) for col in list_cat]
print(cat_indices)


['Sex', 'Embarked', 'Cabin_letter']
[1, 6, 7]


In [7]:
# Filling null values with 0

X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
X_val = X_val.fillna(0)

## Grid Search

In [8]:
def catboost_run(X_train,y_train,X_val,y_val,X_test,y_test,cat_indices,n_tr,rsm,lrn_rt,dep,l2_reg,num_split,cat_split,bag_temp):
    
    print("model train start")
    
    t1 = datetime.datetime.now()
    print(t1)
    
    model = CatBoostClassifier(iterations=n_tr,
                               rsm=rsm,
                               learning_rate=lrn_rt, 
                               depth=dep,
                               l2_leaf_reg=l2_reg,
                               bagging_temperature = bag_temp,
                               random_seed=2)
    
    model.fit(X_train, y_train,cat_indices, use_best_model=True, eval_set=(X_val, y_val), verbose=True)
    # Predicitng and calculating performance on test data
    predict_prob = model.predict_proba(X_test)[:,1]

    pred_list = [1 if i > 0.5 else 0 for i in predict_prob.tolist()]

    y_list = y_test.tolist()

    counter = 0
    counter_1 = 0
    for i in range(len(pred_list)):
        if pred_list[i] == y_list[i]:
            counter = counter+1
            if y_list[i] == 1:
                counter_1 = counter_1+1
                
    accuracy = counter/len(pred_list)
    
    return accuracy

In [9]:
# Lists of paramenter values

rsm_pv = [0.7,0.8,0.9]
print(len(rsm_pv))
lrn_rt_pv = [0.075,0.1,0.15]
print(len(lrn_rt_pv))
dep_pv = [6,7,8]
print(len(dep_pv))
l2_reg_pv = [10,15,50]
print(len(l2_reg_pv))

3
3
3
3


In [10]:
result_col_list = ['rsm',
                   'learning_rate',
                   'depth',
                   'l2_regularization',
                   'accuracy']

results_df = pd.DataFrame(data=None,columns=result_col_list)

In [11]:
t_a = datetime.datetime.now()

cntr_prog = 0
for rsm in rsm_pv:
    for lrn_rt in lrn_rt_pv:
        for dep in dep_pv:
            for l2_reg in l2_reg_pv:
                
                t1 = datetime.datetime.now()
                model = CatBoostClassifier(iterations=100,
                                           rsm=rsm,
                                           learning_rate=lrn_rt, 
                                           depth=dep,
                                           l2_leaf_reg=l2_reg,
                                           random_seed=2)
    
                model.fit(X_train, y_train,cat_indices, use_best_model=True, eval_set=(X_val, y_val),logging_level='Silent')
                # Predicitng and calculating performance on test data
                predict_prob = model.predict_proba(X_test)[:,1]

                pred_list = [1 if i > 0.5 else 0 for i in predict_prob.tolist()]

                y_list = y_test.tolist()

                counter = 0
                for i in range(len(pred_list)):
                    if pred_list[i] == y_list[i]:
                        counter = counter+1
                
                accuracy = counter/len(pred_list)
                
                result_df_temp = pd.DataFrame(data=None,columns=result_col_list)
                
                result_df_temp.loc[0,'rsm'] = rsm
                result_df_temp.loc[0,'learning_rate'] = lrn_rt
                result_df_temp.loc[0,'depth'] = dep
                result_df_temp.loc[0,'l2_regularization'] = l2_reg
                
                result_df_temp.loc[0,'accuracy'] = accuracy

                results_df = results_df.append(result_df_temp)
                
                t2 = datetime.datetime.now()
                
                itr_tm = t2-t1
                
                cntr_prog = cntr_prog+1
                print(str(itr_tm) + " -> " + str(cntr_prog) + "/" + str(3*3*3*3))
                
t_b = datetime.datetime.now()


1/81
2/81
3/81
4/81
5/81
6/81
7/81
8/81
9/81
10/81
11/81
12/81
13/81
14/81
15/81
16/81
17/81
18/81
19/81
20/81
21/81
22/81
23/81
24/81
25/81
26/81
27/81
28/81
29/81
30/81
31/81
32/81
33/81
34/81
35/81
36/81
37/81
38/81
39/81
40/81
41/81
42/81
43/81
44/81
45/81
46/81
47/81
48/81
49/81
50/81
51/81
52/81
53/81
54/81
55/81
56/81
57/81
58/81
59/81
60/81
61/81
62/81
63/81
64/81
65/81
66/81
67/81
68/81
69/81
70/81
71/81
72/81
73/81
74/81
75/81
76/81
77/81
78/81
79/81
80/81
81/81


In [12]:
print("total time taken = " + str(t_b - t_a))

total time taken = 0:05:30.001852


In [13]:
results_df2 = results_df

In [14]:
results_df2 = results_df2.reset_index().drop(columns = ["index"])

final_result = results_df2[results_df2.accuracy == max(results_df2['accuracy'])].reset_index().drop(columns = ["index"]).head(1)

In [15]:
final_result

Unnamed: 0,rsm,learning_rate,depth,l2_regularization,accuracy
0,0.9,0.15,8,15,0.840426


In [16]:
rsm_f = final_result['rsm'][0]
lrn_rt_f = final_result['learning_rate'][0]
dep_f = final_result['depth'][0]
l2_reg_f = final_result['l2_regularization'][0]

In [17]:
n_tree = 1000
print(str(n_tree) + " TREES")

model = CatBoostClassifier(iterations=n_tree,
                           rsm=rsm_f,
                           learning_rate=lrn_rt_f, 
                           depth=dep_f,
                           l2_leaf_reg=l2_reg_f,
                           random_seed=2)

1000 TREES


In [18]:
t1 = datetime.datetime.now()
print(t1)
model.fit(X_train, y_train,cat_indices, use_best_model=True, eval_set=(X_val, y_val), verbose=True)

t2 = datetime.datetime.now()
print(t2)
print(t2-t1)

2018-06-26 08:37:23.603236
0:	learn: 0.6147534	test: 0.6221980	best: 0.6221980 (0)	total: 34.1ms	remaining: 34.1s
1:	learn: 0.5584261	test: 0.5738130	best: 0.5738130 (1)	total: 75.1ms	remaining: 37.5s
2:	learn: 0.5285309	test: 0.5397860	best: 0.5397860 (2)	total: 119ms	remaining: 39.6s
3:	learn: 0.5016046	test: 0.5139010	best: 0.5139010 (3)	total: 137ms	remaining: 34s
4:	learn: 0.4874590	test: 0.5026726	best: 0.5026726 (4)	total: 168ms	remaining: 33.4s
5:	learn: 0.4708541	test: 0.4891581	best: 0.4891581 (5)	total: 212ms	remaining: 35.1s
6:	learn: 0.4580869	test: 0.4825049	best: 0.4825049 (6)	total: 264ms	remaining: 37.4s
7:	learn: 0.4475356	test: 0.4761809	best: 0.4761809 (7)	total: 299ms	remaining: 37s
8:	learn: 0.4428902	test: 0.4713500	best: 0.4713500 (8)	total: 323ms	remaining: 35.5s
9:	learn: 0.4395933	test: 0.4680359	best: 0.4680359 (9)	total: 342ms	remaining: 33.9s
10:	learn: 0.4232031	test: 0.4611367	best: 0.4611367 (10)	total: 395ms	remaining: 35.5s
11:	learn: 0.4156026	test: 

97:	learn: 0.2102334	test: 0.4548989	best: 0.4341052 (54)	total: 4.8s	remaining: 44.2s
98:	learn: 0.2089694	test: 0.4541311	best: 0.4341052 (54)	total: 4.86s	remaining: 44.2s
99:	learn: 0.2076329	test: 0.4531146	best: 0.4341052 (54)	total: 4.91s	remaining: 44.2s
100:	learn: 0.2064497	test: 0.4542185	best: 0.4341052 (54)	total: 4.96s	remaining: 44.1s
101:	learn: 0.2056172	test: 0.4541220	best: 0.4341052 (54)	total: 5.01s	remaining: 44.1s
102:	learn: 0.2043871	test: 0.4558864	best: 0.4341052 (54)	total: 5.06s	remaining: 44s
103:	learn: 0.2038841	test: 0.4560694	best: 0.4341052 (54)	total: 5.1s	remaining: 43.9s
104:	learn: 0.2027739	test: 0.4560589	best: 0.4341052 (54)	total: 5.15s	remaining: 43.9s
105:	learn: 0.2024298	test: 0.4554831	best: 0.4341052 (54)	total: 5.2s	remaining: 43.8s
106:	learn: 0.2017840	test: 0.4554575	best: 0.4341052 (54)	total: 5.26s	remaining: 43.9s
107:	learn: 0.2012183	test: 0.4556191	best: 0.4341052 (54)	total: 5.33s	remaining: 44.1s
108:	learn: 0.1998186	test: 0

191:	learn: 0.1417855	test: 0.4846345	best: 0.4341052 (54)	total: 10.2s	remaining: 42.8s
192:	learn: 0.1409286	test: 0.4853932	best: 0.4341052 (54)	total: 10.2s	remaining: 42.8s
193:	learn: 0.1404674	test: 0.4861688	best: 0.4341052 (54)	total: 10.3s	remaining: 42.8s
194:	learn: 0.1396616	test: 0.4852492	best: 0.4341052 (54)	total: 10.4s	remaining: 42.8s
195:	learn: 0.1389948	test: 0.4853798	best: 0.4341052 (54)	total: 10.4s	remaining: 42.8s
196:	learn: 0.1387214	test: 0.4854065	best: 0.4341052 (54)	total: 10.5s	remaining: 42.7s
197:	learn: 0.1385219	test: 0.4852172	best: 0.4341052 (54)	total: 10.5s	remaining: 42.6s
198:	learn: 0.1381568	test: 0.4864219	best: 0.4341052 (54)	total: 10.6s	remaining: 42.7s
199:	learn: 0.1370426	test: 0.4874183	best: 0.4341052 (54)	total: 10.7s	remaining: 42.7s
200:	learn: 0.1361940	test: 0.4875252	best: 0.4341052 (54)	total: 10.7s	remaining: 42.6s
201:	learn: 0.1359461	test: 0.4873312	best: 0.4341052 (54)	total: 10.8s	remaining: 42.5s
202:	learn: 0.1356032

284:	learn: 0.1084358	test: 0.4967806	best: 0.4341052 (54)	total: 16s	remaining: 40.1s
285:	learn: 0.1082003	test: 0.4972882	best: 0.4341052 (54)	total: 16s	remaining: 40s
286:	learn: 0.1078992	test: 0.4969787	best: 0.4341052 (54)	total: 16.1s	remaining: 39.9s
287:	learn: 0.1077690	test: 0.4966896	best: 0.4341052 (54)	total: 16.1s	remaining: 39.8s
288:	learn: 0.1074734	test: 0.4964409	best: 0.4341052 (54)	total: 16.1s	remaining: 39.7s
289:	learn: 0.1073478	test: 0.4962661	best: 0.4341052 (54)	total: 16.2s	remaining: 39.7s
290:	learn: 0.1072705	test: 0.4963516	best: 0.4341052 (54)	total: 16.2s	remaining: 39.6s
291:	learn: 0.1071086	test: 0.4973484	best: 0.4341052 (54)	total: 16.3s	remaining: 39.5s
292:	learn: 0.1070097	test: 0.4977968	best: 0.4341052 (54)	total: 16.3s	remaining: 39.4s
293:	learn: 0.1068372	test: 0.4978764	best: 0.4341052 (54)	total: 16.4s	remaining: 39.3s
294:	learn: 0.1066864	test: 0.4984438	best: 0.4341052 (54)	total: 16.4s	remaining: 39.3s
295:	learn: 0.1065627	test:

377:	learn: 0.0850488	test: 0.5226243	best: 0.4341052 (54)	total: 21s	remaining: 34.5s
378:	learn: 0.0847768	test: 0.5227351	best: 0.4341052 (54)	total: 21.1s	remaining: 34.5s
379:	learn: 0.0847197	test: 0.5224118	best: 0.4341052 (54)	total: 21.2s	remaining: 34.5s
380:	learn: 0.0844711	test: 0.5226813	best: 0.4341052 (54)	total: 21.2s	remaining: 34.5s
381:	learn: 0.0843831	test: 0.5228165	best: 0.4341052 (54)	total: 21.3s	remaining: 34.5s
382:	learn: 0.0842317	test: 0.5231087	best: 0.4341052 (54)	total: 21.4s	remaining: 34.4s
383:	learn: 0.0840885	test: 0.5224235	best: 0.4341052 (54)	total: 21.4s	remaining: 34.4s
384:	learn: 0.0837470	test: 0.5230421	best: 0.4341052 (54)	total: 21.5s	remaining: 34.3s
385:	learn: 0.0835289	test: 0.5236437	best: 0.4341052 (54)	total: 21.5s	remaining: 34.2s
386:	learn: 0.0834877	test: 0.5237407	best: 0.4341052 (54)	total: 21.5s	remaining: 34.1s
387:	learn: 0.0831136	test: 0.5242520	best: 0.4341052 (54)	total: 21.6s	remaining: 34.1s
388:	learn: 0.0827400	t

472:	learn: 0.0695959	test: 0.5412206	best: 0.4341052 (54)	total: 26.7s	remaining: 29.8s
473:	learn: 0.0692026	test: 0.5413020	best: 0.4341052 (54)	total: 26.8s	remaining: 29.7s
474:	learn: 0.0690329	test: 0.5422561	best: 0.4341052 (54)	total: 26.9s	remaining: 29.7s
475:	learn: 0.0686869	test: 0.5428069	best: 0.4341052 (54)	total: 26.9s	remaining: 29.6s
476:	learn: 0.0684256	test: 0.5427839	best: 0.4341052 (54)	total: 27s	remaining: 29.6s
477:	learn: 0.0681070	test: 0.5429619	best: 0.4341052 (54)	total: 27.1s	remaining: 29.6s
478:	learn: 0.0678903	test: 0.5433504	best: 0.4341052 (54)	total: 27.1s	remaining: 29.5s
479:	learn: 0.0674829	test: 0.5451051	best: 0.4341052 (54)	total: 27.2s	remaining: 29.5s
480:	learn: 0.0673073	test: 0.5452452	best: 0.4341052 (54)	total: 27.3s	remaining: 29.5s
481:	learn: 0.0672395	test: 0.5453317	best: 0.4341052 (54)	total: 27.4s	remaining: 29.4s
482:	learn: 0.0671620	test: 0.5456409	best: 0.4341052 (54)	total: 27.5s	remaining: 29.4s
483:	learn: 0.0669325	t

566:	learn: 0.0569777	test: 0.5700831	best: 0.4341052 (54)	total: 32.7s	remaining: 25s
567:	learn: 0.0569293	test: 0.5699829	best: 0.4341052 (54)	total: 32.8s	remaining: 24.9s
568:	learn: 0.0569111	test: 0.5700731	best: 0.4341052 (54)	total: 32.8s	remaining: 24.9s
569:	learn: 0.0568856	test: 0.5703241	best: 0.4341052 (54)	total: 32.9s	remaining: 24.8s
570:	learn: 0.0568632	test: 0.5704249	best: 0.4341052 (54)	total: 33s	remaining: 24.8s
571:	learn: 0.0567106	test: 0.5700980	best: 0.4341052 (54)	total: 33s	remaining: 24.7s
572:	learn: 0.0566606	test: 0.5708474	best: 0.4341052 (54)	total: 33.1s	remaining: 24.7s
573:	learn: 0.0565454	test: 0.5712815	best: 0.4341052 (54)	total: 33.2s	remaining: 24.6s
574:	learn: 0.0565256	test: 0.5710747	best: 0.4341052 (54)	total: 33.2s	remaining: 24.6s
575:	learn: 0.0565028	test: 0.5711883	best: 0.4341052 (54)	total: 33.3s	remaining: 24.5s
576:	learn: 0.0561642	test: 0.5712607	best: 0.4341052 (54)	total: 33.3s	remaining: 24.4s
577:	learn: 0.0561516	test:

660:	learn: 0.0487891	test: 0.5914508	best: 0.4341052 (54)	total: 38.1s	remaining: 19.6s
661:	learn: 0.0487300	test: 0.5917148	best: 0.4341052 (54)	total: 38.2s	remaining: 19.5s
662:	learn: 0.0482713	test: 0.5949172	best: 0.4341052 (54)	total: 38.2s	remaining: 19.4s
663:	learn: 0.0482549	test: 0.5949962	best: 0.4341052 (54)	total: 38.3s	remaining: 19.4s
664:	learn: 0.0482353	test: 0.5951748	best: 0.4341052 (54)	total: 38.3s	remaining: 19.3s
665:	learn: 0.0482120	test: 0.5952253	best: 0.4341052 (54)	total: 38.4s	remaining: 19.3s
666:	learn: 0.0481824	test: 0.5952109	best: 0.4341052 (54)	total: 38.4s	remaining: 19.2s
667:	learn: 0.0481283	test: 0.5951543	best: 0.4341052 (54)	total: 38.5s	remaining: 19.1s
668:	learn: 0.0480001	test: 0.5951902	best: 0.4341052 (54)	total: 38.5s	remaining: 19.1s
669:	learn: 0.0479418	test: 0.5955748	best: 0.4341052 (54)	total: 38.6s	remaining: 19s
670:	learn: 0.0477536	test: 0.5972556	best: 0.4341052 (54)	total: 38.6s	remaining: 18.9s
671:	learn: 0.0477018	t

755:	learn: 0.0423864	test: 0.6117047	best: 0.4341052 (54)	total: 43.8s	remaining: 14.1s
756:	learn: 0.0423579	test: 0.6117628	best: 0.4341052 (54)	total: 43.9s	remaining: 14.1s
757:	learn: 0.0422235	test: 0.6118759	best: 0.4341052 (54)	total: 44s	remaining: 14s
758:	learn: 0.0421490	test: 0.6133061	best: 0.4341052 (54)	total: 44.1s	remaining: 14s
759:	learn: 0.0420479	test: 0.6129836	best: 0.4341052 (54)	total: 44.1s	remaining: 13.9s
760:	learn: 0.0420352	test: 0.6128479	best: 0.4341052 (54)	total: 44.2s	remaining: 13.9s
761:	learn: 0.0420125	test: 0.6128397	best: 0.4341052 (54)	total: 44.3s	remaining: 13.8s
762:	learn: 0.0419044	test: 0.6139764	best: 0.4341052 (54)	total: 44.4s	remaining: 13.8s
763:	learn: 0.0417511	test: 0.6135502	best: 0.4341052 (54)	total: 44.4s	remaining: 13.7s
764:	learn: 0.0417368	test: 0.6133951	best: 0.4341052 (54)	total: 44.5s	remaining: 13.7s
765:	learn: 0.0416963	test: 0.6139111	best: 0.4341052 (54)	total: 44.5s	remaining: 13.6s
766:	learn: 0.0416651	test:

850:	learn: 0.0375171	test: 0.6279003	best: 0.4341052 (54)	total: 49.5s	remaining: 8.66s
851:	learn: 0.0375128	test: 0.6279099	best: 0.4341052 (54)	total: 49.5s	remaining: 8.6s
852:	learn: 0.0374597	test: 0.6280085	best: 0.4341052 (54)	total: 49.6s	remaining: 8.54s
853:	learn: 0.0374290	test: 0.6281385	best: 0.4341052 (54)	total: 49.7s	remaining: 8.49s
854:	learn: 0.0374074	test: 0.6280148	best: 0.4341052 (54)	total: 49.7s	remaining: 8.43s
855:	learn: 0.0373929	test: 0.6283259	best: 0.4341052 (54)	total: 49.8s	remaining: 8.38s
856:	learn: 0.0373839	test: 0.6281128	best: 0.4341052 (54)	total: 49.9s	remaining: 8.32s
857:	learn: 0.0373468	test: 0.6284468	best: 0.4341052 (54)	total: 49.9s	remaining: 8.26s
858:	learn: 0.0373239	test: 0.6284930	best: 0.4341052 (54)	total: 50s	remaining: 8.2s
859:	learn: 0.0372381	test: 0.6285268	best: 0.4341052 (54)	total: 50s	remaining: 8.14s
860:	learn: 0.0371407	test: 0.6285767	best: 0.4341052 (54)	total: 50.1s	remaining: 8.08s
861:	learn: 0.0371260	test:

944:	learn: 0.0333708	test: 0.6386036	best: 0.4341052 (54)	total: 55s	remaining: 3.2s
945:	learn: 0.0333471	test: 0.6388107	best: 0.4341052 (54)	total: 55.1s	remaining: 3.14s
946:	learn: 0.0333411	test: 0.6387682	best: 0.4341052 (54)	total: 55.1s	remaining: 3.09s
947:	learn: 0.0332933	test: 0.6393761	best: 0.4341052 (54)	total: 55.2s	remaining: 3.03s
948:	learn: 0.0332673	test: 0.6394154	best: 0.4341052 (54)	total: 55.3s	remaining: 2.97s
949:	learn: 0.0332600	test: 0.6393605	best: 0.4341052 (54)	total: 55.4s	remaining: 2.91s
950:	learn: 0.0331184	test: 0.6402000	best: 0.4341052 (54)	total: 55.4s	remaining: 2.85s
951:	learn: 0.0331092	test: 0.6401617	best: 0.4341052 (54)	total: 55.5s	remaining: 2.8s
952:	learn: 0.0331040	test: 0.6400891	best: 0.4341052 (54)	total: 55.5s	remaining: 2.74s
953:	learn: 0.0330596	test: 0.6399129	best: 0.4341052 (54)	total: 55.5s	remaining: 2.68s
954:	learn: 0.0330302	test: 0.6401699	best: 0.4341052 (54)	total: 55.6s	remaining: 2.62s
955:	learn: 0.0330044	tes

In [19]:
# Predicitng and calculating performance on test data
predict_prob = model.predict_proba(X_test)[:,1]

pred_list = [1 if i > 0.5 else 0 for i in predict_prob.tolist()]

y_list = y_test.tolist()

counter = 0
counter_1 = 0
for i in range(len(pred_list)):
    if pred_list[i] == y_list[i]:
        counter = counter+1
        if y_list[i] == 1:
            counter_1 = counter_1+1

print("Result total = "+str(counter/len(pred_list)))

print("Result survived = "+str(counter_1/sum(pred_list)))

print("Result dead = "+str((counter - counter_1)/(len(pred_list) - sum(pred_list))))

Result total = 0.8404255319148937
Result survived = 0.85
Result dead = 0.8359375


In [20]:
from multiprocessing import Pool
from tqdm import tqdm

In [21]:
def catboost_paralllel(param_list):
    
    rsm = param_list[0]
    lrn_rt = param_list[1]
    dep = param_list[2]
    l2_reg = param_list[3]
    
    t1 = datetime.datetime.now()
    model = CatBoostClassifier(iterations=100,
                               rsm=rsm,
                               learning_rate=lrn_rt, 
                               depth=dep,
                               l2_leaf_reg=l2_reg,
                               random_seed=2)

    model.fit(X_train, y_train,cat_indices, use_best_model=True, eval_set=(X_val, y_val),logging_level='Silent')
    # Predicitng and calculating performance on test data
    predict_prob = model.predict_proba(X_test)[:,1]

    pred_list = [1 if i > 0.5 else 0 for i in predict_prob.tolist()]

    y_list = y_test.tolist()

    counter = 0
    for i in range(len(pred_list)):
        if pred_list[i] == y_list[i]:
            counter = counter+1

    accuracy = counter/len(pred_list)

    result_df_temp = pd.DataFrame(data=None,columns=result_col_list)

    result_df_temp.loc[0,'rsm'] = rsm
    result_df_temp.loc[0,'learning_rate'] = lrn_rt
    result_df_temp.loc[0,'depth'] = dep
    result_df_temp.loc[0,'l2_regularization'] = l2_reg

    result_df_temp.loc[0,'accuracy'] = accuracy

    results_df = results_df.append(result_df_temp)

    t2 = datetime.datetime.now()

    itr_tm = t2-t1

    cntr_prog = cntr_prog+1
    print(str(cntr_prog) + "/" + str(3*3*3*3))


In [22]:
param_lists = []
for rsm in rsm_pv:
    for lrn_rt in lrn_rt_pv:
        for dep in dep_pv:
            for l2_reg in l2_reg_pv:
                param_lists.append([rsm,lrn_rt,dep,l2_reg])
                   

In [None]:
pbar = tqdm(total=len(param_lists))
pool = Pool()

t1 = datetime.datetime.now()

pool.imap_unordered(catboost_paralllel, param_lists)
pool.close()
pool.join()
pbar.close()

t2 = datetime.datetime.now()

print(t2-t1)

  0%|          | 0/81 [00:00<?, ?it/s]

In [None]:
if __name__ == '__main__':
    pool = Pool()                        
    pool.map(catboost_paralllel, param_lists) 

In [None]:
import os
os.cpu_count()

In [None]:
results_df2 = results_df

In [None]:
results_df2 = results_df2.reset_index().drop(columns = ["index"])

final_result = results_df2[results_df2.accuracy == max(results_df2['accuracy'])].reset_index().drop(columns = ["index"]).head(1)

In [None]:
final_result

In [None]:
rsm_f = final_result['rsm'][0]
lrn_rt_f = final_result['learning_rate'][0]
dep_f = final_result['depth'][0]
l2_reg_f = final_result['l2_regularization'][0]

In [None]:
n_tree = 1000
print(str(n_tree) + " TREES")

model = CatBoostClassifier(iterations=n_tree,
                           rsm=rsm_f,
                           learning_rate=lrn_rt_f, 
                           depth=dep_f,
                           l2_leaf_reg=l2_reg_f,
                           random_seed=2)

In [None]:
t1 = datetime.datetime.now()
print(t1)
model.fit(X_train, y_train,cat_indices, use_best_model=True, eval_set=(X_val, y_val), verbose=True)

t2 = datetime.datetime.now()
print(t2)
print(t2-t1)

In [None]:
# Predicitng and calculating performance on test data
predict_prob = model.predict_proba(X_test)[:,1]

pred_list = [1 if i > 0.5 else 0 for i in predict_prob.tolist()]

y_list = y_test.tolist()

counter = 0
counter_1 = 0
for i in range(len(pred_list)):
    if pred_list[i] == y_list[i]:
        counter = counter+1
        if y_list[i] == 1:
            counter_1 = counter_1+1

print("Result total = "+str(counter/len(pred_list)))

print("Result survived = "+str(counter_1/sum(pred_list)))

print("Result dead = "+str((counter - counter_1)/(len(pred_list) - sum(pred_list))))

In [None]:
test_data

In [None]:
X_test_data = test_data.drop(['PassengerId','Name', 'Ticket', 'Cabin'], axis=1).fillna(0)

In [None]:
predict_prob_test = model.predict_proba(X_test_data)[:,1]

pred_list_test = [1 if i > 0.5 else 0 for i in predict_prob_test.tolist()]

In [None]:
gender_subm

In [None]:
len(test_data['PassengerId'].tolist()), len(pred_list_test)

In [None]:
submission = pd.DataFrame({'PassengerID':test_data['PassengerId'].tolist(), 'Survived':pred_list_test})
submission

In [None]:
submission.to_csv("submission_aakash.csv",index=False)