In [1]:
import math as math
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
import datetime
import random as random
from copy import deepcopy
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool
import time

In [2]:
# data preparation
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')
gender_subm = pd.read_csv('gender_submission.csv')

train_data['Cabin_letter'] = train_data.Cabin.str[0:1]
test_data['Cabin_letter'] = test_data.Cabin.str[0:1]

train_data.shape, test_data.shape

msk = np.random.rand(len(train_data)) < 0.6
train = train_data[msk]
temp = train_data[~msk]

msk2 = np.random.rand(len(temp)) < 0.5
test = temp[msk2]
val = temp[~msk2]

X_train = train.drop(['PassengerId','Name', 'Ticket', 'Cabin','Survived'], axis=1)
y_train = train['Survived']
X_val = val.drop(['PassengerId','Name', 'Ticket', 'Cabin','Survived'], axis=1)
y_val = val['Survived']
X_test = test.drop(['PassengerId','Name', 'Ticket', 'Cabin','Survived'], axis=1)
y_test = test['Survived']

In [3]:
# Get indices of categorical variables
d_t = X_train.dtypes

list_cat = [x for x in d_t[d_t == object].axes[0]]

print(list_cat)

cat_indices = [X_train.columns.tolist().index(col) for col in list_cat]
print(cat_indices)

['Sex', 'Embarked', 'Cabin_letter']
[1, 6, 7]


In [4]:
# Filling null values with 0

X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
X_val = X_val.fillna(0)

In [5]:
# catboost parameter
n_tree = 100

In [6]:
# list of parameters in format [parameter name, minimum value, maximum value, step size]
num_param_list = [['rsm',0.5,1,2],
                  ['depth',4,10,0],
                  ['learning_rate',0.01,0.5,2],
                  ['l2_leaf_reg',1,100,0]]

In [7]:
param_df = pd.DataFrame(num_param_list, 
                        columns=['parameter_name', 
                                 'min_val',
                                 'max_val',
                                 'decimal_precision'])

In [8]:
# result collection df
hyper_param_list = list(param_df['parameter_name'])
result_col_list = ['dna_str'] + hyper_param_list + ['accuracy']

#results_df = pd.DataFrame(data=None,columns=result_col_list)

In [9]:
t_a = datetime.datetime.now()

def random_search_catboost(param_df):  
    
    hyper_param_str = ''
    hyper_param_str_2 = ''
    res_dict = {}
    
    parameter_list = []
    param_val_list = []
    
    for i in range(param_df.shape[0]):
        parameter = param_df.loc[i]['parameter_name']
        parameter_list = parameter_list + [parameter]
        
        param_val = round(random.uniform(param_df.loc[i]['min_val'],param_df.loc[i]['max_val']), param_df.loc[i]['decimal_precision'])
        
        hyper_param_str = hyper_param_str + parameter + "=" + str(param_val) + ","
        hyper_param_str_2 = hyper_param_str_2 + parameter + "=" + str(param_val) + " and "
        res_dict[parameter] = param_val
        param_val_list = param_val_list + [param_val]
        
    if (result_df[parameter_list] == np.array(param_val_list)).all(1).any():
        accuracy = result_df[result_df.query(hyper_param_str_2)]['fitness'].tolist()[0]
    else:
        model = eval("CatBoostClassifier(iterations=" + str(n_tree) + "," + hyper_param_str + 
                     "random_seed=2,logging_level='Silent')")
        model.fit(X_train, y_train,cat_indices)
    
        # Predicitng and calculating performance on test data
        predict_prob = model.predict_proba(X_test)[:,1]
    
        pred_list = [1 if i > 0.5 else 0 for i in predict_prob.tolist()]
    
        y_list = y_test.tolist()
    
        counter = 0
        for i in range(len(pred_list)):
            if pred_list[i] == y_list[i]:
                counter = counter+1
    
        accuracy = counter/len(pred_list)
    
    res_dict['fitness'] = accuracy
       
    return res_dict

In [None]:
t1 = datetime.datetime.now()

result_df = pd.DataFrame(columns=result_col_list)
res_t = random_search_catboost(param_df)

# number of random guesses
num_rand = 100

result_col_list = list(param_df['parameter_name']) + ['fitness']
result_df = pd.DataFrame(columns=result_col_list)

# initial run
results = []

# initialize progress bar
pbar1 = tqdm(total=num_rand)

for i in range(num_rand):
    
    res_t = random_search_catboost(param_df)
    results = results + [res_t]
    
    pbar1.update(1)

pbar1.close()

result_temp = pd.DataFrame(results,columns=result_col_list)

result_df = result_df.append(result_temp)
result_df = result_df.drop_duplicates()
result_df = result_df.reset_index(drop=True)

result_df = result_df.sort_values(by='fitness',ascending = False).reset_index(drop=True)

best_result = result_df.head(1)

print(best_result)

t2 = datetime.datetime.now()

print("total time -> " + str(t2-t1))

#    results = [pool.apply_async(ga_catboost, args=(x,)) for x in list(pop_df['population'])]
#    final_result = [result.get() for result in results]    
    

HBox(children=(IntProgress(value=0), HTML(value='')))