In [1]:
import math as math
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
import datetime
import random as random
from copy import deepcopy
from tqdm import tqdm_notebook as tqdm

In [2]:
# data preparation
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')
gender_subm = pd.read_csv('gender_submission.csv')

train_data['Cabin_letter'] = train_data.Cabin.str[0:1]
test_data['Cabin_letter'] = test_data.Cabin.str[0:1]

train_data.shape, test_data.shape

msk = np.random.rand(len(train_data)) < 0.6
train = train_data[msk]
temp = train_data[~msk]

msk2 = np.random.rand(len(temp)) < 0.5
test = temp[msk2]
val = temp[~msk2]

X_train = train.drop(['PassengerId','Name', 'Ticket', 'Cabin','Survived'], axis=1)
y_train = train['Survived']
X_val = val.drop(['PassengerId','Name', 'Ticket', 'Cabin','Survived'], axis=1)
y_val = val['Survived']
X_test = test.drop(['PassengerId','Name', 'Ticket', 'Cabin','Survived'], axis=1)
y_test = test['Survived']

In [3]:
# Get indices of categorical variables
d_t = X_train.dtypes

list_cat = [x for x in d_t[d_t == object].axes[0]]

print(list_cat)

cat_indices = [X_train.columns.tolist().index(col) for col in list_cat]
print(cat_indices)

['Sex', 'Embarked', 'Cabin_letter']
[1, 6, 7]


In [4]:
# Filling null values with 0

X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
X_val = X_val.fillna(0)

In [5]:
# function which gives the number of bits required for representing a parameter in the individual genome
def num_trait_bin_len(min_val,max_val,step):
    num_val = math.ceil((max_val-min_val)/step + 1)
#    print(num_val)
    num_val_bin = str(bin(num_val))[2:]
#    print(num_val_bin)
    num_val_bin_len = len(num_val_bin)
    return num_val_bin_len

# testing trait_bin_len function
#print(num_trait_bin_len(0.01,0.55,0.02))

In [6]:
# function for converting a binary string value from a given individual into real value
def bin2num(min_val,max_val,decimal_precision,bin_str):
    Nbit = len(bin_str)
    Dval = int(bin_str,2)
    
    real_val = round(min_val + ((max_val-min_val)/((2**Nbit)-1))*Dval,decimal_precision)
    return real_val

In [7]:
# population initialization function
def dna_initialization(param_df):
    dna_str = ""
    for i in range(param_df.shape[0]):
        dna_str = dna_str+bin(np.random.randint(2**param_df.loc[i]['nBits']))[2:].zfill(int(param_df.loc[i]['nBits']))
    return dna_str

In [8]:
# function for roulette wheel selection
def roullete_selection(pop_df, n_select):
    pop_df2 = deepcopy(pop_df)
    pop_df2 = pop_df2.sort_values(by='fitness', axis=0, ascending=False)    
    pop_df2['fit_cum'] = pop_df2['fitness'].cumsum()    
    fit_sum = pop_df2['fitness'].sum()    
    return_selection = pd.DataFrame(columns=pop_df.columns.tolist())
    
    for i in range(n_select):
        rand_num = random.uniform(0, fit_sum)
        selection = pop_df2[pop_df2['fit_cum'] >= rand_num].head(1)
        return_selection = return_selection.append(selection[return_selection.columns.tolist()])
    
    return return_selection

In [9]:
# function for ranking selection
def ranking_selection(pop_df, n_select):
    pop_df2 = deepcopy(pop_df)
    pop_df2['fitness_2'] = 1/pop_df2['fitness'].rank(ascending=False)
    pop_df2['fitness'] = pop_df2['fitness_2']
    
    return_selection = roullete_selection(pop_df2[pop_df.columns.tolist()],n_select)
    
    return return_selection

In [10]:
# function for tournament selection
def tournament_selection(pop_df, n_select, tournament_size):
    pop_df2 = deepcopy(pop_df)
    return_selection = pd.DataFrame(columns=pop_df.columns.tolist())
    for i in range(n_select):
        tournament_pool = pop_df2.sample(n=tournament_size)
        winner = tournament_pool.sort_values(by='fitness', axis=0, ascending=False).head(1) 
        return_selection = return_selection.append(winner)
    
    return return_selection

In [11]:
# random bit mutation function
def mutation(dna,p_mut):
    dna2 = ''
    for i in range(len(dna)):
        if random.random() < p_mut:
            if dna[i] == '1':
                dna2 = dna2 + '0'
            else:
                dna2 = dna2 + '1'
        else:
            dna2 = dna2 + dna[i]
    return dna

In [12]:
# function for uniform crossover
def uniform_crossover(par1,par2):
    child = ''
    for i in range(len(par1)):
        if random.random() < 0.5:
            child = child + par1[i]
        else:
            child = child + par2[i]
    return mutation(child,p_mut)
     

In [13]:
def fit_prop_crossover(par1,fit1,par2,fit2):
    prob_par1 = fit1/(fit1+fit2)
    child = ''
    for i in range(len(par1)):
        if random.random() < prob_par1:
            child = child + par1[i]
        else:
            child = child + par2[i]
    return mutation(child,p_mut)

In [14]:
# GA parameters
n_pop = 100             # population size
n_gen = 100              # number of generations
p_mut = 0.05           # mutation probability
p_cross = 0.5          # crossover probability
child_ratio = 0.5      # share of children to be included in each generation
tournament_size = 4    # size pool for tournament selection, needed only in case of tournament selection

In [15]:
# catboost parameter
n_tree = 100

In [16]:
# list of parameters in format [parameter name, minimum value, maximum value, step size]
num_param_list = [['rsm',0.5,1,0.05,2],
                  ['depth',4,10,1,0],
                  ['learning_rate',0.01,0.5,0.01,2],
                  ['l2_leaf_reg',1,100,1,0]]

In [17]:
param_df = pd.DataFrame(num_param_list, 
                        columns=['parameter_name', 
                                 'min_val',
                                 'max_val',
                                 'step_size',
                                 'decimal_precision'])

In [18]:
for i in range(param_df.shape[0]):
    nBits = num_trait_bin_len(param_df.loc[i]['min_val'],
                              param_df.loc[i]['max_val'],
                              param_df.loc[i]['step_size'])
    param_df.loc[i,'nBits'] = nBits

dna_len = int(param_df['nBits'].sum())

In [19]:
# result collection df
hyper_param_list = list(param_df['parameter_name'])
result_col_list = ['dna_str'] + hyper_param_list + ['accuracy']

#results_df = pd.DataFrame(data=None,columns=result_col_list)

In [20]:
t_a = datetime.datetime.now()

def ga_catboost(dna):  
    
    hyper_param_str = ''
    bit_cntr = 0
    
    res_dict = {"dna":dna}
    
    for i in range(param_df.shape[0]):
        parameter = param_df.loc[i]['parameter_name']
        param_min = param_df.loc[i]['min_val']
        param_max = param_df.loc[i]['max_val']
        decimal_precision = param_df.loc[i]['decimal_precision']
        bit_len = int(param_df.loc[i]['nBits'])
        
        bit_start = bit_cntr
        bit_end = bit_start+bit_len
        
        param_str = dna[bit_start:bit_end]        
        param_val = bin2num(param_min,param_max,decimal_precision,param_str)      
        hyper_param_str = hyper_param_str + parameter + "=" + str(param_val) + ","
        res_dict[parameter] = param_val
        
    if dna in list(result_df['dna']):
        accuracy = result_df[result_df['dna'] == dna]['fitness'].tolist()[0]
    else:
        model = eval("CatBoostClassifier(iterations=" + str(n_tree) + "," + hyper_param_str + "random_seed=2)")
        model.fit(X_train, y_train,cat_indices)
    
        # Predicitng and calculating performance on test data
        predict_prob = model.predict_proba(X_test)[:,1]
    
        pred_list = [1 if i > 0.5 else 0 for i in predict_prob.tolist()]
    
        y_list = y_test.tolist()
    
        counter = 0
        for i in range(len(pred_list)):
            if pred_list[i] == y_list[i]:
                counter = counter+1
    
        accuracy = counter/len(pred_list)
    
    res_dict['fitness'] = accuracy
       
    return res_dict

In [20]:
t1 = datetime.datetime.now()

In [21]:
# initialize population
pop_list = [dna_initialization(param_df) for i in range(n_pop)]

result_col_list = ['dna'] + list(param_df['parameter_name']) + ['fitness']
result_df = pd.DataFrame(columns=result_col_list)

print(len(pop_list))
n_child_needed = math.floor(child_ratio*n_pop)

# initial run
gen = 0
results = []

# initialize progress bar
pbar1 = tqdm(total=n_pop)

for pop_i in pop_list:
    
    res_t = ga_catboost(pop_i)
    results = results + [res_t]
    
    pbar1.update(1)

pbar1.close()

result_temp = pd.DataFrame(results,columns=result_col_list)

result_df = result_df.append(result_temp)
result_df = result_df.drop_duplicates()
result_df = result_df.reset_index(drop=True)

pop_df = result_temp[['dna','fitness']]

100


A Jupyter Widget




In [22]:
gen_best_performance = []
while gen < n_gen:
    
    tg_1 = datetime.datetime.now()
    
    gen = gen+1
    
#    par_pool1 = tournament_selection(pop_df,n_child_needed,tournament_size)
#    par_pool2 = tournament_selection(pop_df,n_child_needed,tournament_size)
    
    par_pool1 = ranking_selection(pop_df,n_child_needed)
    par_pool2 = ranking_selection(pop_df,n_child_needed)
    
    child_pool = [uniform_crossover(par1,par2) for par1,par2 in zip(par_pool1['dna'].tolist(),
                                    par_pool2['dna'].tolist())]
    
    # removing n lowest performing individuals where n is the number of children
    pop_df = pop_df.sort_values(by='fitness',ascending=False)
    pop_list = pop_df['dna'].tolist()[0:(n_pop-n_child_needed)]
    
    pop_list = pop_list + child_pool
      
    # Calculate fitness values
    results = []
    # initialize progress bar
    pbar2 = tqdm(total=n_pop)

    for pop_i in pop_list:

        res_t = ga_catboost(pop_i)
        results = results + [res_t]

        pbar2.update(1)

    pbar2.close()
   
    result_temp = pd.DataFrame(results,columns=result_col_list)
    
    result_df = result_df.append(result_temp)
    result_df = result_df.drop_duplicates()
    result_df = result_df.reset_index(drop=True)
    
    pop_df = result_temp[['dna','fitness']]
    
    tg_2 = datetime.datetime.now()
    
    print('Best fitness for generation ' + str(gen) + ' is ' + str(pop_df['fitness'].max()) + ' and truntime = ' + str(tg_2-tg_1))
    gen_best_performance = gen_best_performance + [[gen,pop_df['fitness'].max()]]
    

A Jupyter Widget


Best fitness for generation 1 is 0.827956989247 and truntime = 0:01:31.011203


A Jupyter Widget


Best fitness for generation 2 is 0.827956989247 and truntime = 0:01:07.198439


A Jupyter Widget


Best fitness for generation 3 is 0.827956989247 and truntime = 0:01:09.709383


A Jupyter Widget


Best fitness for generation 4 is 0.827956989247 and truntime = 0:00:59.273030


A Jupyter Widget


Best fitness for generation 5 is 0.827956989247 and truntime = 0:00:47.924313


A Jupyter Widget


Best fitness for generation 6 is 0.827956989247 and truntime = 0:00:47.369160


A Jupyter Widget


Best fitness for generation 7 is 0.827956989247 and truntime = 0:00:39.934937


A Jupyter Widget


Best fitness for generation 8 is 0.827956989247 and truntime = 0:00:41.448087


A Jupyter Widget


Best fitness for generation 9 is 0.827956989247 and truntime = 0:00:34.594750


A Jupyter Widget


Best fitness for generation 10 is 0.827956989247 and truntime = 0:00:30.777996


A Jupyter Widget


Best fitness for generation 11 is 0.827956989247 and truntime = 0:00:32.596206


A Jupyter Widget


Best fitness for generation 12 is 0.827956989247 and truntime = 0:00:34.138169


A Jupyter Widget


Best fitness for generation 13 is 0.827956989247 and truntime = 0:00:23.287508


A Jupyter Widget


Best fitness for generation 14 is 0.827956989247 and truntime = 0:00:34.003213


A Jupyter Widget


Best fitness for generation 15 is 0.827956989247 and truntime = 0:00:29.948696


A Jupyter Widget


Best fitness for generation 16 is 0.827956989247 and truntime = 0:00:27.923708


A Jupyter Widget


Best fitness for generation 17 is 0.827956989247 and truntime = 0:00:21.523463


A Jupyter Widget


Best fitness for generation 18 is 0.827956989247 and truntime = 0:00:17.274263


A Jupyter Widget


Best fitness for generation 19 is 0.827956989247 and truntime = 0:00:20.179121


A Jupyter Widget


Best fitness for generation 20 is 0.827956989247 and truntime = 0:00:11.173400


A Jupyter Widget


Best fitness for generation 21 is 0.827956989247 and truntime = 0:00:10.776593


A Jupyter Widget


Best fitness for generation 22 is 0.827956989247 and truntime = 0:00:11.401920


A Jupyter Widget


Best fitness for generation 23 is 0.827956989247 and truntime = 0:00:19.633954


A Jupyter Widget


Best fitness for generation 24 is 0.827956989247 and truntime = 0:00:12.720885


A Jupyter Widget


Best fitness for generation 25 is 0.827956989247 and truntime = 0:00:13.326172


A Jupyter Widget


Best fitness for generation 26 is 0.827956989247 and truntime = 0:00:06.603461


A Jupyter Widget


Best fitness for generation 27 is 0.827956989247 and truntime = 0:00:13.660124


A Jupyter Widget


Best fitness for generation 28 is 0.827956989247 and truntime = 0:00:08.851539


A Jupyter Widget


Best fitness for generation 29 is 0.827956989247 and truntime = 0:00:11.696091


A Jupyter Widget


Best fitness for generation 30 is 0.827956989247 and truntime = 0:00:08.169994


A Jupyter Widget


Best fitness for generation 31 is 0.827956989247 and truntime = 0:00:09.102050


A Jupyter Widget


Best fitness for generation 32 is 0.827956989247 and truntime = 0:00:04.435285


A Jupyter Widget


Best fitness for generation 33 is 0.827956989247 and truntime = 0:00:04.249403


A Jupyter Widget


Best fitness for generation 34 is 0.827956989247 and truntime = 0:00:02.950008


A Jupyter Widget


Best fitness for generation 35 is 0.827956989247 and truntime = 0:00:02.886076


A Jupyter Widget


Best fitness for generation 36 is 0.827956989247 and truntime = 0:00:05.245822


A Jupyter Widget


Best fitness for generation 37 is 0.827956989247 and truntime = 0:00:03.054843


A Jupyter Widget


Best fitness for generation 38 is 0.827956989247 and truntime = 0:00:05.658638


A Jupyter Widget


Best fitness for generation 39 is 0.827956989247 and truntime = 0:00:04.157413


A Jupyter Widget


Best fitness for generation 40 is 0.827956989247 and truntime = 0:00:01.991428


A Jupyter Widget


Best fitness for generation 41 is 0.827956989247 and truntime = 0:00:04.171249


A Jupyter Widget


Best fitness for generation 42 is 0.827956989247 and truntime = 0:00:03.262356


A Jupyter Widget


Best fitness for generation 43 is 0.827956989247 and truntime = 0:00:01.870320


A Jupyter Widget


Best fitness for generation 44 is 0.827956989247 and truntime = 0:00:04.032645


A Jupyter Widget


Best fitness for generation 45 is 0.827956989247 and truntime = 0:00:05.176819


A Jupyter Widget


Best fitness for generation 46 is 0.827956989247 and truntime = 0:00:03.891300


A Jupyter Widget


Best fitness for generation 47 is 0.827956989247 and truntime = 0:00:04.249862


A Jupyter Widget


Best fitness for generation 48 is 0.827956989247 and truntime = 0:00:03.020050


A Jupyter Widget


Best fitness for generation 49 is 0.827956989247 and truntime = 0:00:01.687966


A Jupyter Widget


Best fitness for generation 50 is 0.827956989247 and truntime = 0:00:01.928732


A Jupyter Widget


Best fitness for generation 51 is 0.827956989247 and truntime = 0:00:00.741084


A Jupyter Widget


Best fitness for generation 52 is 0.827956989247 and truntime = 0:00:01.878065


A Jupyter Widget


Best fitness for generation 53 is 0.827956989247 and truntime = 0:00:02.860998


A Jupyter Widget


Best fitness for generation 54 is 0.827956989247 and truntime = 0:00:01.761206


A Jupyter Widget


Best fitness for generation 55 is 0.827956989247 and truntime = 0:00:00.741757


A Jupyter Widget


Best fitness for generation 56 is 0.827956989247 and truntime = 0:00:00.741339


A Jupyter Widget


Best fitness for generation 57 is 0.827956989247 and truntime = 0:00:04.133460


A Jupyter Widget


Best fitness for generation 58 is 0.827956989247 and truntime = 0:00:01.877632


A Jupyter Widget


Best fitness for generation 59 is 0.827956989247 and truntime = 0:00:03.116015


A Jupyter Widget


Best fitness for generation 60 is 0.827956989247 and truntime = 0:00:02.003105


A Jupyter Widget


Best fitness for generation 61 is 0.827956989247 and truntime = 0:00:01.891417


A Jupyter Widget


Best fitness for generation 62 is 0.827956989247 and truntime = 0:00:00.740520


A Jupyter Widget


Best fitness for generation 63 is 0.827956989247 and truntime = 0:00:00.739043


A Jupyter Widget


Best fitness for generation 64 is 0.827956989247 and truntime = 0:00:00.739735


A Jupyter Widget


Best fitness for generation 65 is 0.827956989247 and truntime = 0:00:00.739859


A Jupyter Widget


Best fitness for generation 66 is 0.827956989247 and truntime = 0:00:00.739740


A Jupyter Widget


Best fitness for generation 67 is 0.827956989247 and truntime = 0:00:00.740962


A Jupyter Widget


Best fitness for generation 68 is 0.827956989247 and truntime = 0:00:00.740986


A Jupyter Widget


Best fitness for generation 69 is 0.827956989247 and truntime = 0:00:00.739708


A Jupyter Widget


Best fitness for generation 70 is 0.827956989247 and truntime = 0:00:01.875502


A Jupyter Widget


Best fitness for generation 71 is 0.827956989247 and truntime = 0:00:02.000041


A Jupyter Widget


Best fitness for generation 72 is 0.827956989247 and truntime = 0:00:00.742726


A Jupyter Widget


Best fitness for generation 73 is 0.827956989247 and truntime = 0:00:00.757878


A Jupyter Widget


Best fitness for generation 74 is 0.827956989247 and truntime = 0:00:00.741041


A Jupyter Widget


Best fitness for generation 75 is 0.827956989247 and truntime = 0:00:00.743344


A Jupyter Widget


Best fitness for generation 76 is 0.827956989247 and truntime = 0:00:00.743816


A Jupyter Widget


Best fitness for generation 77 is 0.827956989247 and truntime = 0:00:00.742245


A Jupyter Widget


Best fitness for generation 78 is 0.827956989247 and truntime = 0:00:00.742477


A Jupyter Widget


Best fitness for generation 79 is 0.827956989247 and truntime = 0:00:00.741543


A Jupyter Widget


Best fitness for generation 80 is 0.827956989247 and truntime = 0:00:00.742980


A Jupyter Widget


Best fitness for generation 81 is 0.827956989247 and truntime = 0:00:00.742332


A Jupyter Widget


Best fitness for generation 82 is 0.827956989247 and truntime = 0:00:00.738801


A Jupyter Widget


Best fitness for generation 83 is 0.827956989247 and truntime = 0:00:02.026116


A Jupyter Widget


Best fitness for generation 84 is 0.827956989247 and truntime = 0:00:00.742717


A Jupyter Widget


Best fitness for generation 85 is 0.827956989247 and truntime = 0:00:00.741404


A Jupyter Widget


Best fitness for generation 86 is 0.827956989247 and truntime = 0:00:00.741911


A Jupyter Widget


Best fitness for generation 87 is 0.827956989247 and truntime = 0:00:00.784375


A Jupyter Widget


Best fitness for generation 88 is 0.827956989247 and truntime = 0:00:00.740011


A Jupyter Widget


Best fitness for generation 89 is 0.827956989247 and truntime = 0:00:00.741225


A Jupyter Widget


Best fitness for generation 90 is 0.827956989247 and truntime = 0:00:00.740613


A Jupyter Widget


Best fitness for generation 91 is 0.827956989247 and truntime = 0:00:00.740388


A Jupyter Widget


Best fitness for generation 92 is 0.827956989247 and truntime = 0:00:00.743159


A Jupyter Widget


Best fitness for generation 93 is 0.827956989247 and truntime = 0:00:00.742857


A Jupyter Widget


Best fitness for generation 94 is 0.827956989247 and truntime = 0:00:00.741487


A Jupyter Widget


Best fitness for generation 95 is 0.827956989247 and truntime = 0:00:02.999030


A Jupyter Widget


Best fitness for generation 96 is 0.827956989247 and truntime = 0:00:00.742599


A Jupyter Widget


Best fitness for generation 97 is 0.827956989247 and truntime = 0:00:00.741542


A Jupyter Widget


Best fitness for generation 98 is 0.827956989247 and truntime = 0:00:00.742248


A Jupyter Widget


Best fitness for generation 99 is 0.827956989247 and truntime = 0:00:00.743296


A Jupyter Widget


Best fitness for generation 100 is 0.827956989247 and truntime = 0:00:00.741965


In [23]:
    
t2 = datetime.datetime.now()

print("total time -> " + str(t2-t1))

#    results = [pool.apply_async(ga_catboost, args=(x,)) for x in list(pop_df['population'])]
#    final_result = [result.get() for result in results]    
    

total time -> 0:19:45.973334
