In [278]:
import pandas as pd
import lightgbm as lgbm
import numpy as np
from sklearn.model_selection import train_test_split
import random
import os
import csv
import sys
import time

##### Preprocess data 

In [11]:
#load dataset
data = pd.read_csv('data/application_train.csv')
#select only numerical features
data = data.select_dtypes('number')
#sample a small portion
data = data.sample(n=20000,random_state =50)
#extract labels
labels = np.array(data['TARGET'])
#drop ids 
data.drop(columns=['SK_ID_CURR','TARGET'],inplace=True)
#split into training and testing set
x_train, x_test, y_train, y_test = train_test_split(data,labels, test_size = 5000,random_state=50)

In [12]:
#convert to light gbm datasets
train_set = lgbm.Dataset(x_train,y_train)
test_set = lgbm.Dataset(x_test,y_test)

##### Specify Search Domain 

In [15]:
model = lgbm.LGBMClassifier()
model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

These are the parameters available to us for tuning. A description of each can be found at:
https://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api
https://lightgbm.readthedocs.io/en/latest/Parameters.html  
We pick some of the hyper-parameters to tune:  
To tune:
- boosting_type: the type of boosting method to be used. available options: ['gbdt','dart','goss','rf']
    - we won't be including random forest since we used that as our benchmark
- colsample_bytree (float, optional (default=1.)). we'll create options around the default value, []
- learning_rate: used to control the growth of the trees - we need to vary this on a log scale
- min_child_weight/min_child_samples: determining the complexity of each tree. we'll pick one to tune
- subsample: ratio for sampling used for building trees
- reg_alpha: L1 regularization constant  
- reg_lambda: L2 regularization constant
- is_unbalance: used to specify if the class of the sample is balanced or not

In [267]:
hyper_grid = {'boosting_type': ['gbdt', 'goss', 'dart'],
    'num_leaves': list(range(20, 150)),
    'learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.5), base = 10, num = 1000)),
    'min_child_samples': list(range(10, 200, 5)),
    'reg_alpha': list(np.linspace(0, 1)),
    'reg_lambda': list(np.linspace(0, 1)),
    'colsample_bytree': list(np.linspace(0.5, 1, 10)),
    'subsample': list(np.linspace(0.6, 1, 100)), 
    'subsample_for_bin':list(range(20000, 300000, 20000)),
    'is_unbalance': [True, False]}

##### Objective Function 

The purpose of the **objective function** is to evaluate the performance of the model based on the hyperparameters. It needs to take the randomly sampled hyperparameters as the input and return the score of the model.We will be using the lightgbm's cv method along with early stopping to determine the n_estimators. This method returns a dictionary containing the score of the model. We can determine the number of estimators used by measuring the length of the score list.

In [261]:
cv_results = lgbm.cv(model.get_params(),train_set,num_boost_round=100,nfold=5,metrics='auc',seed=50)



In [268]:
def objective_func(params):
    
    if 'n_estimators' in params.keys():
        del params['n_estimators']
    
    cv_results = lgbm.cv(params,train_set,num_boost_round=1000,\
                         early_stopping_rounds=100,nfold=5,metrics='auc',seed=50)
    
    params['n_estimators'] = len(cv_results['auc-mean'])
    params['score'] = cv_results['auc-mean'][-1]
    #remove keys not needed
    del params['metric'],params['verbose']
#     print('length of params after adding score: %s'%(len(params.keys())))
#     print(params.keys())
    return params

##### Random Search Function

The purpose of this function is to randomly sample hyper-parameters from the search domain, pass them to the objective function and then store the results into a datastructure.

In [269]:
#specify maximum iterations
search_iters = 10

#create dataframe to stpre result
cols = list(hyper_grid.keys()) + ['score','n_estimators']
results = pd.DataFrame(columns=cols,index=range(search_iters))

In [270]:
#make csv file to store results
file_name = 'random_search_resutls.csv'
with open(file_name, 'w') as csvfile:
    #create writer object
    writer = csv.writer(csvfile)
    writer.writerow(cols)

In [284]:
def random_search(search_grid,search_iters):
    count = 0
    params = {}
    best_score = 0
    start = time.time()
    for i in range(search_iters):
        
        for key in hyper_grid.keys():
            params[key] = random.choice(hyper_grid[key]) 

        if params['boosting_type'] == 'goss': params['subsample'] = 1.0
#         print('shape of params: %s'%str(len(params.keys())))
        #call objective function with sampled params    
        param_score = objective_func(params)
        if param_score['score'] > best_score :
            best_score = param_score['score'] 
        
        #store results
        results.loc[i] = list(param_score.values())
        
        print('\rIteration:{}, Score:{:.3f}, Best Score: {:.3f}'.format(i+1,param_score['score'],best_score),end='')
        sys.stdout.flush()
        #store results in a file
        with open(file_name,'a') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(list(param_score.values()))
    end = time.time()
    print('\nSearch finished in %.2f minutes'%((end-start)/60))
    return results

##### Search for Hyper Parameters

In [285]:
search_results = random_search(hyper_grid,1)

Iteration:1, Score:0.684, Best Score: 0.684
Search finished in 0.22 minutes
