In [2]:
# Data manipulation
import pandas as pd
import numpy as np

# Modeling
import lightgbm as lgb

# Evaluation of the model
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.size'] = 18
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# Custom

from utils.preprocessing import one_hot_encoder
from utils.preprocessing import missing_values_table

### Set up training and test set

In [5]:
spec_feature = pd.read_csv('Feature/feature_matrix_spec.csv')
missing_table = missing_values_table(spec_feature)

dump_feats = missing_table[missing_table['% of Total Values'] > 30].index.tolist()

spec_feature = spec_feature.drop(dump_feats, axis = 1)

spec_feature, _ = one_hot_encoder(spec_feature)

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Imputer

Target = spec_feature['TARGET']
ID = spec_feature['SK_ID_CURR']

dataset_temp = spec_feature.drop(['TARGET','SK_ID_CURR'], axis = 1)

imputer = Imputer(strategy = 'median')
scaler = MinMaxScaler(feature_range = (0,1))

dataset_preprocessed = imputer.fit_transform(dataset_temp)
dataset_preprocessed = scaler.fit_transform(dataset_preprocessed)
dataset_preprocessed = pd.DataFrame(dataset_preprocessed, columns = dataset_temp.columns)

dataset_preprocessed['TARGET'] = Target
dataset_preprocessed['SK_ID_CURR'] = ID

train_df = dataset_preprocessed[dataset_preprocessed['TARGET'] != -999]
test_df = dataset_preprocessed[dataset_preprocessed['TARGET'] == -999]

train_features = train_df.drop(['TARGET'], axis = 1)
train_labels = train_df['TARGET']

# Training set
train_set = lgb.Dataset(train_features, label = train_labels)

There are total 885 columns.
823 of them have missing values.


### Objective Function

In [6]:
import csv
from hyperopt import STATUS_OK
from timeit import default_timer as timer

def objective(hyperparameters):
    
    global ITERATION
    
    ITERATION += 1
    
    if 'n_estimators' in hyperparameters:
        del hyperparameters['n_estimators']
        
    subsample = hyperparameters['boosting_type'].get('subsample', 1.0)
    
    hyperparameters['boosting_type'] = hyperparameters['boosting_type']['boosting_type']
    hyperparameters['subsample'] = subsample
    
    for parameter_name in ['num_leaves', 'subsample_for_bin', 'min_child_samples']:
        hyperparameters[parameter_name] = int(hyperparameters[parameter_name])
        
    start = timer()
    
    # Perform n_fold cross validation
    cv_results = lgb.cv(hyperparameters, train_set, num_boost_round = 1500, nfold = N_FOLDS, early_stopping_rounds = 100, metrics = 'auc')
    
    run_time = timer() - start
    
    # Extract the best score
    best_score = cv_results['auc-mean'][-1]
    
     # Loss must be minimized
    loss = 1- best_score
    
    # Boosting rounds that returned the highest cv score
    n_estimators = len(cv_results['auc-mean'])
    
    hyperparameters['n_estimators'] = n_estimators
    
    # Write to the csv file ('a' means append)
    of_connection = open(OUT_FILE, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, hyperparameters, ITERATION, run_time, best_score])
    of_connection.close()
    
    return {'loss': loss, 'hyperparameters': hyperparameters, 'iteration': ITERATION, 'train_time': run_time, 'status': STATUS_OK}


In [7]:
from hyperopt import hp
from hyperopt.pyll.stochastic import sample

### Specify the domain for hyperparameters

In [8]:
space = {
    'boosting_type': hp.choice('boosting_type', 
                                            [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}, 
                                             {'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)},
                                             {'boosting_type': 'goss', 'subsample': 1.0}]),
    'num_leaves': hp.quniform('num_leaves', 20, 150, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
    'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),
    'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
    'is_unbalance': hp.choice('is_unbalance', [True, False]),
}

### Optimization Algorithm

In [9]:
from hyperopt import tpe

# Create the algorithm
tpe_algorithm = tpe.suggest

In [7]:
from hyperopt import Trials

# Record results
trials = Trials()

In [8]:
# Create a file and open a connection
OUT_FILE = 'bayes_test.csv'
of_connection = open(OUT_FILE, 'w')
writer = csv.writer(of_connection)

# Write column names
headers = ['loss', 'hyperparameters', 'iteration', 'runtime', 'score']
writer.writerow(headers)
of_connection.close()

In [13]:
import pickle

trials = pickle.load( open( "Trial-56.pkl", "rb" ) )
OUT_FILE = 'bayes_test.csv'


### Automated Hyperparameter Optimization

In [11]:
from hyperopt import fmin

In [14]:
global  ITERATION

ITERATION = 0
# Governing choices for search
N_FOLDS = 5
MAX_EVALS = 1000

best = fmin(fn = objective, space = space, algo = tpe.suggest, trials = trials,
            max_evals = MAX_EVALS)

  9%|▉         | 88/943 [8:54:46<97:40:48, 411.28s/it, best loss: 0.22049434765094533] 


KeyboardInterrupt: 

In [15]:
import pickle
with open('Trial-144.pkl', 'wb') as file:
    pickle.dump(trials, file)

In [12]:
best

NameError: name 'best' is not defined

In [None]:
results = pd.read_csv(OUT_FILE)

In [None]:
import ast

def Display(results, name):
    """Evaluate model on test data using hyperparameters in results
       Return dataframe of hyperparameters"""
    new_results = results.copy()
    
    # String to dictionary
    new_results['hyperparameters'] = new_results['hyperparameters'].map(ast.literal_eval)
    
    # Sort
    new_results = new_results.sort_values('score', ascending = False).reset_index(drop = True)
    
    # Print out cross validation high score
    print('The highest cross validation score from {} was {:.5f} found on iteration {}.'.format(name, new_results.loc[0, 'score'], new_results.loc[0, 'iteration']))
    
    # Create dataframe of hyperparameters
    hyp_df = pd.DataFrame(columns = list(new_results.loc[0, 'hyperparameters'].keys()))
    
    # Iterate through each set of hyperparameters that were evaluated
    for i, hyp in enumerate(new_results['hyperparameters']):
        hyp_df = hyp_df.append(pd.DataFrame(hyp, index = [0]), 
                               ignore_index = True)
        
    # Put the iteration and score in the hyperparameter dataframe
    hyp_df['iteration'] = new_results['iteration']
    hyp_df['score'] = new_results['score']
    
    return hyp_df

In [None]:
from utils.Models import kfold_lightgbm