In [6]:
import numpy as np
import xgboost
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV

In [7]:
# Data file paths
train_raw_path, test_raw_path = r'../data/train_raw.csv', r'../data/test_raw.csv'

# Open csv files
train_raw_data = np.loadtxt(train_raw_path, dtype='str', delimiter=',', unpack=True).T
test_raw_data = np.loadtxt(test_raw_path, dtype='str', delimiter=',', unpack=True).T

N_train, N_test = train_raw_data.shape[0] - 1, test_raw_data.shape[0] - 1
num_features = train_raw_data.shape[1] - 2

# Initialize arrays for train and test data
train_X, train_Y = np.zeros((N_train, train_raw_data.shape[1] - 2)), np.zeros((N_train, 1))
test_X = np.zeros((N_test, test_raw_data.shape[1] - 1))

formats = \
{
    0:  {'Female': 0, 'Male': 1},
    2:  {'No': 0, 'Yes': 1},
    3:  {'No': 0, 'Yes': 1},
    5:  {'No': 0, 'Yes': 1},
    6:  {'No phone service': 0, 'No': 1, 'Yes': 2},
    7:  {'No': 0, 'DSL': 1, 'Fiber optic': 2},
    8:  {'No internet service': 0, 'No': 1, 'Yes': 2},
    9:  {'No internet service': 0, 'No': 1, 'Yes': 2},
    10: {'No internet service': 0, 'No': 1, 'Yes': 2},
    11: {'No internet service': 0, 'No': 1, 'Yes': 2},
    12: {'No internet service': 0, 'No': 1, 'Yes': 2},
    13: {'No internet service': 0, 'No': 1, 'Yes': 2},
    14: {'Month-to-month': 0, 'One year': 1, 'Two year': 2},
    15: {'No': 0, 'Yes': 1},
    16: {'Mailed check': 0, 'Bank transfer (automatic)': 1, 'Electronic check': 2, 'Credit card (automatic)': 3},
}

for i in range(N_train):
    train_raw_x, train_raw_y = train_raw_data[i+1, 1:-1], train_raw_data[i+1, -1]
    
    train_x = np.zeros((num_features))
    for j in range(num_features):
        if j in formats.keys():
            train_x[j] = formats[j][train_raw_x[j]]
        else:
            if j == 18 and not train_raw_x[j]:              # If 'Total Charges' missing, calculate from 'tenure' and 'Monthly Charges'
                train_x[j] = train_x[4] * train_x[17]  
            else:
                train_x[j] = eval(train_raw_x[j])
    
    train_y = 1 if train_raw_y == 'Yes' else 0
    
    
    train_X[i], train_Y[i] = train_x, train_y

for i in range(N_test):
    test_raw_x = test_raw_data[i+1, 1:]
    
    test_x = np.zeros((num_features))
    for j in range(num_features):
        if j in formats.keys():
            test_x[j] = formats[j][test_raw_x[j]]
        else:
            if j == 18 and not test_raw_x[j]:              # If 'Total Charges' missing, calculate from 'tenure' and 'Monthly Charges'
                test_x[j] = test_x[4] * test_x[17]  
            else:
                test_x[j] = eval(test_raw_x[j])
            
    test_X[i] = test_x

In [12]:
params = {}
params['min_child_weight'] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
params['gamma'] = [4,5,6,6.5,7,8,9,10,12]
params['subsample'] = [0.6, 0.7, 0.8, 0.9, 1.0]
params['colsample_bytree'] = [0.6, 0.7, 0.8, 0.9, 1.0]
params['n_estimators'] = [600, 700, 800, 900, 1000]
params['max_depth'] = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
params['learning_rate'] = [0.04, 0.05, 0.06, 0.07, 0.08, 0.09]
params['reg_alpha'] = [2,3,3.5,4,6,8,11]
params['reg_lambda'] = [4.5,5,5.5]

train_Y = train_Y.ravel()

xgb = xgboost.XGBRFClassifier(objective='binary:logistic', use_label_encoder=False, nthread=8)
skf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 1001)
grid_search = GridSearchCV(xgb, param_grid=params, scoring='roc_auc', n_jobs=4, cv=skf.split(train_X,train_Y), verbose=3)
grid_search.fit(train_X, train_Y)

Fitting 5 folds for each of 17010000 candidates, totalling 85050000 fits
[CV 1/5] END colsample_bytree=0.6, gamma=4, learning_rate=0.04, max_depth=3, min_child_weight=1, n_estimators=600, reg_alpha=2, reg_lambda=4.5, subsample=0.6;, score=0.841 total time=   1.0s
[CV 2/5] END colsample_bytree=0.6, gamma=4, learning_rate=0.04, max_depth=3, min_child_weight=1, n_estimators=600, reg_alpha=2, reg_lambda=4.5, subsample=0.6;, score=0.864 total time=   1.1s
[CV 4/5] END colsample_bytree=0.6, gamma=4, learning_rate=0.04, max_depth=3, min_child_weight=1, n_estimators=600, reg_alpha=2, reg_lambda=4.5, subsample=0.6;, score=0.824 total time=   1.1s
[CV 3/5] END colsample_bytree=0.6, gamma=4, learning_rate=0.04, max_depth=3, min_child_weight=1, n_estimators=600, reg_alpha=2, reg_lambda=4.5, subsample=0.6;, score=0.829 total time=   1.1s
[CV 5/5] END colsample_bytree=0.6, gamma=4, learning_rate=0.04, max_depth=3, min_child_weight=1, n_estimators=600, reg_alpha=2, reg_lambda=4.5, subsample=0.6;, sco

In [11]:
print('\n All results:')
print(grid_search.cv_results_)
print('\n Best estimator:')
print(grid_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search:' % (5))
print(grid_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(grid_search.best_params_)


 All results:
{'mean_fit_time': array([1.44652386]), 'std_fit_time': array([0.46916382]), 'mean_score_time': array([0.01289744]), 'std_score_time': array([0.01653784]), 'param_colsample_bytree': masked_array(data=[0.9],
             mask=[False],
       fill_value='?',
            dtype=object), 'param_gamma': masked_array(data=[7],
             mask=[False],
       fill_value='?',
            dtype=object), 'param_learning_rate': masked_array(data=[0.07],
             mask=[False],
       fill_value='?',
            dtype=object), 'param_max_depth': masked_array(data=[3],
             mask=[False],
       fill_value='?',
            dtype=object), 'param_min_child_weight': masked_array(data=[3],
             mask=[False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[600],
             mask=[False],
       fill_value='?',
            dtype=object), 'param_reg_alpha': masked_array(data=[2],
             mask=[False],
       fill_value='?',
