In [24]:
import numpy as np
from xgboost import XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [25]:
# Data file paths
train_raw_path, test_raw_path = r'..\data\train_raw.csv', r'..\data\test_raw.csv'
train_path, test_path = r'..\data\train.csv', r'..\data\test.csv'

# Open csv files
train_raw_data = np.loadtxt(train_raw_path, dtype='str', delimiter=',', unpack=True).T
test_raw_data = np.loadtxt(test_raw_path, dtype='str', delimiter=',', unpack=True).T

N_train, N_test = train_raw_data.shape[0] - 1, test_raw_data.shape[0] - 1
num_features = train_raw_data.shape[1] - 2

# Initialize arrays for train and test data
train_X, train_Y = np.zeros((N_train, train_raw_data.shape[1] - 2)), np.zeros((N_train, 1))
test_X = np.zeros((N_test, test_raw_data.shape[1] - 1))

formats = \
{
    0:  {'Female': 0, 'Male': 1},
    2:  {'No': 0, 'Yes': 1},
    3:  {'No': 0, 'Yes': 1},
    5:  {'No': 0, 'Yes': 1},
    6:  {'No phone service': 0, 'No': 1, 'Yes': 2},
    7:  {'No': 0, 'DSL': 1, 'Fiber optic': 2},
    8:  {'No internet service': 0, 'No': 1, 'Yes': 2},
    9:  {'No internet service': 0, 'No': 1, 'Yes': 2},
    10: {'No internet service': 0, 'No': 1, 'Yes': 2},
    11: {'No internet service': 0, 'No': 1, 'Yes': 2},
    12: {'No internet service': 0, 'No': 1, 'Yes': 2},
    13: {'No internet service': 0, 'No': 1, 'Yes': 2},
    14: {'Month-to-month': 0, 'One year': 1, 'Two year': 2},
    15: {'No': 0, 'Yes': 1},
    16: {'Mailed check': 0, 'Bank transfer (automatic)': 1, 'Electronic check': 2, 'Credit card (automatic)': 3},
}

for i in range(N_train):
    train_raw_x, train_raw_y = train_raw_data[i+1, 1:-1], train_raw_data[i+1, -1]
    
    train_x = np.zeros((num_features))
    for j in range(num_features):
        if j in formats.keys():
            train_x[j] = formats[j][train_raw_x[j]]
        else:
            if j == 18 and not train_raw_x[j]:              # If 'Total Charges' missing, calculate from 'tenure' and 'Monthly Charges'
                train_x[j] = train_x[4] * train_x[17]  
            else:
                train_x[j] = eval(train_raw_x[j])
    
    train_y = 1 if train_raw_y == 'Yes' else 0
    
    
    train_X[i], train_Y[i] = train_x, train_y

for i in range(N_test):
    test_raw_x = test_raw_data[i+1, 1:]
    
    test_x = np.zeros((num_features))
    for j in range(num_features):
        if j in formats.keys():
            test_x[j] = formats[j][test_raw_x[j]]
        else:
            if j == 18 and not test_raw_x[j]:              # If 'Total Charges' missing, calculate from 'tenure' and 'Monthly Charges'
                test_x[j] = test_x[4] * test_x[17]  
            else:
                test_x[j] = eval(test_raw_x[j])
            
    test_X[i] = test_x

In [26]:
params = \
    {
        'gamma': range(0, 10),
        'min_child_weight': range(100, 120, 2),
        'max_delta_step': np.linspace(1, 3, 10),
        'max_depth': range(1, 5),
        'subsample': [.01] + list(np.linspace(.1, 1, 10)),
        'learning_rate': [1e-5, 1e-4, 5e-4, 1e-3, 2e-3, 3e-3],
        'reg_alpha': range(3, 7),
        'reg_lambda': range(3, 7),
        'top_k': range(0, 5)
    }

train_Y = train_Y.ravel()

model = XGBRFClassifier(objective='binary:logitraw', device='cuda')

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1001)
grid_search = GridSearchCV(model, param_grid=params, scoring='roc_auc', n_jobs=-1, cv=skf.split(train_X, train_Y), verbose=3)

from datetime import datetime
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
        
start_time = timer(None) # timing starts from this point for "start_time" variable
grid_search.fit(train_X, train_Y)
timer(start_time) # timing ends here for "start_time" variable

Fitting 5 folds for each of 21120000 candidates, totalling 105600000 fits


KeyboardInterrupt: 

In [None]:
print('\n All results:')
print(grid_search.cv_results_)
print('\n Best estimator:')
print(grid_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search:' % (5))
print(grid_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(grid_search.best_params_)