In [3]:
import numpy as np
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import KFold
from matplotlib import pyplot as plt
from tqdm import tqdm
import pandas as pd
from xgboost import XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor
from copy import deepcopy

import warnings
warnings.filterwarnings("ignore")

In [4]:
# Data file paths
train_raw_path, test_raw_path = r'..\data\train_raw.csv', r'..\data\test_raw.csv'
train_path, test_path = r'..\data\train.csv', r'..\data\test.csv'

# Open csv files
train_raw_data = np.loadtxt(train_raw_path, dtype='str', delimiter=',', unpack=True).T
test_raw_data = np.loadtxt(test_raw_path, dtype='str', delimiter=',', unpack=True).T

N_train, N_test = train_raw_data.shape[0] - 1, test_raw_data.shape[0] - 1
num_features = train_raw_data.shape[1] - 2

# Initialize arrays for train and test data
train_X, train_Y = np.zeros((N_train, train_raw_data.shape[1] - 2)), np.zeros((N_train, 1))
test_X = np.zeros((N_test, test_raw_data.shape[1] - 1))

print(1)

In [5]:
formats = \
{ 
    0:  {'Female': 0, 'Male': 1},
    2:  {'No': 0, 'Yes': 1},
    3:  {'No': 0, 'Yes': 1},
    5:  {'No': 0, 'Yes': 1},
    6:  {'No phone service': 0, 'No': 1, 'Yes': 2},
    7:  {'No': 0, 'DSL': 1, 'Fiber optic': 2},
    8:  {'No internet service': 0, 'No': 1, 'Yes': 2},
    9:  {'No internet service': 0, 'No': 1, 'Yes': 2},
    10: {'No internet service': 0, 'No': 1, 'Yes': 2},
    11: {'No internet service': 0, 'No': 1, 'Yes': 2},
    12: {'No internet service': 0, 'No': 1, 'Yes': 2},
    13: {'No internet service': 0, 'No': 1, 'Yes': 2},
    14: {'Month-to-month': 0, 'One year': 1, 'Two year': 2},
    15: {'No': 0, 'Yes': 1},
    16: {'Mailed check': 0, 'Bank transfer (automatic)': 1, 'Electronic check': 2, 'Credit card (automatic)': 3},
}

for i in range(N_train):
    train_raw_x, train_raw_y = train_raw_data[i+1, 1:-1], train_raw_data[i+1, -1]
    
    train_x = np.zeros((num_features))
    for j in range(num_features):
        if j in formats.keys():
            train_x[j] = formats[j][train_raw_x[j]]
        else:
            if j == 18 and not train_raw_x[j]:              # If 'Total Charges' missing, calculate from 'tenure' and 'Monthly Charges'
                train_x[j] = train_x[4] * train_x[17]  
            else:
                train_x[j] = eval(train_raw_x[j])
    
    train_y = 1 if train_raw_y == 'Yes' else 0
    
    
    train_X[i], train_Y[i] = train_x, train_y

for i in range(N_test):
    test_raw_x = test_raw_data[i+1, 1:]
    
    test_x = np.zeros((num_features))
    for j in range(num_features):
        if j in formats.keys():
            test_x[j] = formats[j][test_raw_x[j]]
        else:
            if j == 18 and not test_raw_x[j]:              # If 'Total Charges' missing, calculate from 'tenure' and 'Monthly Charges'
                test_x[j] = test_x[4] * test_x[17]  
            else:
                test_x[j] = eval(test_raw_x[j])
            
    test_X[i] = test_x

In [6]:
def score(model, X, Y, threshold=.5):
    Y_pred = model.predict(X)
    Y_pred_bin = np.where(Y_pred > threshold, 1, 0)
    
    return accuracy_score(Y, Y_pred_bin)

def cross_validation_xgb(X, Y, n_splits=5, params=None):
    kf = KFold(n_splits=n_splits)
    
    train_accs, val_accs = [], []
    for train_index, val_index in kf.split(X):
        X_train, Y_train = X[train_index], Y[train_index]
        X_val, Y_val = X[val_index], Y[val_index]
        
        model = XGBClassifier()
        if params:
            for param in params.keys():
                if param == 'gamma':
                    model.set_params(gamma=params['gamma'])
                if param == 'min_child_weight':
                    model.set_params(min_child_weight=params['min_child_weight'])
                if param == 'max_delta_step':
                    model.set_params(max_delta_step=params['max_delta_step'])
                if param == 'subsample':
                    model.set_params(subsample=params['subsample'])
                if param == 'max_depth':
                    model.set_params(max_depth=params['max_depth'])
                    
        model.fit(X_train, Y_train)
    
        train_acc = score(model, X_train, Y_train)
        val_acc = score(model, X_val, Y_val)
        
        train_accs.append(train_acc)
        val_accs.append(val_acc)
    
    avg_train_acc, avg_val_acc = np.mean(train_accs), np.mean(val_accs)
    
    return avg_train_acc, avg_val_acc

def plot_data(vals, accs, param, model_type):
    plt.figure()

    t = 1
    plt.plot(vals[::t], accs['train'][::t], marker='.', linewidth=1)
    plt.plot(vals[::t], accs['val'][::t], marker='.', linewidth=1)
    plt.xlabel(param)
    plt.ylabel('Accuracy')
    plt.title(f'Train and Validation Accuracy vs {param} for {model_type}')
    plt.legend(['Train Accuracy', 'Validation Accuracy'], loc='best')
    plt.xticks(vals[::5])

    plt.show()

    print(f"Highest val_accuracy = {round(accs['val'][np.argmax(accs['val'])], 4)} with {param} = {vals[np.argmax(accs['val'])]}")

def run_experiment(model_name, param, param_vals):
    n_splits = 5
    accs = {'train': [], 'val': []}
    vals = param_vals

    for val in tqdm(vals):
        params = \
            {
                param: val
            }
        train_acc, val_acc = cross_validation_xgb(train_X, train_Y, n_splits=n_splits, params=params)

        accs['train'].append(train_acc)
        accs['val'].append(val_acc)
    
    plot_data(vals, accs, param, model_name)

In [7]:
# run_experiment('XGBClassifier', 'gamma', range(0, 30))
# run_experiment('XGBClassifier', 'min_child_weight', range(74, 150, 2))
# run_experiment('XGBClassifier', 'max_delta_step', np.linspace(1, 3, 40))
# run_experiment('XGBClassifier', 'max_depth', range(1, 20))
# run_experiment('XGBClassifier', 'subsample', [.01] + list(np.linspace(.1, 1, 10)))

In [8]:
def cross_validation_xgb(X, Y, n_splits=5, params=None):
    kf = KFold(n_splits=n_splits)
    
    train_accs, val_accs = [], []
    for train_index, val_index in kf.split(X):
        X_train, Y_train = X[train_index], Y[train_index]
        X_val, Y_val = X[val_index], Y[val_index]
        
        model = XGBRegressor()
        if params:
            for param in params.keys():
                if param == 'gamma':
                    model.set_params(gamma=params['gamma'])
                if param == 'min_child_weight':
                    model.set_params(min_child_weight=params['min_child_weight'])
                if param == 'max_delta_step':
                    model.set_params(max_delta_step=params['max_delta_step'])
                if param == 'subsample':
                    model.set_params(subsample=params['subsample'])
                if param == 'max_depth':
                    model.set_params(max_depth=params['max_depth'])
                    
        model.fit(X_train, Y_train)
    
        train_acc = score(model, X_train, Y_train)
        val_acc = score(model, X_val, Y_val)
        
        train_accs.append(train_acc)
        val_accs.append(val_acc)
    
    avg_train_acc, avg_val_acc = np.mean(train_accs), np.mean(val_accs)
    
    return avg_train_acc, avg_val_acc

# run_experiment('XGBRegressor', 'gamma', range(0, 30))
# run_experiment('XGBRegressor', 'min_child_weight', range(74, 150, 2))
# run_experiment('XGBRegressor', 'max_depth', range(1, 20))

In [9]:
def cross_validation_xgb(X, Y, n_splits=5, params=None):
    kf = KFold(n_splits=n_splits)
    
    train_accs, val_accs = [], []
    for train_index, val_index in kf.split(X):
        X_train, Y_train = X[train_index], Y[train_index]
        X_val, Y_val = X[val_index], Y[val_index]
        
        model = XGBRFClassifier()
        if params:
            for param in params.keys():
                if param == 'gamma':
                    model.set_params(gamma=params['gamma'])
                if param == 'min_child_weight':
                    model.set_params(min_child_weight=params['min_child_weight'])
                if param == 'max_delta_step':
                    model.set_params(max_delta_step=params['max_delta_step'])
                if param == 'subsample':
                    model.set_params(subsample=params['subsample'])
                if param == 'max_depth':
                    model.set_params(max_depth=params['max_depth'])
                    
        model.fit(X_train, Y_train)
    
        train_acc = score(model, X_train, Y_train)
        val_acc = score(model, X_val, Y_val)
        
        train_accs.append(train_acc)
        val_accs.append(val_acc)
    
    avg_train_acc, avg_val_acc = np.mean(train_accs), np.mean(val_accs)
    
    return avg_train_acc, avg_val_acc

# run_experiment('XGBRFClassifier', 'gamma', range(0, 30))
# run_experiment('XGBRFClassifier', 'min_child_weight', range(74, 150, 2))
# run_experiment('XGBRFClassifier', 'max_delta_step', np.linspace(1, 3, 40))
# run_experiment('XGBRFClassifier', 'max_depth', range(1, 20))

In [10]:
def cross_validation_xgb(X, Y, n_splits=5, params=None):
    kf = KFold(n_splits=n_splits)
    
    train_accs, val_accs = [], []
    for train_index, val_index in kf.split(X):
        X_train, Y_train = X[train_index], Y[train_index]
        X_val, Y_val = X[val_index], Y[val_index]
        
        model = XGBRFRegressor()
        if params:
            for param in params.keys():
                if param == 'gamma':
                    model.set_params(gamma=params['gamma'])
                if param == 'min_child_weight':
                    model.set_params(min_child_weight=params['min_child_weight'])
                if param == 'max_delta_step':
                    model.set_params(max_delta_step=params['max_delta_step'])
                if param == 'subsample':
                    model.set_params(subsample=params['subsample'])
                if param == 'max_depth':
                    model.set_params(max_depth=params['max_depth'])
                    
        model.fit(X_train, Y_train)
    
        train_acc = score(model, X_train, Y_train)
        val_acc = score(model, X_val, Y_val)
        
        train_accs.append(train_acc)
        val_accs.append(val_acc)
    
    avg_train_acc, avg_val_acc = np.mean(train_accs), np.mean(val_accs)
    
    return avg_train_acc, avg_val_acc

# run_experiment('XGBRFRegressor', 'gamma', range(0, 30))
# run_experiment('XGBRFRegressor', 'min_child_weight', range(74, 150, 2))
# run_experiment('XGBRFRegressor', 'max_depth', range(1, 20))
# run_experiment('XGBClassifier', 'subsample', [.01] + list(np.linspace(.1, 1, 10)))

In [11]:
def score(model, X, Y, threshold=.5):
    Y_pred = model.predict(X)
    Y_pred_bin = np.where(Y_pred > threshold, 1, 0)
    
    return accuracy_score(Y, Y_pred_bin)

def cross_validation_xgb(X, Y, n_splits, params=None):
    kf = KFold(n_splits=n_splits)
    
    train_accs, val_accs = [], []
    for train_index, val_index in kf.split(X):
        X_train, Y_train = X[train_index], Y[train_index]
        X_val, Y_val = X[val_index], Y[val_index]
        
        model = XGBClassifier()
        if params:
            for param in params.keys():
                if param == 'gamma':
                    model.set_params(gamma=params['gamma'])
                if param == 'min_child_weight':
                    model.set_params(min_child_weight=params['min_child_weight'])
                if param == 'max_delta_step':
                    model.set_params(max_delta_step=params['max_delta_step'])
                if param == 'subsample':
                    model.set_params(subsample=params['subsample'])
                if param == 'max_depth':
                    model.set_params(max_depth=params['max_depth'])
                    
        model.fit(X_train, Y_train)
    
        train_acc = score(model, X_train, Y_train)
        val_acc = score(model, X_val, Y_val)
        
        train_accs.append(train_acc)
        val_accs.append(val_acc)
    
    avg_train_acc, avg_val_acc = np.mean(train_accs), np.mean(val_accs)
    
    return avg_train_acc, avg_val_acc

n_splits = 5
params = \
    {
        'gamma': 6,
        'min_child_weight': 116,
        'max_delta_step': 2.0769,
        'max_depth': 4
    }

train_acc, val_acc = cross_validation_xgb(train_X, train_Y, n_splits, params=params)

print(train_acc, val_acc)

0.8083005470312813 0.8042277602015254


In [24]:
from sklearn.model_selection import StratifiedKFold
model = XGBRFClassifier(objective='binary:logistic', device='cuda')

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1001)

n_scores = cross_val_score(model, train_X, train_Y, scoring='roc_auc', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Accuracy: 0.838 (0.009)


In [27]:
model = XGBRFClassifier(objective='binary:logistic', device='cuda')
model.fit(train_X, train_Y)
print(model.score(train_X, train_Y))

0.8203256597417181


In [32]:
model = XGBRFClassifier(objective='binary:logistic', device='cuda')
model.fit(train_X, train_Y)
print(model.score(train_X, train_Y))
print(model.predict_proba(train_X)[:, 1])
print(model.predict(train_X))

0.8203256597417181
[0.8319212  0.14630349 0.18248063 ... 0.5132345  0.0959857  0.77113104]
[1 0 0 ... 1 0 1]


def generate_submission(file_path, customer_IDs, test_Y):    
    file = open(file_path, 'w')
    file.write('ID, TARGET\n')
    for i, id in enumerate(customer_IDs):
        line = f"{id}, {test_Y[i]}"
        if i != len(customer_IDs) - 1:
            line += '\n'
            
        file.write(line)
    file.close()

model = XGBRFClassifier(objective='binary:logistic', device='cuda')

model.fit(train_X, train_Y)

test_Y = model.predict_proba(test_X)[:, 1]
generate_submission('..\submissions\\xgbrfclf_1.csv', test_raw_data.T[0, 1:], test_Y)