## Setup

In [17]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from matplotlib import pyplot as plt
import xgboost

import warnings
warnings.filterwarnings("ignore") 

# Data file paths
train_raw_path, test_raw_path = r'../data/train_raw.csv', r'../data/test_raw.csv'
train_path, test_path = r'../data/train.csv', r'../data/test.csv'

# Open csv files
train_raw_data = np.loadtxt(train_raw_path, dtype='str', delimiter=',', unpack=True).T
test_raw_data = np.loadtxt(test_raw_path, dtype='str', delimiter=',', unpack=True).T

N_train, N_test = train_raw_data.shape[0] - 1, test_raw_data.shape[0] - 1

normalizations = \
{
    0:  {'Female': 0, 'Male': 1},
    2:  {'No': 0, 'Yes': 1},
    3:  {'No': 0, 'Yes': 1},
    5:  {'No': 0, 'Yes': 1},
    6:  {'No phone service': 0, 'No': 1, 'Yes': 2},
    7:  {'No': 0, 'DSL': 1, 'Fiber optic': 2},
    8:  {'No internet service': 0, 'No': 1, 'Yes': 2},
    9:  {'No internet service': 0, 'No': 1, 'Yes': 2},
    10: {'No internet service': 0, 'No': 1, 'Yes': 2},
    11: {'No internet service': 0, 'No': 1, 'Yes': 2},
    12: {'No internet service': 0, 'No': 1, 'Yes': 2},
    13: {'No internet service': 0, 'No': 1, 'Yes': 2},
    14: {'Month-to-month': 0, 'One year': 1, 'Two year': 2},
    15: {'No': 0, 'Yes': 1},
    16: {'Mailed check': 0, 'Bank transfer (automatic)': 1, 'Electronic check': 2, 'Credit card (automatic)': 3},
}

## All Features

In [18]:
num_features = train_raw_data.shape[1] - 2

# Initialize arrays for train and test data
train_X, train_Y = np.zeros((N_train, train_raw_data.shape[1] - 2)), np.zeros((N_train, 1))
test_X, test_Y = np.zeros((N_test, test_raw_data.shape[1] - 1)), np.zeros((N_test, 1))

for i in range(N_train):
    train_raw_x, train_raw_y = train_raw_data[i+1, 1:-1], train_raw_data[i+1, -1]
    
    train_x = np.zeros((num_features))
    for j in range(num_features):
        if j in normalizations.keys():
            train_x[j] = normalizations[j][train_raw_x[j]]
        else:
            if j == 18 and not train_raw_x[j]:              # If 'Total Charges' missing, calculate from 'tenure' and 'Monthly Charges'
                train_x[j] = train_x[4] * train_x[17]  
            else:
                train_x[j] = eval(train_raw_x[j])
    
    train_y = 1 if train_raw_y == 'Yes' else 0
    
    
    train_X[i], train_Y[i] = train_x, train_y

for i in range(N_test):
    test_raw_x = test_raw_data[i+1, 1:]
    
    test_x = np.zeros((num_features))
    for j in range(num_features):
        if j in normalizations.keys():
            test_x[j] = normalizations[j][test_raw_x[j]]
        else:
            if j == 18 and not test_raw_x[j]:              # If 'Total Charges' missing, calculate from 'tenure' and 'Monthly Charges'
                test_x[j] = test_x[4] * test_x[17]  
            else:
                test_x[j] = eval(test_raw_x[j])
            
    test_X[i] = test_x

def cross_validation_lr(X, Y, n_splits, C=1):
    kf = KFold(n_splits=n_splits)
    
    E_trains, E_vals = [], []
    for train_index, val_index in kf.split(X):
        X_train, Y_train = X[train_index], Y[train_index]
        X_val, Y_val = X[val_index], Y[val_index]
        
        model = xgboost.XGBRFClassifier(objective='binary:logistic')
        model.fit(X_train, Y_train)
        
        E_train = model.score(X_train, Y_train)
        E_val = model.score(X_val, Y_val)
        
        E_trains.append(E_train)
        E_vals.append(E_val)
    
    avg_E_train, avg_E_val = np.mean(E_trains), np.mean(E_vals)
    
    return avg_E_train, avg_E_val

E_train, E_val = cross_validation_lr(train_X, train_Y, n_splits=5, C=400)
default_train_acc = E_train
default_val_acc = E_val
print(f"All Features:   \t ({round(E_train, 4)}, {round(E_val, 4)})")

All Features:   	 (0.821, 0.7958)


## Removing 3 Features

In [19]:
normalizations = \
{
    0:  {'Female': 0, 'Male': 1},
    1:  {'No': 0, 'Yes': 1},
    2:  {'No': 0, 'Yes': 1},
    4:  {'No phone service': 0, 'No': 1, 'Yes': 2},
    5:  {'No': 0, 'DSL': 1, 'Fiber optic': 2},
    6:  {'No internet service': 0, 'No': 1, 'Yes': 2},
    7:  {'No internet service': 0, 'No': 1, 'Yes': 2},
    8: {'No internet service': 0, 'No': 1, 'Yes': 2},
    9: {'No internet service': 0, 'No': 1, 'Yes': 2},
    10: {'No internet service': 0, 'No': 1, 'Yes': 2},
    11: {'Month-to-month': 0, 'One year': 1, 'Two year': 2},
    12: {'No': 0, 'Yes': 1},
    13: {'Mailed check': 0, 'Bank transfer (automatic)': 1, 'Electronic check': 2, 'Credit card (automatic)': 3},
}

num_features = train_raw_data.shape[1] - 5

# Initialize arrays for train and test data
train_X, train_Y = np.zeros((N_train, train_raw_data.shape[1] - 5)), np.zeros((N_train, 1))
test_X, test_Y = np.zeros((N_test, test_raw_data.shape[1] - 4)), np.zeros((N_test, 1))


for i in range(N_train):
    train_raw_x, train_raw_y = train_raw_data[i+1, 1:-1], train_raw_data[i+1, -1]

    curr_train_x = np.delete(train_raw_x, [1, 5, 13])
    
    train_x = np.zeros((num_features))
    for j in range(num_features):
        if j in normalizations.keys():
            train_x[j] = normalizations[j][curr_train_x[j]]
        else:
            if j == 15 and not curr_train_x[j]:              # If 'Total Charges' missing, calculate from 'tenure' and 'Monthly Charges'
                train_x[j] = train_x[4] * train_x[14]  
            else:
                train_x[j] = eval(curr_train_x[j])
    
    train_y = 1 if train_raw_y == 'Yes' else 0
    
    
    train_X[i], train_Y[i] = train_x, train_y

for i in range(N_test):
    test_raw_x = test_raw_data[i+1, 1:]
    curr_test_x = np.delete(test_raw_x, [1, 5, 13])
    
    test_x = np.zeros((num_features))
    for j in range(num_features):
        if j in normalizations.keys():
            test_x[j] = normalizations[j][curr_test_x[j]]
        else:
            if j == 15 and not curr_test_x[j]:              # If 'Total Charges' missing, calculate from 'tenure' and 'Monthly Charges'
                test_x[j] = test_x[4] * test_x[14]  
            else:
                test_x[j] = eval(curr_test_x[j])
            
    test_X[i] = test_x

def cross_validation_lr(X, Y, n_splits, C=1):
    kf = KFold(n_splits=n_splits)
    
    E_trains, E_vals = [], []
    for train_index, val_index in kf.split(X):
        X_train, Y_train = X[train_index], Y[train_index]
        X_val, Y_val = X[val_index], Y[val_index]
        
        model = xgboost.XGBRFClassifier(objective='binary:logistic')
        model.fit(X_train, Y_train)
        
        E_train = model.score(X_train, Y_train)
        E_val = model.score(X_val, Y_val)
        
        E_trains.append(E_train)
        E_vals.append(E_val)
    
    avg_E_train, avg_E_val = np.mean(E_trains), np.mean(E_vals)
    
    return avg_E_train, avg_E_val

E_train, E_val = cross_validation_lr(train_X, train_Y, n_splits=5, C=400)
default_train_acc = E_train
default_val_acc = E_val
print(f"Removing 3 Features:   \t ({round(E_train, 4)}, {round(E_val, 4)})")

Removing 3 Features:   	 (0.8204, 0.7977)


## Removing 8 Features

In [20]:
normalizations = \
{
    0:  {'Female': 0, 'Male': 1},
    1:  {'No': 0, 'Yes': 1},
    2:  {'No': 0, 'Yes': 1},
    3:  {'No': 0, 'DSL': 1, 'Fiber optic': 2},
    4:  {'No internet service': 0, 'No': 1, 'Yes': 2},
    5:  {'No internet service': 0, 'No': 1, 'Yes': 2},
    6: {'No internet service': 0, 'No': 1, 'Yes': 2},
    7: {'No internet service': 0, 'No': 1, 'Yes': 2},
    8: {'Mailed check': 0, 'Bank transfer (automatic)': 1, 'Electronic check': 2, 'Credit card (automatic)': 3},
}

num_features = train_raw_data.shape[1] - 10

# Initialize arrays for train and test data
train_X, train_Y = np.zeros((N_train, train_raw_data.shape[1] - 10)), np.zeros((N_train, 1))
test_X, test_Y = np.zeros((N_test, test_raw_data.shape[1] - 9)), np.zeros((N_test, 1))


for i in range(N_train):
    train_raw_x, train_raw_y = train_raw_data[i+1, 1:-1], train_raw_data[i+1, -1]

    curr_train_x = np.delete(train_raw_x, [1, 4, 5, 6, 11, 13, 14, 15])
    
    train_x = np.zeros((num_features))
    for j in range(num_features):
        if j in normalizations.keys():
            train_x[j] = normalizations[j][curr_train_x[j]]
        else:
            if j == 10 and not curr_train_x[j]:
                train_x[j] = 0                              # If 'Total Charges' missing, just set to 0
            else:
                train_x[j] = eval(curr_train_x[j])
    
    train_y = 1 if train_raw_y == 'Yes' else 0
    
    
    train_X[i], train_Y[i] = train_x, train_y

for i in range(N_test):
    test_raw_x = test_raw_data[i+1, 1:]
    curr_test_x = np.delete(test_raw_x, [1, 4, 5, 6, 11, 13, 14, 15])
    
    test_x = np.zeros((num_features))
    for j in range(num_features):
        if j in normalizations.keys():
            test_x[j] = normalizations[j][curr_test_x[j]]
        else:
            if j == 10 and not curr_test_x[j]:
                test_x[j] = 0                               # If 'Total Charges' missing, just set to 0
            else:
                test_x[j] = eval(curr_test_x[j])
            
    test_X[i] = test_x

def cross_validation_lr(X, Y, n_splits, C=1):
    kf = KFold(n_splits=n_splits)
    
    E_trains, E_vals = [], []
    for train_index, val_index in kf.split(X):
        X_train, Y_train = X[train_index], Y[train_index]
        X_val, Y_val = X[val_index], Y[val_index]
        
        model = xgboost.XGBRFClassifier(objective='binary:logistic')
        model.fit(X_train, Y_train)
        
        E_train = model.score(X_train, Y_train)
        E_val = model.score(X_val, Y_val)
        
        E_trains.append(E_train)
        E_vals.append(E_val)
    
    avg_E_train, avg_E_val = np.mean(E_trains), np.mean(E_vals)
    
    return avg_E_train, avg_E_val

E_train, E_val = cross_validation_lr(train_X, train_Y, n_splits=5, C=400)
default_train_acc = E_train
default_val_acc = E_val
print(f"Removing 3 Features:   \t ({round(E_train, 4)}, {round(E_val, 4)})")

Removing 3 Features:   	 (0.819, 0.7949)
