## Load and Normalize Data

In [20]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from matplotlib import pyplot as plt
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [7]:
# Data file paths
train_raw_path, test_raw_path = r'..\data\train_raw.csv', r'..\data\test_raw.csv'
train_path, test_path = r'..\data\train.csv', r'..\data\test.csv'

# Open csv files
train_raw_data = np.loadtxt(train_raw_path, dtype='str', delimiter=',', unpack=True).T
test_raw_data = np.loadtxt(test_raw_path, dtype='str', delimiter=',', unpack=True).T

N_train, N_test = train_raw_data.shape[0] - 1, test_raw_data.shape[0] - 1
num_features = train_raw_data.shape[1] - 2

# Initialize arrays for train and test data
train_X, train_Y = np.zeros((N_train, train_raw_data.shape[1] - 2)), np.zeros((N_train, 1))
test_X, test_Y = np.zeros((N_test, test_raw_data.shape[1] - 1)), np.zeros((N_test, 1))

In [8]:
normalizations = \
{
    0:  {'Female': 0, 'Male': 1},
    2:  {'No': 0, 'Yes': 1},
    3:  {'No': 0, 'Yes': 1},
    5:  {'No': 0, 'Yes': 1},
    6:  {'No phone service': 0, 'No': 1, 'Yes': 2},
    7:  {'No': 0, 'DSL': 1, 'Fiber optic': 2},
    8:  {'No internet service': 0, 'No': 1, 'Yes': 2},
    9:  {'No internet service': 0, 'No': 1, 'Yes': 2},
    10: {'No internet service': 0, 'No': 1, 'Yes': 2},
    11: {'No internet service': 0, 'No': 1, 'Yes': 2},
    12: {'No internet service': 0, 'No': 1, 'Yes': 2},
    13: {'No internet service': 0, 'No': 1, 'Yes': 2},
    14: {'Month-to-month': 0, 'One year': 1, 'Two year': 2},
    15: {'No': 0, 'Yes': 1},
    16: {'Mailed check': 0, 'Bank transfer (automatic)': 1, 'Electronic check': 2, 'Credit card (automatic)': 3},
}

for i in range(N_train):
    train_raw_x, train_raw_y = train_raw_data[i+1, 1:-1], train_raw_data[i+1, -1]
    
    train_x = np.zeros((num_features))
    for j in range(num_features):
        if j in normalizations.keys():
            train_x[j] = normalizations[j][train_raw_x[j]]
        else:
            if j == 18 and not train_raw_x[j]:              # If 'Total Charges' missing, calculate from 'tenure' and 'Monthly Charges'
                train_x[j] = train_x[4] * train_x[17]  
            else:
                train_x[j] = eval(train_raw_x[j])
    
    train_y = 1 if train_raw_y == 'Yes' else 0
    
    
    train_X[i], train_Y[i] = train_x, train_y

for i in range(N_test):
    test_raw_x = test_raw_data[i+1, 1:]
    
    test_x = np.zeros((num_features))
    for j in range(num_features):
        if j in normalizations.keys():
            test_x[j] = normalizations[j][test_raw_x[j]]
        else:
            if j == 18 and not test_raw_x[j]:              # If 'Total Charges' missing, calculate from 'tenure' and 'Monthly Charges'
                test_x[j] = test_x[4] * test_x[17]  
            else:
                test_x[j] = eval(test_raw_x[j])
            
    test_X[i] = test_x

## Train

In [23]:
def cross_validation_lr(X, Y, n_splits, C=1):
    kf = KFold(n_splits=n_splits)
    
    E_trains, E_vals = [], []
    for train_index, val_index in kf.split(X):
        X_train, Y_train = X[train_index], Y[train_index]
        X_val, Y_val = X[val_index], Y[val_index]
        
        model = LogisticRegression(C=C)
        model.fit(X_train, Y_train)
        
        E_train = model.score(X_train, Y_train)
        E_val = model.score(X_val, Y_val)
        
        E_trains.append(E_train)
        E_vals.append(E_val)
    
    avg_E_train, avg_E_val = np.mean(E_trains), np.mean(E_vals)
    
    return avg_E_train, avg_E_val

k_errors = {}
C_errors = {}

for k in range(2, 20):
    k_errors[k] = {'train': [], 'val': []}
    print(f"k = {k}")
    for C in tqdm(range(0, 1001, 10)):
        C_errors[C] = {'train': [], 'val': []}
        if C == 0:
            C = 1
        E_train, E_val = cross_validation_lr(train_X, train_Y, n_splits=k, C=C)
        
        k_errors[k]['train'].append(E_train)
        k_errors[k]['val'].append(E_val)
        C_errors[C]['train'].append(E_train)
        C_errors[C]['val'].append(E_val)
        
        print(f"k = {k}, C = {C}:   \t ({round(E_train, 4)}, {round(E_val, 4)})")

k = 2, C = 1:   	 (0.8042, 0.8027)
k = 2, C = 10:   	 (0.8048, 0.8031)
k = 2, C = 20:   	 (0.8046, 0.8042)
k = 2, C = 30:   	 (0.805, 0.8046)
k = 2, C = 40:   	 (0.8039, 0.8048)
k = 2, C = 50:   	 (0.8022, 0.8031)
k = 2, C = 60:   	 (0.8027, 0.8016)
k = 2, C = 70:   	 (0.8076, 0.8033)
k = 2, C = 80:   	 (0.804, 0.8025)
k = 2, C = 90:   	 (0.8035, 0.8024)
k = 2, C = 100:   	 (0.804, 0.8037)
k = 2, C = 110:   	 (0.8044, 0.8046)
k = 2, C = 120:   	 (0.8031, 0.8031)
k = 2, C = 130:   	 (0.8046, 0.8014)
k = 2, C = 140:   	 (0.8029, 0.8022)
k = 2, C = 150:   	 (0.8035, 0.8037)
k = 2, C = 160:   	 (0.805, 0.8037)
k = 2, C = 170:   	 (0.802, 0.802)
k = 2, C = 180:   	 (0.8029, 0.8024)
k = 2, C = 190:   	 (0.8063, 0.8035)
k = 2, C = 200:   	 (0.8037, 0.802)
k = 2, C = 210:   	 (0.8046, 0.804)
k = 2, C = 220:   	 (0.8061, 0.8037)
k = 2, C = 230:   	 (0.8057, 0.8042)
k = 2, C = 240:   	 (0.805, 0.8048)
k = 2, C = 250:   	 (0.805, 0.8033)
k = 2, C = 260:   	 (0.8031, 0.8031)
k = 2, C = 270:   	 (0

KeyboardInterrupt: 