## Load and Normalize Data

In [92]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from matplotlib import pyplot as plt
from tqdm import tqdm
from copy import deepcopy

import warnings
warnings.filterwarnings("ignore")

In [93]:
# Data file paths
train_raw_path, test_raw_path = r'..\data\train_raw.csv', r'..\data\test_raw.csv'
train_path, test_path = r'..\data\train.csv', r'..\data\test.csv'

# Open csv files
train_raw_data = np.loadtxt(train_raw_path, dtype='str', delimiter=',', unpack=True).T
test_raw_data = np.loadtxt(test_raw_path, dtype='str', delimiter=',', unpack=True).T

N_train, N_test = train_raw_data.shape[0] - 1, test_raw_data.shape[0] - 1
num_features = train_raw_data.shape[1] - 2

# Initialize arrays for train and test data
train_X, train_Y = np.zeros((N_train, train_raw_data.shape[1] - 2)), np.zeros((N_train, 1))
test_X, test_Y = np.zeros((N_test, test_raw_data.shape[1] - 1)), np.zeros((N_test, 1))

In [94]:
formats = \
{
    0:  {'Female': 0, 'Male': 1},
    2:  {'No': 0, 'Yes': 1},
    3:  {'No': 0, 'Yes': 1},
    5:  {'No': 0, 'Yes': 1},
    6:  {'No phone service': 0, 'No': 1, 'Yes': 2},
    7:  {'No': 0, 'DSL': 1, 'Fiber optic': 2},
    8:  {'No internet service': 0, 'No': 1, 'Yes': 2},
    9:  {'No internet service': 0, 'No': 1, 'Yes': 2},
    10: {'No internet service': 0, 'No': 1, 'Yes': 2},
    11: {'No internet service': 0, 'No': 1, 'Yes': 2},
    12: {'No internet service': 0, 'No': 1, 'Yes': 2},
    13: {'No internet service': 0, 'No': 1, 'Yes': 2},
    14: {'Month-to-month': 0, 'One year': 1, 'Two year': 2},
    15: {'No': 0, 'Yes': 1},
    16: {'Mailed check': 0, 'Bank transfer (automatic)': 1, 'Electronic check': 2, 'Credit card (automatic)': 3},
}

for i in range(N_train):
    train_raw_x, train_raw_y = train_raw_data[i+1, 1:-1], train_raw_data[i+1, -1]
    
    train_x = np.zeros((num_features))
    for j in range(num_features):
        if j in formats.keys():
            train_x[j] = formats[j][train_raw_x[j]]
        else:
            if j == 18 and not train_raw_x[j]:              # If 'Total Charges' missing, calculate from 'tenure' and 'Monthly Charges'
                train_x[j] = train_x[4] * train_x[17]  
            else:
                train_x[j] = eval(train_raw_x[j])
    
    train_y = 1 if train_raw_y == 'Yes' else 0
    
    
    train_X[i], train_Y[i] = train_x, train_y

for i in range(N_test):
    test_raw_x = test_raw_data[i+1, 1:]
    
    test_x = np.zeros((num_features))
    for j in range(num_features):
        if j in formats.keys():
            test_x[j] = formats[j][test_raw_x[j]]
        else:
            if j == 18 and not test_raw_x[j]:              # If 'Total Charges' missing, calculate from 'tenure' and 'Monthly Charges'
                test_x[j] = test_x[4] * test_x[17]  
            else:
                test_x[j] = eval(test_raw_x[j])
            
    test_X[i] = test_x

In [95]:
# for i in range(num_features):
#     min_ = np.min(train_X[:, i])
#     max_ = np.max(train_X[:, i])
    
#     train_X[:, i] = (train_X[:, i] - min_) / (max_ - min_)

# for i in range(num_features):
#     min_ = np.min(test_X[:, i])
#     max_ = np.max(test_X[:, i])
    
#     test_X[:, i] = (test_X[:, i] - min_) / (max_ - min_)

In [96]:
# for i in range(5):
#     print(train_X[i])
#     print(train_raw_data[i+1, 1:-1])

In [97]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

def convert_to_dataloader(X, Y, batch_size):
    # Convert NumPy arrays to PyTorch tensors
        X_tensor = torch.tensor(X, dtype=torch.float32)
        Y_tensor = torch.tensor(Y, dtype=torch.float32)

        # Initialize your custom dataset with your data
        dataset = CustomDataset(X_tensor, Y_tensor)

        # Create DataLoader
        return DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [98]:
def cross_validation_nn(model_: nn.Sequential, X, Y, loss_fn, device, n_splits=5, n_epochs=10, batch_size=64):
    kf = KFold(n_splits=n_splits)
    
    k = 1
    train_accs, val_accs = [], []
    for train_index, val_index in kf.split(X):
        print(f"k = {k}")
        k += 1
        # K split
        X_train, Y_train = X[train_index], Y[train_index]
        X_val, Y_val = X[val_index], Y[val_index]

        # Create DataLoader
        train_loader = convert_to_dataloader(X_train, Y_train, batch_size)
        val_loader = convert_to_dataloader(X_val, Y_val, batch_size)
        
        model = nn.Sequential(
            nn.Flatten(),
            
            nn.Linear(num_features, 10),
            nn.ReLU(),
            nn.Dropout(.1),
            
            nn.Linear(10, 5),
            nn.ReLU(),
            nn.Dropout(.1),
            
            nn.Linear(5, 1),
            nn.Sigmoid()
        )
        model.to(device)
        
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

        # training_accuracy_history = np.zeros([n_epochs, 1])
        # training_loss_history = np.zeros([n_epochs, 1])
        # validation_accuracy_history = np.zeros([n_epochs, 1])
        # validation_loss_history = np.zeros([n_epochs, 1])
        
        # for epoch in range(n_epochs):
        #     print(f'Epoch {epoch+1}/10:', end='')
        #     train_total = 0
        #     train_correct = 0
            
        #     # Train
        #     model.train()
            
        #     for i, (data, target) in enumerate(train_loader):
        #         data, target = data.to(device), target.to(device)
                
        #         # Erase accumulated gradients
        #         optimizer.zero_grad()
        #         # Forward pass
        #         output = model(data)
        #         # Calculate loss
        #         loss = loss_fn(output, target)
        #         # Backward pass
        #         loss.backward()
        #         # Weight update
        #         optimizer.step()
                
        #         # track training accuracy
        #         _, predicted = torch.max(output.data, 1)
        #         train_total += target.size(0)
        #         train_correct += (predicted == target).sum().item()
        #         # track training loss
        #         training_loss_history[epoch] += loss.item()
        #         # progress update after 180 batches (~1/10 epoch for batch size 32)
        #         if i % 180 == 0: print('.',end='')
        #     training_loss_history[epoch] /= len(train_loader)
        #     training_accuracy_history[epoch] = train_correct / train_total
        #     print(f'\n\tloss: {training_loss_history[epoch,0]:0.4f}, acc: {training_accuracy_history[epoch,0]:0.4f}',end='')

        #     # Validate
        #     test_total = 0
        #     test_correct = 0
        #     with torch.no_grad():
        #         model.eval()
        #         for i, (data, target) in enumerate(val_loader):
        #             data, target = data.to(device), target.to(device)
        #             # Forward pass
        #             output = model(data)
        #             # Find accuracy
        #             _, predicted = torch.max(output.data, 1)
        #             test_total += target.size(0)
        #             test_correct += (predicted == target).sum().item()
        #             # Find loss
        #             loss = loss_fn(output, target)
        #             validation_loss_history[epoch] += loss.item()
        #         validation_loss_history[epoch] /= len(val_loader)
        #         validation_accuracy_history[epoch] = test_correct / test_total
        #     print(f', val loss: {validation_loss_history[epoch,0]:0.4f}, val acc: {validation_accuracy_history[epoch,0]:0.4f}')
        
        # train_acc = training_accuracy_history[n_epochs - 1, 0]
        # val_acc = validation_accuracy_history[n_epochs - 1, 0]
        
        model.train()
        for epoch in range(n_epochs):
            for batch_idx, (data, target) in enumerate(train_loader):
                data, target = data.to(device), target.to(device)
                # Erase accumulated gradients
                optimizer.zero_grad()
                # Forward pass
                output = model(data)
                # Calculate loss
                loss = loss_fn(output, target)
                # Backward pass
                loss.backward()
                # Weight update
                optimizer.step()
        
        model.eval()
        train_loss = 0
        train_correct = 0
        # Turning off automatic differentiation
        with torch.no_grad():
            for data, target in train_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                train_correct += round(output)
                
        val_loss = 0
        val_correct = 0
        # Turning off automatic differentiation
        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                val_correct += round(output)

        train_acc = train_correct / len(train_loader.dataset)
        val_acc = val_correct / len(val_loader.dataset)
        
        train_accs.append(train_acc)
        val_accs.append(val_acc)
        
    
    avg_train_acc, avg_val_acc = np.mean(train_accs), np.mean(val_accs)
    
    return avg_train_acc, avg_val_acc

In [99]:
dropout = 1

model = nn.Sequential(
    nn.Flatten(),
    
    nn.Linear(num_features, 10),
    nn.ReLU(),
    nn.Dropout(dropout),
    
    nn.Linear(10, 5),
    nn.ReLU(),
    nn.Dropout(dropout),
    
    nn.Linear(5, 1),
    nn.Sigmoid()
)

# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Training on GPU...")
else:
    device = torch.device("cpu")
    print("GPU not available, training on CPU...")

Training on GPU...


In [100]:
train_acc, val_acc = cross_validation_nn(model, train_X, train_Y, nn.BCELoss(), device)
print(train_acc, val_acc)

k = 1
k = 2
k = 3
k = 4
k = 5
0.7374133274224401 0.7374116661936846


In [101]:
train_acc, val_acc = cross_validation_nn(model, train_X, train_Y, optimizer, loss_fn, device)
print(train_acc, val_acc)

ValueError: The number of folds must be of Integral type. cuda of type <class 'torch.device'> was passed.