#Description of the Notebook

This Google Colab Notebook was created for the creation of a dense neural network using Pytorch. It utilizes Optuna in order to determine the optimal hyperparameters.

# Import Data and Metadata

In [None]:
# Import data acquisition libraries
import pandas as pd
import requests as r
#Import data processing libraries
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
from IPython import display
import os
from PIL import Image
from torch.utils.data.dataset import Dataset
from imageio import imread

In [None]:
#data import

arrhythmia_data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data'
arrhythmia_classes = 'http://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.names'
dataset = pd.read_csv(arrhythmia_data,header = None)

In [None]:
names = r.get(arrhythmia_classes).content.decode('utf-8')

In [None]:
diagnose_classifications = {
       '01':             'Normal',
       '02':             'Ischemic changes (Coronary Artery Disease)',
       '03':             'Old Anterior Myocardial Infarction',
       '04':             'Old Inferior Myocardial Infarction',
       '05':             'Sinus tachycardy',
       '06':             'Sinus bradycardy',	
       '07':             'Ventricular Premature Contraction (PVC)',
       '08':             'Supraventricular Premature Contraction',
       '09':             'Left bundle branch block',	
       '10':              'Right bundle branch block',
       '11':             '1. degree AtrioVentricular block',
       '12':             '2. degree AV block	',
       '13':             '3. degree AV block',
       '14':             'Left ventricule hypertrophy',
       '15':             'Atrial Fibrillation or Flutter',
       '16':             'Others'
          				         }

In [None]:
#Replacement of missing values with median value for that attribute
dataset = dataset.replace('?', np.nan)
dataset.fillna(dataset.median(), inplace = True)
dataset[279] = dataset[279]-1

#Creation of the Pytorch Dataset and Model

In [None]:
!pip install -q optuna

[K     |████████████████████████████████| 308 kB 8.3 MB/s 
[K     |████████████████████████████████| 81 kB 4.7 MB/s 
[K     |████████████████████████████████| 209 kB 43.9 MB/s 
[K     |████████████████████████████████| 78 kB 3.2 MB/s 
[K     |████████████████████████████████| 49 kB 6.6 MB/s 
[K     |████████████████████████████████| 146 kB 33.4 MB/s 
[K     |████████████████████████████████| 112 kB 46.7 MB/s 
[?25h  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone


In [None]:
# Import Neural Network Libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torch.utils.data import Dataset, random_split
import pickle

import optuna
from optuna.trial import TrialState
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

from google.colab import files


In [None]:
# Dataloader for pandas dataframe to pytorch dataset conversion
class pandas_dataset(Dataset):

  def __init__(self,pd_dataframe):
    df= pd_dataframe
    
    x=df.iloc[:,:-1].values
    y=df.iloc[:,-1:].values
    transformer = preprocessing.RobustScaler().fit(x)
    x = transformer.transform(x)

    self.x_train=torch.tensor(x.astype(np.float32),dtype=torch.float32)
    self.y_train=torch.tensor(y.astype(np.int8),dtype=torch.long)
    y = torch.tensor(y[:, 0])

  def __len__(self):
    return len(self.y_train)
  
  def __getitem__(self,idx):
    return self.x_train[idx],self.y_train[idx]

In [None]:
df_dataset=pandas_dataset(dataset)
#data_loader=DataLoader(df_dataset,batch_size=8,shuffle=False)

In [None]:
train, test = random_split(df_dataset,[400,52])

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True
batch_size = 8
params = {'batch_size': batch_size,
          'shuffle': True,
          'num_workers': 2}

train_loader = DataLoader(train, **params)
test_loader = DataLoader(test,**params)

In [None]:
n_features = len(dataset.columns) - 1
n_classes = 16
n_epochs = 25
n_neurons = [128, 64, 32]

criterion = nn.CrossEntropyLoss()


In [None]:
LOG_INTERVAL = 10
N_TRAIN_EXAMPLES = batch_size * 1000
N_VALID_EXAMPLES = batch_size * 1000
EPOCHS = 100

In [None]:
def optuna_model(trial):
    # We optimize the number of layers, hidden units and dropout ratio in each layer.
    n_layers = trial.suggest_int("n_layers", 1, 4)
    layers = []

    in_features = n_features
    for i in range(n_layers):
        out_features = trial.suggest_int("n_units_l{}".format(i), 4, 128)
        layers.append(nn.Linear(in_features, out_features))
        layers.append(nn.ReLU())
        p = trial.suggest_float("dropout_l{}".format(i), 0.1, 0.5)
        layers.append(nn.Dropout(p))

        in_features = out_features
    layers.append(nn.Linear(in_features, n_classes))
    layers.append(nn.LogSoftmax(dim=1))

    return nn.Sequential(*layers)

In [None]:
def objective(trial):

    # Generate the model.
    model = optuna_model(trial).to(device)

    # Generate the optimizers.
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)


    # Training of the model.
    for epoch in range(EPOCHS):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            # Limiting training data for faster epochs.
            if batch_idx * batch_size >= N_TRAIN_EXAMPLES:
                break

            data, target = data.view(data.size(0), -1).to(device), target.flatten().to(device)

            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()

        # Validation of the model.
        model.eval()
        correct = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(test_loader):
                # Limiting validation data.
                if batch_idx * batch_size >= N_VALID_EXAMPLES:
                    break
                data, target = data.view(data.size(0), -1).to(device), target.to(device)
                output = model(data)
                # Get the index of the max log-probability.
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / min(len(test_loader.dataset), N_VALID_EXAMPLES)

        trial.report(accuracy, epoch)

        with open("{}.pickle".format(trial.number), "wb") as fout:
          pickle.dump(model, fout)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return accuracy

In [None]:
if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, timeout=60000)

    pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
    complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2022-05-25 08:29:27,690][0m A new study created in memory with name: no-name-3fc699ce-ecca-4e20-a7da-7f91ea2b516c[0m
[32m[I 2022-05-25 08:30:40,210][0m Trial 0 finished with value: 0.6730769230769231 and parameters: {'n_layers': 1, 'n_units_l0': 6, 'dropout_l0': 0.46703596315983953, 'optimizer': 'Adam', 'lr': 0.059902729132143985}. Best is trial 0 with value: 0.6730769230769231.[0m
[32m[I 2022-05-25 08:31:58,653][0m Trial 1 finished with value: 0.7692307692307693 and parameters: {'n_layers': 3, 'n_units_l0': 82, 'dropout_l0': 0.26857955631800867, 'n_units_l1': 73, 'dropout_l1': 0.17405541195685909, 'n_units_l2': 115, 'dropout_l2': 0.304103902981594, 'optimizer': 'Adam', 'lr': 1.9646249445731023e-05}. Best is trial 1 with value: 0.7692307692307693.[0m
[32m[I 2022-05-25 08:33:15,671][0m Trial 2 finished with value: 0.7307692307692307 and parameters: {'n_layers': 2, 'n_units_l0': 26, 'dropout_l0': 0.23553584750387718, 'n_units_l1': 33, 'dropout_l1': 0.26934232972739475, 

Study statistics: 
  Number of finished trials:  100
  Number of pruned trials:  70
  Number of complete trials:  30
Best trial:
  Value:  0.8461538461538461
  Params: 
    n_layers: 3
    n_units_l0: 96
    dropout_l0: 0.352744256630383
    n_units_l1: 115
    dropout_l1: 0.38236094441509805
    n_units_l2: 81
    dropout_l2: 0.1373959777347792
    optimizer: Adam
    lr: 0.004976536851408254


# Download of Neural Network and Train/Test Datasets for Reproduction

In [None]:
files.download("{}.pickle".format(study.best_trial.number))
with open("{}.pickle".format(study.best_trial.number), "rb") as fin:
    best_result = pickle.load(fin)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import numpy
#save dataset
numpy.save('arrhythmia_train', train, allow_pickle = True)
numpy.save('arrhythmia_test', test, allow_pickle = True)

  arr = np.asanyarray(arr)
  arr = np.asanyarray(arr)


In [None]:
files.download('arrhythmia_train.pt')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download('arrhythmia_test.pt')