# Training and evaluating machine learning models
- Train-test split
- k-fold Cross-Validation

In [0]:
!pip3 install torch torchvision



In [2]:
import numpy as np
import pandas as pd
import torch, torchvision
torch.__version__

'0.4.1'

In [7]:
# to use GPU
device = torch.device("cuda")
device

device(type='cuda')

## 1. Train-test split
- Splitting train and test data in Pytorch

### Import data
- Import [epileptic seizure data](https://archive.ics.uci.edu/ml/datasets/Epileptic+Seizure+Recognition) from UCI ML repository
- Split train and test data using ```random_split()```
- Train logistic regression model with training data and evaluate results with test data

In [0]:
class SeizureDataset(torch.utils.data.Dataset):
  def __init__(self):
    # import and initialize dataset
    df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00388/data.csv")
    df = df[df.columns[1:]]
        
    self.X = df[df.columns[:-1]].values
    self.Y = df["y"].astype("category").cat.codes.values.astype(np.int32)
    
  def __getitem__(self, idx):
    # get item by index
    return self.X[idx], self.Y[idx]
  
  def __len__(self):
    # returns length of data
    return len(self.X)

In [0]:
seizuredataset = SeizureDataset()

In [7]:
NUM_INSTANCES = len(seizuredataset)
TEST_RATIO = 0.3
TEST_SIZE = int(NUM_INSTANCES * 0.3)
TRAIN_SIZE = NUM_INSTANCES - TEST_SIZE

print(NUM_INSTANCES, TRAIN_SIZE, TEST_SIZE)

11500 8050 3450


In [8]:
train_data, test_data = torch.utils.data.random_split(seizuredataset, (TRAIN_SIZE, TEST_SIZE))

print(len(train_data), len(test_data))

8050 3450


In [0]:
# when splitting train and test sets, data loader for each dataset should be made separately
train_loader = torch.utils.data.DataLoader(train_data, batch_size = 64, shuffle = True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size = 64, shuffle = False)

In [8]:
# logistic regression model
model = torch.nn.Linear(178, 5).to(device)
criterion = torch.nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)  
model

Linear(in_features=178, out_features=5, bias=True)

In [15]:
num_step = len(train_loader)

for epoch in range(100):
  for i, (x, y) in enumerate(train_loader):
    x, y = x.float().to(device), y.long().to(device)
    outputs = model(x)
    
    loss = criterion(outputs, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  if (epoch + 1) % 10 == 0:
    print("Epoch: {}, Loss: {:.5f}".format(epoch + 1, loss.item()))

Epoch: 10, Loss: 40.67493
Epoch: 20, Loss: 65.14543
Epoch: 30, Loss: 49.11377
Epoch: 40, Loss: 34.16365
Epoch: 50, Loss: 46.56900
Epoch: 60, Loss: 33.15329
Epoch: 70, Loss: 32.82365
Epoch: 80, Loss: 39.61089
Epoch: 90, Loss: 30.58812
Epoch: 100, Loss: 46.22044


In [0]:
y_true, y_pred, y_prob  = [], [], []
with torch.no_grad():
  for x, y in test_loader:
    # ground truth
    y = list(y.numpy())
    y_true += y
    
    x = x.float().to(device)
    outputs = model(x)

    # predicted label
    _, predicted = torch.max(outputs.data, 1)
    predicted = list(predicted.cpu().numpy())
    y_pred += predicted
    
    # probability for each label
    prob = list(outputs.cpu().numpy())
    y_prob += prob

In [28]:
# calculating overall accuracy
num_correct = 0

for i in range(len(y_true)):
  if y_true[i] == y_pred[i]:
    num_correct += 1

print("Accuracy: ", num_correct/len(y_true))

Accuracy:  0.22


## 2. k-fold Cross-Validation
- Perform k-fold cross validation in Pytorch
- Cross validation can be implemented using NumPy, but we rely on ```skorch``` and ```sklearn``` here for the facility of implementation

In [16]:
!pip install -U skorch

Collecting skorch
[?25l  Downloading https://files.pythonhosted.org/packages/49/61/d0949b994b8e1faa7c0218c45e94034d6ebf1c4fd87e99663eddfe761e95/skorch-0.3.0-py3-none-any.whl (89kB)
[K    100% |████████████████████████████████| 92kB 4.5MB/s 
Collecting tabulate (from skorch)
[?25l  Downloading https://files.pythonhosted.org/packages/12/c2/11d6845db5edf1295bc08b2f488cf5937806586afe42936c3f34c097ebdc/tabulate-0.8.2.tar.gz (45kB)
[K    100% |████████████████████████████████| 51kB 5.3MB/s 
Building wheels for collected packages: tabulate
  Running setup.py bdist_wheel for tabulate ... [?25l- \ done
[?25h  Stored in directory: /root/.cache/pip/wheels/2a/85/33/2f6da85d5f10614cbe5a625eab3b3aebfdf43e7b857f25f829
Successfully built tabulate
Installing collected packages: tabulate, skorch
Successfully installed skorch-0.3.0 tabulate-0.8.2


In [0]:
from skorch import NeuralNetClassifier
from sklearn.model_selection import cross_val_score

In [4]:
# import data
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00388/data.csv")
df = df[df.columns[1:]]

X_data = df[df.columns[:-1]].values.astype(np.float32)
y_data = df["y"].astype("category").cat.codes.values.astype(np.int64)

print(X_data.shape, y_data.shape)

(11500, 178) (11500,)


In [9]:
# generate skorch high-level classifier and perform 5-fold cross validation using cross_val_score()
logistic = NeuralNetClassifier(model, max_epochs = 10, lr = 1e-2)
scores = cross_val_score(logistic, X_data, y_data, cv = 5, scoring = "accuracy")

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1           nan       [32m0.1933[0m           nan  0.1225
      2           nan       0.1846           nan  0.1125
      3           nan       0.1857           nan  0.1121
      4           nan       0.1906           nan  0.1101
      5           nan       0.1851           nan  0.1093
      6           nan       0.1873           nan  0.1124
      7           nan       0.1851           nan  0.1141
      8           nan       0.1884           nan  0.1102
      9           nan       0.1878           nan  0.1104
     10           nan       0.1878           nan  0.1118
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1           nan       [32m0.2205[0m           nan  0.1072
      2           nan       0.2184           nan  0.1111
      3           nan       0.2048           nan  0.1048
      4      

In [12]:
# print out results
print(scores)
print(scores.mean(), scores.std())

[0.19173913 0.1973913  0.21347826 0.20869565 0.23521739]
0.20930434782608698 0.01509791607948889
