# Training Networks
In this Notebook, we train networks on the HDF5 Database built. Each network takes in the noisy data, and attempts to predict the optimal circuit parameters that correspond. 

## Custom DataLoader

We need our own dataloader class to extract the correct signals/parameters. We then divide this randomly into training, test, and validation subgroups in an 80:10:10 ratio.

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as func
import h5py
import os

# Custom Dataloader for our NN from .h5 database
class HDF5Data(Dataset):
    def __init__(self, path_to_h5, ftype):
        self.path_to_h5 = path_to_h5
        with h5py.File(path_to_h5, 'r') as f:
            signaldata = f['NoisySignals'][()]
            
            ftypeValid = False
            if ftype=='lowpass':
                ftypeValid = True
                params = f['LowPass'][()]
            elif ftype=='highpass':
                ftypeValid = True
                params = f['HighPass'][()]
            elif ftype=='bandpass':
                ftypeValid = True
                params = f['BandPass'][()]
            elif ftype=='butterlowpass':
                ftypeValid = True
                params = f['ButterworthLowPass'][()]
            
            if ftypeValid==False:
                raise ValueError("Given filter name is not one of the options")
            datalen = len(signaldata)
            
        self.signaldata = signaldata
        self.params = params
        self.datalen = datalen

    def __len__(self):
        return self.datalen

    def __getitem__(self, idx):
        return self.signaldata[idx], self.params[idx]

In [12]:
# Splits full dataset into training, test, and validation sets
def loaderSplit(path_to_h5, ftype):
    dataset = HDF5Data(PATH_TO_H5, ftype)

    # Split --> training:test = 80:20
    train_set_size = int(len(dataset) * 0.8)
    test_set_size = len(dataset) - train_set_size
    train_set, test_set = torch.utils.data.random_split(dataset, [train_set_size, test_set_size])

    # Split test --> test:valid = 10:10
    test_set_size = int(len(test_set)*0.5)
    valid_set_size = len(test_set) - test_set_size
    valid_set, test_set = torch.utils.data.random_split(test_set, [valid_set_size, test_set_size])

    # Final Split --> training:test:valid = 80:10:10 
    print("Data Points in Training Set:", len(train_set))
    print("Data Points in Test Set:",len(test_set))
    print("Data Points in Validation Set:",len(valid_set))

    # Using PyTorch DataLoader
    test_loader = DataLoader(test_set, batch_size=64, shuffle=True)
    train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
    valid_loader = DataLoader(valid_set, batch_size=64, shuffle=True)
    
    return test_loader, train_loader, valid_loader

## Deep Models


In [13]:
# General Deep Network, 4 hiddlen layers, 1 Relu activation
class NN(nn.Module):
    def __init__(self, input_size, output_size):
        super(NN, self).__init__()
        self.l1 = nn.Linear(input_size, 32)
        self.l2 = nn.Linear(32, 24)
        self.l3 = nn.Linear(24, 32)
        self.l4 = nn.Linear(32, 8)
        self.l5 = nn.Linear(8, output_size)
    
    def forward(self, x):
        x = func.relu(self.l1(x))
        x = self.l2(x)
        x = self.l3(x)
        x = self.l4(x)
        x = self.l5(x)
        return x

In [21]:
# Handles training for one epoch
def train_epoch(epoch_idx, loader, model):
    running_loss = 0
    last_loss = 0
    
    for batch_idx, data in enumerate(loader):
        # Every data instance is an input + label pair
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = mse_loss(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if batch_idx % 64 == 63:
            last_loss = running_loss / 64 # loss per batch
            print('  batch {} loss: {}'.format(batch_idx+1, last_loss))
            running_loss = 0
    return last_loss

In [22]:
PATH_TO_H5 = '/Users/aaronphilip/ScienceFair/projects/NanoporeSequencingFiltering/database/NanoporeFiltered.h5'
ftype = 'lowpass'
test_loader, train_loader, valid_loader = loaderSplit(PATH_TO_H5, ftype)

Data Points in Training Set: 7200
Data Points in Test Set: 900
Data Points in Validation Set: 900


In [23]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [24]:
# 100 long input vector --> 2 long output vector 
lowModel = NN(100, 2).to(device)

In [30]:
# Hyperparameters
epochs = 20
learning_rate = 0.01

# Optimizer and Loss Function
optimizer = optim.Adam(lowModel.parameters(), lr=learning_rate)
mse_loss = nn.MSELoss()

In [31]:
best_vloss = 1_000_000
for epoch in range(epochs):
    print('Epoch: %s' % (epoch+1))

    # Make sure gradient tracking is on, and do a pass over the data
    lowModel.train(True)
    avg_loss = train_epoch(epoch, train_loader, lowModel)

    # We don't need gradients on to do reporting
    lowModel.train(False)

    running_vloss = 0.0
    for i, vdata in enumerate(valid_loader):
        vinputs, vlabels = vdata
        voutputs = lowModel(vinputs)
        vloss = mse_loss(voutputs, vlabels)
        running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('Loss  train {} Validation {}'.format(avg_loss, avg_vloss))

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss

        
print(best_vloss)
model_path = '../models/low_model.pt'
torch.save(lowModel.state_dict(), model_path)

Epoch: 1
  batch 64 loss: 0.20420456200372428
Loss  train 0.20420456200372428 Validation 0.19586306810379028
Epoch: 2
  batch 64 loss: 0.19739924801979214
Loss  train 0.19739924801979214 Validation 0.19454459846019745
Epoch: 3
  batch 64 loss: 0.20406357652973384
Loss  train 0.20406357652973384 Validation 0.2502115070819855
Epoch: 4
  batch 64 loss: 0.19689722266048193
Loss  train 0.19689722266048193 Validation 0.19213297963142395
Epoch: 5
  batch 64 loss: 0.20220009190961719
Loss  train 0.20220009190961719 Validation 0.21825146675109863
Epoch: 6
  batch 64 loss: 0.19707565032877028
Loss  train 0.19707565032877028 Validation 0.1933043897151947
Epoch: 7
  batch 64 loss: 0.19391681323759258
Loss  train 0.19391681323759258 Validation 0.19299575686454773
Epoch: 8
  batch 64 loss: 0.1931652156636119
Loss  train 0.1931652156636119 Validation 0.1965872347354889
Epoch: 9
  batch 64 loss: 0.19960960082244128
Loss  train 0.19960960082244128 Validation 0.19304046034812927
Epoch: 10
  batch 64 los