In [1]:
import numpy as np
import os
import math
import random
import torch
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
from Model import RNN, CNN
from tqdm import tqdm_notebook

### Prepare the data

In [2]:
EVAL_DATA_DIR = os.path.abspath(os.path.abspath('') + '/Aniyama_groundtruth')

In [3]:
data_types = [p for p in os.listdir(EVAL_DATA_DIR) if not p.startswith('.')]
dataset = []
for t in data_types:
    type_dir = f'{EVAL_DATA_DIR}/{t}'
    for file in os.listdir(type_dir):
        filename = f'{type_dir}/{file}'
        examples = []
        with open(filename, 'r') as f:
            for i, line in enumerate(f.readlines()):
                if i == 0:
                    continue
                tokens = line.rstrip().split(',')
                power, anomaly = float(tokens[1]), int(tokens[2])
                example = [power, anomaly]
                examples.append(example)
    examples = np.array(examples)
    dataset.append(examples)

In [4]:
data = []
labels = []
for chunk in dataset:
    size = chunk.shape[0]
    new_size = (math.ceil(size / 10) * 10)
    pad_size = new_size - size
    padding = np.zeros((pad_size, 2))
    new_chunk = np.vstack((chunk, padding)).reshape(new_size // 10, 10, 2)
    for i in range(new_chunk.shape[0]):
        data.append(new_chunk[i,:,0])
        label = new_chunk[i,:,1].sum() > 0
        labels.append(float(label))
data = np.array(data)
labels = np.array(labels)

In [5]:
data.shape, labels.shape

((3429, 10), (3429,))

In [6]:
labels.sum()

1698.0

In [7]:
X_train, X_test, y_train, y_text = train_test_split(data, labels, test_size = 0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

X_train, X_valid, X_test = torch.FloatTensor(X_train), torch.FloatTensor(X_train), torch.FloatTensor(X_train)
y_train, y_valid, y_test = torch.FloatTensor(y_train), torch.FloatTensor(y_train), torch.FloatTensor(y_train)

### Setup Data Loader

In [9]:
from torch.utils import data

class Dataset(data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, data, labels):
        'Initialization'
        self.labels = labels
        self.data = data

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.data)

  def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        return self.data[index], self.labels[index]

In [10]:
config = {
    'batch_size': 32,
    'shuffle': True,
    'num_workers': 6,
    'drop_last': True,
    'num_epochs': 400,
    'encode_dim': 1,
    'hidden_dim': 64,
    'output_dim': 1,
    'num_layers': 3,
    'dropout': 0.3,
    'device': device
}

In [11]:
train_set = Dataset(X_train, y_train)
train_loader = data.DataLoader(
    train_set,
    batch_size=config['batch_size'],
    shuffle=config['shuffle'],
    num_workers=config['num_workers'],
    drop_last=config['drop_last']
)

validation_set = Dataset(X_valid, y_valid)
validation_loader = data.DataLoader(
    validation_set,
    batch_size=config['batch_size'],
    shuffle=config['shuffle'],
    num_workers=config['num_workers'],
    drop_last=config['drop_last']
)

test_set = Dataset(X_test, y_test)
test_loader = data.DataLoader(
    test_set,
    batch_size=config['batch_size'],
    shuffle=config['shuffle'],
    num_workers=config['num_workers'],
    drop_last=config['drop_last']
)

In [12]:
models = {}
models['baseline_rnn'] = RNN(
    config['encode_dim'],
    config['hidden_dim'],
    config['output_dim'],
    config['num_layers'],
    config['dropout']
)

In [13]:
model = models['baseline_rnn']

### TensorBoard Setup

In [14]:
writer = SummaryWriter('runs/baseline_rnn/')

### Train/Validate the Model

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 83,777 trainable parameters


In [16]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = torch.nn.BCEWithLogitsLoss()

model = model.to(config['device'])
criterion = criterion.to(config['device'])

In [17]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [18]:
def train(model, loader, optimizer, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch_data, batch_labels in loader:
        
        batch_data, batch_labels = batch_data.to(device), batch_labels.to(device)
                
        optimizer.zero_grad()
        
        predictions = model(batch_data).squeeze(1)
        
        loss = criterion(predictions, batch_labels)
        
        acc = binary_accuracy(predictions, batch_labels)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(loader), epoch_acc / len(loader)

In [19]:
def evaluate(model, loader, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch_data, batch_labels in loader:
            batch_data, batch_labels = batch_data.to(device), batch_labels.to(device)
            
            predictions = model(batch_data).squeeze(1)
            
            loss = criterion(predictions, batch_labels)
            
            acc = binary_accuracy(predictions, batch_labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(loader), epoch_acc / len(loader)

In [20]:
def test(model, loader, device):
    epoch_acc = 0
    with torch.no_grad():
    
        for batch_data, batch_labels in loader:
            batch_data, batch_labels = batch_data.to(device), batch_labels.to(device)
            
            predictions = model(batch_data).squeeze(1)
            acc = binary_accuracy(predictions, batch_labels)

            epoch_acc += acc.item()
        
    return epoch_acc / len(loader)

In [21]:
best_valid_loss = float('inf')

for epoch in tqdm_notebook(range(config['num_epochs'])):
    
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, config['device'])
    valid_loss, valid_acc = evaluate(model, validation_loader, criterion, config['device'])
    
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'rnn-base-model.pt')
    
    # log the running loss
    writer.add_scalar('Loss/train', train_loss, epoch)
    writer.add_scalar('Loss/validation', valid_loss, epoch)
    writer.add_scalar('Accuracy/train', train_acc, epoch)
    writer.add_scalar('Accuracy/validation', valid_acc, epoch)

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))




In [22]:
test(model, test_loader, config['device'])

0.89111328125

### CNN

In [23]:
writer = SummaryWriter('runs/baseline_cnn/')
models['baseline_cnn'] = CNN()
model = models['baseline_cnn']

In [24]:
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,074 trainable parameters


In [25]:
optimizer = optim.Adam(model.parameters())
criterion = torch.nn.BCEWithLogitsLoss()

model = model.to(config['device'])
criterion = criterion.to(config['device'])

In [26]:
best_valid_loss = float('inf')

for epoch in tqdm_notebook(range(config['num_epochs'])):
    
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, config['device'])
    valid_loss, valid_acc = evaluate(model, validation_loader, criterion, config['device'])
    
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'cnn-base-model.pt')
    
    # log the running loss
    writer.add_scalar('Loss/train', train_loss, epoch)
    writer.add_scalar('Loss/validation', valid_loss, epoch)
    writer.add_scalar('Accuracy/train', train_acc, epoch)
    writer.add_scalar('Accuracy/validation', valid_acc, epoch)

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))




In [27]:
test(model, test_loader, config['device'])

0.87841796875