In [2]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch import optim
import torch.optim.lr_scheduler as lr_scheduler
import matplotlib.pyplot as plt
import seaborn as sns

import lib.torch_device as tdev

from lib.ds.torch_dataset import create_data_loader
from lib.model.attention_classifier import AttentionClassifier, AttentionClassifierHyperParameters
from lib.torch_generic_model_training import train_model_with_cv, train_model, evaluate_model
from lib.training_hyper_parameters import TrainingHyperParameters
from lib.ds.numpy_dataset import NumpyDataset
from lib.model.model_persistence import save_model, load_model
from lib.random import set_random_seed
from lib.metrics import calculate_average_metrics_for_final_epoch_of_folds, calculate_average_metrics_per_epoch, calculate_average_metrics

import lib.torch_device as tdev

%load_ext autoreload
%autoreload 2

In [3]:
device = tdev.get_torch_device()
device

device(type='cuda', index=0)

In [65]:
rng = np.random.default_rng()

def create_random_sequences(sequence_length: int, segments: list[list[int]], segment_end_labels: list[int]):

def create_sequences_with_segment(sequence_length: int, segment: list[int], segment_end_label: int):
    sequences: list[np.ndarray] = []
    labels: list[np.ndarray] = []
    segment_length = len(segment)
    
    for i in range(sequence_length - segment_length + 1):
        seq = np.zeros(sequence_length)
        l = np.zeros(sequence_length).astype(int)
        
        seq[i:i + segment_length] = np.array(segment)
        l[i + segment_length - 1] = segment_end_label
            
        sequences.append(seq)
        labels.append(l)
    
    return sequences, labels

data = np.array([
    *(create_sequences_with_segment(8, [1], segment_end_label=0)[0]),
    *(create_sequences_with_segment(8, [1, 1], segment_end_label=0)[0]),
    *(create_sequences_with_segment(8, [1, 1, 1], segment_end_label=0)[0]),
    *(create_sequences_with_segment(8, [1, 1, 1, 1], segment_end_label=0)[0]),
])[:, :, np.newaxis]
labels = np.array([
    *(create_sequences_with_segment(8, [1], segment_end_label=1)[1]),
    *(create_sequences_with_segment(8, [1, 1], segment_end_label=0)[1]),
    *(create_sequences_with_segment(8, [1, 1, 1], segment_end_label=1)[1]),
    *(create_sequences_with_segment(8, [1, 1, 1, 1], segment_end_label=0)[1]),
]).astype(int)

ds = NumpyDataset(data, labels)

print(f'{data.shape   = }')
print(f'{labels.shape = }')

for i in range(data.shape[0]):
    print(data[i], labels[i])

data.shape   = (26, 8, 1)
labels.shape = (26, 8)
[[1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]] [1 0 0 0 0 0 0 0]
[[0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]] [0 1 0 0 0 0 0 0]
[[0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]] [0 0 1 0 0 0 0 0]
[[0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]] [0 0 0 1 0 0 0 0]
[[0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]] [0 0 0 0 1 0 0 0]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]] [0 0 0 0 0 1 0 0]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]] [0 0 0 0 0 0 1 0]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]] [0 0 0 0 0 0 0 1]
[[1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]] [0 0 0 0 0 0 0 0]
[[0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]] [0 0 0 0 0 0 0 0]
[[0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]] [0 0 0 0 0 0 0 0]
[[0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]] [0 0 0 0 0 0 0 0]
[[0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]] [0 0 0 0 0 0 0 0]
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]] [0 0 0 0 0 0 0 0]
[[0.]
 [0.]
 

# Hyper Parameters

In [66]:
ac_hyper_parameters = AttentionClassifierHyperParameters(
    in_features=data.shape[-1],
    out_features=2,
    
    attention_window_size=8,
    
    d_model=64,
    num_heads=16,
    attention_stack_size=2,
    attention_stack_activation_provider=lambda: nn.LeakyReLU(),
    attention_dropout=0.45,
    
    in_linear_hidden_out_features=[128, 64, 64],
    out_linear_hidden_out_features=[32, 16],
    linear_activation_provider=lambda: nn.LeakyReLU(),
    linear_dropout=0.45,
)

In [75]:
training_hyper_parameters = TrainingHyperParameters(
        batch_size=16,
        
        loss_weight_factors=None,
        
        optimizer_provider=lambda model, lr: optim.Adamax(
            model.parameters(),
            lr=lr,
            betas=(0.9, 0.98),
            eps=1e-9
        ),

        num_epochs=275,
        lr=1e-3,
    
        lr_scheduler_milestones=[int(m) for m in [30, 100, 250]],
        lr_scheduler_gamma=0.75,
        lr_scheduler_provider=lambda optimizer, milestones, gamma: None,
        # lr_scheduler_provider=lambda optimizer, milestones, gamma: lr_scheduler.MultiStepLR(
        #     optimizer, 
        #     milestones=milestones,
        #     gamma=gamma
        # )
    )

# Training

In [76]:
ac = AttentionClassifier(ac_hyper_parameters, batch_first=True)
    
train_model(
    ac,
    training_hyper_parameters,
    ds,
    ds,
    device
)

torch.Size([1024, 1, 64])


#### Training ####
##################
AttentionClassifier with 52754 parameters, in_fnn: 16832, attention_stack: 33280, out_fnn: 2642
##################

train label counts = [  194,    14]
eval label counts  = [  194,    14]

loss weights                    = [ 1.00, 13.86]
eval loss weights (theoretical) = [ 1.00, 13.86]


Training Epoch   1/275: lr = 1.00E-03, epoch =     1, avg_loss = 0.006668, num_samples =   208, num_correct =   164, acc = 0.788462, bacc = 0.455817, score = 0.576923
Evaluation Epoch   1/275: epoch =     1, avg_loss = 0.006431, num_samples =   208, num_correct =   194, acc = 0.932692, bacc = 0.500000, score = 0.865385
Training Epoch   2/275: lr = 1.00E-03, epoch =     2, avg_loss = 0.006673, num_samples =   208, num_correct =   194, acc = 0.932692, bacc = 0.500000, score = 0.865385
Evaluation Epoch   2/275: epoch =     2, avg_loss = 0.006463, num_samples =   208, num_correct =   194, acc = 0.932692, bacc = 0.500000, score = 0.865385
Tra

(AttentionClassifier(
   (in_fnn): FNN(
     (layers): Sequential(
       (0): Dropout(p=0.45, inplace=False)
       (1): Linear(in_features=1, out_features=128, bias=True)
       (2): LeakyReLU(negative_slope=0.01)
       (3): Dropout(p=0.45, inplace=False)
       (4): Linear(in_features=128, out_features=64, bias=True)
       (5): LeakyReLU(negative_slope=0.01)
       (6): Dropout(p=0.45, inplace=False)
       (7): Linear(in_features=64, out_features=64, bias=True)
       (8): LeakyReLU(negative_slope=0.01)
       (9): Dropout(p=0.45, inplace=False)
       (10): Linear(in_features=64, out_features=64, bias=True)
     )
   )
   (positional_encoder): PositionalEncoding()
   (attention_stack): Sequential(
     (0): MultiheadSelfAttention(
       (multihead_attention): MultiheadAttention(
         (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
       )
     )
     (1): LeakyReLU(negative_slope=0.01)
     (2): MultiheadSelfAttention(
       (multih

In [72]:
def predict_for_sequence(seq: list[int]):
    return torch.argmax(ac.forward(torch.Tensor([[[float(e)] for e in seq]]).to(device)), axis=2)

In [86]:
print('Train')
print(f'{predict_for_sequence([1, 0, 0, 0, 0, 0, 0, 0]) = }')
print(f'{predict_for_sequence([0, 0, 0, 1, 0, 0, 0, 0]) = }')
print(f'{predict_for_sequence([0, 0, 0, 0, 0, 0, 0, 1]) = }')
print()
print(f'{predict_for_sequence([ 1, 1, 0, 0, 0, 0,0, 0]) = }')
print(f'{predict_for_sequence([0, 0, 0, 0, 1, 1, 0, 0]) = }')
print(f'{predict_for_sequence([0, 0, 0, 0, 0, 0, 1, 1]) = }')
print()
print(f'{predict_for_sequence([ 1, 1, 1, 0, 0, 0,0, 0]) = }')
print(f'{predict_for_sequence([0, 0, 0, 1, 1, 1, 0, 0]) = }')
print(f'{predict_for_sequence([0, 0, 0, 0, 0, 1, 1, 1]) = }')
print()
print(f'{predict_for_sequence([1, 1, 1, 1, 0, 0, 0, 0]) = }')
print(f'{predict_for_sequence([0, 0, 1, 1, 1, 1, 0, 0]) = }')
print(f'{predict_for_sequence([0, 0, 0, 0, 1, 1, 1, 1]) = }')

print()
print()

print('Test')

print(f'{predict_for_sequence([0, 0, 0, 1, 0, 1, 0, 1]) = }')
print(f'{predict_for_sequence([1, 0, 0, 0, 0, 1, 0, 1]) = }')

Train
predict_for_sequence([1, 0, 0, 0, 0, 0, 0, 0]) = tensor([[1, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')
predict_for_sequence([0, 0, 0, 1, 0, 0, 0, 0]) = tensor([[0, 0, 0, 1, 0, 0, 0, 0]], device='cuda:0')
predict_for_sequence([0, 0, 0, 0, 0, 0, 0, 1]) = tensor([[0, 0, 0, 0, 0, 0, 0, 1]], device='cuda:0')

predict_for_sequence([ 1, 1, 0, 0, 0, 0,0, 0]) = tensor([[0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')
predict_for_sequence([0, 0, 0, 0, 1, 1, 0, 0]) = tensor([[0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')
predict_for_sequence([0, 0, 0, 0, 0, 0, 1, 1]) = tensor([[0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')

predict_for_sequence([ 1, 1, 1, 0, 0, 0,0, 0]) = tensor([[1, 1, 1, 0, 0, 0, 0, 0]], device='cuda:0')
predict_for_sequence([0, 0, 0, 1, 1, 1, 0, 0]) = tensor([[0, 0, 0, 1, 1, 1, 0, 0]], device='cuda:0')
predict_for_sequence([0, 0, 0, 0, 0, 1, 1, 1]) = tensor([[0, 0, 0, 0, 0, 1, 1, 1]], device='cuda:0')

predict_for_sequence([1, 1, 1, 1, 0, 0, 0, 0]) = tensor([[0, 0, 0, 0, 0, 0, 0, 0]]