In [1]:
# force the notebook to auto reload external python modules | useful for development
%load_ext autoreload
%autoreload 2

In [3]:
# imports
import sys

sys.path.append("..\\")
from src.data import MyDataset, AudioTripletDataset, LandmarkTripletDataset
from src.features import TripletGenerator
# from src.models import main

import torch
from torch.utils.data import random_split
from torch.utils.data import DataLoader

random_seed = 42

In [4]:
# paths
raw_data_path = "..\\data\\raw\\"
processed_data_path = "..\\data\\processed\\"

---

In [5]:
my_dataset = MyDataset(location=raw_data_path)

In [5]:
# create & pickle dataset | RUN THIS CELL ONLY ONCE TO CREATE THE DATASET
my_dataset.create_dataset()
my_dataset.save_dataset(processed_data_path)



In [25]:
# load dataset | RUN THIS CELL TO LOAD THE SAVED DATASET
my_dataset.load_dataset(processed_data_path)
sample_size = len(my_dataset)
print(f"Loaded dataset with {sample_size} samples.")

Loaded dataset with 323 samples.


In [28]:
# train-valid-test split
# to maintain independence between train and test sets, split the dataset before creating the triplets

train_size = int(0.8 * sample_size)
val_size = int(0.1 * sample_size)
test_size = sample_size - train_size - val_size

train_set, val_set, test_set = random_split(my_dataset, [train_size, val_size, test_size], generator=torch.Generator().manual_seed(random_seed))

train_size, valid_size, test_size = len(train_set), len(val_set), len(test_set)
print(f"Train dataset size: {train_size} \nValid dataset size: {valid_size} \nTest dataset size: {test_size}")

Train dataset size: 258 
Valid dataset size: 32 
Test dataset size: 33


In [31]:
# CREATE & PICKLE triplets of indicies | RUN THIS CELL ONLY ONCE TO CREATE THE TRIPLETS
# ensure diversity of triplets by setting num_triplets to a large number
# essentially 10 to 20 triplets per sample

# TRAIN
train_triplets = TripletGenerator(train_set, num_triplets=train_size*10, prefix="train")
train_triplets.save_triplets(processed_data_path)

# VALID
valid_triplets = TripletGenerator(val_set, num_triplets=valid_size*10, prefix="valid")
valid_triplets.save_triplets(processed_data_path)

# TEST
test_triplets = TripletGenerator(test_set, num_triplets=test_size*10, prefix="test")
test_triplets.save_triplets(processed_data_path)

print(f"Created and saved: \
      \n {len(train_triplets.triplets)} train triplets \
      \n {len(valid_triplets.triplets)} valid triplets \
      \n {len(test_triplets.triplets)} test triplets"
      )

Created and saved:       
 2580 train triplets       
 320 alid triplets       
 330 test triplets


---

In [33]:
# LOAD triplets | RUN THIS CELL TO LOAD THE SAVED TRIPLETS

# TRAIN
train_triplets = TripletGenerator(load=True, root_path=processed_data_path, prefix="train")
train_audio_triplet_dataset = AudioTripletDataset(my_dataset.data, train_triplets.triplets)
train_landmark_triplet_dataset = LandmarkTripletDataset(my_dataset.data, train_triplets.triplets)

# VALID
valid_triplets = TripletGenerator(load=True, root_path=processed_data_path, prefix="valid")
valid_audio_triplet_dataset = AudioTripletDataset(my_dataset.data, valid_triplets.triplets)
valid_landmark_triplet_dataset = LandmarkTripletDataset(my_dataset.data, valid_triplets.triplets)

# TEST
test_triplets = TripletGenerator(load=True, root_path=processed_data_path, prefix="test")
test_audio_triplet_dataset = AudioTripletDataset(my_dataset.data, test_triplets.triplets)
test_landmark_triplet_dataset = LandmarkTripletDataset(my_dataset.data, test_triplets.triplets)

print(f"Loaded: \
      \n {len(train_triplets.triplets)} train triplets \
      \n {len(valid_triplets.triplets)} valid triplets \
      \n {len(test_triplets.triplets)} test triplets"
      )

Loaded:       
 2580 train triplets       
 320 valid triplets       
 330 test triplets


In [34]:
# create dataloaders
train_loader = DataLoader(train_audio_triplet_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_audio_triplet_dataset, batch_size=32, shuffle=False)

In [44]:
# imports
from src.config import SystemConfig, TrainingConfig
from src.logging import setup_log_directory
from src.models import MyModel, main
from src.visualization import plot_loss_accuracy

from torch.utils.tensorboard import SummaryWriter

In [57]:
# train model
model = MyModel((250, 400))
print(model)

training_config = TrainingConfig()

# Model checkpoint log dir setup.
training_config, current_version_name = setup_log_directory(training_config)

# Tensorboard log dir setup.
summary_writer = SummaryWriter(training_config.log_dir)

SiameseModel(
  (siamese_network): LSTM(
    (lstm): LSTM(400, 64, batch_first=True)
    (attention): Linear(in_features=64, out_features=1, bias=True)
    (flatten): Flatten(start_dim=1, end_dim=-1)
    (dense1): Linear(in_features=64, out_features=128, bias=True)
    (dense2): Linear(in_features=128, out_features=64, bias=True)
    (dense3): Linear(in_features=64, out_features=32, bias=True)
  )
)
Logging at: ..\output\Logs_Checkpoints\Model_logs\version_5
Model Checkpoint at: ..\output\Logs_Checkpoints\Model_checkpoints\version_5


In [58]:
# Train and Validate
train_loss, train_acc, val_loss, val_acc = main(
    model,
    (train_loader, valid_loader),
    summary_writer=summary_writer,
    scheduler=None,
    system_config=SystemConfig(),
    training_config=training_config,
    data_augmentation=False,
)

RuntimeError: CUDA error: the launch timed out and was terminated
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
plot_loss_accuracy(
    train_loss=[train_loss],
    val_loss=[val_loss],
    train_acc=[train_acc],
    val_acc=[val_acc],
    colors=["blue"],
    loss_legend_loc="upper center",
    acc_legend_loc="upper left",
)