In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import random
import torch

from src.data import CymruFluencyDataset
from src.features import get_mouth_landmarks

LOAD_DIR = 'data/processed/'
SAVE_MODEL_DIR = 'registery/'


  torch.utils._pytree._register_pytree_node(


In [3]:
# Set seed for reproducibility
seed = 42 

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # If using multiple GPUs

#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False

In [4]:
audio_static_feat = np.load(LOAD_DIR + 'audio_static_feat.npy')
audio_dynamic_feat = np.load(LOAD_DIR + 'audio_dynamic_feat.npy')
landmarks_feat = np.load(LOAD_DIR + 'landmarks.npy')
labels = np.load(LOAD_DIR + 'labels.npy')
landmarks_mouth_feat = get_mouth_landmarks(landmarks_feat)

In [5]:
from src.models import AudioGRUNet, LandmarkSTGCNNet, AudioLSTM, AudioLSTMAttn, LandmarkLSTM

In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
def train_model(model, data_type, dataloaders, criterion, optimizer, num_epochs=50, device='cpu'):
    """
    Train the model and return the best model.

    Parameters:
        model: PyTorch model
        dataloaders: dict with keys 'train' and 'val' and values DataLoader objects
        criterion: loss function
        optimizer: optimization algorithm
        num_epochs: number of epochs
        device: 'cpu' or 'cuda'
    
    Returns:
        best_model: PyTorch model with best validation accuracy
    """
    best_model_wts = model.state_dict()
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")

        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # set model to training mode
            else:
                model.eval()   # set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            for audio_inputs, landmark_inputs, labels in dataloaders[phase]:
                if data_type == 'audio':
                    inputs = audio_inputs.to(device)
                elif data_type == 'landmark':
                    inputs = landmark_inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    _, preds = torch.max(outputs, 1)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels)

            epoch_loss = running_loss / len(dataloaders[phase].sampler)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].sampler)
            print(f"{phase.capitalize()} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}")

            # deep copy the model if validation accuracy improves
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = model.state_dict()

    print(f"Best Validation Accuracy: {best_acc:.4f}")
    model.load_state_dict(best_model_wts)
    return model

In [8]:
def main(
        dataset,
        data_type,
        labels,
        model_constructor,
        model_kwargs,
        save_path,
        num_epochs=50,
        batch_size=16,
        lr=1e-3,
):

    np.random.seed(42)
    
    n_samples = len(dataset)
    indices = np.arange(n_samples)
    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
    train_idx, valid_idx = next(sss.split(indices, labels))
    
    train_dataset = Subset(dataset, train_idx)
    valid_dataset = Subset(dataset, valid_idx)
    
    dataloaders = {
        'train': DataLoader(train_dataset, batch_size=batch_size, shuffle=True),
        'val': DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
    }
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = model_constructor(**model_kwargs)
    model = model.to(device)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    model = train_model(model, data_type, dataloaders, criterion, optimizer, num_epochs, device)
    
    torch.save(model.state_dict(), save_path)
    print(f"Model saved to {save_path}")


In [21]:
audio_lstm_kwargs = {
    'audio_input_size': 23,
    'lstm_hidden_size': 64,
    'lstm_num_layers': 1,
    'dropout': 0.8,
    'num_classes': 2
}

dataset = CymruFluencyDataset(audio_dynamic_feat, landmarks_mouth_feat, labels)

main(
    dataset=dataset,
    data_type='audio',
    labels=labels,
    model_constructor=AudioLSTM,
    model_kwargs=audio_lstm_kwargs,
    save_path=SAVE_MODEL_DIR + 'audio_lstm.pth',
    num_epochs=50,
    batch_size=16,
    lr=1e-3
)

Epoch 1/50
Train Loss: 0.6996 Acc: 0.5351
Val Loss: 0.6939 Acc: 0.5253
Epoch 2/50
Train Loss: 0.6932 Acc: 0.5307
Val Loss: 0.6841 Acc: 0.4949
Epoch 3/50
Train Loss: 0.6523 Acc: 0.6491
Val Loss: 0.6764 Acc: 0.5253
Epoch 4/50
Train Loss: 0.6750 Acc: 0.5789
Val Loss: 0.6637 Acc: 0.6162
Epoch 5/50
Train Loss: 0.6751 Acc: 0.6009
Val Loss: 0.6569 Acc: 0.6364
Epoch 6/50
Train Loss: 0.6460 Acc: 0.6447
Val Loss: 0.6513 Acc: 0.6364
Epoch 7/50
Train Loss: 0.6228 Acc: 0.6711
Val Loss: 0.6440 Acc: 0.6162
Epoch 8/50
Train Loss: 0.6384 Acc: 0.6009
Val Loss: 0.6281 Acc: 0.6768
Epoch 9/50
Train Loss: 0.6347 Acc: 0.6623
Val Loss: 0.6180 Acc: 0.6869
Epoch 10/50
Train Loss: 0.6002 Acc: 0.6667
Val Loss: 0.6068 Acc: 0.7071
Epoch 11/50
Train Loss: 0.5961 Acc: 0.7018
Val Loss: 0.6025 Acc: 0.6869
Epoch 12/50
Train Loss: 0.5594 Acc: 0.7281
Val Loss: 0.5870 Acc: 0.6869
Epoch 13/50
Train Loss: 0.5586 Acc: 0.6974
Val Loss: 0.5727 Acc: 0.7071
Epoch 14/50
Train Loss: 0.5525 Acc: 0.7281
Val Loss: 0.5629 Acc: 0.7778
E

In [24]:
audio_lstmatt_kwargs = {
    'audio_input_size': 23,
    'lstm_hidden_size': 16,
    'lstm_num_layers': 1,
    'dropout': 0.8,
    'num_classes': 2,
    'num_heads': 1
}

dataset = CymruFluencyDataset(audio_dynamic_feat, landmarks_mouth_feat, labels)

main(
    dataset,
    'audio',
    labels,
    AudioLSTMAttn,
    audio_lstmatt_kwargs,
    SAVE_MODEL_DIR + 'audio_lstmatt.pth',
    num_epochs=50,
    batch_size=16,
    lr=1e-3
)

Epoch 1/50
Train Loss: 0.7008 Acc: 0.4605
Val Loss: 0.6942 Acc: 0.4747
Epoch 2/50
Train Loss: 0.6855 Acc: 0.5921
Val Loss: 0.6923 Acc: 0.5657
Epoch 3/50
Train Loss: 0.6930 Acc: 0.4956
Val Loss: 0.6907 Acc: 0.5758
Epoch 4/50
Train Loss: 0.6818 Acc: 0.5746
Val Loss: 0.6878 Acc: 0.5556
Epoch 5/50
Train Loss: 0.6793 Acc: 0.5921
Val Loss: 0.6841 Acc: 0.5354
Epoch 6/50
Train Loss: 0.6737 Acc: 0.5658
Val Loss: 0.6793 Acc: 0.5455
Epoch 7/50
Train Loss: 0.6740 Acc: 0.6140
Val Loss: 0.6736 Acc: 0.6061
Epoch 8/50
Train Loss: 0.6563 Acc: 0.6316
Val Loss: 0.6628 Acc: 0.5758
Epoch 9/50
Train Loss: 0.6380 Acc: 0.6096
Val Loss: 0.6468 Acc: 0.6061
Epoch 10/50
Train Loss: 0.6348 Acc: 0.6842
Val Loss: 0.6315 Acc: 0.6970
Epoch 11/50
Train Loss: 0.6062 Acc: 0.6974
Val Loss: 0.6117 Acc: 0.6263
Epoch 12/50
Train Loss: 0.5716 Acc: 0.7237
Val Loss: 0.5819 Acc: 0.7273
Epoch 13/50
Train Loss: 0.5248 Acc: 0.7719
Val Loss: 0.5642 Acc: 0.7273
Epoch 14/50
Train Loss: 0.4992 Acc: 0.7895
Val Loss: 0.5299 Acc: 0.7475
E

In [None]:
audio_gru_kwargs = {
    'input_size': 23,
    'hidden_size': 128,
    'num_layers': 2,
    'num_classes': 2,
    'bidirectional': False,
    'gru_dropout': 0.2,
    'classifier_dropout': 0.2,
    'num_heads': 2,
}

dataset = CymruFluencyDataset(audio_dynamic_feat, landmarks_mouth_feat, labels)

main(
    dataset,
    'audio',
    labels,
    AudioGRUNet,
    audio_gru_kwargs,
    SAVE_MODEL_DIR + 'audio_gru.pth',
    num_epochs=50,
    batch_size=16,
    lr=1e-3
)

Epoch 1/50
Train Loss: 0.7157 Acc: 0.5088
Val Loss: 0.6606 Acc: 0.6465
Epoch 2/50
Train Loss: 0.6458 Acc: 0.6404
Val Loss: 0.6075 Acc: 0.7172
Epoch 3/50
Train Loss: 0.5169 Acc: 0.7149
Val Loss: 0.5865 Acc: 0.7172
Epoch 4/50
Train Loss: 0.4048 Acc: 0.8114
Val Loss: 0.5858 Acc: 0.7374
Epoch 5/50
Train Loss: 0.3131 Acc: 0.8640
Val Loss: 0.4050 Acc: 0.8081
Epoch 6/50
Train Loss: 0.2214 Acc: 0.9342
Val Loss: 0.4212 Acc: 0.8485
Epoch 7/50
Train Loss: 0.1330 Acc: 0.9518
Val Loss: 0.5528 Acc: 0.8384
Epoch 8/50
Train Loss: 0.0924 Acc: 0.9649
Val Loss: 1.1591 Acc: 0.7980
Epoch 9/50
Train Loss: 0.3071 Acc: 0.9123
Val Loss: 1.1880 Acc: 0.6869
Epoch 10/50
Train Loss: 0.4405 Acc: 0.7982
Val Loss: 0.4507 Acc: 0.7980
Epoch 11/50
Train Loss: 0.1939 Acc: 0.9474
Val Loss: 0.4400 Acc: 0.8384
Epoch 12/50
Train Loss: 0.0615 Acc: 0.9781
Val Loss: 0.5105 Acc: 0.8485
Epoch 13/50
Train Loss: 0.0372 Acc: 0.9868
Val Loss: 0.6778 Acc: 0.8586
Epoch 14/50
Train Loss: 0.0420 Acc: 0.9737
Val Loss: 0.7179 Acc: 0.8485
E

In [26]:
landmark_stgcn_kwargs = {
    "num_nodes": 20, 
    "in_channels": 3, 
    "hidden_channels": [64, 128], 
    "stgcn_dropout": 0.2, 
    "num_heads": 4, 
    "classifier_dropout": 0.2,
    "num_classes": 2
}

dataset = CymruFluencyDataset(audio_dynamic_feat, landmarks_mouth_feat, labels)

main(dataset, 'landmark', labels, LandmarkSTGCNNet, landmark_stgcn_kwargs, SAVE_MODEL_DIR + 'landmark_stgcn.pth')

Epoch 1/50
Train Loss: 0.6642 Acc: 0.5833
Val Loss: 0.7144 Acc: 0.5253
Epoch 2/50
Train Loss: 0.5921 Acc: 0.7105
Val Loss: 0.6442 Acc: 0.6465
Epoch 3/50
Train Loss: 0.6141 Acc: 0.6798
Val Loss: 0.6350 Acc: 0.6465
Epoch 4/50
Train Loss: 0.5794 Acc: 0.7105
Val Loss: 0.5877 Acc: 0.6667
Epoch 5/50
Train Loss: 0.5331 Acc: 0.7105
Val Loss: 0.5042 Acc: 0.7374
Epoch 6/50
Train Loss: 0.5484 Acc: 0.7105
Val Loss: 0.5122 Acc: 0.6970
Epoch 7/50
Train Loss: 0.5339 Acc: 0.6798
Val Loss: 0.4878 Acc: 0.7172
Epoch 8/50
Train Loss: 0.5208 Acc: 0.7325
Val Loss: 0.4982 Acc: 0.7071
Epoch 9/50
Train Loss: 0.5526 Acc: 0.6974
Val Loss: 0.5366 Acc: 0.7071
Epoch 10/50
Train Loss: 0.6092 Acc: 0.6842
Val Loss: 0.5028 Acc: 0.7475
Epoch 11/50
Train Loss: 0.5267 Acc: 0.7237
Val Loss: 0.5293 Acc: 0.6869
Epoch 12/50
Train Loss: 0.5104 Acc: 0.7325
Val Loss: 0.5027 Acc: 0.7374
Epoch 13/50
Train Loss: 0.5451 Acc: 0.7018
Val Loss: 0.5033 Acc: 0.7374
Epoch 14/50
Train Loss: 0.5637 Acc: 0.7281
Val Loss: 0.5524 Acc: 0.7273
E

In [15]:
landmark_stgcn_kwargs = {
    "landmark_input_size": 60, 
    "lstm_hidden_size": 128, 
    "lstm_num_layers": 2,
    "dropout": 0.2,
    "num_classes": 2
}

dataset = CymruFluencyDataset(audio_dynamic_feat, landmarks_mouth_feat, labels)

main(
    dataset,
    'landmark',
    labels,
    LandmarkLSTM,
    landmark_stgcn_kwargs,
    SAVE_MODEL_DIR + 'landmark_lstm.pth',
    lr=5e-5
)

Epoch 1/50
Train Loss: 0.6929 Acc: 0.5395
Val Loss: 0.6873 Acc: 0.5758
Epoch 2/50
Train Loss: 0.6852 Acc: 0.5921
Val Loss: 0.6826 Acc: 0.6364
Epoch 3/50
Train Loss: 0.6803 Acc: 0.6184
Val Loss: 0.6783 Acc: 0.6364
Epoch 4/50
Train Loss: 0.6747 Acc: 0.6360
Val Loss: 0.6727 Acc: 0.6364
Epoch 5/50
Train Loss: 0.6697 Acc: 0.6535
Val Loss: 0.6671 Acc: 0.6263
Epoch 6/50
Train Loss: 0.6640 Acc: 0.6535
Val Loss: 0.6609 Acc: 0.6061
Epoch 7/50
Train Loss: 0.6538 Acc: 0.6711
Val Loss: 0.6518 Acc: 0.6162
Epoch 8/50
Train Loss: 0.6459 Acc: 0.6711
Val Loss: 0.6425 Acc: 0.6566
Epoch 9/50
Train Loss: 0.6342 Acc: 0.6754
Val Loss: 0.6336 Acc: 0.6465
Epoch 10/50
Train Loss: 0.6249 Acc: 0.6754
Val Loss: 0.6237 Acc: 0.6566
Epoch 11/50
Train Loss: 0.6117 Acc: 0.6754
Val Loss: 0.6119 Acc: 0.6364
Epoch 12/50
Train Loss: 0.5985 Acc: 0.6886
Val Loss: 0.5971 Acc: 0.6566
Epoch 13/50
Train Loss: 0.5785 Acc: 0.6886
Val Loss: 0.5814 Acc: 0.6465
Epoch 14/50
Train Loss: 0.5566 Acc: 0.6930
Val Loss: 0.5543 Acc: 0.6566
E