**Note:** Some cells may appear as not-run because the notebook was restarted multiple times to resolve errors.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import random
import torch

from src.data import CymruFluencyDataset
from src.main import main
from src.features import get_mouth_landmarks

LOAD_DIR = 'data/processed/'
SAVE_MODEL_DIR = 'registery/'


  torch.utils._pytree._register_pytree_node(


In [3]:
# Set seed for reproducibility
seed = 42 

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # If using multiple GPUs

#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False

In [4]:
audio_static_feat = np.load(LOAD_DIR + 'audio_static_feat.npy')
audio_dynamic_feat = np.load(LOAD_DIR + 'audio_dynamic_feat.npy')
landmarks_feat = np.load(LOAD_DIR + 'landmarks.npy')
labels = np.load(LOAD_DIR + 'labels.npy')
landmarks_mouth_feat = get_mouth_landmarks(landmarks_feat)

#### MLP

In [None]:
from src.models import MLP

model_kwargs = {
    "audio_dim": 23,
    "landmark_vector_dim": 3 * 20,
    "hidden_dim": 64,
    "num_classes": 2
}

save_path = SAVE_MODEL_DIR + "mlp.pth"
dataset = CymruFluencyDataset(audio_static_feat, landmarks_mouth_feat, labels)

main(dataset, labels, MLP, model_kwargs, save_path, num_epochs=50, batch_size=16)


Epoch 1/50
Train Loss: 0.7105 Acc: 0.4649
Val Loss: 0.6053 Acc: 0.6869
Epoch 2/50
Train Loss: 0.6365 Acc: 0.6184
Val Loss: 0.5808 Acc: 0.7172
Epoch 3/50
Train Loss: 0.6162 Acc: 0.6404
Val Loss: 0.5587 Acc: 0.7374
Epoch 4/50
Train Loss: 0.5866 Acc: 0.6886
Val Loss: 0.5322 Acc: 0.7677
Epoch 5/50
Train Loss: 0.5837 Acc: 0.6754
Val Loss: 0.5150 Acc: 0.7778
Epoch 6/50
Train Loss: 0.5317 Acc: 0.7544
Val Loss: 0.4742 Acc: 0.8485
Epoch 7/50
Train Loss: 0.4938 Acc: 0.7982
Val Loss: 0.4557 Acc: 0.8283
Epoch 8/50
Train Loss: 0.4419 Acc: 0.8465
Val Loss: 0.4283 Acc: 0.7980
Epoch 9/50
Train Loss: 0.4472 Acc: 0.8158
Val Loss: 0.3807 Acc: 0.8889
Epoch 10/50
Train Loss: 0.4464 Acc: 0.8246
Val Loss: 0.4282 Acc: 0.8586
Epoch 11/50
Train Loss: 0.4016 Acc: 0.8553
Val Loss: 0.3571 Acc: 0.9091
Epoch 12/50
Train Loss: 0.3817 Acc: 0.8904
Val Loss: 0.3423 Acc: 0.8283
Epoch 13/50
Train Loss: 0.3240 Acc: 0.9430
Val Loss: 0.3430 Acc: 0.8889
Epoch 14/50
Train Loss: 0.3044 Acc: 0.8991
Val Loss: 0.3137 Acc: 0.8687
E

### LSTM-LSTM variants

#### Self-attention Fusion

In [None]:
from src.models.lstm_lstm import MultiModalLSTM

model_kwargs = {
    "audio_input_size": 23,
    "landmark_input_size": 3 * 20,
    "lstm_hidden_size": 128,
    "fusion_type": "self"
}

save_path = SAVE_MODEL_DIR + "multimodal_lstms_attention_3e-5_500.pth"

dataset = CymruFluencyDataset(audio_dynamic_feat, landmarks_mouth_feat, labels)

main(dataset, labels, MultiModalLSTM, model_kwargs, save_path, num_epochs=500, batch_size=16, lr=3e-5)

Epoch 1/500
Train Loss: 0.7019 Acc: 0.4956
Val Loss: 0.6918 Acc: 0.4848
Epoch 2/500
Train Loss: 0.7011 Acc: 0.4693
Val Loss: 0.6911 Acc: 0.4848
Epoch 3/500
Train Loss: 0.6976 Acc: 0.4912
Val Loss: 0.6898 Acc: 0.5051
Epoch 4/500
Train Loss: 0.6918 Acc: 0.5482
Val Loss: 0.6885 Acc: 0.5354
Epoch 5/500
Train Loss: 0.6973 Acc: 0.4781
Val Loss: 0.6875 Acc: 0.6061
Epoch 6/500
Train Loss: 0.6941 Acc: 0.5219
Val Loss: 0.6870 Acc: 0.5960
Epoch 7/500
Train Loss: 0.6969 Acc: 0.5088
Val Loss: 0.6861 Acc: 0.6869
Epoch 8/500
Train Loss: 0.6927 Acc: 0.5614
Val Loss: 0.6855 Acc: 0.6970
Epoch 9/500
Train Loss: 0.6947 Acc: 0.5482
Val Loss: 0.6850 Acc: 0.7172
Epoch 10/500
Train Loss: 0.6902 Acc: 0.4474
Val Loss: 0.6842 Acc: 0.6869
Epoch 11/500
Train Loss: 0.6862 Acc: 0.5395
Val Loss: 0.6835 Acc: 0.6768
Epoch 12/500
Train Loss: 0.6846 Acc: 0.5789
Val Loss: 0.6824 Acc: 0.6768
Epoch 13/500
Train Loss: 0.6935 Acc: 0.5482
Val Loss: 0.6816 Acc: 0.6566
Epoch 14/500
Train Loss: 0.6771 Acc: 0.5702
Val Loss: 0.6806

#### Cross-attention Fusion

In [7]:
from src.models.lstm_lstm import MultiModalLSTM

model_kwargs = {
    "audio_input_size": 23,
    "landmark_input_size": 3 * 20,
    "lstm_hidden_size": 128,
    "fusion_type": "cross"
}

save_path = SAVE_MODEL_DIR + "multimodal_lstms_cross_5e-5_500.pth"

dataset = CymruFluencyDataset(audio_dynamic_feat, landmarks_mouth_feat, labels)

main(dataset, labels, MultiModalLSTM, model_kwargs, save_path, num_epochs=200, batch_size=16, lr=5e-5)

Epoch 1/200
Train Loss: 0.7125 Acc: 0.4825
Val Loss: 0.6936 Acc: 0.4141
Epoch 2/200
Train Loss: 0.7060 Acc: 0.4956
Val Loss: 0.6928 Acc: 0.4444
Epoch 3/200
Train Loss: 0.7007 Acc: 0.5439
Val Loss: 0.6919 Acc: 0.5253
Epoch 4/200
Train Loss: 0.7097 Acc: 0.5088
Val Loss: 0.6909 Acc: 0.5152
Epoch 5/200
Train Loss: 0.7010 Acc: 0.4956
Val Loss: 0.6897 Acc: 0.5354
Epoch 6/200
Train Loss: 0.6900 Acc: 0.5307
Val Loss: 0.6882 Acc: 0.5657
Epoch 7/200
Train Loss: 0.7038 Acc: 0.5000
Val Loss: 0.6866 Acc: 0.5960
Epoch 8/200
Train Loss: 0.6754 Acc: 0.5395
Val Loss: 0.6852 Acc: 0.6263
Epoch 9/200
Train Loss: 0.6861 Acc: 0.5570
Val Loss: 0.6841 Acc: 0.6162
Epoch 10/200
Train Loss: 0.6970 Acc: 0.5614
Val Loss: 0.6830 Acc: 0.6263
Epoch 11/200
Train Loss: 0.6915 Acc: 0.5570
Val Loss: 0.6816 Acc: 0.6364
Epoch 12/200
Train Loss: 0.6869 Acc: 0.5614
Val Loss: 0.6802 Acc: 0.6364
Epoch 13/200
Train Loss: 0.6796 Acc: 0.5526
Val Loss: 0.6785 Acc: 0.6061
Epoch 14/200
Train Loss: 0.7047 Acc: 0.5175
Val Loss: 0.6770

#### Concatenation Fusion

In [8]:
from src.models.lstm_lstm import MultiModalLSTM

model_kwargs = {
    "audio_input_size": 23,
    "landmark_input_size": 3 * 20,
    "lstm_hidden_size": 128,
    "fusion_type": "concat"
}

save_path = SAVE_MODEL_DIR + "multimodal_lstms_concat_3e-5_200.pth"

dataset = CymruFluencyDataset(audio_dynamic_feat, landmarks_mouth_feat, labels)

main(dataset, labels, MultiModalLSTM, model_kwargs, save_path, num_epochs=200, batch_size=16, lr=3e-5)

Epoch 1/200
Train Loss: 0.7111 Acc: 0.5219
Val Loss: 0.6907 Acc: 0.5354
Epoch 2/200
Train Loss: 0.7068 Acc: 0.5219
Val Loss: 0.6899 Acc: 0.5455
Epoch 3/200
Train Loss: 0.7200 Acc: 0.5482
Val Loss: 0.6890 Acc: 0.5758
Epoch 4/200
Train Loss: 0.7085 Acc: 0.5219
Val Loss: 0.6882 Acc: 0.5859
Epoch 5/200
Train Loss: 0.6770 Acc: 0.6009
Val Loss: 0.6876 Acc: 0.5859
Epoch 6/200
Train Loss: 0.7002 Acc: 0.5132
Val Loss: 0.6870 Acc: 0.5859
Epoch 7/200
Train Loss: 0.7141 Acc: 0.4956
Val Loss: 0.6864 Acc: 0.5960
Epoch 8/200
Train Loss: 0.7200 Acc: 0.5219
Val Loss: 0.6855 Acc: 0.5960
Epoch 9/200
Train Loss: 0.6993 Acc: 0.5307
Val Loss: 0.6848 Acc: 0.6061
Epoch 10/200
Train Loss: 0.7230 Acc: 0.5175
Val Loss: 0.6840 Acc: 0.5960
Epoch 11/200
Train Loss: 0.7017 Acc: 0.5132
Val Loss: 0.6830 Acc: 0.5859
Epoch 12/200
Train Loss: 0.6846 Acc: 0.5307
Val Loss: 0.6820 Acc: 0.5960
Epoch 13/200
Train Loss: 0.6834 Acc: 0.5702
Val Loss: 0.6813 Acc: 0.5758
Epoch 14/200
Train Loss: 0.7101 Acc: 0.4868
Val Loss: 0.6803

### GRU-STGCN variants

#### Self-attention Fusion

In [None]:
dataset = CymruFluencyDataset(audio_dynamic_feat, landmarks_mouth_feat, labels)

from src.models import MultiModalFusionNet
from src.features import adjacency_matrix, mouth_connectivity_dictionary

adj_mat = adjacency_matrix(mouth_connectivity_dictionary())

model_kwargs = {
    "adjacency_matrix": adj_mat,
    "audio_input_size": 23,
    "audio_hidden_size": 64,
    "audio_num_layers": 1,
    "bidirectional_audio": False,
    "stgcn_in_channels": 3,
    "stgcn_hidden_channels": [16, 32],
    "fusion_type": 'self',
    "fusion_heads": 2,
    "num_classes": 2
}

save_path = SAVE_MODEL_DIR + "multimodal_gru-stgcn_attention_self.pth"

main(dataset, labels, MultiModalFusionNet, model_kwargs, save_path, num_epochs=50, batch_size=16, lr=1e-4)


Epoch 1/50
Train Loss: 0.6816 Acc: 0.5351
Val Loss: 0.6636 Acc: 0.5859
Epoch 2/50
Train Loss: 0.6211 Acc: 0.6667
Val Loss: 0.5951 Acc: 0.7273
Epoch 3/50
Train Loss: 0.5068 Acc: 0.7719
Val Loss: 0.4902 Acc: 0.8081
Epoch 4/50
Train Loss: 0.4127 Acc: 0.8114
Val Loss: 0.4955 Acc: 0.7677
Epoch 5/50
Train Loss: 0.3938 Acc: 0.8202
Val Loss: 0.7671 Acc: 0.6263
Epoch 6/50
Train Loss: 0.2977 Acc: 0.8728
Val Loss: 0.4088 Acc: 0.8384
Epoch 7/50
Train Loss: 0.2222 Acc: 0.8991
Val Loss: 0.4127 Acc: 0.8283
Epoch 8/50
Train Loss: 0.1401 Acc: 0.9649
Val Loss: 0.4526 Acc: 0.8081
Epoch 9/50
Train Loss: 0.1009 Acc: 0.9649
Val Loss: 0.3661 Acc: 0.8889
Epoch 10/50
Train Loss: 0.0516 Acc: 0.9825
Val Loss: 0.4408 Acc: 0.8889
Epoch 11/50
Train Loss: 0.0438 Acc: 0.9825
Val Loss: 0.3878 Acc: 0.9091
Epoch 12/50
Train Loss: 0.0694 Acc: 0.9737
Val Loss: 0.7813 Acc: 0.7879
Epoch 13/50
Train Loss: 0.1602 Acc: 0.9342
Val Loss: 0.6066 Acc: 0.7879
Epoch 14/50
Train Loss: 0.1223 Acc: 0.9518
Val Loss: 0.5620 Acc: 0.8182
E

#### Concatenation Fusion

In [None]:
dataset = CymruFluencyDataset(audio_dynamic_feat, landmarks_mouth_feat, labels)

from src.models import MultiModalFusionNet
from src.features import adjacency_matrix, mouth_connectivity_dictionary

adj_mat = adjacency_matrix(mouth_connectivity_dictionary())

model_kwargs = {
    "adjacency_matrix": adj_mat,
    "audio_input_size": 23,
    "audio_hidden_size": 64,
    "audio_num_layers": 1,
    "bidirectional_audio": False,
    "stgcn_in_channels": 3,
    "stgcn_hidden_channels": [16, 32],
    "fusion_type": 'concat', # 'self', 'concat', 'cross'
    "fusion_heads": 2,
    "num_classes": 2
}

save_path = SAVE_MODEL_DIR + "multimodal_gru-stgcn_attention_concat.pth"

main(dataset, labels, MultiModalFusionNet, model_kwargs, save_path, num_epochs=50, batch_size=16, lr=1e-4)


Epoch 1/50
Train Loss: 0.6939 Acc: 0.5746
Val Loss: 0.6579 Acc: 0.6162
Epoch 2/50
Train Loss: 0.6495 Acc: 0.5965
Val Loss: 0.6174 Acc: 0.6566
Epoch 3/50
Train Loss: 0.6203 Acc: 0.6798
Val Loss: 0.6009 Acc: 0.7071
Epoch 4/50
Train Loss: 0.5632 Acc: 0.7588
Val Loss: 0.5594 Acc: 0.7071
Epoch 5/50
Train Loss: 0.5048 Acc: 0.8158
Val Loss: 0.5184 Acc: 0.7778
Epoch 6/50
Train Loss: 0.4518 Acc: 0.8333
Val Loss: 0.4913 Acc: 0.7677
Epoch 7/50
Train Loss: 0.3812 Acc: 0.8684
Val Loss: 0.4746 Acc: 0.7273
Epoch 8/50
Train Loss: 0.3553 Acc: 0.8553
Val Loss: 0.4261 Acc: 0.7677
Epoch 9/50
Train Loss: 0.2974 Acc: 0.8991
Val Loss: 0.4028 Acc: 0.8384
Epoch 10/50
Train Loss: 0.2309 Acc: 0.9342
Val Loss: 0.5277 Acc: 0.7172
Epoch 11/50
Train Loss: 0.2291 Acc: 0.9167
Val Loss: 0.4299 Acc: 0.7677
Epoch 12/50
Train Loss: 0.1680 Acc: 0.9561
Val Loss: 0.4115 Acc: 0.7677
Epoch 13/50
Train Loss: 0.1368 Acc: 0.9693
Val Loss: 0.3709 Acc: 0.7879
Epoch 14/50
Train Loss: 0.1094 Acc: 0.9825
Val Loss: 0.4411 Acc: 0.7475
E

#### Cross-attention Fusion

In [None]:
dataset = CymruFluencyDataset(audio_dynamic_feat, landmarks_mouth_feat, labels)

from src.models import MultiModalFusionNet
from src.features import adjacency_matrix, mouth_connectivity_dictionary

adj_mat = adjacency_matrix(mouth_connectivity_dictionary())

model_kwargs = {
    "adjacency_matrix": adj_mat,
    "audio_input_size": 23,
    "audio_hidden_size": 64,
    "audio_num_layers": 1,
    "bidirectional_audio": False,
    "stgcn_in_channels": 3,
    "stgcn_hidden_channels": [16, 32],
    "fusion_type": 'cross', # 'self', 'concat', 'cross'
    "fusion_heads": 2,
    "num_classes": 2
}

save_path = SAVE_MODEL_DIR + "multimodal_gru-stgcn_attention_cross.pth"

main(dataset, labels, MultiModalFusionNet, model_kwargs, save_path, num_epochs=50, batch_size=16, lr=1e-4)


Epoch 1/50
Train Loss: 0.6831 Acc: 0.5526
Val Loss: 0.6449 Acc: 0.6263
Epoch 2/50
Train Loss: 0.6005 Acc: 0.6842
Val Loss: 0.5744 Acc: 0.6869
Epoch 3/50
Train Loss: 0.5263 Acc: 0.7544
Val Loss: 0.7519 Acc: 0.5556
Epoch 4/50
Train Loss: 0.4190 Acc: 0.7982
Val Loss: 0.4635 Acc: 0.7677
Epoch 5/50
Train Loss: 0.3293 Acc: 0.8333
Val Loss: 0.3674 Acc: 0.7879
Epoch 6/50
Train Loss: 0.3005 Acc: 0.8289
Val Loss: 0.3251 Acc: 0.8485
Epoch 7/50
Train Loss: 0.2211 Acc: 0.8860
Val Loss: 0.4172 Acc: 0.7879
Epoch 8/50
Train Loss: 0.2019 Acc: 0.9079
Val Loss: 0.4853 Acc: 0.7576
Epoch 9/50
Train Loss: 0.1312 Acc: 0.9474
Val Loss: 0.2776 Acc: 0.8788
Epoch 10/50
Train Loss: 0.0582 Acc: 0.9956
Val Loss: 0.3233 Acc: 0.8586
Epoch 11/50
Train Loss: 0.0345 Acc: 0.9956
Val Loss: 0.3306 Acc: 0.8990
Epoch 12/50
Train Loss: 0.0243 Acc: 1.0000
Val Loss: 0.3455 Acc: 0.8788
Epoch 13/50
Train Loss: 0.0109 Acc: 1.0000
Val Loss: 0.3797 Acc: 0.8788
Epoch 14/50
Train Loss: 0.0050 Acc: 1.0000
Val Loss: 0.3924 Acc: 0.8687
E

#### Siamese GRU-STGCN 