In [3]:
# !pip install gdown
# !gdown --folder https://drive.google.com/drive/folders/18fbeQOzN4BMn09LPnFgWflhTP-r9JJrc?usp=sharing
# !pip install -r requirements.txt

In [1]:
import pytorch_lightning as pl
import torch

from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

from engine.attention import AttentionAudioClassifier
from engine.data import get_main_classes_loader

torch.set_float32_matmul_precision("high")
pl.seed_everything(123)

Seed set to 123


123

In [2]:
train_loader = get_main_classes_loader("train", oversample_silence=True)
val_loader = get_main_classes_loader("val")
test_loader = get_main_classes_loader("test")

In [3]:
EMBEDDING_SIZE = 80

In [None]:
callbacks = [
    EarlyStopping(
        monitor="val_balanced_accuracy",
        mode="max",
        patience=10,
        min_delta=1e-5,
    ),
    ModelCheckpoint(
        save_top_k=1,
        monitor="val_accuracy",
        filename="model-{epoch}-{val_accuracy:.2f}",
        mode="max",
        every_n_epochs=1
    )
]

model = AttentionAudioClassifier(3, 100, EMBEDDING_SIZE, 512, 4, 4).cuda()
trainer = pl.Trainer(
    max_epochs=100,
    callbacks=callbacks,
    gradient_clip_val=1,
    gradient_clip_algorithm="norm"
)
trainer.fit(model, train_loader, val_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
------------------------------------------------
0 | encoder    | AudioEncoder     | 11.4 M
1 | classifier | Sequential       | 6.6 M 
2 | loss       | CrossEntropyLoss | 0     
------------------------------------------------
18.0 M    Trainable params
0         Non-trainable params
18.0 M    Total params
71.907    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=55` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=55` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [15]:
# best_model = AttentionAudioClassifier.load_from_checkpoint("lightning_logs/version_1/checkpoints/model-epoch=35-val_accuracy=0.92.ckpt")

In [27]:
labels = []
predictions = []
for X, y in test_loader:
    y_pred = best_model(X).argmax(dim=1)
    labels.append(y)
    predictions.append(y_pred)
labels = torch.concat(labels, dim=0).cpu().numpy()
predictions = torch.concat(predictions, dim=0).cpu().numpy()

In [29]:
from sklearn.metrics import confusion_matrix

In [30]:
confusion_matrix(labels, predictions)

array([[2227,  337,    3],
       [ 254, 4012,    2],
       [  40,   15,  269]])

In [36]:
from collections import Counter
Counter(labels)

Counter({1: 4268, 0: 2567, 2: 324})

In [33]:
from collections import Counter
Counter(predictions)

Counter({1: 4364, 0: 2521, 2: 274})

In [34]:
import json
with open("data/converted/test_labels.json", "r") as f:
    l = json.load(f)

In [37]:
Counter(l)

Counter({9: 4268,
         7: 324,
         10: 272,
         2: 267,
         4: 262,
         6: 259,
         11: 256,
         0: 253,
         3: 252,
         1: 251,
         8: 249,
         5: 246})

In [39]:
import pickle as pkl
with open("data/converted/encoder.pkl", "rb") as f:
    encoder = pkl.load(f)
dict(zip(list(range(12)), encoder.inverse_transform(list(range(12)))))

{0: 'down',
 1: 'go',
 2: 'left',
 3: 'no',
 4: 'off',
 5: 'on',
 6: 'right',
 7: 'silence',
 8: 'stop',
 9: 'unknown',
 10: 'up',
 11: 'yes'}