In [1]:
import torch
import numpy as np
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
from torch_optimizer import Lookahead

from config import dataset, data_loader, model as model_config, optimizer as optimizer_config, scheduler as scheduler_config, training
from src.data.data_loader import load_speech_commands_dataset, load_bg_noise_dataset
from utils import set_memory_GB,print_model_size, log_to_file
from src.utils.augmentations import add_time_shift_and_align, add_silence
from train_utils import trainig_loop





  def forward(ctx, xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
  def backward(ctx, dout):
  def forward(
  def backward(ctx, dout, *args):
  def forward(ctx, x, weight, bias, process_group=None, sequence_parallel=True):
  def backward(ctx, grad_output):
  def forward(ctx, zxbcdt, conv1d_weight, conv1d_bias, dt_bias, A, D, chunk_size, initial_states=None, seq_idx=None, dt_limit=(0.0, float("inf")), return_final_states=False, activation="silu",
  def backward(ctx, dout, *args):


In [2]:
import numpy as np
import random
import torch
from torch.utils.data import Dataset

class TFDatasetAdapter(Dataset):
    def __init__(self, tf_dataset, bg_noise_dataset=None, fixed_length=16000, augmentation=False, noise_level=0.3):
        self.tf_dataset = tf_dataset
        self.data = list(tf_dataset)
        self.bg_noise_data = list(bg_noise_dataset) if bg_noise_dataset is not None else None
        self.fixed_length = fixed_length
        self.augmentation = augmentation
        self.noise_level = noise_level

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        audio, label = self.data[idx]
        audio = audio.numpy()

        # Normalize the audio tensor
        audio = audio / np.max(np.abs(audio))

        # Convert to float32
        audio = audio.astype(np.float32)

        # Ensure the audio tensor has the correct shape (1D array)
        if audio.ndim > 1:
            audio = np.squeeze(audio)

        # Add background noise if available
        if self.bg_noise_data:
            bg_noise_audio = random.choice(self.bg_noise_data)

            # Trim or pad bg_noise to match the audio length
            if len(bg_noise_audio) < len(audio):
                bg_noise_audio = np.pad(bg_noise_audio, (0, len(audio) - len(bg_noise_audio)), mode='constant')
            else:
                # Take a random slice of bg_noise_audio with the same length as the original audio
                start_idx = random.randint(0, len(bg_noise_audio) - len(audio))
                bg_noise_audio = bg_noise_audio[start_idx:start_idx + len(audio)]

            # Add bg_noise as noise to the original audio
            audio = audio + self.noise_level * bg_noise_audio

        # Pad or trim the audio to the fixed length
        if len(audio) < self.fixed_length:
            audio = np.pad(audio, (0, self.fixed_length - len(audio)), mode='constant')
        else:
            audio = audio[:self.fixed_length]

        # Apply augmentations if any
        if self.augmentation:
            for aug in self.augmentation:
                audio = aug(audio)

        return torch.tensor(audio, dtype=torch.float32), torch.tensor(label.numpy(), dtype=torch.long)


In [3]:
import torch
import torch.nn as nn
import sys
sys.path.append('mamba/mamba_ssm/modules')
from mamba_simple import Mamba

class KeywordSpottingModel_with_cls(nn.Module):
    def __init__(self, input_length, d_model, d_state, d_conv, expand, label_names, num_mamba_layers=1, dropout_rate=0.2):
        super(KeywordSpottingModel_with_cls, self).__init__()
        
        # Initial CNN feature extractor for raw audio input
        self.cnn_extractor = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=3, stride=2, padding=1),  # Example: Adjust channels as needed
            nn.ReLU(),
            nn.Conv1d(16, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv1d(32, d_model, kernel_size=3, stride=2, padding=1),
            nn.ReLU()
        )
        
        # CLS token: learnable parameter with shape [1, 1, d_model]
        self.cls_token = nn.Parameter(torch.zeros(1, 1, d_model))
        
        # Quantization stubs
        self.quant = torch.quantization.QuantStub()  # Quantize the input
        self.dequant = torch.quantization.DeQuantStub()  # Dequantize output if needed
        
        # Stack multiple Mamba layers with RMSNorm layer
        self.mamba_layers = nn.ModuleList()
        self.layer_norms = nn.ModuleList()

        for _ in range(num_mamba_layers):
            self.mamba_layers.append(Mamba(d_model=d_model, d_state=d_state, d_conv=d_conv, expand=expand))
            self.layer_norms.append(nn.modules.normalization.RMSNorm(d_model))

        # Output layer
        self.fc = nn.Linear(d_model, len(label_names))  
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        # x is expected to have shape [batch_size, input_length] for raw audio
        print(f"Input shape: {x.shape}")
        
        # Add a channel dimension for Conv1D
        x = x.unsqueeze(1)  # Shape: [batch_size, 1, input_length]
        print(f"After unsqueeze (for Conv1D): {x.shape}")

        # Pass through the CNN feature extractor
        x = self.cnn_extractor(x)  # Shape: [batch_size, d_model, num_frames]
        print(f"After CNN feature extractor: {x.shape}")

        # Transpose to [batch_size, num_frames, d_model] for CLS token addition
        x = x.permute(0, 2, 1)
        print(f"After permute for CLS token addition: {x.shape}")
        
        # Create a CLS token and expand it across the batch dimension
        batch_size = x.size(0)
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # Shape: [batch_size, 1, d_model]
        print(f"CLS token shape: {cls_tokens.shape}")
        
        # Append the CLS token to the input sequence
        x = torch.cat((x, cls_tokens), dim=1)  # Shape: [batch_size, num_frames + 1, d_model]
        print(f"After concatenating CLS token: {x.shape}")
        
        # Transpose to [batch_size, num_frames + 1, d_model] for Mamba (instead of [batch_size, d_model, num_frames + 1])
        x = x.permute(0, 1, 2)
        print(f"After permute for Mamba layer: {x.shape}")
        
        # Pass through Mamba layers and layer normalization
        for i, (mamba_layer, layer_norm) in enumerate(zip(self.mamba_layers, self.layer_norms)):
            x = mamba_layer(x)
            print(f"After Mamba layer {i}: {x.shape}")
            x = layer_norm(x)  # Apply RMSNorm after Mamba layer
            print(f"After RMSNorm {i}: {x.shape}")

        x = self.dropout(x)  # Apply dropout after Mamba layers
        print(f"After dropout: {x.shape}")
        
        # Extract the CLS token output (last token)
        cls_output = x[:, -1, :]  # Shape: [batch_size, d_model]
        print(f"CLS token output shape: {cls_output.shape}")
        
        # Pass through the output layer
        x = self.fc(cls_output)
        print(f"Output shape: {x.shape}")
        
        return x


In [4]:
torch.cuda.is_available()

True

In [5]:
set_memory_GB(2)

Memory fraction set to 0.044916159152997036
Memory fraction in GB: 2.0


In [6]:
train_ds, val_ds, test_ds, silence_ds , info = load_speech_commands_dataset()
bg_noise_ds = None

2024-11-06 08:51:26.937652: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-06 08:51:26.959074: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-06 08:51:26.989493: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 08:51:27.031535: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 08:51:27.041761: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attemptin

In [7]:
# maintain seed for repructablity
np.seed = 42
# tf.random.set_seed(42)
torch.manual_seed(0)

<torch._C.Generator at 0x7f4cc819a3b0>

In [8]:
label_names = ['down', 'go', 'left', 'no', 'off', 'on', 'right', 'stop', 'up', 'yes']
print(label_names)

['down', 'go', 'left', 'no', 'off', 'on', 'right', 'stop', 'up', 'yes']


In [9]:
augmentations = [
    lambda x: add_time_shift_and_align(x),
]

In [10]:
# Convert the TFDS dataset to a PyTorch Dataset for raw audio input
fixed_length = 16000  # Length of the raw audio input

# Initialize the dataset adapters with raw audio (no MFCC transform)
pytorch_train_dataset = TFDatasetAdapter(
    train_ds.take(1000),
    bg_noise_ds,
    fixed_length,
    augmentation=augmentations,
    noise_level=0.2,
)

pytorch_val_dataset = TFDatasetAdapter(
    val_ds.take(1000),
    None,
    fixed_length,
    augmentation=False,
    )


2024-11-06 08:51:33.977170: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-11-06 08:51:34.713432: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [11]:
# #play sound from dataset
# import IPython.display as ipd

# for i in range(10):
#     x, y = pytorch_train_dataset[i]
#     print(label_names[y])
#     ipd.display(ipd.Audio(x.numpy(), rate=16000))
#     # print(x.shape)

In [12]:
# Create a DataLoader to feed the data into the model
batch_size = 32
train_loader = DataLoader(pytorch_train_dataset, batch_size=batch_size, shuffle=True,num_workers=4,prefetch_factor=2)
val_loader = DataLoader(pytorch_val_dataset, batch_size=batch_size, shuffle=False,num_workers=4,prefetch_factor=2)

# Training loop

# With L2 regulariztion AND Droput layer

In [13]:
configs = {'d_state': 51, 'd_conv': 10, 'expand': 2, 'batch_size': 26, 'dropout_rate': 0.134439213335519, 'num_mamba_layers': 2, 'n_mfcc': 23, 'n_fft': 475, 'hop_length': 119, 'n_mels': 61, 'noise_level': 0.2582577623788829, 'lr': 0.0011942156978344588, 'weight_decay': 2.5617519345807027e-05}

# Configuration parameters from configs dictionary
fixed_length = 16000  # Raw audio input length
d_model = configs['d_state']  # Output of CNN feature extractor should match d_model for compatibility
d_state = configs['d_state']
d_conv = configs['d_conv']
expand = configs['expand']
dropout_rate = configs['dropout_rate']
num_mamba_layers = configs['num_mamba_layers']
noise_level = configs['noise_level']
learning_rate = configs['lr']
weight_decay = configs['weight_decay']

# Initialize the model with raw audio input length
model = KeywordSpottingModel_with_cls(
    input_length=fixed_length,  # Use raw audio length instead of MFCC dimensions
    d_model=d_model,
    d_state=d_state,
    d_conv=d_conv,
    expand=expand,
    label_names=label_names,
    num_mamba_layers=num_mamba_layers,
    dropout_rate=dropout_rate
).to("cuda")

# Loss function
criterion = nn.CrossEntropyLoss().to("cuda")  # No need for class weights unless specified

# Optimizer setup
base_optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
optimizer = Lookahead(base_optimizer, k=5, alpha=0.5)  # Wrap around Adam optimizer



In [14]:
# Adding learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3)

In [16]:



# Training loop
num_epochs = 100

train_accuracies = []
val_accuracies = []
train_losses = []
val_losses = []

train_accuracies, val_accuracies, train_losses, val_losses = trainig_loop(model, num_epochs, train_loader, val_loader, criterion, optimizer, scheduler)





  0%|                                                                               | 0/32 [00:00<?, ?it/s]

Input shape: torch.Size([32, 16000])
After unsqueeze (for Conv1D): torch.Size([32, 1, 16000])
After CNN feature extractor: torch.Size([32, 51, 2000])
After permute for CLS token addition: torch.Size([32, 2000, 51])
CLS token shape: torch.Size([32, 1, 51])
After concatenating CLS token: torch.Size([32, 2001, 51])
After permute for Mamba layer: torch.Size([32, 2001, 51])
After Mamba layer 0: torch.Size([32, 2001, 51])
After RMSNorm 0: torch.Size([32, 2001, 51])
After Mamba layer 1: torch.Size([32, 2001, 51])
After RMSNorm 1: torch.Size([32, 2001, 51])
After dropout: torch.Size([32, 2001, 51])
CLS token output shape: torch.Size([32, 51])
Output shape: torch.Size([32, 10])


../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [1,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [3,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [4,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [10,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [13,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [14,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_los

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
#load model
model.load_state_dict(torch.load("best_model.pth"))
# load test data
pytorch_test_dataset = TFDatasetAdapter(test_ds,None, fixed_length, n_mfcc, n_fft, hop_length, n_mels, augmentation=None)
test_loader = DataLoader(pytorch_test_dataset, batch_size=batch_size, shuffle=False,num_workers=4,prefetch_factor=2)

# Evaluate the model on the test set
accuracy = 0
total = 0
model.eval()

with torch.no_grad():
    for audio, labels in test_loader:
        audio, labels = audio.to("cuda"), labels.to("cuda")
        outputs = model(audio)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        accuracy += (predicted == labels).sum().item()
test_accuracy = 100 * accuracy / total
print(f'Test Accuracy: {test_accuracy}%')





  model.load_state_dict(torch.load("best_model.pth"))


RuntimeError: Error(s) in loading state_dict for KeywordSpottingModel_with_cls:
	Missing key(s) in state_dict: "total_ops", "total_params", "quant.total_ops", "quant.total_params", "dequant.total_ops", "dequant.total_params", "mamba_layers.total_ops", "mamba_layers.total_params", "mamba_layers.0.total_ops", "mamba_layers.0.total_params", "mamba_layers.0.act.total_ops", "mamba_layers.0.act.total_params", "layer_norms.total_ops", "layer_norms.total_params", "layer_norms.0.total_ops", "layer_norms.0.total_params". 
	Unexpected key(s) in state_dict: "mamba_layers.1.A_log", "mamba_layers.1.D", "mamba_layers.1.in_proj.weight", "mamba_layers.1.conv1d.weight", "mamba_layers.1.conv1d.bias", "mamba_layers.1.x_proj.weight", "mamba_layers.1.dt_proj.weight", "mamba_layers.1.dt_proj.bias", "mamba_layers.1.out_proj.weight", "layer_norms.1.weight". 
	size mismatch for cls_token: copying a param with shape torch.Size([1, 1, 136]) from checkpoint, the shape in current model is torch.Size([1, 1, 52]).
	size mismatch for proj.weight: copying a param with shape torch.Size([136, 69]) from checkpoint, the shape in current model is torch.Size([52, 23]).
	size mismatch for proj.bias: copying a param with shape torch.Size([136]) from checkpoint, the shape in current model is torch.Size([52]).
	size mismatch for mamba_layers.0.A_log: copying a param with shape torch.Size([272, 51]) from checkpoint, the shape in current model is torch.Size([104, 51]).
	size mismatch for mamba_layers.0.D: copying a param with shape torch.Size([272]) from checkpoint, the shape in current model is torch.Size([104]).
	size mismatch for mamba_layers.0.in_proj.weight: copying a param with shape torch.Size([544, 136]) from checkpoint, the shape in current model is torch.Size([208, 52]).
	size mismatch for mamba_layers.0.conv1d.weight: copying a param with shape torch.Size([272, 1, 10]) from checkpoint, the shape in current model is torch.Size([104, 1, 10]).
	size mismatch for mamba_layers.0.conv1d.bias: copying a param with shape torch.Size([272]) from checkpoint, the shape in current model is torch.Size([104]).
	size mismatch for mamba_layers.0.x_proj.weight: copying a param with shape torch.Size([111, 272]) from checkpoint, the shape in current model is torch.Size([106, 104]).
	size mismatch for mamba_layers.0.dt_proj.weight: copying a param with shape torch.Size([272, 9]) from checkpoint, the shape in current model is torch.Size([104, 4]).
	size mismatch for mamba_layers.0.dt_proj.bias: copying a param with shape torch.Size([272]) from checkpoint, the shape in current model is torch.Size([104]).
	size mismatch for mamba_layers.0.out_proj.weight: copying a param with shape torch.Size([136, 272]) from checkpoint, the shape in current model is torch.Size([52, 104]).
	size mismatch for layer_norms.0.weight: copying a param with shape torch.Size([136]) from checkpoint, the shape in current model is torch.Size([52]).
	size mismatch for fc.weight: copying a param with shape torch.Size([10, 136]) from checkpoint, the shape in current model is torch.Size([10, 52]).

In [None]:
from utils import plot_learning_curves

plot_learning_curves(train_accuracies, val_accuracies, train_losses, val_losses)

In [None]:
import pandas as pd
from utils import compute_inference_GPU_mem
#save model size(macs, params) and accuracy
batch_size = configs['batch_size']
macs, params = print_model_size(model,input_size=torch.randn(batch_size, input_dim, d_model-1).to("cuda"))
macs = macs/1e9
accuracy = test_accuracy
data = {'Model': ['KeywordSpottingModel_RSM_Norm_0-1-2_order_cls_bgnoise'], 'GMACs': [macs], 'Params': [params], 'Accuracy': [accuracy]}
model_config = {'input_dim': input_dim, 'd_model': d_model, 'd_state': d_state, 'd_conv': d_conv, 'expand': expand}
data.update(model_config)
inf_GPU_mem = compute_inference_GPU_mem(model, input=torch.randn(1, input_dim, d_model-1).to("cuda"))
#inference macs and params
inf_macs, inf_params = print_model_size(model,input_size=torch.randn(1, input_dim, d_model-1).to("cuda"))
inference_data = {'Inference CUDA Mem in MB': [inf_GPU_mem], 'Inference GMACs': [inf_macs/1e9], 'Inference Params': [inf_params]}
data.update(inference_data)
df = pd.DataFrame(data, index=[0])
df.to_csv('results.csv', mode='a', header=False)

[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
[INFO] Register count_convNd() for <class 'torch.nn.modules.conv.Conv1d'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.dropout.Dropout'>.

MACs: 25464920.0 Which are 0.02546492 Giga-MACs, Params: 19354.0



NameError: name 'test_accuracy' is not defined

In [None]:
with torch.profiler.profile(with_flops=True) as prof:
    model(torch.randn(32, input_dim, d_model-1).to("cuda"))

# Print FLOPs
print(prof.key_averages().table(sort_by="flops"))