In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
import pandas as pd
from torchsummary import summary
from torch.utils.data import Dataset, DataLoader
import numpy as np

In [2]:
#load features and labels
features = np.load('features.npy')
labels = np.load('labels.npy')


In [3]:
print("Number of samples:", len(features))
print("Shape of one feature:", features[0].shape)

Number of samples: 5334
Shape of one feature: (40, 115)


In [4]:
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(
    features, labels, test_size=0.2, stratify=labels, random_state=42
)

print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)


(4267, 40, 115)
(1067, 40, 115)
(4267,)
(1067,)


In [5]:
import numpy as np

# Stack all training features to compute mean and std
all_train_features = np.stack(X_train, axis=0)  # Shape: (num_samples, n_mfcc, max_length)

# Compute mean and std across the dataset (axis=0)
mean = np.mean(all_train_features, axis=0)
std = np.std(all_train_features, axis=0)

print(mean.shape)
print(std.shape)


(40, 115)
(40, 115)


In [6]:
def normalize(features, mean, std):
    return (features - mean) / std

# Normalize training data
X_train_norm = [normalize(f, mean, std) for f in X_train]

# Normalize validation data
X_val_norm = [normalize(f, mean, std) for f in X_val]


np.save('mean.npy', mean)
np.save('std.npy', std)



In [7]:
class AudioDataset(Dataset):
    def __init__(self, features, labels):
        self.X = features
        self.y = labels

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        # Convert features to tensor and add channel dimension
        mfcc = self.X[idx]
        mfcc = torch.tensor(mfcc, dtype=torch.float32)
        mfcc = mfcc.unsqueeze(0)  # Add channel dimension,  Shape: (1, n_mfcc, max_length)
        label = torch.tensor(self.y[idx], dtype=torch.long)
        return mfcc, label

In [8]:
# Create datasets
train_dataset = AudioDataset(X_train_norm, y_train)
val_dataset = AudioDataset(X_val_norm, y_val)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [9]:
class AudioCNN(nn.Module):
    def __init__(self, num_labels, input_height, input_width):
        super().__init__()

        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=(3,3),
                padding=1,
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(16)
        )

        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=32,
                kernel_size=(3,3),
                padding=1,
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(32)
        )

        # Calculate the size after convolutional layers
        def conv2d_output_size(size, kernel_size=3, stride=1, padding=1):
            return (size + 2*padding - (kernel_size - 1) - 1) // stride + 1

        def maxpool_output_size(size, kernel_size=2, stride=2):
            return (size - (kernel_size - 1) - 1) // stride + 1

        # Compute height and width after conv and pooling layers
        h = input_height
        w = input_width

        h = conv2d_output_size(h)
        h = maxpool_output_size(h)
        h = conv2d_output_size(h)
        h = maxpool_output_size(h)

        w = conv2d_output_size(w)
        w = maxpool_output_size(w)
        w = conv2d_output_size(w)
        w = maxpool_output_size(w)

        # flatten
        self.flatten = nn.Flatten()

        # linear
        self.linear = nn.Linear(32*h*w, num_labels)
 
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.flatten(x)
        logits = self.linear(x)
        return logits
 

In [10]:
num_labels = len(np.unique(labels))
n_mfcc= X_train_norm[0].shape[0]
max_length = X_train_norm[0].shape[1]

model = AudioCNN(num_labels, input_height=n_mfcc, input_width=max_length)

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)



In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cpu


In [13]:
num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (mfcc, label) in enumerate(train_loader):
        mfcc = mfcc.to(device)
        label = label.to(device)

        optimizer.zero_grad()
        output = model(mfcc)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * mfcc.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

    # Validation
    model.eval()
    val_running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for mfcc, label in val_loader:
            mfcc = mfcc.to(device)
            label = label.to(device)
            output = model(mfcc)
            loss = criterion(output, label)
            val_running_loss += loss.item() * mfcc.size(0)

            _, predicted = torch.max(output.data, 1)
            total += label.size(0)
            correct += (predicted == label).sum().item()

    val_loss = val_running_loss / len(val_dataset)
    val_acc = 100 * correct / total
    print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}%')


Epoch 1/100, Loss: 0.4512
Validation Loss: 0.1115, Validation Accuracy: 97.5633%
Epoch 2/100, Loss: 0.0486
Validation Loss: 0.0326, Validation Accuracy: 98.7816%
Epoch 3/100, Loss: 0.0146
Validation Loss: 0.0112, Validation Accuracy: 99.7188%
Epoch 4/100, Loss: 0.0070
Validation Loss: 0.0135, Validation Accuracy: 99.3440%
Epoch 5/100, Loss: 0.0088
Validation Loss: 0.0102, Validation Accuracy: 99.5314%
Epoch 6/100, Loss: 0.0059
Validation Loss: 0.0074, Validation Accuracy: 99.4377%
Epoch 7/100, Loss: 0.0046
Validation Loss: 0.0052, Validation Accuracy: 99.9063%
Epoch 8/100, Loss: 0.0052
Validation Loss: 0.0093, Validation Accuracy: 99.3440%
Epoch 9/100, Loss: 0.0050
Validation Loss: 0.0041, Validation Accuracy: 99.7188%
Epoch 10/100, Loss: 0.0044
Validation Loss: 0.0106, Validation Accuracy: 99.5314%
Epoch 11/100, Loss: 0.0060
Validation Loss: 0.0702, Validation Accuracy: 99.2502%
Epoch 12/100, Loss: 0.1374
Validation Loss: 0.5639, Validation Accuracy: 93.2521%
Epoch 13/100, Loss: 0.225

In [14]:
torch.save(model.state_dict(), 'audio_cnn_model.pth')


In [15]:
model = AudioCNN(num_labels=num_labels, input_height=n_mfcc, input_width=max_length)
model.load_state_dict(torch.load('audio_cnn_model.pth'))
model.to(device)
model.eval()


  model.load_state_dict(torch.load('audio_cnn_model.pth'))


AudioCNN(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear): Linear(in_features=8960, out_features=8, bias=True)
)

## Inference



In [16]:
# Load inference features
val_features = np.load('val_features.npy', allow_pickle=True)

print(f"Number of inference samples: {len(val_features)}")
print(f"Shape of one feature: {val_features[0].shape}")

Number of inference samples: 2946
Shape of one feature: (40, 115)


In [17]:
# Load normalization parameters
mean = np.load('mean.npy')
std = np.load('std.npy')

def normalize(features, mean, std):
    return (features - mean) / std

# Normalize inference data
val_features_norm = [normalize(f, mean, std) for f in val_features]



In [18]:

class InferenceDataset(Dataset):
    def __init__(self, features):
        self.X = features

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        mfcc = self.X[idx]
        mfcc = torch.tensor(mfcc, dtype=torch.float32)
        mfcc = mfcc.unsqueeze(0)  # Shape: (1, n_mfcc, max_length)
        return mfcc


In [19]:
# Create inference dataset
inference_dataset = InferenceDataset(val_features_norm)

# Create dataloader
inference_loader = DataLoader(inference_dataset, batch_size=32, shuffle=False)

# Inference
model.eval()
all_predictions = []
with torch.no_grad():
    for mfcc in inference_loader:
        mfcc = mfcc.to(device)
        output = model(mfcc)
        _, predicted = torch.max(output.data, 1)
        all_predictions.extend(predicted.tolist())

print(all_predictions)

[6, 0, 7, 6, 3, 6, 3, 1, 2, 6, 1, 7, 0, 4, 0, 1, 7, 7, 6, 2, 1, 4, 7, 7, 7, 6, 5, 2, 1, 4, 0, 7, 2, 4, 2, 2, 6, 1, 2, 3, 5, 2, 7, 6, 2, 7, 3, 7, 2, 7, 7, 6, 2, 7, 2, 4, 4, 7, 6, 7, 7, 1, 7, 3, 3, 7, 4, 1, 7, 4, 6, 2, 1, 7, 7, 6, 7, 7, 1, 3, 7, 7, 2, 7, 6, 3, 0, 2, 7, 3, 6, 2, 6, 2, 2, 7, 7, 7, 2, 7, 6, 0, 7, 7, 2, 7, 7, 6, 7, 6, 7, 6, 3, 7, 4, 3, 6, 4, 5, 7, 7, 6, 1, 4, 7, 5, 6, 7, 5, 6, 4, 7, 7, 0, 7, 7, 7, 4, 0, 7, 5, 0, 7, 7, 5, 2, 0, 2, 2, 7, 0, 2, 4, 1, 4, 0, 7, 7, 7, 7, 2, 1, 7, 3, 1, 2, 6, 7, 7, 0, 6, 7, 3, 7, 7, 3, 6, 6, 1, 0, 7, 7, 1, 0, 4, 4, 4, 2, 2, 6, 6, 7, 6, 7, 7, 4, 6, 1, 0, 5, 6, 4, 7, 7, 4, 7, 2, 2, 0, 4, 4, 1, 4, 7, 6, 0, 1, 7, 6, 2, 2, 4, 7, 5, 7, 7, 6, 6, 3, 7, 2, 5, 2, 2, 1, 7, 4, 7, 1, 7, 1, 2, 3, 7, 6, 7, 4, 7, 3, 2, 0, 6, 0, 7, 6, 0, 7, 2, 7, 4, 2, 2, 7, 7, 6, 7, 6, 6, 4, 2, 2, 4, 6, 6, 0, 5, 7, 6, 6, 2, 6, 7, 7, 2, 7, 3, 3, 6, 3, 7, 7, 6, 7, 2, 7, 4, 6, 2, 2, 5, 5, 0, 2, 1, 6, 2, 2, 7, 0, 3, 5, 7, 7, 7, 6, 7, 7, 7, 7, 2, 3, 6, 1, 2, 6, 2, 3, 0, 4, 7, 7, 6, 1, 

In [20]:
metadata_df = pd.read_csv('../Train.csv')
unique_classes = metadata_df['class'].unique()
class_mapping = {class_name: idx for idx, class_name in enumerate(unique_classes)}

print(class_mapping)

{'left': 0, 'down': 1, 'go': 2, 'right': 3, 'up': 4, 'yes': 5, 'no': 6, 'stop': 7}


In [21]:
class_mapping_inverted = {v: k for k, v in class_mapping.items()}

In [22]:
predicted_class_names = [class_mapping_inverted[pred_id] for pred_id in all_predictions]


In [23]:
inference_metadata = pd.read_csv('../Test_1.csv')
ids = inference_metadata['id'].tolist()

In [24]:
submission_df = pd.DataFrame({
    'id': ids,
    'class': predicted_class_names
})


In [25]:
print(submission_df.head())


              id  class
0  id_u5iqtgjzhx     no
1  id_l7ebzcfk5e   left
2  id_jbzci8uepl   stop
3  id_jzil0fw5vs     no
4  id_o7mrvf5wj7  right


In [26]:
submission_df.to_csv('submission.csv', index=False)
