In [None]:
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torchaudio import transforms, datasets
import matplotlib.pyplot as plt


In [None]:
train_data = datasets.SPEECHCOMMANDS(root='/content', download=True, subset='training')
test_data = datasets.SPEECHCOMMANDS(root='/content', download=True, subset='testing')
valid_data = datasets.SPEECHCOMMANDS(root='/content', download=True, subset='validation')

In [None]:
label = list(set([i[2] for i in train_data]))
label

In [None]:
label_to_index = {label: i for i, label in enumerate(label)}
label_to_index

In [None]:
from torchaudio import transforms

transform = transforms.MelSpectrogram(
    sample_rate=16000,
    n_mels=64
)

In [None]:
import torch.nn.functional as F
max_len = 100


def collate_fn(batch):
    spectrograms, targets = [], []
    for waveform, sample_rate, label, *_ in batch:
        spec = transform(waveform).squeeze(0)

        if spec.shape[1] > max_len:
            spec = spec[:, :max_len]
        elif spec.shape[1] < max_len:
            pad_amount = max_len - spec.shape[1]
            spec = F.pad(spec, (0, pad_amount))

        spectrograms.append(spec)
        targets.append(label_to_index[label])

    spectrograms = torch.stack(spectrograms)
    spectrograms = spectrograms.unsqueeze(1)
    targets = torch.tensor(targets)
    return spectrograms, targets

In [None]:
len(label)

In [None]:
train = DataLoader(train_data, batch_size=64, shuffle=True, collate_fn=collate_fn)
test = DataLoader(test_data, batch_size=64, collate_fn=collate_fn)

In [None]:
class CheckAudio(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.first = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )

        self.second = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 8 * 12, 512),
            nn.ReLU(),
            nn.Linear(512, num_classes),
        )

    def forward(self, x):
        x = self.first(x)
        x = self.second(x)
        return x

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
print(torch.cuda.is_available())

In [None]:
model = CheckAudio(num_classes=len(label)).to(device)

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
for epoch in range(2):
  model.train()
  total_loss = 0
  for x_batch, y_batch in train:
    x_batch, y_batch = x_batch.to(device), y_batch.to(device)

    y_pred = model(x_batch)
    loss = loss_fn(y_pred, y_batch)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  print(f"Эпоха {epoch + 1}, Потерии: {round(total_loss, 2)}")

In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for x_batch, y_batch in test:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        y_pred = model(x_batch)
        predicted = torch.argmax(y_pred, dim=1)
        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)

accuracy = 100 * correct / total
print(f"Точность модели на тестовых данных: {accuracy:.2f}%")

In [None]:
torch.save(model.state_dict(), "ModelAudio.pth")



In [None]:
torch.save(label, "label.pth")