# Build the Speech Model

## Load Spectorgram Images into DataLoader for Training

In [16]:
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, models, transforms
import pandas as pd
import os

## Load Spectorgram Images Created

The tensors will be created from the images and labels from the name of the folder containing the images

In [2]:
data_path = './data/spectrograms' #looking in subfolder train

yes_no_dataset = datasets.ImageFolder(
    root=data_path,
    transform=transforms.Compose([transforms.Resize((201,81)),
                                  transforms.ToTensor()
                                  ])
)
print(yes_no_dataset)
print(yes_no_dataset[5][0].size())

Dataset ImageFolder
    Number of datapoints: 7985
    Root location: ./data/spectrograms
    StandardTransform
Transform: Compose(
               Resize(size=(201, 81), interpolation=PIL.Image.BILINEAR)
               ToTensor()
           )
torch.Size([3, 201, 81])


In [129]:
yes_no_dataset[0][1] # show label

0

In [130]:
yes_no_dataset[4000][1]

1

In [3]:
#split data to test and train
#use 80% to train
train_size = int(0.8 * len(yes_no_dataset))
test_size = len(yes_no_dataset) - train_size
yes_no_train_dataset, yes_no_test_dataset = torch.utils.data.random_split(yes_no_dataset, [train_size, test_size])

print(len(yes_no_train_dataset))
print(len(yes_no_test_dataset))

6388
1597


In [4]:
train_dataloader = torch.utils.data.DataLoader(
    yes_no_train_dataset,
    batch_size=5,
    num_workers=2,
    shuffle=True
)

test_dataloader = torch.utils.data.DataLoader(
    yes_no_test_dataset,
    batch_size=5,
    num_workers=2,
    shuffle=True
)

In [118]:
train_dataloader.dataset[0][0][0][0]

tensor([0.7451, 0.7569, 0.7725, 0.8235, 0.8549, 0.8275, 0.7804, 0.7882, 0.8471,
        0.8431, 0.6353, 0.7020, 0.8196, 0.8118, 0.7725, 0.6941, 0.7137, 0.7020,
        0.7137, 0.7529, 0.8000, 0.8314, 0.8275, 0.7882, 0.8275, 0.8627, 0.8510,
        0.7961, 0.7725, 0.8039, 0.8392, 0.8431, 0.8275, 0.8000, 0.7961, 0.7569,
        0.7765, 0.8353, 0.8471, 0.8314, 0.7608, 0.7882, 0.8000, 0.7922, 0.7686,
        0.6824, 0.6235, 0.7686, 0.7569, 0.7412, 0.8039, 0.8235, 0.8196, 0.7961,
        0.7686, 0.8000, 0.8000, 0.7059, 0.7333, 0.7765, 0.8000, 0.8000, 0.7843,
        0.7569, 0.7373, 0.7373, 0.7294, 0.7333, 0.7569, 0.7647, 0.7255, 0.6902,
        0.7255, 0.7255, 0.6902, 0.6392, 0.6824, 0.7608, 0.8078, 0.7961, 0.7137])

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))


Using cuda device


In [6]:
class CNNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=5)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(51136, 50)
        self.fc2 = nn.Linear(50, 2)


    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        #x = x.view(x.size(0), -1)
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc2(x))
        return F.log_softmax(x,dim=1)
    
model = CNNet().to(device)

print(model)

CNNet(
  (conv1): Conv2d(3, 32, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1))
  (conv2_drop): Dropout2d(p=0.5, inplace=False)
  (flatten): Flatten()
  (fc1): Linear(in_features=51136, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=2, bias=True)
)


In [7]:
# cost function used to determine best parameters
cost = torch.nn.CrossEntropyLoss()

# used to create optimal parameters
learning_rate = 0.0001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Create the training function

def train(dataloader, model, loss, optimizer):
    model.train()
    size = len(dataloader.dataset)
    for batch, (X, Y) in enumerate(dataloader):
        X, Y = X.to(device), Y.to(device)
        optimizer.zero_grad()
        pred = model(X)
        loss = cost(pred, Y)
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f'loss: {loss:>7f}  [{current:>5d}/{size:>5d}]')


# Create the validation/test function

def test(dataloader, model):
    size = len(dataloader.dataset)
    model.eval()
    test_loss, correct = 0, 0

    with torch.no_grad():
        for batch, (X, Y) in enumerate(dataloader):
            X, Y = X.to(device), Y.to(device)
            pred = model(X)

            test_loss += cost(pred, Y).item()
            correct += (pred.argmax(1)==Y).type(torch.float).sum().item()

    test_loss /= size
    correct /= size

    print(f'\nTest Error:\nacc: {(100*correct):>0.1f}%, avg loss: {test_loss:>8f}\n')

In [8]:
epochs = 5

for t in range(epochs):
    print(f'Epoch {t+1}\n-------------------------------')
    train(train_dataloader, model, cost, optimizer)
    test(test_dataloader, model)
print('Done!')

Epoch 1
-------------------------------
loss: 0.703531  [    0/ 6388]
loss: 0.687022  [  500/ 6388]
loss: 0.634257  [ 1000/ 6388]
loss: 0.601841  [ 1500/ 6388]
loss: 0.665305  [ 2000/ 6388]
loss: 0.646202  [ 2500/ 6388]
loss: 0.730650  [ 3000/ 6388]
loss: 0.578943  [ 3500/ 6388]
loss: 0.250668  [ 4000/ 6388]
loss: 0.528344  [ 4500/ 6388]
loss: 0.251262  [ 5000/ 6388]
loss: 0.130904  [ 5500/ 6388]
loss: 0.375727  [ 6000/ 6388]

Test Error:
acc: 86.9%, avg loss: 0.064851

Epoch 2
-------------------------------
loss: 0.228177  [    0/ 6388]
loss: 0.120126  [  500/ 6388]
loss: 0.538169  [ 1000/ 6388]
loss: 0.172016  [ 1500/ 6388]
loss: 0.355710  [ 2000/ 6388]
loss: 0.146667  [ 2500/ 6388]
loss: 0.472763  [ 3000/ 6388]
loss: 0.289497  [ 3500/ 6388]
loss: 0.220728  [ 4000/ 6388]
loss: 0.225991  [ 4500/ 6388]
loss: 0.086273  [ 5000/ 6388]
loss: 0.377300  [ 5500/ 6388]
loss: 0.103193  [ 6000/ 6388]

Test Error:
acc: 90.1%, avg loss: 0.049509

Epoch 3
-------------------------------
loss: 0.29

In [9]:
size = len(test_dataloader.dataset)
model.eval()
test_loss, correct = 0, 0

with torch.no_grad():
    for batch, (X, Y) in enumerate(test_dataloader):
        X, Y = X.to(device), Y.to(device)
        pred = model(X)
        print(f"predicted: {pred.argmax(1)}")
        print(f"actual: {Y}")
        test_loss += cost(pred, Y).item()
        correct += (pred.argmax(1)==Y).type(torch.float).sum().item()

predicted: tensor([1, 1, 1, 0, 1], device='cuda:0')
actual: tensor([1, 1, 1, 0, 1], device='cuda:0')
predicted: tensor([1, 0, 0, 0, 0], device='cuda:0')
actual: tensor([1, 0, 0, 0, 0], device='cuda:0')
predicted: tensor([0, 0, 1, 0, 1], device='cuda:0')
actual: tensor([0, 0, 1, 0, 1], device='cuda:0')
predicted: tensor([0, 1, 0, 0, 0], device='cuda:0')
actual: tensor([0, 1, 0, 0, 0], device='cuda:0')
predicted: tensor([1, 1, 0, 1, 0], device='cuda:0')
actual: tensor([1, 1, 0, 1, 0], device='cuda:0')
predicted: tensor([0, 1, 0, 1, 0], device='cuda:0')
actual: tensor([0, 1, 0, 1, 1], device='cuda:0')
predicted: tensor([1, 1, 0, 0, 0], device='cuda:0')
actual: tensor([0, 1, 0, 0, 0], device='cuda:0')
predicted: tensor([0, 1, 1, 0, 1], device='cuda:0')
actual: tensor([1, 1, 1, 0, 1], device='cuda:0')
predicted: tensor([0, 1, 0, 0, 0], device='cuda:0')
actual: tensor([0, 0, 0, 0, 0], device='cuda:0')
predicted: tensor([0, 0, 1, 1, 1], device='cuda:0')
actual: tensor([0, 0, 1, 1, 1], device=

actual: tensor([1, 1, 0, 1, 0], device='cuda:0')
predicted: tensor([0, 1, 0, 0, 1], device='cuda:0')
actual: tensor([0, 1, 0, 0, 1], device='cuda:0')
predicted: tensor([1, 0, 0, 0, 0], device='cuda:0')
actual: tensor([1, 0, 0, 0, 0], device='cuda:0')
predicted: tensor([0, 1, 1, 1, 1], device='cuda:0')
actual: tensor([0, 1, 1, 1, 1], device='cuda:0')
predicted: tensor([0, 1, 0, 0, 0], device='cuda:0')
actual: tensor([0, 1, 0, 0, 0], device='cuda:0')
predicted: tensor([0, 1, 1, 1, 0], device='cuda:0')
actual: tensor([0, 1, 1, 1, 0], device='cuda:0')
predicted: tensor([1, 1, 0, 0, 0], device='cuda:0')
actual: tensor([1, 1, 0, 0, 0], device='cuda:0')
predicted: tensor([0, 0, 0, 0, 1], device='cuda:0')
actual: tensor([0, 0, 0, 0, 1], device='cuda:0')
predicted: tensor([1, 1, 0, 0, 1], device='cuda:0')
actual: tensor([1, 1, 0, 1, 1], device='cuda:0')
predicted: tensor([0, 1, 0, 0, 1], device='cuda:0')
actual: tensor([0, 1, 0, 0, 1], device='cuda:0')
predicted: tensor([1, 0, 0, 0, 1], device=

actual: tensor([0, 1, 1, 1, 1], device='cuda:0')
predicted: tensor([1, 1, 1, 0, 0], device='cuda:0')
actual: tensor([1, 1, 1, 0, 0], device='cuda:0')
predicted: tensor([1, 0, 1, 0, 1], device='cuda:0')
actual: tensor([1, 0, 1, 0, 1], device='cuda:0')
predicted: tensor([0, 0, 1, 0, 0], device='cuda:0')
actual: tensor([0, 0, 1, 1, 1], device='cuda:0')
predicted: tensor([1, 1, 0, 1, 0], device='cuda:0')
actual: tensor([1, 0, 0, 1, 0], device='cuda:0')
predicted: tensor([0, 1, 0, 0, 1], device='cuda:0')
actual: tensor([0, 1, 0, 0, 1], device='cuda:0')
predicted: tensor([1, 1, 1, 1, 1], device='cuda:0')
actual: tensor([1, 1, 1, 1, 1], device='cuda:0')
predicted: tensor([1, 1, 0, 0, 0], device='cuda:0')
actual: tensor([1, 1, 0, 0, 0], device='cuda:0')
predicted: tensor([0, 1, 1, 1, 1], device='cuda:0')
actual: tensor([0, 1, 1, 1, 1], device='cuda:0')
predicted: tensor([0, 0, 1, 1, 0], device='cuda:0')
actual: tensor([0, 0, 1, 1, 0], device='cuda:0')
predicted: tensor([1, 1, 0, 1, 1], device=

predicted: tensor([0, 1, 1, 0, 1], device='cuda:0')
actual: tensor([0, 1, 1, 0, 1], device='cuda:0')
predicted: tensor([1, 1, 1, 1, 0], device='cuda:0')
actual: tensor([1, 1, 1, 1, 0], device='cuda:0')
predicted: tensor([0, 0, 1, 1, 1], device='cuda:0')
actual: tensor([0, 0, 1, 1, 1], device='cuda:0')
predicted: tensor([1, 0, 1, 1, 1], device='cuda:0')
actual: tensor([1, 0, 1, 1, 1], device='cuda:0')
predicted: tensor([1, 1, 1, 1, 1], device='cuda:0')
actual: tensor([1, 1, 1, 1, 1], device='cuda:0')
predicted: tensor([0, 0, 0, 0, 0], device='cuda:0')
actual: tensor([0, 0, 0, 0, 1], device='cuda:0')
predicted: tensor([1, 1, 1, 0, 0], device='cuda:0')
actual: tensor([1, 0, 1, 0, 0], device='cuda:0')
predicted: tensor([0, 0, 0, 0, 0], device='cuda:0')
actual: tensor([0, 0, 0, 0, 0], device='cuda:0')
predicted: tensor([0, 1, 0, 0, 0], device='cuda:0')
actual: tensor([0, 1, 0, 0, 0], device='cuda:0')
predicted: tensor([0, 1, 0, 1, 1], device='cuda:0')
actual: tensor([0, 1, 0, 1, 1], device=

## Test your own voice!

In [260]:
def create_image(waveform, filename, my_audio_dir):
    #make directory
    os.makedirs(f'{my_audio_dir}//spectrograms//test//', mode=0o777, exist_ok=True)

    spectrogram_tensor = torchaudio.transforms.Spectrogram()(waveform)
    
    img_path = f'{my_audio_dir}//spectrograms//test//{filename}.png'

    fig = plt.figure()
    plt.imsave(img_path, spectrogram_tensor.log2()[0,:,:].numpy(), cmap='gray')
    return img_path


In [261]:
def load_image(data_path):
    return datasets.ImageFolder(
                    root=data_path,
                    transform=transforms.Compose([transforms.Resize((201,81)),
                                                  transforms.ToTensor()
                                                  ])
                    )

In [279]:
def predict(tensor):
    # Use the model to predict the label of the waveform
    tensor = tensor.to(device)
    #tensor = transform(tensor)
    tensor = model(tensor.unsqueeze(0))
    tensor = get_likely_index(tensor)
    tensor = index_to_label(tensor.squeeze())
    return tensor

In [280]:
def get_likely_index(tensor):
    # find most likely label index for each element in the batch
    return tensor.argmax(dim=-1)

In [281]:
def index_to_label(tensor):
    if tensor == 1:
        return 'yes'
    return 'no'

In [307]:
#filename = 'yes_1.wav'
#filename = 'no_1.wav'

filename = 'yes_2.wav'
#filename = 'no_2.wav'

## load audio file
myaudio_dir = './/data//myaudio'
waveform, sample_rate = torchaudio.load(f'{myaudio_dir}//{filename}')

#create image and save to spectorgrams folder
image_path = create_image(waveform, filename, myaudio_dir)

print(image_path)

.//data//myaudio//spectrograms//test//yes_2.wav.png


<Figure size 432x288 with 0 Axes>

In [310]:
image_path = f'{myaudio_dir}//spectrograms//'
print(image_path)
spectrogram_image = load_image(image_path)

.//data//myaudio//spectrograms//


In [311]:
test_image = spectrogram_image[0][0]
print(test_image)

tensor([[[0.7569, 0.8118, 0.8706,  ..., 0.8275, 0.8353, 0.7882],
         [0.7490, 0.8196, 0.8549,  ..., 0.8157, 0.8196, 0.7843],
         [0.7137, 0.7569, 0.7490,  ..., 0.7333, 0.7333, 0.7490],
         ...,
         [0.4078, 0.3725, 0.4000,  ..., 0.3804, 0.4118, 0.4196],
         [0.4039, 0.3961, 0.3608,  ..., 0.3882, 0.4078, 0.4039],
         [0.3137, 0.3686, 0.3412,  ..., 0.3961, 0.4118, 0.4196]],

        [[0.7569, 0.8118, 0.8706,  ..., 0.8275, 0.8353, 0.7882],
         [0.7490, 0.8196, 0.8549,  ..., 0.8157, 0.8196, 0.7843],
         [0.7137, 0.7569, 0.7490,  ..., 0.7333, 0.7333, 0.7490],
         ...,
         [0.4078, 0.3725, 0.4000,  ..., 0.3804, 0.4118, 0.4196],
         [0.4039, 0.3961, 0.3608,  ..., 0.3882, 0.4078, 0.4039],
         [0.3137, 0.3686, 0.3412,  ..., 0.3961, 0.4118, 0.4196]],

        [[0.7569, 0.8118, 0.8706,  ..., 0.8275, 0.8353, 0.7882],
         [0.7490, 0.8196, 0.8549,  ..., 0.8157, 0.8196, 0.7843],
         [0.7137, 0.7569, 0.7490,  ..., 0.7333, 0.7333, 0.

In [312]:
print(f"Predicted: {predict(test_image)} Actual: {filename}")

Predicted: yes Actual: yes_2.wav
