In [None]:
import os, warnings
import numpy as np
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,random_split,Dataset
import torchaudio
from torchaudio import transforms
from torch import Tensor
from torchaudio.datasets.utils import (
    download_url,
    extract_archive,
    walk_files
)

from train_utils import *
from model import *
from dataloader import *

In [None]:
train_audio_path = './data1/SpeechCommands/speech_commands_v0.02/'

labels_dict=os.listdir(train_audio_path)

a = torchaudio.datasets.SPEECHCOMMANDS('./data1/' , url = 'speech_commands_v0.02', 
                                       folder_in_archive= 'SpeechCommands', download = True)

In [None]:
filename = "./data1/SpeechCommands/speech_commands_v0.02/backward/0165e0e8_nohash_0.wav"
waveform, sample_rate = torchaudio.load(filename)

print("Shape of waveform: {}".format(waveform.size()))
print("Sample rate of waveform: {}".format(sample_rate))

plt.figure()
plt.plot(waveform.t().numpy())

In [None]:
plt.plot(a[0][0].t())
plt.show()

In [None]:
count=0
wave = []
labels = []
for i in range(0,105829):
    if a[i][0].shape == (1,16000):
        wave.append(a[i][0])
        labels.append(a[i][2])

In [None]:
specgram = torchaudio.transforms.MFCC()(wave[0])

print("Shape of spectrogram: {}".format(specgram.size()))

plt.figure(figsize=(10,5))
plt.imshow(specgram[0,:,:].numpy())
plt.colorbar()
plt.show()

In [None]:
specgram = torchaudio.transforms.MelSpectrogram()(wave[0])
mfcc = torchaudio.transforms.MFCC()(wave[0])


fig,ax = plt.subplots(1,2)

ax[0].imshow(specgram[0,:,:].numpy())
ax[1].imshow(mfcc[0,:,:].numpy())

In [None]:
data_transform = 0

if data_transform == 1:
    print("MFCC Features classification")
    train_audio_transforms = nn.Sequential(
            torchaudio.transforms.MFCC(log_mels=False)
            )
    net = NN2D(num_class=35)
elif data_transform == 2:
    print("Mel Spectogram Features classification")
    train_audio_transforms = nn.Sequential(
            torchaudio.transforms.MelSpectrogram()
            )
    net = NN2DMEL(num_class=35)
else:
    train_audio_transforms = None
    net = NN(num_class=35)

In [None]:
labels_dict=list(set(labels))

In [None]:
dataset= SpeechDataLoader(wave,labels,labels_dict, train_audio_transforms)

traindata, testdata = random_split(dataset, [round(len(dataset)*.8), round(len(dataset)*.2)])

trainloader = torch.utils.data.DataLoader(traindata, batch_size=100, shuffle=True)

testloader = torch.utils.data.DataLoader(testdata, batch_size=100, shuffle=True)

In [None]:
device = torch.device('cuda:9' if torch.cuda.is_available() else 'cpu')

net = net.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(),lr=0.001)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                              steps_per_epoch=int(len(trainloader)),
                                              epochs=num_epochs,
                                              anneal_strategy='linear') 


In [None]:
num_epochs=20

for epoch in range(0, num_epochs):
    
    train(net,trainloader,optimizer,scheduler,criterion,epoch,device)
    best_acc = test(net,testloader,optimizer,criterion,epoch,device)