# Creating dataset

В данном разделе написан код, который создает датасет из спектограмм

In [3]:
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa.display
from tqdm import tqdm
import gc
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas

    y, sr = librosa.load(wav_file)
    window = np.hanning(window_size)
    stft  = librosa.core.spectrum.stft(y, n_fft=window_size, hop_length=512, window=window)
    out = 2 * np.abs(stft) / np.sum(window)
    canvas = FigureCanvas(fig)
    ax = fig.add_subplot(111)
    ax.axis('off')
    p = librosa.display.specshow(librosa.amplitude_to_db(out, ref=np.max), ax=ax)
    temp = wav_file.split('/')[-1]
    temp = './' + temp.split('.')[0]+'.png'
    
    fig.savefig(temp
                ,pad_inches = 0)

In [5]:
def create_img_data(audio_folder:str,img_folder:str,window_size=1024,hop_length=512):
    '''
    данная функция генерирует по датасету аудиофайлов датасет спектрограмм(изображений)
    
    ==Input==
    audio_folder - путь к директории с датасетом аудиофайлов
    img_folder - путь к директории куда складывать изображения
    window_size - по сути размер скользящего окна
    '''
    if not os.path.exists(img_folder):
        os.makedirs(img_folder)
    # цикл по подпапкам и файлам
    for subdir, dirs, files in os.walk(audio_folder):
        print(subdir)
        for file in tqdm(files):
            filepath = subdir + os.sep + file
            if filepath.endswith(".wav"):
                fig = plt.Figure(frameon=False)
                y, sr = librosa.load(filepath)
                window = np.hanning(window_size) # построение скользящего окна для сглаживания
                stft  = librosa.core.spectrum.stft(y,   # разложение в Фурье
                                                   n_fft=window_size, 
                                                   hop_length=hop_length, 
                                                   window=window)
                out = 2 * np.abs(stft) / np.sum(window)
                canvas = FigureCanvas(fig)
                ax = fig.add_subplot(111)
                ax.axis('off')
                p = librosa.display.specshow(librosa.amplitude_to_db(out,  # построение спектрограммы
                                                                     ref=np.max),
                                             ax=ax)
                path=os.path.join(img_folder,subdir.split('/')[2])
                if not os.path.exists(path):
                    os.makedirs(path)
                f_name=os.path.join(path,file.split('.')[0]+'.png')   
                fig.savefig(f_name
                            ,pad_inches = 0)
                # закрытие и удаление объектов "график" чтобы не переполнялась память
                fig.clear()
                plt.close(fig)
            gc.collect()
        gc.collect()

In [7]:
%%time
create_img_data('archive/segregated-urban8K-sounds','./img')

archive/segregated-urban8K-sounds


0it [00:00, ?it/s]


archive/segregated-urban8K-sounds/street_music


100%|████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [07:28<00:00,  2.23it/s]


archive/segregated-urban8K-sounds/air_conditioner


100%|████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [07:25<00:00,  2.25it/s]


archive/segregated-urban8K-sounds/jackhammer


100%|████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [07:40<00:00,  2.17it/s]


archive/segregated-urban8K-sounds/siren


100%|██████████████████████████████████████████████████████████████████████████████████████| 929/929 [06:46<00:00,  2.28it/s]


archive/segregated-urban8K-sounds/engine_idling


100%|████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [07:40<00:00,  2.17it/s]


archive/segregated-urban8K-sounds/gun_shot


100%|██████████████████████████████████████████████████████████████████████████████████████| 374/374 [01:52<00:00,  3.31it/s]


archive/segregated-urban8K-sounds/drilling


100%|████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [07:22<00:00,  2.26it/s]


archive/segregated-urban8K-sounds/car_horn


100%|██████████████████████████████████████████████████████████████████████████████████████| 429/429 [02:27<00:00,  2.90it/s]


archive/segregated-urban8K-sounds/dog_bark


100%|████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [06:39<00:00,  2.50it/s]


archive/segregated-urban8K-sounds/children_playing


100%|████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [07:26<00:00,  2.24it/s]

CPU times: user 1h 2min 36s, sys: 12 s, total: 1h 2min 48s
Wall time: 1h 2min 52s





# Spliting Dataset to Train and Test

In [17]:
import os
import shutil
import numpy as np

In [21]:
def split_train_test(img_dir:str, dataset_dir:str,test_ratio = 0.20 ):
    classes = os.listdir(img_dir)
    for i in classes:
        if not os.path.exists(dataset_dir):
            os.makedirs(dataset_dir)
        if not os.path.exists(dataset_dir +'/train/' + i):
            os.makedirs(dataset_dir +'/train/' + i)
        if not os.path.exists(dataset_dir +'/val/' + i):
            os.makedirs(dataset_dir +'/val/' + i)
        source = img_dir + '/' + i
        allFileNames = os.listdir(source)
        np.random.shuffle(allFileNames)
        
        train_FileNames, test_FileNames = np.split(np.array(allFileNames),
                                                              [int(len(allFileNames)* (1 - test_ratio))])
        train_FileNames = [source+'/'+ name for name in train_FileNames.tolist()]
        test_FileNames = [source+'/' + name for name in test_FileNames.tolist()]
        for name in train_FileNames:
            shutil.copy(name, dataset_dir +'/train/' + i)
        for name in test_FileNames:
            shutil.copy(name, dataset_dir +'/val/' + i)

In [23]:
%%time
split_train_test('./img','img_dataset')

CPU times: user 235 ms, sys: 1.06 s, total: 1.3 s
Wall time: 1.44 s


# Creating classificator

In [1]:
import os
import cv2
import time
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, Dataset
from torch.utils.data import RandomSampler

import torchvision.transforms as T
import torchvision.models as models
from torchvision.utils import make_grid
from torchvision.datasets import ImageFolder

from matplotlib import pyplot as plt

In [2]:
DIR_TRAIN = "./img_dataset/train"
DIR_VALID = "./img_dataset/val"

In [3]:
classes = os.listdir(DIR_TRAIN)
print("Total Classes: ",len(classes))

Total Classes:  10


In [4]:
classes

['street_music',
 'air_conditioner',
 'jackhammer',
 'siren',
 'engine_idling',
 'gun_shot',
 'drilling',
 'car_horn',
 'dog_bark',
 'children_playing']

In [5]:
train_count = 0
valid_count = 0

for _class in classes:
    train_count += len(os.listdir(DIR_TRAIN +'/'+ _class))
    valid_count += len(os.listdir(DIR_VALID +'/'+ _class))
print("Total train images: ",train_count)
print("Total valid images: ",valid_count)

Total train images:  6985
Total valid images:  1747


In [6]:
# Создаем списки со всеми изображениями для тренировки и для валидации
#
#
train_imgs = []
valid_imgs = []
for _class in classes:
    
    for img in os.listdir(DIR_TRAIN +'/'+ _class):
        train_imgs.append(DIR_TRAIN +'/'+ _class + "/" + img)
    
    for img in os.listdir(DIR_VALID +'/'+ _class):
        valid_imgs.append(DIR_VALID +'/'+ _class + "/" + img)
        

class_to_int = {classes[i] : i for i in range(len(classes))}

In [7]:
class_to_int

{'street_music': 0,
 'air_conditioner': 1,
 'jackhammer': 2,
 'siren': 3,
 'engine_idling': 4,
 'gun_shot': 5,
 'drilling': 6,
 'car_horn': 7,
 'dog_bark': 8,
 'children_playing': 9}

There are multiple ways to load images from the dataset, I have used 2 such methods:


Using Dataset Class

In [8]:
# Создаем класс датасета. Этот объект будет подавать изображения и метки классов в нейронку пачками
# Пачками потому что нельзя все карьтнки разом загрузить в память и там работать.


def get_transform():
    return T.Compose([T.ToTensor()])

class CustomDataset(Dataset):
    
    def __init__(self, imgs_list, class_to_int, transforms = None):
        
        super().__init__()
        self.imgs_list = imgs_list
        self.class_to_int = class_to_int
        self.transforms = transforms
        
        
    def __getitem__(self, index):
    
        image_path = self.imgs_list[index]
        
        #Reading image
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image /= 255.0
        
        #Retriving class label
        label = image_path.split("/")[-2]
        label = self.class_to_int[label]
        
        #Applying transforms on image
        if self.transforms:
            image = self.transforms(image)
        
        return image, label
        
        
        
    def __len__(self):
        return len(self.imgs_list)

In [9]:
# создаем экземпляры нашего класса датасет
train_dataset = CustomDataset(train_imgs, class_to_int, get_transform())
valid_dataset = CustomDataset(valid_imgs, class_to_int, get_transform())

In [10]:
# создаем Сэмплер. по сути он просто будет перемешивать идущие на вход картинки
# чтобы при каждом цикле все шло в разном порядке. 
train_random_sampler = RandomSampler(train_dataset)
valid_random_sampler = RandomSampler(valid_dataset)

In [11]:
# Создаем экземпляры Загрузчика данных

train_data_loader = DataLoader(
    dataset = train_dataset,
    batch_size = 4,
    sampler = train_random_sampler,
    num_workers = 2,
)

valid_data_loader = DataLoader(
    dataset = valid_dataset,
    batch_size = 4,
    sampler = valid_random_sampler,
    num_workers = 2,
)

In [None]:
# Просто показать картинки , ничего особенного
for images, labels in train_data_loader:
    fig, ax = plt.subplots(figsize = (10, 10))
    ax.set_xticks([])
    ax.set_yticks([])
    ax.imshow(make_grid(images, 4).permute(1,2,0))
    break

mobilenet_v3_large best loss 0.19127139026714768 best acc 100.0 % 

In [14]:
def return_model(num_classes):
    '''
    Эта функция возвращает экземпляр модели, построенной под наше количество классов.
    Мы берем предобученную сеть, например mobilenet_v3_large и меняем ей "голову"
    '''
    model = models.mobilenet_v3_large(pretrained = True)
    
    #print(model)
    
    last_layer_input_features = model.classifier[-1].in_features
    
    model.classifier[-1] = nn.Linear(last_layer_input_features, num_classes, bias = True)
    
    #print(model)
    
    return model

In [15]:
model=return_model(len(classes)) # создаем нашу модель

In [16]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
torch.cuda.empty_cache()

model.to(device)

### Нюансы обучения
#optimizer = torch.optim.Adam(model.classifier.parameters(), lr = 0.0001)
optimizer = torch.optim.Adam(model.classifier.parameters(), lr = 0.0001) # метод оптимизации
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 5, gamma = 0.75)
criterion = nn.CrossEntropyLoss() # функция потерь

train_loss = []
train_accuracy = []

val_loss = []
val_accuracy = []


In [17]:
def calc_accuracy(true,pred):
    pred = F.softmax(pred, dim = 1)
    true = torch.zeros(pred.shape[0], pred.shape[1]).scatter_(1, true.unsqueeze(1), 1.)
    acc = (true.argmax(-1) == pred.argmax(-1)).float().detach().numpy()
    acc = float((100 * acc.sum()) / len(acc))
    return round(acc, 4)

In [18]:
import copy
import math

In [None]:
%%time
### Training Code
best_model_wts = copy.deepcopy(model.state_dict())
PATH = './best_result_10classes.pt'
best_loss = math.inf
train = True
earley_stoping = 10
earley_stoping_counter = 0
epoch=0
while train:
    
    start = time.time()
    
    #Epoch Loss & Accuracy
    train_epoch_loss = []
    train_epoch_accuracy = []
    
    #Val Loss & Accuracy
    val_epoch_loss = []
    val_epoch_accuracy = []
    
    # Training
    for images, labels in train_data_loader:
        
        images = images.to(device)
        labels = labels.to(device)
        
        #Reset Grads
        optimizer.zero_grad()
        
        #Forward ->
        preds = model(images)
        #Calculate Accuracy
        acc = calc_accuracy(labels.cpu(), preds.cpu())
        
        #Calculate Loss & Backward, Update Weights (Step)
        loss = criterion(preds, labels)
        loss.backward()
        optimizer.step()
        
        #Append loss & acc
        loss_value = loss.item()
        train_epoch_loss.append(loss_value)
        train_epoch_accuracy.append(acc)
    
    #Validation
    for images, labels in valid_data_loader:
        
        images = images.to(device)
        labels = labels.to(device)
        
        #Forward ->
        preds = model(images)
        
        #Calculate Accuracy
        acc = calc_accuracy(labels.cpu(), preds.cpu())
        
        #Calculate Loss
        loss = criterion(preds, labels)
        
        #Append loss & acc
        loss_value = loss.item()
        val_epoch_loss.append(loss_value)
        val_epoch_accuracy.append(acc)
    
    
    train_epoch_loss = np.mean(train_epoch_loss)
    train_epoch_accuracy = np.mean(train_epoch_accuracy)
    
    val_epoch_loss = np.mean(val_epoch_loss)
    val_epoch_accuracy = np.mean(val_epoch_accuracy)
    
    end = time.time()
    
    train_loss.append(train_epoch_loss)
    train_accuracy.append(train_epoch_accuracy)
    
    val_loss.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)
    
    #Print Epoch Statistics
    print("** Epoch {} ** - Epoch Time {}".format(epoch, int(end-start)))
    print("Train Loss = {}".format(round(train_epoch_loss, 4)))
    print("Train Accuracy = {} % \n".format(train_epoch_accuracy))
    print("Val Loss = {}".format(round(val_epoch_loss, 4)))
    print("Val Accuracy = {} % \n".format(val_epoch_accuracy))
    epoch+=1
    if val_epoch_loss< best_loss:
        print(f'loss descrease {val_epoch_loss}< {best_loss}')
        best_loss = val_epoch_loss
        best_model_wts = copy.deepcopy(model.state_dict())
        torch.save(model.state_dict(), PATH)
        earley_stoping_counter = 0
    else:
        earley_stoping_counter+=1
        print(f'{earley_stoping_counter} epochs without increasing')
    if earley_stoping_counter >= earley_stoping:
        print(f'enaught')
        train = False
        

** Epoch 0 ** - Epoch Time 1101
Train Loss = 1.486
Train Accuracy = 48.82655981682885 % 

Val Loss = 1.2551
Val Accuracy = 57.341723798627 % 

loss descrease 1.255055527733447< inf
** Epoch 1 ** - Epoch Time 2794
Train Loss = 1.1701
Train Accuracy = 59.87406983400115 % 

Val Loss = 1.1873
Val Accuracy = 58.46681922196797 % 

loss descrease 1.1873221753050043< 1.255055527733447


In [None]:
sns.lineplot(data=train_val_acc_df, x = "epochs", y="value", hue="variable",
             ax=axes[0]).set_title('Train-Val Accuracy/Epoch')
sns.lineplot(data=train_val_loss_df, x = "epochs", y="value", hue="variable",
             ax=axes[1]).set_title('Train-Val Loss/Epoch')

In [35]:
import cv2

In [40]:
!pip install pandas

Collecting pandas
  Using cached pandas-1.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
Collecting pytz>=2020.1
  Using cached pytz-2022.1-py2.py3-none-any.whl (503 kB)
Installing collected packages: pytz, pandas
Successfully installed pandas-1.4.1 pytz-2022.1


In [2]:
import matplotlib.pyplot as plt
import librosa.display

import numpy as np
import pandas as pd
import librosa

#plt.rcParams["figure.figsize"] = [6.40, 4.80]
plt.rcParams["figure.autolayout"] = True
filename = './children_playing/6902-2-0-4.wav'
y, sr = librosa.load(filename)
#y = y[:100000] # shorten audio a bit for speed

window_size = 1024
window = np.hanning(window_size)
stft  = librosa.core.spectrum.stft(y, n_fft=window_size, hop_length=512, window=window)
out = 2 * np.abs(stft) / np.sum(window)

# For plotting headlessly
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas

fig = plt.Figure(frameon=False)
canvas = FigureCanvas(fig)
ax = fig.add_subplot(111)
ax.axis('off')
p = librosa.display.specshow(librosa.amplitude_to_db(out, ref=np.max), ax=ax)
fig.savefig('spec.png',pad_inches = 0)

In [79]:
!pip install pillow



In [80]:
from PIL import Image

In [99]:
test_transforms = T.Compose([T.ToTensor()])

model1 = return_model(5)
model1.load_state_dict(torch.load('./best_result1.pt'))
model1.eval()
window_size = 1024

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
torch.cuda.empty_cache()

model.to(device)

def infer(wav_file, transforms= None):
    
    y, sr = librosa.load(wav_file)
    window = np.hanning(window_size)
    stft  = librosa.core.spectrum.stft(y, n_fft=window_size, hop_length=512, window=window)
    out = 2 * np.abs(stft) / np.sum(window)
    canvas = FigureCanvas(fig)
    ax = fig.add_subplot(111)
    ax.axis('off')
    p = librosa.display.specshow(librosa.amplitude_to_db(out, ref=np.max), ax=ax)
    temp = wav_file.split('/')[-1]
    temp = './' + temp.split('.')[0]+'.png'
    
    fig.savefig(temp
                ,pad_inches = 0)
    
    
    
    image = cv2.imread(temp, cv2.IMREAD_COLOR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
    image /= 255.0
    #Applying transforms on image
    
    image_tensor = test_transforms(image).float()
    image_tensor = image_tensor.unsqueeze_(0)
    #input = Variable(image_tensor)
    image_tensor.to(device)
    output = model1(image_tensor)
    index = torch.argmax(output, dim=1)
    #index = output.data.cpu().numpy().argmax()
    
    
    #image = torch.tensor(image)
    
    #image = image.to(device)
    #preds = model(images)
    #index = output.data.cpu().numpy().argmax()
    return index
    

In [103]:
class_to_int = {'air_conditioner': 0,
 'drilling': 1,
 'dog_bark': 2,
 'children_playing': 3,
 'car_horn': 4}
int_to_class={v: k for k, v in class_to_int.items()}

In [106]:
print(int_to_class[infer('./children_playing/6902-2-0-4.wav',get_transform).item()])

<class 'torch.Tensor'>
<class 'torch.Tensor'>
children_playing
