In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('fivethirtyeight') 
# %matplotlib inline
import librosa
import soundfile as sf
import librosa.display
import wave
import time
import os
import random
import torch
# import torchaudio
from torchvision.transforms.functional import pil_to_tensor
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torchvision.models as models
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import gc
from netcal.metrics import ECE
import copy
import pickle

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
SPECTROGRAM_DPI = 90 # image quality of spectrograms
DEFAULT_SAMPLE_RATE = 44100
DEFAULT_HOPE_LENGHT = 1024

In [4]:
def get_only_one_class(l):
    if type(l) != str:
        return l
    return l.split(',')[0]
base_dir = '/home/user_7428/databases/FSD'
spectrogram_dir = f"{base_dir}/spectograms"
df_dev = pd.read_csv(f"{base_dir}/metadata/collection_dev.csv")
df_eval = pd.read_csv(f"{base_dir}/metadata/collection_eval.csv")
df_dev['path'] = df_dev['fname'].apply(lambda file: f'dev_audio/{file}.wav')
df_eval['path'] = df_eval['fname'].apply(lambda file: f'eval_audio/{file}.wav')
df = pd.concat([df_dev, df_eval]).drop_duplicates().reset_index(drop=True)
df['class'] = df['labels']
df['category'] = df['labels'].apply(get_only_one_class)
var = df.groupby('category').agg('count').sort_values('fname', ascending=False)
K = 20
top_K_classes = var.head(K).reset_index()
df = df[df['category'].isin(top_K_classes['category'])]
classes = []
for l in df['category'].unique():
    if type(l) != str:
        continue
    for c in l.split(','):
        if c not in classes:
            classes.append(c)
classes_num = len(classes)
CONSTANT_NUMBER_OF_SAMPLES = top_K_classes['fname'].min() * len(top_K_classes)
top_K_classes

Unnamed: 0,category,fname,labels,mids,path,class
0,Male_speech_and_man_speaking,846,846,846,846,846
1,Snare_drum,776,776,776,776,776
2,Laughter,711,711,711,711,711
3,Electric_guitar,687,687,687,687,687
4,Cello,660,660,660,660,660
5,Trumpet,631,631,631,631,631
6,Fart,629,629,629,629,629
7,Clarinet,592,592,592,592,592
8,Piano,585,585,585,585,585
9,Squeak,583,583,583,583,583


In [5]:
count = 0
matplotlib.pyplot.close()
if not os.path.exists(spectrogram_dir):
    os.mkdir(spectrogram_dir)
for i, row in df.iterrows():
    wav_file = f"{base_dir}/{row['path']}"
    png_file = f"{spectrogram_dir}/{row['path'].split('/')[1].replace('.wav', '.png')}"
    if os.path.exists(png_file):
        continue
    waveform, sample_rate = librosa.load(wav_file, sr=None)
    # waveform = waveform.numpy()
    fig, axes = plt.subplots(1, 1)
    axes.specgram(waveform, Fs=sample_rate)
    # f = plt.figure()
    axes.axis('off')
    # plt.show(block=False)
    plt.savefig(f'{png_file}', dpi=SPECTROGRAM_DPI , bbox_inches='tight')
    plt.cla()
    plt.close(fig)
    os.remove(wav_file)
    del fig
    del axes
    del wav_file
    gc.collect()
    count += 1
    # print("Created spectogram")

In [6]:
def inspect_image_dimensions(image_dir, num_images=1):
    image_paths = [os.path.join(image_dir, fname) for fname in os.listdir(image_dir) if fname.endswith('.png')]
    for img_path in image_paths[:num_images]:
        with Image.open(img_path) as img:
            print(f'{img_path}: {img.size}')
inspect_image_dimensions(spectrogram_dir)

/home/user_7428/databases/FSD/spectograms/156500.png: (519, 367)


In [7]:
df['png_path'] = df['path'].apply(lambda path: path.replace('dev_audio/', 'spectograms/').replace('eval_audio/', 'spectograms/').replace('.wav', '.png'))
df.head()

Unnamed: 0,fname,labels,mids,path,class,category,png_path
0,64760,Electric_guitar,/m/02sgy,dev_audio/64760.wav,Electric_guitar,Electric_guitar,spectograms/64760.png
1,16399,Electric_guitar,/m/02sgy,dev_audio/16399.wav,Electric_guitar,Electric_guitar,spectograms/16399.png
2,16401,Electric_guitar,/m/02sgy,dev_audio/16401.wav,Electric_guitar,Electric_guitar,spectograms/16401.png
3,16402,Electric_guitar,/m/02sgy,dev_audio/16402.wav,Electric_guitar,Electric_guitar,spectograms/16402.png
4,16404,Electric_guitar,/m/02sgy,dev_audio/16404.wav,Electric_guitar,Electric_guitar,spectograms/16404.png


In [8]:
img_height, img_width = 512, 384  # Adjusted dimensions based on inspection
transform = transforms.Compose([
    transforms.Resize((img_height, img_width)),
    transforms.ToTensor(),  # Converts the image to tensor and scales pixel values to [0, 1]
])
batch_size = 32
learning_rate = 0.0001
num_epochs = 100

In [9]:
class SpectrogramDataset(Dataset):
    def __init__(self, image_dir, df, classes, transform, wanted_classes=None):
        self.transform = transform
        self.image_dir = image_dir
        self.image_paths = []
        self.labels = []
        self.failed = 0
        self.succeeded = 0
        self.samples_per_label = None
        if wanted_classes != None and type(wanted_classes) == dict:
            self.samples_per_label = {label: 0 for label in wanted_classes}
        for fname in os.listdir(image_dir):
            if fname.endswith('.png'):
                # try:
                    current_class = df[df['png_path'] == f'spectograms/{fname}']['category']
                    if len(current_class) == 0:
                        continue
                    current_class = current_class.values[0]
                    label = classes.index(current_class)
                    if wanted_classes != None and current_class not in wanted_classes:
                        continue
                    if wanted_classes != None and type(wanted_classes) == dict and self.samples_per_label[current_class] >= wanted_classes[current_class]:
                        continue
                    image_path = os.path.join(image_dir, fname)
                    self.image_paths.append(image_path)
                    self.labels.append(label)
                    self.samples_per_label[current_class] += 1
                    self.succeeded += 1
                # except Exception as e:
                #     self.failed += 1
        print(f"Created data loader. Failed: {self.failed}, Total: {self.failed + self.succeeded}")

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert('RGB')
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)

        # return pil_to_tensor(image), label
        return image, label

In [10]:
def get_test_and_train_loader(batch_size, wanted_classes=None, train_fraction=0.8):
    dataset = SpectrogramDataset(spectrogram_dir, df, classes, transform, wanted_classes)
    train_size = int(train_fraction * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, test_loader

In [11]:
class CNN(nn.Module):
    def __init__(self, num_classes):
        super(CNN, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        )
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        )
        self.fc1 = nn.Linear(128 * (img_height // 8) * (img_width // 8), 512)
        self.drop = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = out.view(out.size(0), -1)
        out = self.fc1(out)
        out = self.drop(out)
        out = self.fc2(out)
        return out

In [12]:
def train_model(model, train_loader, criterion, optimizer, num_epochs, device, save_path):
    if save_path != None:
        print(f"Checking if path {save_path} exists")
        if os.path.exists(save_path):
            print(f'Model loaded from {save_path}')
            model.load_state_dict(torch.load(save_path))
            model.to(device)
            model.eval()
            return
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for _, (images, labels) in enumerate(train_loader):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / 100:.4f}')
    if save_path != None:
        torch.save(model.state_dict(), save_path)
        print(f'Model saved to {save_path}')

In [13]:
def evaluate_model(model, test_loader, device):
    model.eval()
    ground_truth = []
    confidences = []
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            probabilities = F.softmax(outputs, dim=1)
            confidences += list(probabilities.cpu().numpy())
            _, predicted = torch.max(outputs.data, 1)
            ground_truth += list(labels.to('cpu').numpy())
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f'Total: {total}, Correct: {correct}. Accuracy of the model on the test images: {100 * correct / total:.2f}%')
    ground_truth = np.array(ground_truth)
    confidences = np.array(confidences)
    return ground_truth, confidences

In [14]:
def get_ece_and_acc(confidences, ground_truth):
    correct = ground_truth == confidences.argmax(axis=1)
    sum_true = np.sum(correct)
    total = correct.size
    acc = sum_true / total
    n_bins = 10
    ece = ECE(n_bins)
    uncalibrated_score = ece.measure(confidences, ground_truth)
    return uncalibrated_score, acc

In [15]:
def create_train_and_evaluate_model(
        batch_size, 
        number_of_wanted_classes, 
        num_epochs, 
        save=False, 
        train_fraction=0.8, 
        wanted_classes=None, 
        initial_model=None
    ):
    result_dir = "./results/experiment_one_data_variance"
    samples_variance = np.array([wanted_classes[key] for key in wanted_classes]).var() ** 0.5
    samples_mean = np.array([wanted_classes[key] for key in wanted_classes]).mean()
    ext = f"_{number_of_wanted_classes}_classes_{batch_size}_batch_{num_epochs}_epochs_{samples_mean}_mean_{samples_variance}_variance"
    ece_file_name = f"{result_dir}/ece{ext}.txt"
    if os.path.exists(ece_file_name):
        return
    if not os.path.isdir(result_dir):
        os.mkdir(result_dir)
    # if wanted_classes == None:
    #     wanted_classes = [i for i in range(number_of_wanted_classes)]
    train_loader, test_loader = get_test_and_train_loader(batch_size, wanted_classes=wanted_classes, train_fraction=train_fraction)
    model = copy.deepcopy(initial_model) 
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    save_path = None
    if save:
        save_path=f"{result_dir}/cnn_model{ext}.pth"
    train_model(model, train_loader, criterion, optimizer, num_epochs, device, save_path)
    ground_truth, confidences = evaluate_model(model, test_loader, device)
    ece, acc = get_ece_and_acc(confidences, ground_truth)
    result_dict = {
        'ece': ece,
        'acc': acc
    }
    with open(ece_file_name, 'wb') as f:
        pickle.dump(result_dict, f)
    return ece


In [16]:
def generate_k_random_numbers_whos_sum_is_const(K, bound, const=0):
    result = np.random.randint(-bound, bound, K)
    result[-1] -= result.sum()
    return result

def change_variance(wanted_classes, max_step_size=5):
    steps = generate_k_random_numbers_whos_sum_is_const(K, max_step_size)
    for i, l in enumerate(wanted_classes):
        max_number_of_samples = top_K_classes[top_K_classes['category'] == l]['fname'].values[0]
        step_size = steps[i]
        direction = 1#(-1) ** i
        step = step_size * direction
        wanted_classes[l] += step
        if wanted_classes[l] >= max_number_of_samples:
            wanted_classes[l] = max_number_of_samples
        if wanted_classes[l] <= 0:
            wanted_classes[l] = min(np.random.randint(max_step_size), max_number_of_samples)

INITIAL_NUMBER_OF_SAMPLES = int(0.5*int(CONSTANT_NUMBER_OF_SAMPLES / classes_num))
wanted_classes = {l: INITIAL_NUMBER_OF_SAMPLES for l in classes}
initial_model = CNN(num_classes=classes_num)
for _ in range(30):
    print(np.array([wanted_classes[key] for key in wanted_classes]).var() ** 0.5)
    num_epochs = 100
    batch_size = 32
    create_train_and_evaluate_model(batch_size, classes_num, num_epochs, wanted_classes=wanted_classes, initial_model=initial_model)
    change_variance(wanted_classes)

0.0
Created data loader. Failed: 0, Total: 5000
Epoch [1/1], Loss: 3.6277
Total: 1000, Correct: 346. Accuracy of the model on the test images: 34.60%
3.7682887362833544
Created data loader. Failed: 0, Total: 5000


KeyboardInterrupt: 