In [None]:
import torch #keep
import torchvision #keep
import torchvision.transforms as transforms   #best practice, get rid of line and expand explicity all transforms references?
import pandas
import os

In [None]:
class customSet(torch.utils.data.Dataset):

    # save annotations csv data and dataset directory
    def __init__(self, annotations_file, audio_dir, transformation = NONE, target_sample_rate, num_samples):
        self.annotations = pandas.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.transformation = transformation
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    
    # number of items in dataset (one item per line in annotations csv data)
    def __len__(self):
        return len(self.annotations)

    
    # retrieve signal and label of a given index of item
    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        # use .load_wav instead?
        signal, sample_rate = torchaudio.load(audio_sample_path)

        # TODO: necesary? if we already know how many channels and sample rates we're training /testing with...
        # function: wrap in 
        signal = self._resample_if_necessary(signal)                
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necesary(signal)
        signal = self._right_pad_if_necessary(signal)

        
        # TODO: does this line need to be wrapped in IF-statement if .transformation = NONE 
        signal = self.transformation(signal)
        
        label = self._get_audio_label(index)
        return signal, label

    
    # returns path for given index of item
    def _get_audio_sample_path(self, index):
        # TODO confirm configuration of title/where to retrieve in dataset directory
        title = self.annotations.iloc[index, 0]
        genre = self.annotations.iloc[index, 1]
        
        # this line is for stringing together folder path with the info retrieved from iloc
        path = os.path.join(self.audio_dir, outerfolder , title)

        return path

    
    # if target sample rate greater than desired sample_rate then resample
    def _resample_if_necessary(self, signal, sample_rate):
        if sample_rate != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sample_rate, self.target_sample_rate)
            signal = resampler(signal)
        return signal

    
    # if more than 1 channel, mixex signal down to mono
    def _mix_down_if_necessary(self, signal):
        # shape[0] gives channels, 
        if self.shape[0] > 1 :
            signal = torch.mean(signal, dim = 0, keepdim = True)
        return signal

    
    # if length of signal greater than num_samples, cut signal    
    def _cut_if_necesary(self, signal):
        # signal is Tensor that is a tuple of (numchannels, numsamples) -> since mixed down prior, then expected (1, num_samples), just check .shape[1]
        if signal.shape[1] > signal.num_samples:
            signal = signal[:, : self.num_samples]
        return


    # if length of signal less than num_samples, pad with zeros
    def _right_pad_if_necessary(self, signal):
        signal_length = signal.shape[1]
        if signal_length < signal.num_samples:
            num_missing_samples = signal.num_samples - signal_length
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal
   
    
    # returns label given index of item
    def _get_audio_label(self, index):
        return self.annotations.iloc[index, 1]
    

if __name__ == "__main__":
    # TODO: add hardcoded local file path  
    ANNOTATIONS_FILE = 
    AUDIO_DIR = 
    SAMPLE_RATE = # 22050
    NUM-SAMPLES = # 22050
    # create mel spectrogram object
    mel_spec = torchaudio.transforms.MelSpectrogram(
        sample_rate = SAMPLE_RATE,
        n_fft = 1024,
        hop_length = 512,
        n_mels = 64
    )
        
    mydatainstance =  customSet(ANNOTATIONS_FILE, AUDIO_DIR, mel_spec, num_samples)

    signal, label = mydatainstance[0]
    

In [None]:

batch_size = 32

# TODO: change sets to custom dataset; no image transform required? since using audio...?
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)


trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=4)

testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=4)


# ?
classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

In [None]:
import os
import pandas 
from torchvision.io import read_image

class CustomImageDataset(Dataset):
    def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):
        self.img_labels = pandas.read_csv(annotations_file)
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        image = read_image(img_path)
        label = self.img_labels.iloc[idx, 1]
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label

In [None]:
from tabulate import tabulate

print('Training set')
print(f'Samples: {trainset.data.shape}')
print(f'Labels: {len(trainset.targets)}')

print('\nTest set')
print(f'Samples: {testset.data.shape}')
print(f'Labels: {len(testset.targets)}')

print('\nClasses\n')
print(tabulate(
    list(trainset.class_to_idx.items()), headers=['Name', 'Index'], 
    tablefmt='orgtbl'
))

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# functions to show an image


def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()


# get some random training images
dataiter = iter(trainloader)
images, labels = next(dataiter)

# show images
imshow(torchvision.utils.make_grid(images))
# print labels
print(' '.join('%5s' % classes[labels[j]] for j in range(batch_size)))

In [None]:
torch.cuda.is_available()
torch.cuda.get_device_name(0)
print(torch.cuda.get_device_name(0))

#keep
if torch.cuda.is_available(): 
 dev = "cuda:0" 
else: 
 dev = "cpu" 

#dev = "cpu" 
device = torch.device(dev) 
dev = "cuda" 
print(device)

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary

#keep
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3,32,5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 32, 5)
        self.fc1 = nn.Linear(32 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()
net.to(device)
print(net)

summary(net, (3,32,32), batch_size=32, device=dev)

In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [None]:
%%time
for epoch in range(20):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs, labels = inputs.to(device,non_blocking=True), labels.to(device, non_blocking=True)
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        
        if i % 400 == 399:    # print every 400 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')


In [None]:
dataiter = iter(testloader)
images, labels = next(dataiter)

# print images
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(8)))

images, labels = next(dataiter)

# print images
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(8)))

In [None]:
PATH = './cifar_net.pth'
torch.save(net.state_dict(), PATH)

print(device)
net = Net()
net.load_state_dict(torch.load(PATH))
net.to(device)
images = images.to(device)
outputs = net(images)
_, predicted = torch.max(outputs, 1)

# print(outputs)
print('Predicted: ', ' '.join('%5s' % classes[predicted[j]]
                              for j in range(8)))

In [None]:
def test_accuracy(net, testloader, device):
    correct = 0

    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        net.eval()
        for images, labels in testloader:
            images, labels = images.to(device), labels.to(device)
            #  = images + 0.2 * torch.randn(images.shape).to(device)
            
            # calculate outputs by running images through the network
            outputs = net(images)

            # the class with the highest energy is what we choose as prediction
            predicted = torch.max(outputs.data, 1)[1]

            correct += (predicted == labels).sum().item()
    
    return correct / len(testloader.dataset)

def test_accuracy_per_class(net, testloader, device):
    correct_pred = {classname: 0 for classname in trainset.classes}
    total_pred = {classname: 0 for classname in trainset.classes}

    with torch.no_grad():
        net.eval()
        for images, labels in testloader:
            images, labels = images.to(device), labels.to(device)

            outputs = net(images)
            predicted = torch.max(outputs.data, 1)[1]

            # collect the correct predictions for each class
            for label, prediction in zip(labels, predicted):
                if label == prediction:
                    correct_pred[trainset.classes[label]] += 1
                total_pred[trainset.classes[label]] += 1
    
    accuracy_per_class = {classname: 0 for classname in trainset.classes}
    for classname, correct_count in correct_pred.items():
        accuracy = (100 * float(correct_count)) / total_pred[classname]
        accuracy_per_class[classname] = accuracy

    return accuracy_per_class

test_acc = test_accuracy(net, testloader, 'cuda')
print(f'Best trial test set accuracy: {test_acc}')

overall_accuracy = test_accuracy(net, testloader, dev)

print(
    'Overall accuracy of the network  '
    f'{(overall_accuracy * 100):.2f} %\n'
    'on the 10000 test images'
)

accuracy_per_class = test_accuracy_per_class(net, testloader, dev)

print('Accuracy per class\n')
for classname, accuracy in accuracy_per_class.items():
    print(f'{classname:12s} {accuracy:.2f} %')

In [None]:
import matplotlib.pyplot as plt

# Import datasets, classifiers and performance metrics
from sklearn import datasets, metrics, svm
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

print('Training set')
print(f'Samples: {trainset.data.shape}')
print(f'Labels: {len(trainset.targets)}')
print(type(trainset.data))
print(type(trainset.targets))
print('\nTest set')
print(f'Samples: {testset.data.shape}')
print(f'Labels: {len(testset.targets)}')

train_n_samples = len(trainset.data)
print(train_n_samples)
test_n_samples = len(testset.data)
print(test_n_samples)

Xtrain = trainset.data
Xtest  = testset.data

from skimage.feature import hog
Xtrain_hog = []
for i in range(len(Xtrain)):
    fd  = hog(Xtrain[i] , orientations=9 , pixels_per_cell = (8,8),
                     cells_per_block = (2,2) , visualize = False, channel_axis=-1)
    Xtrain_hog.append(fd)
    if ((i % 10000) == 0): 
        print(i)

Xtrain_hog = np.array(Xtrain_hog)
print('Done calculating HOGs for training')
print(Xtrain_hog.shape)

Xtest_hog = []
for i in range(len(Xtest)):
    fd = hog(Xtest[i] , orientations=9 , pixels_per_cell = (8,8),
                     cells_per_block = (2,2) , visualize = False, channel_axis=-1)
    Xtest_hog.append(fd)
    if ((i % 1000) == 0): 
        print(i)

Xtest_hog = np.array(Xtest_hog)
print('Done calculating HOGs for testing')

ytrain = trainset.targets
ytest  = testset.targets

from sklearn.decomposition import PCA
pca = PCA(0.8)
Xtrain_pca = pca.fit_transform(Xtrain_hog)
Xtest_pca  = pca.transform(Xtest_hog)
print(Xtrain_pca.shape)
print(Xtest_pca.shape)

In [None]:
%%time 

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# Create a classifier: a support vector classifier
clf = svm.SVC(C=10, cache_size=10000)
clf.fit(Xtrain_pca, ytrain)

ytest_predict  = clf.predict(Xtest_pca)
print(classification_report(ytest, ytest_predict))

color = 'white'
cm = confusion_matrix(ytest, ytest_predict)
disp = ConfusionMatrixDisplay(cm, display_labels=['airplanes', 'cars', 'birds', 'cats', 'deer', 'dogs', 'frogs', 'horses', 'ships', 'trucks'])
disp.plot()
plt.xticks(rotation=45, ha='right')
plt.show()