# Imports

In [26]:
import torchvision.models as models
import torchvision
from torchvision import  datasets, transforms
import torch.nn as nn
import torch

from PIL import Image

import numpy as np

import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
import itertools

from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode


# Unpack Data

In [None]:
#!unzip upload_colab.zip
!unzip upload_colab_stacked.zip

In [99]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


# Dataset Class

In [28]:
class AudioImagesDataset(Dataset):
    """audio images dataset."""

    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.audio_images_frame = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.audio_images_frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.root_dir,
                                self.audio_images_frame.iloc[idx, 1])
        image = io.imread(img_name)
        audio_class = self.audio_images_frame.iloc[idx, 2]
        file_name = self.audio_images_frame.iloc[idx, 1]
        audio_class = audio_class.astype('int')
        sample = {'image': image, 'audio_class': audio_class, 'file_name': file_name}

        if self.transform:
            sample = self.transform(sample)

        return sample

# Data Transformation Classes

In [29]:
#Transformation classes for the dataset

class Rescale(object):
    """Rescale the image in a sample to a given size.

    Args:
        output_size (tuple or int): Desired output size. If tuple, output is
            matched to output_size. If int, smaller of image edges is matched
            to output_size keeping aspect ratio the same.
    """

    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size

    def __call__(self, sample):
        image, audio_class = sample['image'], sample['audio_class']

        h, w = image.shape[:2]
        if isinstance(self.output_size, int):
            if h > w:
                new_h, new_w = self.output_size * h / w, self.output_size
            else:
                new_h, new_w = self.output_size, self.output_size * w / h
        else:
            new_h, new_w = self.output_size

        new_h, new_w = int(new_h), int(new_w)

        img = transform.resize(image, (new_h, new_w))

        return {'image': img, 'audio_class': audio_class, 'file_name': sample['file_name']}


class LeftCrop(object):
    """Crop the image from top left to given size.

    Args:
        output_size (tuple or int): Desired output size. If int, square crop
            is made.
    """

    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        if isinstance(output_size, int):
            self.output_size = (output_size, output_size)
        else:
            assert len(output_size) == 2
            self.output_size = output_size

    def __call__(self, sample):
        image, audio_class = sample['image'], sample['audio_class']

        new_h, new_w = self.output_size

        img = image[0: new_h,
                      0: new_w]

        return {'image': img, 'audio_class': audio_class, 'file_name': sample['file_name']}


class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        image, audio_class = sample['image'], sample['audio_class']

        # swap color axis because
        # numpy image: H x W x C
        # torch image: C X H X W
        # image = image.transpose((2, 0, 1))
        image = image.transpose((1,0))
        image = torch.from_numpy(image).to(torch.float)
        image = image.unsqueeze(0)
        return {'image': image,
                'audio_class': audio_class, 'file_name': sample['file_name']}


class ToNormalize(object):
    """Normalization of the image"""
    def __call__(self, sample):
        image, audio_class = sample['image'], sample['audio_class']
        
        image = transforms.Normalize([133.09], [55.32]).forward(image)
        
        return {'image': image,
                'audio_class': audio_class, 'file_name': sample['file_name']}


## Calculating mean and standard deviation for normalization

In [102]:
if True:
  print("Already calculated, see below")
else:
  liste = [1,2,3,4,5,6,7,8,9,10]

  data_csv = 'audio_images/fold' +str(liste[0]) +'/filename_class.csv'
  data_root = 'audio_images/fold' +str(liste[0]) +'/'
  dataset = AudioImagesDataset(csv_file = data_csv, root_dir = data_root, transform=audio_transform)
  for fold in liste[1:]:
    data_csv = 'audio_images/fold' +str(fold) +'/filename_class.csv'
    data_root = 'audio_images/fold' +str(fold) +'/'
    next_fold = AudioImagesDataset(csv_file = data_csv, root_dir = data_root, transform = audio_transform )
    dataset = torch.utils.data.ConcatDataset([dataset, next_fold])
  loader = DataLoader(dataset, batch_size=8,
                        shuffle=True, num_workers=4)


  mean = 0.0
  for i, data in enumerate(loader, 0):
      images = data['image'].to(device)
      batch_samples = images.size(0) 
      images = images.view(batch_samples, images.size(1), -1)
      mean += images.mean(2).sum(0)
  mean = mean / len(loader.dataset)

  var = 0.0
  for i, data in enumerate(loader, 0):
      images = data['image'].to(device)
      batch_samples = images.size(0)
      images = images.view(batch_samples, images.size(1), -1)
      var += ((images - mean.unsqueeze(1))**2).sum([0,2])
  std = torch.sqrt(var / (len(loader.dataset)*128*128))

  print(mean)
  print(std)
  del(dataset)
  del(loader)

Already calculated, see below


tensor([133.0939], device='cuda:0')

tensor([55.3163], device='cuda:0')

In [101]:
audio_transform = transforms.Compose([
                                               LeftCrop(127), 
                                               ToTensor(),
                                               ToNormalize()
                                           ])

# The network class

In [96]:
class vgg_ten_label(nn.Module):
    def __init__(self):
        super(vgg_ten_label, self).__init__()
        model = models.vgg16(pretrained=True)
        
        features = list(model.features.children())
        features[0] = nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.features = nn.Sequential(*features)
        
        self.avgpool = nn.AdaptiveAvgPool2d((7,7))
        self.classifier = nn.Sequential(*model.classifier.children())
        self.classifier_extra = nn.Sequential(
            nn.ReLU(inplace=True),
            nn.Linear(1000 , 512),
            nn.ReLU(inplace=True),
            nn.Linear(512 , 10),
        )
        
    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        x = self.classifier_extra(x)
        return x

In [97]:
print(vgg_ten_label().to(device))

vgg_ten_label(
  (features): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, 

# Helpful functions for Visualization and Validation

In [104]:
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [105]:
def calculate_print_accuracy(model, test_loader):
  correct = 0
  total = 0
  true_classes = np.array([])
  predicted_classes = np.array([])
  with torch.no_grad():
      for i, data in enumerate(test_loader,0):
          images, labels = data['image'].to(device), data['audio_class'].to(device)
          outputs = model(images)
          _, predicted = torch.max(outputs.data, 1)
          total += labels.size(0)
          correct += (predicted == labels).sum().item()
          true_classes = np.append(true_classes, labels.cpu())
          predicted_classes = np.append(predicted_classes, predicted.cpu())

  print('Accuracy of the network on the test images: %d %%' % (
      100 * correct / total))
  accuracy = correct / total
  conf_matrix = confusion_matrix(true_classes, predicted_classes)
  return conf_matrix, accuracy


# Training the model

In [106]:
def train_model(model, criterion, optimizer, train_loader):
  for epoch in range(18):  # loop over the dataset multiple times

      running_loss = 0.0
      for i, data in enumerate(train_loader, 0):
          # get the inputs; data is a dict
          inputs, labels = data['image'].to(device), data['audio_class'].to(device)
          # zero the parameter gradients
          optimizer.zero_grad()
          # forward + backward + optimize
          outputs = model(inputs)
          loss = criterion(outputs, labels)
          loss.backward()
          optimizer.step()
          # add Loss for statistic in print-function
          running_loss += loss.item()
      print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss ))
      running_loss = 0.0

  print('Finished Training')

# Putting it all together

In [None]:
#########Using cross validation ###############
accuracies = []
confusion_matrices = []
for cross_idx in range(1,11):
  print("Validation on fold #" + str(cross_idx))
  #prep for datasets
  liste = [1,2,3,4,5,6,7,8,9,10]
  liste.remove(cross_idx)

  #testset 
  test_csv = 'audio_images/fold' +str(cross_idx) +'/filename_class.csv'
  test_root = 'audio_images/fold' +str(cross_idx) +'/'
  testset = AudioImagesDataset(csv_file = test_csv, root_dir = test_root, transform=audio_transform)
  testloader = DataLoader(testset, batch_size=8,
                        shuffle=True, num_workers=4)
  
  #trainset
  train_csv = 'audio_images/fold' +str(liste[0]) +'/filename_class.csv'
  train_root = 'audio_images/fold' +str(liste[0]) +'/'
  trainset = AudioImagesDataset(csv_file = train_csv, root_dir = train_root, transform=audio_transform)
  for fold in liste[1:]:
    train_csv = 'audio_images/fold' +str(fold) +'/filename_class.csv'
    train_root = 'audio_images/fold' +str(fold) +'/'
    next_fold = AudioImagesDataset(csv_file = train_csv, root_dir = train_root, transform = audio_transform )
    trainset = torch.utils.data.ConcatDataset([trainset, next_fold])
  trainloader = DataLoader(trainset, batch_size=8,
                        shuffle=True, num_workers=4)
  

  #initialize model
  vgg_ten = vgg_ten_label().to(device)
  print("Before training the network:")
  conf, acc = calculate_print_accuracy(vgg_ten, testloader)
  #plot_confusion_matrix(conf, ["AC", "CH", "CP", "DB", "DR", "EI", "GS", "JH", "SI", "SM"])
  
  # Define the loss and the optimizer
  criterion = nn.CrossEntropyLoss()
  optimizer_vgg_ten= torch.optim.SGD(vgg_ten.parameters(), lr=0.001, momentum=0.9)
  train_model(vgg_ten, criterion, optimizer_vgg_ten, trainloader)
  print("After training the network:")
  conf, acc = calculate_print_accuracy(vgg_ten, testloader)
  plot_confusion_matrix(conf, ["AC", "CH", "CP", "DB", "DR", "EI", "GS", "JH", "SI", "SM"])
  accuracies.append(acc)
  confusion_matrices.append(conf)
  print("================================================================================")
  print("================================================================================")
  print("================================================================================")


Validation on fold #1
Before training the network:


In [None]:
np.average(accuracies)

In [None]:
combined = confusion_matrices[0] + confusion_matrices[1]
for i in range(2,10):
  combined = combined + confusion_matrices[i]

plot_confusion_matrix(combined, ["AC", "CH", "CP", "DB", "DR", "EI", "GS", "JH", "SI", "SM"])
