In [1]:
from libraries import * 
from preprocessing import SubsetSC, Preprocessing
from utils import downsample, pad_sequence
warnings.filterwarnings("ignore")

# selecting GPU if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"running on GPU = {str(device) == 'cuda'}")

# 1. create training validation splits, see later for test
print(f"1. extracting subsets")
train_set = SubsetSC("training")
validation_set = SubsetSC("validation")
print(f"   extraction done")

# 2. performing sampling for faster computation
def sample(dataset):
    X, y = [], []
    for waveform, sample_rate, label, speeker_id, utterance_number in tqdm(dataset):
        X_, y_ = waveform, label
        X.append(X_)
        y.append(y_)
    return X, y

downsampling = True
if downsampling:
    print(f"2. sampling datasets")
    train_X, train_y = sample(train_set)
    val_X, val_y = sample(validation_set)
    # test_set = sample(test_set)
print(f"   sampling done")

print(f"3. datasets ready")


# setting up computational parameters
if device == "cuda":
    num_workers, pin_memory = 4, True
else:
    num_workers, pin_memory = 0, False
    
    
# display the list of labels available in the dataset
labels = sorted(list(set(datapoint[2] for datapoint in train_set))) 

def collate_fn(batch):
    tensors, targets = [], []
    # Gather in lists, and encode labels as indices
    for waveform, _, label, *_ in batch:
        tensors += [waveform]
        targets += [label_to_index(label)]
    # Group the list of tensors into a batched tensor
    tensors = pad_sequence(tensors)
    targets = torch.stack(targets)
    return tensors, targets

def index_to_label(index):
    return labels[index]

# encoding each word using its index in the list of labels
def label_to_index(word):
    return torch.tensor(labels.index(word))

running on GPU = True
1. extracting subsets


  0%|          | 241/84843 [00:00<00:35, 2402.05it/s]

   extraction done
2. sampling datasets


100%|██████████| 84843/84843 [00:36<00:00, 2323.65it/s]
100%|██████████| 9981/9981 [00:04<00:00, 2335.73it/s]


   sampling done
3. datasets ready


In [2]:
class CnnAudioNet(nn.Module):
    def __init__(self,NumClasses):
        super(CnnAudioNet,self).__init__()
        self.NumClasses = NumClasses
        self.Fc_features = 128
        self.C1 = nn.Conv2d(1,32,3,padding=1)
        self.BN1 = nn.BatchNorm2d(32)
        self.fc1 = nn.Linear(15808,128)
        self.fc2 = nn.Linear(128, self.NumClasses )
        
    def forward(self,x):
        x = F.relu(self.C1(x))
        x = x.view(-1,np.prod(x.shape[1:]))  # flatten
        x = self.fc1(x)
        x = self.fc2(x)
        return x

model = CnnAudioNet(len(labels))
model.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(params=model.parameters(), lr=0.001)

In [27]:
from torchvision import transforms
from torch.utils import data

normalize = transforms.Normalize(mean=[0],
                                 std=[0.229, 0.224, 0.225])


class MyDataset(data.Dataset):
    def __init__(self, X, y, transform=None):
        self.data = X
        self.target = y
        self.transform = transform
        
    def __getitem__(self, index):  # ici preprocessing
        x = self.data[index]
        y = self.target[index]

        if self.transform:
            x = self.transform(x)
        return x, y
    
    def __len__(self):
        return len(self.data)

       
training_set = MyDataset(train_X, train_y, transforms.Compose([normalize]))
training_generator = data.DataLoader(training_set)

In [26]:
training_generator

<torch.utils.data.dataloader.DataLoader at 0x7f1ee3093b50>

In [22]:
norm = transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
x = torch.randn(3, 224, 224)
out = norm(x)
x.shape

torch.Size([3, 224, 224])

In [28]:
for dataBatch, target, _ in training_generator:
    print(dataBatch, target)

ValueError: Expected tensor to be a tensor image of size (..., C, H, W). Got tensor.size() = torch.Size([1, 16000]).