In [4]:
import os
import random
import time

import numpy as np
import pandas as pd
import scipy as sc
from scipy.io import wavfile, loadmat
from scipy import signal
import cv2

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torchvision.transforms import Compose

from sklearn.metrics import roc_curve, roc_auc_score

import tensorboardX
from tqdm import tqdm

import matplotlib.pyplot as plt

In [1]:
(0 + 1) % 2 == 0

False

In [7]:
torch.rand(1, requires_grad=True)

tensor([0.7652], requires_grad=True)

## List all paths of files in a derectory including files from subdirectories

In [None]:
def getListOfFiles(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
                
    return allFiles

listOfFiles = getListOfFiles('/home/nvme/data/vc1/audio/')

listOfFiles[:3]

In [None]:
for path in tqdm(listOfFiles):
    try:
        wavfile.read(path)
        
    except ValueError:
        print(path)

## False alarms vs False misses plot

In [None]:
plt.figure()
plt.plot(Pfas, Pmisses)
# plt.xscale('log')
# plt.yscale('log')
plt.show()

## Calculating a cosine similarity matrix

In [None]:
def cosine_similarity_matrix_loop(tensor1, tensor2):
    sim = [[F.cosine_similarity(a, b, dim=0) for a in tensor2] for b in tensor1]
    return torch.Tensor(sim)

def cosine_similarity_matrix_vectorized(tensor1, tensor2):
    B, D = tensor1.size()
    dot = tensor2 @ tensor1.t()
    norm1 = tensor1.norm(dim=1)
    norm2 = tensor2.norm(dim=1).view(1, B).t()
    dot /= norm1 * norm2
    return dot.t()

B = 5

anchors = torch.randn(B, 4)
positives = torch.randn(B, 4)

print(anchors)
print()
print(positives)
print(anchors.size(), positives.size())

print(F.cosine_similarity(anchors, positives))

sim = [[F.cosine_similarity(a, b, dim=0) for a in positives] for b in anchors]
sim = torch.Tensor(sim)
print(sim)
loop_result = cosine_similarity_matrix_loop(anchors, positives)
print(loop_result)
vectorized_result = cosine_similarity_matrix_vectorized(anchors, positives)
print(vectorized_result)
print((loop_result - vectorized_result).sum())

Hard Negative Mining

In [None]:
print(sim)
sim_sorted, sim_sorted_idx = sim.sort(dim=1, descending=True)
print(sim_sorted)
print(sim_sorted_idx)
# Given a sim matrix Dij, if i=j a value corresponds to a similarity between 
# positive pairs -> we need to prevent them from getting to the negative samples
# First, we need to remove i=j elements.
mask = (sim_sorted_idx != torch.arange(B).repeat(1, B).view(B, B).t())
sim_sorted_idx_rm = sim_sorted_idx[mask].view(B, B-1)
print(sim_sorted_idx_rm)
# select the indices for appropriately hard samples
tau = 0.1
idx_threshold = round(tau * (B-2))
# only half of the batch size -> B // 2
hnm_idxs = sim_sorted_idx_rm[B // 2:, idx_threshold]
print(hnm_idxs)
idx_threshold_rand = torch.from_numpy(np.random.uniform(size=(B, 1)) * (B-1)).long()
# print(idx_threshold_rand)
# rand_idxs = sim_sorted_idx_rm[:B // 2, idx_threshold_rand]
rand_idxs = torch.gather(sim_sorted_idx_rm, dim=1, index=idx_threshold_rand)[:B // 2]
print(rand_idxs)
print(hnm_idxs.shape, rand_idxs.shape)
print(torch.cat([rand_idxs.view(-1), hnm_idxs.view(-1)]))
negatives = positives[torch.cat([rand_idxs.view(-1), hnm_idxs.view(-1)]), :]
print(negatives)
pos_n_neg = torch.cat([positives, negatives])
anchors_n_anchors = torch.cat([anchors, anchors])
labels = torch.cat([torch.ones(B), torch.zeros(B)])
print(pos_n_neg)
print(pos_n_neg.shape)
print(anchors_n_anchors)
print(anchors_n_anchors.shape)
print(labels)
print(labels.shape)

## Remove DC component and add a small dither

In [None]:
before_dc = loadmat('before_dc.mat')['sin'].reshape(-1)
after_dc = loadmat('after_dc.mat')['sin'].reshape(-1)

audio_path = 'Y8hIVOBuels_0000002.wav'
# read
rate, samples = wavfile.read(audio_path)

print(before_dc.mean())
print(after_dc.mean())
print(samples.mean())

signal.lfilter([1, -1], [1, -0.99], before_dc)

(after_dc - signal.lfilter([1, -1], [1, -0.99], before_dc)).sum()

## Top-1 and Top-5 accuracy

In [None]:
C = 7 # 1251 class num
SIZE = (1, C)
probs = torch.rand(SIZE) # net(spec)
label = torch.randint(C, size=(1,)).type(torch.LongTensor)
pred_top5 = probs.topk(5)[1]
pred_top1 = probs.topk(1)[1]
print(probs)
print(label)
print(pred_top5)
print(label in pred_top5.view(5))
print(label in pred_top1.view(1))
print(label == pred_top5.view(5)[0])
if label == pred_top5.view(5)[0]:
    print('it works')

## Overfitting model on a small dataset

In [None]:
## Input (B, 1, 512, 298)
class VoiceNet(nn.Module):

    def __init__(self, num_classes=2):
        super(VoiceNet, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=96, kernel_size=7, stride=2, padding=1)
        self.conv2 = nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, stride=2, padding=1)
        self.conv3 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.conv5 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        
        self.bn1 = nn.BatchNorm2d(num_features=96)
        self.bn2 = nn.BatchNorm2d(num_features=256)
        self.bn3 = nn.BatchNorm2d(num_features=256)
        self.bn4 = nn.BatchNorm2d(num_features=256)
        self.bn5 = nn.BatchNorm2d(num_features=256)
        self.bn6 = nn.BatchNorm2d(num_features=4096)
        self.bn7 = nn.BatchNorm1d(num_features=1024)
        
        self.relu = nn.ReLU()
        
        self.mpool1 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.mpool2 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.mpool5 = nn.MaxPool2d(kernel_size=(5, 3), stride=(3, 2))
        
        # Conv2d with weights of size (H, 1) is identical to FC with H weights
        self.fc6 = nn.Conv2d(in_channels=256, out_channels=4096, kernel_size=(9, 1))
        self.fc7 = nn.Linear(in_features=4096, out_features=1024)
        self.fc8 = nn.Linear(in_features=1024, out_features=num_classes)
        
    def forward(self, x):
        B, C, H, W = x.size()
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.mpool1(x)
        x = self.relu(self.bn2(self.conv2(x)))       
        x = self.mpool2(x)
        x = self.relu(self.bn3(self.conv3(x)))
        x = self.relu(self.bn4(self.conv4(x)))
        x = self.relu(self.bn5(self.conv5(x)))
        x = self.mpool5(x)
        x = self.relu(self.bn6(self.fc6(x)))
        
        _, _, _, W = x.size()
        self.apool6 = nn.AvgPool2d(kernel_size=(1, W))
        x = self.apool6(x)
        
        x = x.view(x.size(0), -1)
        x = self.relu(self.bn7(self.fc7(x)))
        x = self.fc8(x)
        
        # no need for SoftMax because CrossEntropyLoss calculates it
        if self.train:
            return x
        
        else:
            assert True == False, 'Decide what to do with SM on inference'
    
B = 3
SIZE0 = (100, 1, 512, 298)
SIZE1 = (100, 1, 512, 298)
SIZE2 = (100, 1, 512, 298)
model_input_size = (B, 1, 512, 298)

In [None]:
## Input (B, 3, 32, 32)
class VoiceNet(nn.Module):
    "Implementation Ref: https://github.com/kuangliu/pytorch-cifar"
    def __init__(self, num_classes=None):
        super(VoiceNet, self).__init__()
        VGG16 = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 
                 512, 512, 512, 'M', 512, 512, 512, 'M']
        self.features = self._make_layers(VGG16)
        self.classifier = nn.Linear(512, num_classes)

    def forward(self, x):
        out = self.features(x)
        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return out

    def _make_layers(self, cfg):
        layers = []
        in_channels = 3
        for x in cfg:
            if x == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
                           nn.BatchNorm2d(x),
                           nn.ReLU(inplace=True)]
                in_channels = x
        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
        return nn.Sequential(*layers)
    

SIZE0 = (100, 3, 32, 32)
SIZE1 = (100, 3, 32, 32)
SIZE2 = (100, 3, 32, 32)
model_input_size = (B, 3, 32, 32)

In [None]:
DATASET_PATH = '/home/nvme/data/vc1/'
LOG_PATH = '/home/nvme/logs/VoxCeleb/_model_overfit_test'
EPOCH_NUM = 30
B = 96
WEIGHT_DECAY = 5e-4
LR_INIT = 1e-2
LR_LAST = 1e-4
# lr scheduler parameter
gamma = 10 ** (np.log10(LR_LAST / LR_INIT) / (EPOCH_NUM - 1))
MOMENTUM = 0.9
DEVICE = 'cuda:1'
NUM_WORKERS = 4
EVAL_THRESHOLD = 0.5
TBoard = tensorboardX.SummaryWriter(log_dir=LOG_PATH)

In [None]:
net = VoiceNet(num_classes=3)
net.to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), 1e-7, MOMENTUM, weight_decay=WEIGHT_DECAY)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)

B = 3

class0_data = torch.rand(SIZE0) - 20
class1_data = torch.rand(SIZE1) + 20
class2_data = torch.rand(SIZE2)

labels0 = torch.zeros(100).type(torch.LongTensor)
labels1 = torch.ones(100).type(torch.LongTensor)
labels2 = torch.ones(100).type(torch.LongTensor) * 2

dataset = torch.cat([class0_data, class1_data, class2_data])
datalabels = torch.cat([labels0, labels1, labels2])

shuffling_idxs = torch.randperm(len(dataset))
dataset = dataset[shuffling_idxs]
datalabels = datalabels[shuffling_idxs]

for epoch_num in range(EPOCH_NUM):
#     lr_scheduler.step()
    
    # train
    net.train()
    
#     for iter_num, specs in tqdm(enumerate(dataset)):
    for i in tqdm(range(len(dataset) // B)):
        labels, specs = datalabels[i*B:i*B+B].view(B), dataset[i*B:i*B+B].view(model_input_size)
        optimizer.zero_grad()
        labels, specs = labels.to(DEVICE), specs.to(DEVICE)
        probs = net(specs)
        loss = criterion(probs, labels)
        loss.backward()
        optimizer.step()

        # TBoard
        step_num = epoch_num * (len(dataset)//3) + i
        TBoard.add_scalar('TrainLoss', loss.item(), step_num)

## Datasets debug

In [None]:
transforms = Compose([
    Normalize(),
    ToTensor()
])

trainset = IdentificationDataset(DATASET_PATH, train=True, transform=transforms)
trainsetloader = torch.utils.data.DataLoader(trainset, batch_size=3)

testset = IdentificationDataset(DATASET_PATH, train=False, transform=transforms)
testsetloader = torch.utils.data.DataLoader(testset, batch_size=1)

for i, a in enumerate(trainsetloader, 0):
    labels, specs = a
    print(labels, specs)
    if i > 2:
        break

for i, a in enumerate(testsetloader, 0):
    labels, specs = a
    print(labels, specs)
    if i > 10:
        break

## Spectrogram comparison

In [None]:
audio_path = 'Y8hIVOBuels_0000002.wav'
# read
rate, samples = wavfile.read(audio_path)

## parameters
window = 'hamming'
# window width and step size
Tw = 25
Ts = 10
# frame duration (samples)
Nw = int(rate * Tw * 1e-3)
# overlapped duration (samples)
# 2 ** to the next pow of 2
Ns = int(rate * (Tw - Ts) * 1e-3)
nfft = 2 ** (Nw - 1).bit_length()
pre_emphasis = 0.97

# preemphasis filtering
samples = np.append(samples[0], samples[1:] - pre_emphasis * samples[:-1])

# removes DC component of the signal and add a small dither
samples = signal.lfilter([1, -1], [1, -0.99], samples)
dither = np.random.uniform(-1, 1, samples.shape)
spow = np.std(samples)
samples = samples + 1e-6 * spow * dither

# spectogram
frequencies, times, spectrogram = signal.spectrogram(samples, rate, window, Nw, Ns, nfft, 
                                mode='magnitude', return_onesided=False)

spectrogram *= 1600

print(spectrogram.shape)

In [None]:
np.random.uniform(-1, 1, samples.shape)

In [None]:
plt.figure(figsize=(10, 4))
plt.pcolormesh(times, frequencies, spectrogram, cmap=plt.cm.jet)
plt.ylabel('Frequency [Hz]')
plt.xlabel('Time [sec]')
plt.colorbar()
print(spectrogram.shape)
plt.show()

In [None]:
mat = loadmat('SPEC.mat')

In [None]:
plt.figure(figsize=(10, 4))
plt.pcolormesh(times, frequencies, mat['SPEC'], cmap=plt.cm.jet)
plt.ylabel('Frequency [Hz]')
plt.xlabel('Time [sec]')
plt.colorbar()
print(spectrogram.shape)
plt.show()

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(mat['SPEC'][:, 200], spectrogram[:, 200], 'bo')

In [None]:
spectrogram[:, 123] / mat['SPEC'][:, 123]

In [None]:
plt.matshow(spectrogram);

In [None]:
plt.matshow(mat['SPEC']);

## Thought the model also uses frames

In [None]:
class IdentificationDataset(Dataset):
    
    def __init__(self, path, train):
        iden_split_path = os.path.join(path, 'iden_split.txt')
        split = pd.read_table(iden_split_path, sep=' ', header=None, names=['phase', 'path'])
        
        if train:
            phases = [1, 2]
        
        else:
            phases = [3]
            
        mask = split['phase'].isin(phases)
        self.dataset = split['path'][mask].reset_index(drop=True)
        self.path = path
        self.train = train
        print(self.dataset.head(10))
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        track_path = self.dataset[idx]
        print(track_path)
        
        ## FACE
        frames_path = os.path.join(self.path, 'video', track_path.replace('.wav', '.txt'))
        frames_table = pd.read_table(frames_path, skiprows=6, usecols=['FRAME '])
        mask = np.where(frames_table.values % 25 == 0)
        # Note: only 20 per each face-track (see the asterics on the project page)
        # frames_1fps = frames_table[mask]
        frames_1fps = frames_table.values[mask][:20]
        print(frames_1fps)
        selected_frame = np.random.choice(frames_1fps)
        print(selected_frame)
        selected_frame_filename = '{0:07d}.jpg'.format(selected_frame)
        # cut off filename and extention. Add selected filename
        selected_frame_path = os.path.join(self.path, 'video', track_path[:-10], selected_frame_filename)
        print(selected_frame_path)
        
        
        # load this frame
        
        ## AUDIO
        audio_path = os.path.join(self.path, 'audio', track_path)
        
        return 1