In [1]:
import torch
import torchaudio
import torchvision
import pandas as pd
import numpy as np
import glob
import os
import re
import matplotlib.pyplot as plt
import torchaudio.transforms as T
import json
import torch.nn.functional as F
from torch import nn
import librosa
from torch.utils.tensorboard import SummaryWriter

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
import s3fs
fs = s3fs.S3FileSystem()

In [4]:
train_mean = 2.5853811548103334
train_std = 48.60284136954955
normalize_t = torchvision.transforms.Normalize(mean=train_mean, std=train_std)


class ReconstructedFakeSoundDataset(torch.utils.data.Dataset):
    def __init__(self, data_dir, transform=None, split="train", **transform_args):
        self.split = split
        self.data_dir = data_dir
        with fs.open(f"{data_dir}/meta_data/{split}_reconstructed.json") as m:
            d = json.load(m)
            self.metadata = d["audios"]
        self.transform = transform
        self.precision = 0.01
        
        self.transform_args = transform_args

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        m = self.metadata[idx]
        filepath = m["filepath"]
        label = m["label"]
        filepath = f"s3://bukovec-ml-data/FakeAudio/{filepath[25:]}"
        audio = None
        sample_rate = None
        # with fs.open(filepath) as d:
        #     audio, sample_rate = torchaudio.load(d)
        #     if sample_rate != 32000:
        #         # print(f"WARNING - audio {m['audio_id']} resampled from {sample_rate} to 32khz")
        #         audio = torchaudio.functional.resample(audio, sample_rate, 32000)
        #         sample_rate = 32000
        with fs.open(filepath) as d:
            audio, sample_rate = librosa.load(d, sr=32000)
        onset, offset = (float(x) for x in m["onset_offset"].split("_"))
        segment = torch.zeros(int(10 / self.precision))
        segment[int(onset / self.precision): int(offset/self.precision)] = 1.0
        if self.transform:
            audio = self.transform(torch.tensor(audio).unsqueeze(0), sample_rate, **self.transform_args)
        # print(f"{audio_name} {audio_path} {label}")
        return (audio, torch.tensor(label, dtype=torch.float32), segment, {
            "onset": onset,
            "offset": offset,
            "audio_id": m["audio_id"]
        })
    
mel_spectrogram_32khz = T.MelSpectrogram(
        sample_rate=32000,
        n_fft=2048,
        hop_length=320,
        center=True,
        pad_mode="reflect",
        power=2.0,
        norm="slaney",
        n_mels=300,
        mel_scale="htk",
        f_min=0,
        f_max = 32000/2
    )

def transform(audio, sample_rate, n_fft=2048, hop_length=160, n_mels=300, win_length=None):
    if sample_rate == 32000:
        m = mel_spectrogram_32khz
    else:
        m = T.MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=2048,
        win_length=None,
        hop_length=320,
        center=True,
        pad_mode="reflect",
        power=2.0,
        norm="slaney",
        n_mels=300,
        mel_scale="htk",
        f_min=0,
        f_max = sample_rate/2
    )
    # s = spectrogram(audio)
    s = m(audio)
    s = normalize_t(s)
    # this truncates the last 0.005 seconds, but idrc
    s = F.pad(s, (0, 1000 - s.shape[2]))
    # if s.shape != (1, 300, 2000):
    #     print("ruh roh")
    return s



In [5]:
train_set = ReconstructedFakeSoundDataset('s3://bukovec-ml-data/FakeAudio', transform=transform, split='train')

In [6]:
train_set[0][0].shape

torch.Size([1, 300, 1000])

In [7]:
class FinetunedMobileNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.channel_up = nn.Conv2d(1, 3, kernel_size=1, stride=1, padding=0)
        self.backbone = torchvision.models.segmentation.deeplabv3_mobilenet_v3_large(weights='DEFAULT')
        self.backbone.classifier[4] = nn.Conv2d(256, 1, kernel_size=1, stride=1, padding=0)
        
        self.classifier = nn.Linear(300 * 1000, 1)
        self.segmentation = nn.Sequential(nn.Dropout(p=0.3), nn.Linear(300 * 1000, 1000), nn.ReLU(), nn.Dropout(p=0.3), nn.Linear(1000, 1000))

    def forward(self, x):
        B, C, H, W = x.shape
        x = self.channel_up(x)
        x = self.backbone(x)['out']
        x = x.view(B, 1 * H * W)
        c = self.classifier(x)
        c = torch.sigmoid(c)
        s = self.segmentation(x)
        s = torch.sigmoid(s)
        return c, s

In [8]:
model = FinetunedMobileNet()

In [9]:
model = model.to(device)

In [10]:
train_loader = torch.utils.data.DataLoader(train_set, batch_size=16, shuffle=True)

In [11]:
class_criterion = nn.BCELoss()
# segmentation_criterion = torchvision.ops.sigmoid_focal_loss
segmentation_criterion = nn.BCELoss()
alpha = 0.3 # class loss weight 
torch.autograd.set_detect_anomaly(True)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
model.train()
writer = SummaryWriter()
for epoch in range(10):  
    print(f"Epoch {epoch}")
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels, segments, _ = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        segments = segments.to(device)
        optimizer.zero_grad()
        labels_, segments_ = model(inputs)
        class_loss = class_criterion(labels_.view(-1), labels)
        # segmentation_loss = segmentation_criterion(segments_, segments, alpha=0.4)
        segmentation_loss = segmentation_criterion(segments_, segments)
        
        loss = alpha * class_loss + (1-alpha) * segmentation_loss
        loss.backward()
        optimizer.step()
        writer.add_scalar("Batch loss/train", loss.item(), epoch * len(train_loader) + i)
        running_loss += loss.item()
            
    writer.add_scalar("Loss/train", loss.item(), epoch)
    writer.add_scalar("Running loss/epoch", running_loss, epoch)
    writer.flush()
        
print('Finished Training')
writer.flush()

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Finished Training


In [12]:
torch.save(model.state_dict(), 'mobilenet-baseline.pth')

In [14]:
easy_set = ReconstructedFakeSoundDataset('s3://bukovec-ml-data/FakeAudio', split='easy', transform=transform)
hard_set = ReconstructedFakeSoundDataset('s3://bukovec-ml-data/FakeAudio', split='hard', transform=transform)
dev_set = torch.utils.data.ConcatDataset([easy_set, hard_set])
dev_loader = torch.utils.data.DataLoader(dev_set, batch_size=16)
test_set = ReconstructedFakeSoundDataset('s3://bukovec-ml-data/FakeAudio', split='zeroshot', transform=transform)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=16, shuffle=True)

In [None]:
model.eval()

In [20]:
def segmentation_metrics(pred, target):
    tp = (pred * target).sum().item()
    fp = ((pred == 1) & (target == 0)).sum().item()
    fn = ((pred == 0) & (target == 1)).sum().item()

    return tp, fp, fn
    
def eval_model(model, dataset):
    easy_loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)
    model.eval()
    class_correct = 0
    n_samples = 0
    seg_tp = 0
    seg_fp = 0 
    seg_fn = 0
    with torch.no_grad():
        for j, vdata in enumerate(easy_loader, 0):
            vinputs, vclass, vseg, _ = vdata
            vinputs = vinputs.to(device)
            vclass = vclass.to(device)
            vseg = vseg.to(device)
            vclass_, vseg_ = model(vinputs)
            vseg_ = torch.sigmoid(vseg_)
            vclass_ = (vclass_.view(-1) > 0.5).float()
            vseg_ = (vseg_ > 0.5).float()
            class_correct += (vclass_ == vclass).sum()
            n_samples += vclass_.shape[0]
            tp, fp, fn = segmentation_metrics(vseg_, vseg)
            seg_tp += tp
            seg_fp += fp
            seg_fn += fn

    class_acc = class_correct/n_samples
    seg_prec = seg_tp / (seg_tp + seg_fp)
    seg_recall = seg_tp / (seg_tp + seg_fn)
    seg_f1 = 2 / ((1 / seg_prec) + (1/seg_recall))
    score = alpha * class_acc + (1-alpha) * seg_f1
    print(f"Class Accuracy = {class_correct} / {n_samples} = {class_acc.item()}, Segment Precision = {seg_prec}, Segment Recall = {seg_recall}. Segment F1 = {seg_f1}")
    print(f"Score = {score}")

    return score, class_acc, seg_prec, seg_recall, seg_f1

eval_model(model, dev_set)

Class Accuracy = 200 / 343 = 0.5830904245376587, Segment Precision = 0.1332918795375101, Segment Recall = 0.9899648019171722. Segment F1 = 0.23494944886974603
Score = 0.3393917381763458


(tensor(0.3394, device='cuda:0'),
 tensor(0.5831, device='cuda:0'),
 0.1332918795375101,
 0.9899648019171722,
 0.23494944886974603)

In [21]:
eval_model(model, test_set)

Class Accuracy = 125 / 257 = 0.48638132214546204, Segment Precision = 0.12295171064431099, Segment Recall = 0.9493285774234158. Segment F1 = 0.21770720558161916
Score = 0.29830944538116455


(tensor(0.2983, device='cuda:0'),
 tensor(0.4864, device='cuda:0'),
 0.12295171064431099,
 0.9493285774234158,
 0.21770720558161916)