# Processing for local training

In [1]:
import os
#!tar -xf data/for-rerec.tar.gz -C data/

In [2]:
#ls data/for-rerecorded/

In [3]:
# !pip3 install torch torchvision librosa matplotlib tqdm pandas

In [4]:
import torch
print(torch.cuda.is_available())

from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import torchvision
import torchvision.transforms as transforms
from torchvision import models

import numpy as np
import matplotlib.pyplot as plt
import librosa
import os
from tqdm import tqdm
from sklearn.metrics import roc_curve
import pandas as pd
import shutil
import zipfile

True


# Data Preprocessing

In [5]:
SAMPLE_RATE = 16000  # Sampling rate
N_MELS = 128

In [6]:
# TODO: Make it so each output is "513-dimensional" as with the reference paper
#
# https://arxiv.org/pdf/2203.16263

def compute_spectrograms(path):
    y, sr = librosa.load(path, sr=SAMPLE_RATE)
    fixed_length = 2 * SAMPLE_RATE
    if len(y) < fixed_length:
        y = np.pad(y, (0, fixed_length - len(y)))
    else:
        y = y[:fixed_length]

    cqt = librosa.cqt(y, sr=sr)
    cqt_spec = librosa.amplitude_to_db(np.abs(cqt), ref=np.max)

    stft = librosa.stft(y)
    log_spec = librosa.amplitude_to_db(np.abs(stft), ref=np.max)

    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS)
    mel_spec = librosa.power_to_db(mel, ref=np.max)

    return cqt_spec, log_spec, mel_spec

In [7]:
data_dirs = {
    'training_fake': 'data/for-rerecorded/training/fake/',
    'testing_fake': 'data/for-rerecorded/testing/fake/',
    'validation_fake': 'data/for-rerecorded/validation/fake/',
    'training_real': 'data/for-rerecorded/training/real/',
    'testing_real': 'data/for-rerecorded/testing/real/',
    'validation_real': 'data/for-rerecorded/validation/real/',
}

In [8]:
def process_directory(directory, output_dir):
  os.makedirs(output_dir, exist_ok=True)

  for filename in tqdm(os.listdir(directory)):
    if filename.endswith('.wav'):
      audio_path = os.path.join(directory, filename)
      cqt, log, mel = compute_spectrograms(audio_path)

      base_name = os.path.splitext(filename)[0]
      # Save spectrograms as numpy arrays
      np.save(f"{output_dir}/{base_name}_cqt.npy", cqt)
      np.save(f"{output_dir}/{base_name}_log.npy", log)
      np.save(f"{output_dir}/{base_name}_mel.npy", mel)

compute_specs = False
if compute_specs:
    for set_name, directory in data_dirs.items():
      output_dir = f'data/spectrograms/{set_name}_spectrograms'
      process_directory(directory, output_dir)
      print(f"Processed {set_name} set.")


In [9]:
# lets look at some data
display_cqt = "data/spectrograms/training_fake_spectrograms/recording1.wav_norm_mono_cqt.npy"
display_log = "data/spectrograms/training_fake_spectrograms/recording1.wav_norm_mono_log.npy"
display_mel = "data/spectrograms/training_fake_spectrograms/recording1.wav_norm_mono_mel.npy"

cqt_test = np.load(display_cqt)
log_test = np.load(display_log)
mel_test = np.load(display_mel)
print(cqt_test.shape)
print(log_test.shape)
print(mel_test.shape)

# for reference
cqt_size = 84
log_size = 1025
mel_size = 128


(84, 86)
(1025, 86)
(128, 86)


# Define models

In [10]:
class ResNet50Spectrogram(nn.Module):
    def __init__(self):
        super(ResNet50Spectrogram, self).__init__()

        self.model = models.resnet50(weights=None)

        original_conv = self.model.conv1
        self.model.conv1 = nn.Conv2d(in_channels=1,
                            out_channels=original_conv.out_channels,
                            kernel_size = original_conv.kernel_size,
                            stride = original_conv.stride,
                            padding = original_conv.padding,
                            bias = False)

        self.model.fc = nn.Linear(self.model.fc.in_features, 2)

    def forward(self, x):
        return self.model(x)

In [11]:
class EfficientNetSpectrogram(nn.Module):
    def __init__(self, model_type):
        super(EfficientNetSpectrogram, self).__init__()
        
        self.enet = None
        
        if model_type == "b0":
            self.enet = models.efficientnet_b0(weights=None, num_classes=2)
            
        
        # We need to change the network to accept 1 channel instead of
        # 3 because of our data.
        original_conv = self.enet.features[0][0]
        new_conv = nn.Conv2d(in_channels=1,
                            out_channels=original_conv.out_channels,
                            kernel_size = original_conv.kernel_size,
                            stride = original_conv.stride,
                            padding = original_conv.padding,
                            bias = False)
        self.enet.features[0][0] = new_conv
        
    def forward(self, x):
        return self.enet(x)

In [12]:
class LSTMSpectrogram(nn.Module):
    def __init__(self):
        super(LSTMSpectrogram, self).__init__()
        
        self.nlayer = 2
        self.nhiddens = 256
        
        if feature_type == "cqt":
            self.lstm = nn.LSTM(input_size=cqt_size, hidden_size=self.nhiddens, num_layers=self.nlayer, 
                                batch_first=True, dropout=0.3)
        elif feature_type == "log":
            self.lstm = nn.LSTM(input_size=log_size, hidden_size=self.nhiddens, num_layers=self.nlayer, 
                                batch_first=True, dropout=0.3)
        elif feature_type == "mel":
            self.lstm = nn.LSTM(input_size=mel_size, hidden_size=self.nhiddens, num_layers=self.nlayer, 
                                batch_first=True, dropout=0.3)
            
        self.fc = nn.Linear(self.nhiddens, 1)
        
    def forward(self, x):
        x = x.squeeze(1)
        # features are in wrong order for lstm
        x = x.transpose(1,2)
        
        x, (h_o, c_o) = self.lstm(x)
        
        h_o = h_o.squeeze(0)
        if self.nlayer > 1:
            h_o = h_o[-1]
        x = self.fc(h_o)
        x = x.squeeze(-1)
        
        #x = self.fc(x)
        return x

# Training procedures

In [13]:
class SpecDataset(Dataset):
    # data_type is one of 'cqt', 'log', 'mel'
    #
    # loader_type is one of 'train', 'validation', 'test'
    def __init__(self, data_type, loader_type):
        
        root = os.getcwd()
        data_root = os.path.join(root, 'data/spectrograms')
        
        self.data = []
        
        real_folder = None
        fake_folder = None

        # get the folder
        if loader_type == "train":
            real_folder = os.path.join(data_root, 'training_real_spectrograms')
            fake_folder = os.path.join(data_root, 'training_fake_spectrograms')
        elif loader_type == "validation":
            real_folder = os.path.join(data_root, 'validation_real_spectrograms')
            fake_folder = os.path.join(data_root, 'validation_fake_spectrograms')
        elif loader_type == "test":
            real_folder = os.path.join(data_root, 'testing_real_spectrograms')
            fake_folder = os.path.join(data_root, 'testing_fake_spectrograms')
        elif loader_type == "ITWtest":
            real_folder = os.path.join(data_root, 'ITWtest_real_spectrograms')
            fake_folder = os.path.join(data_root, 'ITWtest_fake_spectrograms')
        else:
            # Should never occur.
            pass
        
        real_files = []
        fake_files = []
        
        # now we have the folder given the loader type, collect
        # the data required for the loader.
        
        # get real example filenames
        suffix = f"{data_type}.npy"
        for filename in os.listdir(real_folder):
            # check if correct suffix and exists as a file
            if filename.endswith(suffix) and os.path.isfile(os.path.join(real_folder, filename)):
                this_filepath = os.path.join(real_folder, filename)
                real_files.append(this_filepath)
                
        print(f"Real examples for {data_type} {loader_type}: {len(real_files)}")
        
        # get fake example filenames
        suffix = f"{data_type}.npy"
        for filename in os.listdir(fake_folder):
            # check if correct suffix and exists as a file
            if filename.endswith(suffix) and os.path.isfile(os.path.join(fake_folder, filename)):
                this_filepath = os.path.join(fake_folder, filename)
                fake_files.append(this_filepath)
                
        print(f"Fake examples for {data_type} {loader_type}: {len(fake_files)}")
        
        label_val_false = 0
        label_val_true = 1
        if model_type == "LSTM":
            label_val_false = float(0)
            label_val_true = float(1)
        
        # load the data into memory
        #
        # if we need to work with a larger dataset, you might need to
        # alter this to be lazy loading instead, but it fits in my main memory
        # because of how much I currently have.
        for real_file in real_files:
            rf_data = torch.tensor(np.load(real_file))
            rf_data = rf_data.unsqueeze(0)
            if resizing == True:
                rf_data = rf_data.unsqueeze(0)
                rf_data = F.interpolate(rf_data, size=dims_resize, mode='bilinear', align_corners = False)
                rf_data = rf_data.squeeze(0)
            self.data.append((rf_data,label_val_true))
            
        for fake_file in fake_files:
            ff_data = torch.tensor(np.load(fake_file))
            ff_data = ff_data.unsqueeze(0)
            if resizing == True:
                ff_data = ff_data.unsqueeze(0)
                ff_data = F.interpolate(ff_data, size=dims_resize, mode='bilinear', align_corners = False)
                ff_data = ff_data.squeeze(0)
            self.data.append((ff_data, label_val_false))
        
    def __len__(self):
        return len(self.data)
            
    def __getitem__(self, idx):
        # return the data and the label
        return self.data[idx]

In [14]:
def dynamic_collate(batch):
    data, labels = zip(*batch)
    data = [d for d in data]
    
    max_length = max(d.shape[2] for d in data)
    
    padded = []
    if resizing == False:
        for d in data:
            # total padding needed, >= 0
            padding = max_length - d.shape[2]
        
            padded_d = None
            if padding > 0:
                # add zero's (silence) to match rest of batch
                padded_data = F.pad(d, (0,padding))
            
            else:
                # already max length
                padded_data = d
            padded.append(padded_data)
    else:
        # if resizing was true, we don't need to pad, everything is of the same shape
        padded = data
    
    '''
    for p in padded:
        r = p.unsqueeze(0)
        r = F.interpolate(r, size=dims_resize, mode='bilinear', align_corners = False)
        r = r.squeeze(0)
        resized.append(r)
    '''
    
    # stack properly now that everything is padded
    padded = torch.stack(padded, dim=0)

    labels = torch.tensor(labels)
    
    return padded, labels

In [15]:
#mean = [0]
#std = [1]

# deal with this later
#
# we should also probably compute the mean and std manually instead of assuming they correctly
# normalized it, since this is the re-recorded dataset
train_transform = transforms.Compose([
    transforms.ToTensor()
    #transforms.Normalize(mean, std)
  ])
test_transform = transforms.Compose([transforms.ToTensor()])

####################################################
# <CHANGE ME> if you want to use different features!
####################################################
feature_type = "cqt"

####################################################
# <CHANGE ME> if you want to use resizing!
#
# We need to resize to, for example, (224, 224)
####################################################
resizing = False
dims_resize = (224, 224)

#model_type = "enet"
model_type = "res"
#model_type = "LSTM"

device = torch.device("cuda")

model = None

if model_type == "LSTM":
    model = LSTMSpectrogram()
elif model_type == "enet":
    model =  EfficientNetSpectrogram("b0")
elif model_type == "res":
    model = ResNet50Spectrogram()
    
model = model.to(device)

#epochs = 100
epochs = 30
batch_size = 32
weight_decay = 5e-4
learning_rate = 0.0001

criterion = nn.CrossEntropyLoss()
if model_type == "LSTM":
    criterion = nn.BCEWithLogitsLoss()
    
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay = weight_decay)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = epochs)

FoR_train_loader = None
FoR_val_loader = None
FoR_test_loader = None

# data loaders
FoR_train_dataset = SpecDataset(feature_type, "train")
FoR_val_dataset = SpecDataset(feature_type, "validation")
FoR_test_dataset = SpecDataset(feature_type, "test")

FoR_train_loader = DataLoader(FoR_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=dynamic_collate)
FoR_val_loader = DataLoader(FoR_val_dataset, batch_size=batch_size, shuffle=True, collate_fn=dynamic_collate)
FoR_test_loader = DataLoader(FoR_test_dataset, batch_size=batch_size, shuffle=True, collate_fn=dynamic_collate)

Real examples for cqt train: 5104
Fake examples for cqt train: 5104
Real examples for cqt validation: 1101
Fake examples for cqt validation: 1143
Real examples for cqt test: 408
Fake examples for cqt test: 408


In [16]:
# we need to compute the equal error rate as one of our metrics.
def compute_EER(model, loader):
    model.eval()
    all_scores = []
    all_labels = []
    
    with torch.no_grad():
        for data in loader:
            waveform, labels = data
            
            waveform = waveform.to(device)
            labels = labels.to(device)
            
            out = model(waveform)
            if model_type == "LSTM":
                out = torch.sigmoid(out)
            else:
                out = torch.softmax(out, dim=1)
                # take the positive class labels
                out = out[:,1]
            
            all_scores.extend(out.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # use sklearn to compute this for us
    fpr, tpr, thresholds = roc_curve(all_labels, all_scores)
    
    # definition
    fnr = 1 - tpr

    # find closest threshold
    eer_thresh = np.nanargmin(np.abs(fpr-fnr))
    EER = (fpr[eer_thresh] + fnr[eer_thresh])/2
    
    return EER


In [17]:
def train(loader):
    model.train()
    training_loss = 0.0
    
    for data in loader:
        waveform, labels = data
        waveform = waveform.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        
        # basic pytorch boilerplate
        out = model(waveform)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        
        training_loss += loss.item()
        
    training_loss = training_loss / len(loader)
    return training_loss

In [18]:
def validate(loader):
    model.eval()
    validation_loss = 0.0
    
    n_correct = 0
    n_total = 0
    
    with torch.no_grad():
        for data in loader:
            waveform, labels = data
            
            waveform = waveform.to(device)
            labels = labels.to(device)
            
            out = model(waveform)
            loss = criterion(out, labels)
            
            validation_loss += loss.item()
            
            # count correct predictions
            preds = None
            if model_type == "LSTM":
                preds = (out > 0).long()
            else:
                preds = out.argmax(dim=1)
            
            n_correct = n_correct + (preds == labels).sum().item()
            n_total = n_total + labels.size(0)
            
    validation_loss = validation_loss / len(loader)
    accuracy = n_correct / n_total
    
    return validation_loss, accuracy
            

In [19]:
# reference paper uses patience = 5
patience = 5
best_validation_loss = 10000.0
fail_count = 0

training_losses = []
val_losses = []
test_losses = []

for epoch in tqdm(range(epochs)):
    training_loss = train(FoR_train_loader)
    print(f"[Epoch {epoch}] Training Loss: {training_loss}")
    
    training_losses.append(training_loss)
    
    validation_loss, val_accuracy = validate(FoR_val_loader)    
    print(f"[Epoch {epoch}] Validation Loss: {validation_loss} Accuracy: {val_accuracy}")
    
    val_losses.append(validation_loss)
    
    test_loss, test_accuracy = validate(FoR_test_loader)
    print(f"[DEBUG Epoch {epoch}] Test Loss: {test_loss} Accuracy: {test_accuracy}")
    
    test_losses.append(test_loss)
    
    if validation_loss < best_validation_loss:
        best_validation_loss = validation_loss
        fail_count = 0
    else:
        # increment number of epochs of no improvement
        fail_count = fail_count + 1
        
    if fail_count >= patience:
        print(f"Triggering early breaking on epoch {epoch}")
        break
    
    scheduler.step()

  0%|                                                                                            | 0/30 [00:03<?, ?it/s]


KeyboardInterrupt: 

In [None]:
print(f"Test EER: {compute_EER(model, FoR_test_loader)}")
test_loss, test_accuracy = validate(FoR_test_loader)
print(f"Testing loss: {test_loss} Accuracy: {test_accuracy}")

# expected to be quite low, though obvious overfitting at current settings
print(f"Train EER: {compute_EER(model, FoR_train_loader)}")

In [None]:
plt.plot(range(len(training_losses)), training_losses, label="Training Loss", marker='o')
plt.plot(range(len(val_losses)), val_losses, label="Validation Loss", marker='s')
plt.plot(range(len(test_losses)), test_losses, label="Test Loss", marker='x')

plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss vs epochs')
plt.legend()
plt.show()


# Process ITW Test Set

In [None]:
with zipfile.ZipFile('data/release_in_the_wild.zip') as zip_ref:
    zip_ref.extractall('data')

In [None]:
src_csv = 'data/release_in_the_wild/meta.csv'

df = pd.read_csv(src_csv)

for _, row in df.iterrows():
    name = row['file']
    label = str(row['label'])

    src_path = os.path.join('data/release_in_the_wild', name)
    dst_dir = os.path.join('data/release_in_the_wild', label)
    dst_path = os.path.join(dst_dir, name)

    os.makedirs(dst_dir, exist_ok=True)

    if os.path.exists(src_path):
        shutil.move(src_path, dst_path)

os.rename('data/release_in_the_wild/bona-fide', 'data/release_in_the_wild/real')
os.rename('data/release_in_the_wild/spoof', 'data/release_in_the_wild/fake')

In [None]:
def process_directory(directory, output_dir):
  os.makedirs(output_dir, exist_ok=True)

  for filename in os.listdir(directory):
    if filename.endswith('.wav'):
      audio_path = os.path.join(directory, filename)
      cqt, log, mel = compute_spectrograms(audio_path)

      base_name = os.path.splitext(filename)[0]
      # Save spectrograms as numpy arrays
      np.save(f"{output_dir}/{base_name}_cqt.npy", cqt)
      np.save(f"{output_dir}/{base_name}_log.npy", log)
      np.save(f"{output_dir}/{base_name}_mel.npy", mel)

data_dirs = {
    'ITWtest_real': 'data/release_in_the_wild/real',
    'ITWtest_fake': 'data/release_in_the_wild/fake/'
}

for set_name, directory in data_dirs.items():
    output_dir = f'data/spectrograms/{set_name}_spectrograms'
    process_directory(directory, output_dir)
    print(f"Processed {set_name} set.")

feature_type = "cqt"
batch_size = 128
resizing = True
dims_resize = (224, 224)

ITW_test_dataset = SpecDataset(feature_type, "ITWtest")
ITW_test_loader = DataLoader(ITW_test_dataset, batch_size=batch_size, shuffle=True, collate_fn=dynamic_collate)

In [None]:
print(f"Test EER: {compute_EER(model, ITW_test_loader)}")
test_loss, test_accuracy = validate(ITW_test_loader)
print(f"Testing loss: {test_loss} Accuracy: {test_accuracy}")

# expected to be quite low, though obvious overfitting at current settings
print(f"Train EER: {compute_EER(model, FoR_train_loader)}")