# Processing for local training

In [1]:
import os
#!tar -xf data/for-rerec.tar.gz -C data/

In [2]:
#ls data/for-rerecorded/

In [3]:
#!pip3 install torch torchvision librosa matplotlib tqdm pandas tensorboard argparse

In [1]:
import torch
print(torch.cuda.is_available())

from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import torchvision
import torchvision.transforms as transforms
from torchvision import models

import numpy as np
import matplotlib.pyplot as plt
import librosa
import os
from tqdm import tqdm
from sklearn.metrics import roc_curve
import pandas as pd
import shutil
import zipfile
import sys
import soundfile as sf

True


# Data Preprocessing

In [5]:
SAMPLE_RATE = 16000  # Sampling rate
N_MELS = 128

In [6]:
# TODO: Make it so each output is "513-dimensional" as with the reference paper
#
# https://arxiv.org/pdf/2203.16263

def compute_spectrograms(path):
    y, sr = librosa.load(path, sr=SAMPLE_RATE)
    fixed_length = 2 * SAMPLE_RATE
    if len(y) < fixed_length:
        y = np.pad(y, (0, fixed_length - len(y)))
    else:
        y = y[:fixed_length]

    cqt = librosa.cqt(y, sr=sr)
    cqt_spec = librosa.amplitude_to_db(np.abs(cqt), ref=np.max)

    stft = librosa.stft(y)
    log_spec = librosa.amplitude_to_db(np.abs(stft), ref=np.max)

    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS)
    mel_spec = librosa.power_to_db(mel, ref=np.max)

    return cqt_spec, log_spec, mel_spec

In [7]:
data_dirs = {
    'training_fake': 'data/for-rerecorded/training/fake/',
    'testing_fake': 'data/for-rerecorded/testing/fake/',
    'validation_fake': 'data/for-rerecorded/validation/fake/',
    'training_real': 'data/for-rerecorded/training/real/',
    'testing_real': 'data/for-rerecorded/testing/real/',
    'validation_real': 'data/for-rerecorded/validation/real/',
}

In [8]:
def process_directory(directory, output_dir):
  os.makedirs(output_dir, exist_ok=True)

  for filename in tqdm(os.listdir(directory)):
    if filename.endswith('.wav'):
      audio_path = os.path.join(directory, filename)
      cqt, log, mel = compute_spectrograms(audio_path)

      base_name = os.path.splitext(filename)[0]
      # Save spectrograms as numpy arrays
      np.save(f"{output_dir}/{base_name}_cqt.npy", cqt)
      np.save(f"{output_dir}/{base_name}_log.npy", log)
      np.save(f"{output_dir}/{base_name}_mel.npy", mel)

compute_specs = False
if compute_specs:
    for set_name, directory in data_dirs.items():
      output_dir = f'data/spectrograms/{set_name}_spectrograms'
      process_directory(directory, output_dir)
      print(f"Processed {set_name} set.")


In [9]:
# lets look at some data
display_cqt = "data/spectrograms/training_fake_spectrograms/recording1.wav_norm_mono_cqt.npy"
display_log = "data/spectrograms/training_fake_spectrograms/recording1.wav_norm_mono_log.npy"
display_mel = "data/spectrograms/training_fake_spectrograms/recording1.wav_norm_mono_mel.npy"

cqt_test = np.load(display_cqt)
log_test = np.load(display_log)
mel_test = np.load(display_mel)
print(cqt_test.shape)
print(log_test.shape)
print(mel_test.shape)

# for reference
cqt_size = 84
log_size = 1025
mel_size = 128


(84, 63)
(1025, 63)
(128, 63)


# Define models

In [10]:
class ResNet50Spectrogram(nn.Module):
    def __init__(self):
        super(ResNet50Spectrogram, self).__init__()

        self.model = models.resnet50(weights=None)

        original_conv = self.model.conv1
        self.model.conv1 = nn.Conv2d(in_channels=1,
                            out_channels=original_conv.out_channels,
                            kernel_size = original_conv.kernel_size,
                            stride = original_conv.stride,
                            padding = original_conv.padding,
                            bias = False)

        self.model.fc = nn.Linear(self.model.fc.in_features, 2)

    def forward(self, x):
        return self.model(x)

In [11]:
class EfficientNetSpectrogram(nn.Module):
    def __init__(self, model_type):
        super(EfficientNetSpectrogram, self).__init__()
        
        self.enet = None
        
        if model_type == "b0":
            self.enet = models.efficientnet_b0(weights=None, num_classes=2)
        elif model_type == 'b1':
            self.enet = models.efficientnet_b1(weights=None, num_classes=2)
            
        
        # We need to change the network to accept 1 channel instead of
        # 3 because of our data.
        original_conv = self.enet.features[0][0]
        new_conv = nn.Conv2d(in_channels=1,
                            out_channels=original_conv.out_channels,
                            kernel_size = original_conv.kernel_size,
                            stride = original_conv.stride,
                            padding = original_conv.padding,
                            bias = False)
        self.enet.features[0][0] = new_conv
        
    def forward(self, x):
        return self.enet(x)

In [12]:
class LSTMSpectrogram(nn.Module):
    def __init__(self):
        super(LSTMSpectrogram, self).__init__()
        
        self.nlayer = 2
        self.nhiddens = 256
        
        if feature_type == "cqt":
            self.lstm = nn.LSTM(input_size=cqt_size, hidden_size=self.nhiddens, num_layers=self.nlayer, 
                                batch_first=True, dropout=0.3)
        elif feature_type == "log":
            self.lstm = nn.LSTM(input_size=log_size, hidden_size=self.nhiddens, num_layers=self.nlayer, 
                                batch_first=True, dropout=0.3)
        elif feature_type == "mel":
            self.lstm = nn.LSTM(input_size=mel_size, hidden_size=self.nhiddens, num_layers=self.nlayer, 
                                batch_first=True, dropout=0.3)
            
        self.fc = nn.Linear(self.nhiddens, 1)
        
    def forward(self, x):
        x = x.squeeze(1)
        # features are in wrong order for lstm
        x = x.transpose(1,2)
        
        x, (h_o, c_o) = self.lstm(x)
        
        h_o = h_o.squeeze(0)
        if self.nlayer > 1:
            h_o = h_o[-1]
        x = self.fc(h_o)
        x = x.squeeze(-1)
        
        #x = self.fc(x)
        return x

In [13]:
#################################################################
#################################################################
##                                                             ##
## All of the code in this cell is from the RawNet2 repository ##
##            https://github.com/Jungjee/RawNet                ##
##                                                             ##
#################################################################
#################################################################

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math

from torch.utils import data
from collections import OrderedDict
from torch.nn.parameter import Parameter
from torch.autograd import Variable

class FRM(nn.Module):
    def __init__(self, nb_dim, do_add = True, do_mul = True):
        super(FRM, self).__init__()
        self.fc = nn.Linear(nb_dim, nb_dim)
        self.sig = nn.Sigmoid()
        self.do_add = do_add
        self.do_mul = do_mul
    def forward(self, x):
        y = F.adaptive_avg_pool1d(x, 1).view(x.size(0), -1)
        y = self.sig(self.fc(y)).view(x.size(0), x.size(1), -1)

        if self.do_mul: x = x * y
        if self.do_add: x = x + y
        return x

class Residual_block_wFRM(nn.Module):
    def __init__(self, nb_filts, first = False):
        super(Residual_block_wFRM, self).__init__()
        self.first = first
        if not self.first:
            self.bn1 = nn.BatchNorm1d(num_features = nb_filts[0])
        self.lrelu = nn.LeakyReLU()
        self.lrelu_keras = nn.LeakyReLU(negative_slope=0.3)
        
        self.conv1 = nn.Conv1d(in_channels = nb_filts[0],
            out_channels = nb_filts[1],
            kernel_size = 3,
            padding = 1,
            stride = 1)
        self.bn2 = nn.BatchNorm1d(num_features = nb_filts[1])
        self.conv2 = nn.Conv1d(in_channels = nb_filts[1],
            out_channels = nb_filts[1],
            padding = 1,
            kernel_size = 3,
            stride = 1)
        
        if nb_filts[0] != nb_filts[1]:
            self.downsample = True
            self.conv_downsample = nn.Conv1d(in_channels = nb_filts[0],
                out_channels = nb_filts[1],
                padding = 0,
                kernel_size = 1,
                stride = 1)
            
        else:
            self.downsample = False
        self.mp = nn.MaxPool1d(3)
        self.frm = FRM(
            nb_dim = nb_filts[1],
            do_add = True,
            do_mul = True)
        
    def forward(self, x):
        identity = x
        if not self.first:
            out = self.bn1(x)
            out = self.lrelu_keras(out)
        else:
            out = x
            
        out = self.conv1(out)
        out = self.bn2(out)
        out = self.lrelu_keras(out)
        out = self.conv2(out)
        
        if self.downsample:
            identity = self.conv_downsample(identity)
            
        out += identity
        out = self.mp(out)
        out = self.frm(out)
        return out

class LayerNorm(nn.Module):

    def __init__(self, features, eps=1e-6):
        super(LayerNorm,self).__init__()
        self.gamma = nn.Parameter(torch.ones(features))
        self.beta = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta

class SincConv_fast(nn.Module):
    """Sinc-based convolution
    Parameters
    ----------
    in_channels : `int`
        Number of input channels. Must be 1.
    out_channels : `int`
        Number of filters.
    kernel_size : `int`
        Filter length.
    sample_rate : `int`, optional
        Sample rate. Defaults to 16000.
    Usage
    -----
    See `torch.nn.Conv1d`
    Reference
    ---------
    Mirco Ravanelli, Yoshua Bengio,
    "Speaker Recognition from raw waveform with SincNet".
    https://arxiv.org/abs/1808.00158
    """

    @staticmethod
    def to_mel(hz):
        return 2595 * np.log10(1 + hz / 700)

    @staticmethod
    def to_hz(mel):
        return 700 * (10 ** (mel / 2595) - 1)

    def __init__(self, out_channels, kernel_size, sample_rate=16000, in_channels=1,
                 stride=1, padding=0, dilation=1, bias=False, groups=1, min_low_hz=50, min_band_hz=50):

        super(SincConv_fast,self).__init__()

        if in_channels != 1:
            #msg = (f'SincConv only support one input channel '
            #       f'(here, in_channels = {in_channels:d}).')
            msg = "SincConv only support one input channel (here, in_channels = {%i})" % (in_channels)
            raise ValueError(msg)

        self.out_channels = out_channels
        self.kernel_size = kernel_size
        
        # Forcing the filters to be odd (i.e, perfectly symmetrics)
        if kernel_size%2==0:
            self.kernel_size=self.kernel_size+1
            
        self.stride = stride
        self.padding = padding
        self.dilation = dilation

        if bias:
            raise ValueError('SincConv does not support bias.')
        if groups > 1:
            raise ValueError('SincConv does not support groups.')

        self.sample_rate = sample_rate
        self.min_low_hz = min_low_hz
        self.min_band_hz = min_band_hz

        # initialize filterbanks such that they are equally spaced in Mel scale
        low_hz = 30
        high_hz = self.sample_rate / 2 - (self.min_low_hz + self.min_band_hz)

        mel = np.linspace(self.to_mel(low_hz),
                          self.to_mel(high_hz),
                          self.out_channels + 1)
        hz = self.to_hz(mel)
        

        # filter lower frequency (out_channels, 1)
        self.low_hz_ = nn.Parameter(torch.Tensor(hz[:-1]).view(-1, 1))

        # filter frequency band (out_channels, 1)
        self.band_hz_ = nn.Parameter(torch.Tensor(np.diff(hz)).view(-1, 1))

        # Hamming window
        #self.window_ = torch.hamming_window(self.kernel_size)
        n_lin=torch.linspace(0, (self.kernel_size/2)-1, steps=int((self.kernel_size/2))) # computing only half of the window
        self.window_=0.54-0.46*torch.cos(2*math.pi*n_lin/self.kernel_size);

        # (1, kernel_size/2)
        n = (self.kernel_size - 1) / 2.0
        self.n_ = 2*math.pi*torch.arange(-n, 0).view(1, -1) / self.sample_rate # Due to symmetry, I only need half of the time axes

    def forward(self, waveforms):
        """
        Parameters
        ----------
        waveforms : `torch.Tensor` (batch_size, 1, n_samples)
            Batch of waveforms.
        Returns
        -------
        features : `torch.Tensor` (batch_size, out_channels, n_samples_out)
            Batch of sinc filters activations.
        """

        self.n_ = self.n_.to(waveforms.device)

        self.window_ = self.window_.to(waveforms.device)

        low = self.min_low_hz  + torch.abs(self.low_hz_)
        
        high = torch.clamp(low + self.min_band_hz + torch.abs(self.band_hz_),self.min_low_hz,self.sample_rate/2)
        band=(high-low)[:,0]
        
        f_times_t_low = torch.matmul(low, self.n_)
        f_times_t_high = torch.matmul(high, self.n_)

        band_pass_left=((torch.sin(f_times_t_high)-torch.sin(f_times_t_low))/(self.n_/2))*self.window_ # Equivalent of Eq.4 of the reference paper (SPEAKER RECOGNITION FROM RAW WAVEFORM WITH SINCNET). I just have expanded the sinc and simplified the terms. This way I avoid several useless computations. 
        band_pass_center = 2*band.view(-1,1)
        band_pass_right= torch.flip(band_pass_left,dims=[1])
        
        
        band_pass=torch.cat([band_pass_left,band_pass_center,band_pass_right],dim=1)

        
        band_pass = band_pass / (2*band[:,None])
        

        self.filters = (band_pass).view(
            self.out_channels, 1, self.kernel_size)

        return F.conv1d(waveforms, self.filters, stride=self.stride,
                        padding=self.padding, dilation=self.dilation,
                         bias=None, groups=1) 
    
class RawNet2(nn.Module):
    def __init__(self, d_args):
        super(RawNet2, self).__init__()

        self.ln = LayerNorm(d_args['nb_samp'])
        self.first_conv = SincConv_fast(in_channels = d_args['in_channels'],
            out_channels = d_args['filts'][0],
            kernel_size = d_args['first_conv']
            )

        self.first_bn = nn.BatchNorm1d(num_features = d_args['filts'][0])
        self.lrelu = nn.LeakyReLU()
        self.lrelu_keras = nn.LeakyReLU(negative_slope = 0.3)
        
        self.block0 = nn.Sequential(Residual_block_wFRM(nb_filts = d_args['filts'][1], first = True))
        self.block1 = nn.Sequential(Residual_block_wFRM(nb_filts = d_args['filts'][1]))
 
        self.block2 = nn.Sequential(Residual_block_wFRM(nb_filts = d_args['filts'][2]))
        d_args['filts'][2][0] = d_args['filts'][2][1]
        self.block3 = nn.Sequential(Residual_block_wFRM(nb_filts = d_args['filts'][2]))
        self.block4 = nn.Sequential(Residual_block_wFRM(nb_filts = d_args['filts'][2]))
        self.block5 = nn.Sequential(Residual_block_wFRM(nb_filts = d_args['filts'][2]))
        self.avgpool = nn.AdaptiveAvgPool1d(1)

        self.bn_before_gru = nn.BatchNorm1d(num_features = d_args['filts'][2][-1])
        self.gru = nn.GRU(input_size = d_args['filts'][2][-1],
            hidden_size = d_args['gru_node'],
            num_layers = d_args['nb_gru_layer'],
            batch_first = True)

        
        self.fc1_gru = nn.Linear(in_features = d_args['gru_node'],
            out_features = d_args['nb_fc_node'])
        self.fc2_gru = nn.Linear(in_features = d_args['nb_fc_node'],
            out_features = d_args['nb_classes'],
            bias = True)
        
        self.sig = nn.Sigmoid()
        
    def forward(self, x, y = 0, is_test=False):
        #follow sincNet recipe
        nb_samp = x.shape[0]
        len_seq = x.shape[2]
        x = self.ln(x)
        x=x.view(nb_samp,1,len_seq)
        x = F.max_pool1d(torch.abs(self.first_conv(x)), 3)
        x = self.first_bn(x)
        x = self.lrelu_keras(x)
        
        x = self.block0(x)
        x = self.block1(x)

        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        x = self.block5(x)

        x = self.bn_before_gru(x)
        x = self.lrelu_keras(x)
        x = x.permute(0, 2, 1)  #(batch, filt, time) >> (batch, time, filt)
        self.gru.flatten_parameters()
        x, _ = self.gru(x)
        x = x[:,-1,:]
        code = self.fc1_gru(x)
        if is_test: return code
        
        code_norm = code.norm(p=2,dim=1, keepdim=True) / 10.
        code = torch.div(code, code_norm)
        out = self.fc2_gru(code)
        return out
    
#################################################################
#################################################################
##                                                             ##
## All of the code in this cell is from the RawNet2 repository ##
##            https://github.com/Jungjee/RawNet                ##
##                                                             ##
#################################################################
#################################################################

import argparse

def str2bool(v):
    if isinstance(v, bool):
       return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')


def get_args():
    
    if '-f' in sys.argv:
        jupyter_arg_index = sys.argv.index('-f')
        del sys.argv[jupyter_arg_index:jupyter_arg_index+2]
    
    parser = argparse.ArgumentParser()

    #DNN args
    parser.add_argument('-m_first_conv', type = int, default = 251)
    parser.add_argument('-m_in_channels', type = int, default = 1)
    parser.add_argument('-m_filts', type = list, default = [128, [128,128], [128,256], [256,256]])
    parser.add_argument('-m_blocks', type = list, default = [2, 4])
    parser.add_argument('-m_nb_fc_att_node', type = list, default = [1])
    parser.add_argument('-m_nb_fc_node', type = int, default = 1024)
    parser.add_argument('-m_gru_node', type = int, default = 1024)
    parser.add_argument('-m_nb_gru_layer', type = int, default = 1)
    parser.add_argument('-m_nb_samp', type = int, default = 59049)

    args = parser.parse_args()
    args.model = {}
    for k, v in vars(args).items():
        if k[:2] == 'm_':
            print(k, v)
            args.model[k[2:]] = v
    return args

#################################################################
#################################################################
##                                                             ##
## All of the code in this cell is from the RawNet2 repository ##
##            https://github.com/Jungjee/RawNet                ##
##                                                             ##
#################################################################
#################################################################

In [14]:
class ResNet18Spectrogram(nn.Module):
    def __init__(self):
        super(ResNet18Spectrogram, self).__init__()

        self.model = models.resnet18(weights=None)

        original_conv = self.model.conv1
        self.model.conv1 = nn.Conv2d(in_channels=1,
                            out_channels=original_conv.out_channels,
                            kernel_size = original_conv.kernel_size,
                            stride = original_conv.stride,
                            padding = original_conv.padding,
                            bias = False)

        self.model.fc = nn.Linear(self.model.fc.in_features, 2)

    def forward(self, x):
        return self.model(x)

# Training procedures

In [15]:
class SpecDataset(Dataset):
    # data_type is one of 'cqt', 'log', 'mel'
    #
    # loader_type is one of 'train', 'validation', 'test'
    def __init__(self, data_type, loader_type):
        
        root = os.getcwd()
        data_root = os.path.join(root, 'data/spectrograms')
        
        self.data = []
        
        real_folder = None
        fake_folder = None

        # get the folder
        if loader_type == "train":
            real_folder = os.path.join(data_root, 'training_real_spectrograms')
            fake_folder = os.path.join(data_root, 'training_fake_spectrograms')
        elif loader_type == "validation":
            real_folder = os.path.join(data_root, 'validation_real_spectrograms')
            fake_folder = os.path.join(data_root, 'validation_fake_spectrograms')
        elif loader_type == "test":
            real_folder = os.path.join(data_root, 'testing_real_spectrograms')
            fake_folder = os.path.join(data_root, 'testing_fake_spectrograms')
        elif loader_type == "ITWFull":
            real_folder = os.path.join(data_root, 'ITWfull_real_spectrograms')
            fake_folder = os.path.join(data_root, 'ITWfull_fake_spectrograms')
        else:
            # Should never occur.
            pass
        
        real_files = []
        fake_files = []
        
        # now we have the folder given the loader type, collect
        # the data required for the loader.
        
        # get real example filenames
        suffix = f"{data_type}.npy"
        for filename in os.listdir(real_folder):
            # check if correct suffix and exists as a file
            if filename.endswith(suffix) and os.path.isfile(os.path.join(real_folder, filename)):
                this_filepath = os.path.join(real_folder, filename)
                real_files.append(this_filepath)
                
        print(f"Real examples for {data_type} {loader_type}: {len(real_files)}")
        
        # get fake example filenames
        suffix = f"{data_type}.npy"
        for filename in os.listdir(fake_folder):
            # check if correct suffix and exists as a file
            if filename.endswith(suffix) and os.path.isfile(os.path.join(fake_folder, filename)):
                this_filepath = os.path.join(fake_folder, filename)
                fake_files.append(this_filepath)
                
        print(f"Fake examples for {data_type} {loader_type}: {len(fake_files)}")
        
        label_val_false = 0
        label_val_true = 1
        if model_type == "LSTM":
            label_val_false = float(0)
            label_val_true = float(1)
        
        # load the data into memory
        #
        # if we need to work with a larger dataset, you might need to
        # alter this to be lazy loading instead, but it fits in my main memory
        # because of how much I currently have.
        for real_file in real_files:
            rf_data = torch.tensor(np.load(real_file))
            rf_data = rf_data.unsqueeze(0)
            if resizing == True:
                rf_data = rf_data.unsqueeze(0)
                rf_data = F.interpolate(rf_data, size=dims_resize, mode='bilinear', align_corners = False)
                rf_data = rf_data.squeeze(0)
            self.data.append((rf_data,label_val_true))
            
        for fake_file in fake_files:
            ff_data = torch.tensor(np.load(fake_file))
            ff_data = ff_data.unsqueeze(0)
            if resizing == True:
                ff_data = ff_data.unsqueeze(0)
                ff_data = F.interpolate(ff_data, size=dims_resize, mode='bilinear', align_corners = False)
                ff_data = ff_data.squeeze(0)
            self.data.append((ff_data, label_val_false))
        
    def __len__(self):
        return len(self.data)
            
    def __getitem__(self, idx):
        # return the data and the label
        return self.data[idx]

In [16]:
class RawnetWaveformDataset(Dataset):
    def __init__(self, loader_type):
        
        root = os.getcwd()
        data_root = None
        real_folder = None
        fake_folder = None
        
        # get the folder
        if loader_type == "train":
            data_root = os.path.join(root, 'data/for-rerecorded/training')
            real_folder = os.path.join(data_root, 'real')
            fake_folder = os.path.join(data_root, 'fake')
        elif loader_type == "validation":
            data_root = os.path.join(root, 'data/for-rerecorded/validation')
            real_folder = os.path.join(data_root, 'real')
            fake_folder = os.path.join(data_root, 'fake')
        elif loader_type == "test":
            data_root = os.path.join(root, 'data/for-rerecorded/testing')
            real_folder = os.path.join(data_root, 'real')
            fake_folder = os.path.join(data_root, 'fake')
        elif loader_type == "ITWFull":
            data_root = os.path.join(root, 'data/release_in_the_wild')
            real_folder = os.path.join(data_root, 'real')
            fake_folder = os.path.join(data_root, 'fake')
        else:
            # Should never occur.
            pass
        
        self.real_files = []
        self.fake_files = []
        
        # get real example filenames
        suffix = f".wav"
        for filename in os.listdir(real_folder):
            # check if correct suffix and exists as a file
            if filename.endswith(suffix) and os.path.isfile(os.path.join(real_folder, filename)):
                this_filepath = os.path.join(real_folder, filename)
                self.real_files.append(this_filepath)
                
        print(f"Real examples for raw waveform {loader_type}: {len(self.real_files)}")
        
        # get fake example filenames
        suffix = f".wav"
        for filename in os.listdir(fake_folder):
            # check if correct suffix and exists as a file
            if filename.endswith(suffix) and os.path.isfile(os.path.join(fake_folder, filename)):
                this_filepath = os.path.join(fake_folder, filename)
                self.fake_files.append(this_filepath)
                
        print(f"Fake examples for raw waveform {loader_type}: {len(self.fake_files)}")
            
        # load the raw waveform data
        #
        # We follow the dataloader from the RawNet implementation
        
        self.data = []
        
        target_samples = 59049
        
        for real_file in self.real_files:
            X, sr = sf.read(real_file)
            X = X.astype(np.float64)
            X = X.reshape(1, -1)
            
            # prepare the .wav file to be 59049 samples as is needed
            n_samples = X.shape[1]
            if n_samples < target_samples:
                n_duplicates = int(target_samples / n_samples) + 1
                X = np.tile(X, (1, n_duplicates))[:, :target_samples]
                
            elif n_samples > target_samples:
                start = np.random.randint(0, n_samples - target_samples)
                X = X[:, start : start + target_samples]
                
            # now append the data
            X = X.astype(np.float32)
            self.data.append((X, 1))
            
        for fake_file in self.fake_files:
            X, sr = sf.read(real_file)
            X = X.astype(np.float64)
            X = X.reshape(1, -1)
            
            # prepare the .wav file to be 59049 samples as is needed
            n_samples = X.shape[1]
            if n_samples < target_samples:
                n_duplicates = int(target_samples / n_samples) + 1
                X = np.tile(X, (1, n_duplicates))[:, :target_samples]
                
            elif n_samples > target_samples:
                start = np.random.randint(0, n_samples - target_length)
                X = X[:, start : start + target_length]
                
            # append
            X = X.astype(np.float32)
            self.data.append((X, 0))
            
            
        
    def __len__(self):
        return len(self.data)
            
    def __getitem__(self, idx):
        # return the data and the label
        return self.data[idx]

In [17]:
def dynamic_collate(batch):
    data, labels = zip(*batch)
    data = [d for d in data]
    
    max_length = max(d.shape[2] for d in data)
    
    padded = []
    if resizing == False:
        for d in data:
            # total padding needed, >= 0
            padding = max_length - d.shape[2]
        
            padded_d = None
            if padding > 0:
                # add zero's (silence) to match rest of batch
                padded_data = F.pad(d, (0,padding))
            
            else:
                # already max length
                padded_data = d
            padded.append(padded_data)
    else:
        # if resizing was true, we don't need to pad, everything is of the same shape
        padded = data
    
    '''
    for p in padded:
        r = p.unsqueeze(0)
        r = F.interpolate(r, size=dims_resize, mode='bilinear', align_corners = False)
        r = r.squeeze(0)
        resized.append(r)
    '''
    
    # stack properly now that everything is padded
    padded = torch.stack(padded, dim=0)

    labels = torch.tensor(labels)
    
    return padded, labels

In [None]:
#mean = [0]
#std = [1]

# deal with this later
#
# we should also probably compute the mean and std manually instead of assuming they correctly
# normalized it, since this is the re-recorded dataset
train_transform = transforms.Compose([
    transforms.ToTensor()
    #transforms.Normalize(mean, std)
  ])
test_transform = transforms.Compose([transforms.ToTensor()])

####################################################
# <CHANGE ME> if you want to use different features!
####################################################
feature_type = "cqt"

####################################################
# <CHANGE ME> if you want to use resizing!
#
# We need to resize to, for example, (224, 224)
####################################################
resizing = True
dims_resize = (224, 224)

#model_type = "enet"
#model_type = "res"
#model_type = "LSTM"
#model_type = 'raw2' # very, very long time to train
#model_type = "res18"
model_type = 'enet1'

device = torch.device("cuda")

model = None
RawNet2_args = None

if model_type == "LSTM":
    model = LSTMSpectrogram()
elif model_type == "enet":
    model =  EfficientNetSpectrogram("b0")
elif model_type == "res":
    model = ResNet50Spectrogram()
elif model_type == 'raw2':
    RawNet2_args = get_args()
    # just do this manually
    RawNet2_args.model['nb_classes'] = 2
    model = RawNet2(RawNet2_args.model)
elif model_type == 'res18':
    model = ResNet18Spectrogram()
elif model_type == 'enet1':
    model = EfficientNetSpectrogram("b1")
    
model = model.to(device)

#epochs = 100
epochs = 30
batch_size = 32
weight_decay = 5e-4
learning_rate = 0.0001

criterion = nn.CrossEntropyLoss()
if model_type == "LSTM":
    resizing = False
    criterion = nn.BCEWithLogitsLoss()
elif model_type == "raw2":
    resizing = False
    
    
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay = weight_decay)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = epochs)

FoR_train_dataset = None
FoR_val_dataset = None
FoR_test_dataset = None

FoR_train_loader = None
FoR_val_loader = None
FoR_test_loader = None

# data loaders
if model_type == "raw2":
    FoR_train_dataset = RawnetWaveformDataset("train")
    FoR_val_dataset = RawnetWaveformDataset("validation")
    FoR_test_dataset = RawnetWaveformDataset("test")
    FoR_train_loader = DataLoader(FoR_train_dataset, batch_size=batch_size, shuffle=True)
    FoR_val_loader = DataLoader(FoR_val_dataset, batch_size=batch_size, shuffle=True)
    FoR_test_loader = DataLoader(FoR_test_dataset, batch_size=batch_size, shuffle=True)
else:
    FoR_train_dataset = SpecDataset(feature_type, "train")
    FoR_val_dataset = SpecDataset(feature_type, "validation")
    FoR_test_dataset = SpecDataset(feature_type, "test")
    FoR_train_loader = DataLoader(FoR_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=dynamic_collate)
    FoR_val_loader = DataLoader(FoR_val_dataset, batch_size=batch_size, shuffle=True, collate_fn=dynamic_collate)
    FoR_test_loader = DataLoader(FoR_test_dataset, batch_size=batch_size, shuffle=True, collate_fn=dynamic_collate)

In [None]:
# we need to compute the equal error rate as one of our metrics.
def compute_EER(model, loader):
    model.eval()
    all_scores = []
    all_labels = []
    
    with torch.no_grad():
        for data in loader:
            waveform, labels = data
            
            waveform = waveform.to(device)
            labels = labels.to(device)
            
            out = model(waveform)
            if model_type == "LSTM":
                out = torch.sigmoid(out)
            else:
                out = torch.softmax(out, dim=1)
                # take the positive class labels
                out = out[:,1]
            
            all_scores.extend(out.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # use sklearn to compute this for us
    fpr, tpr, thresholds = roc_curve(all_labels, all_scores)
    
    # definition
    fnr = 1 - tpr

    # find closest threshold
    eer_thresh = np.nanargmin(np.abs(fpr-fnr))
    EER = (fpr[eer_thresh] + fnr[eer_thresh])/2
    
    return EER


In [None]:
def train(loader):
    model.train()
    training_loss = 0.0
    
    for data in loader:
        waveform, labels = data
        waveform = waveform.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        
        # basic pytorch boilerplate
        out = model(waveform)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        
        training_loss += loss.item()
        
    training_loss = training_loss / len(loader)
    return training_loss

In [None]:
def validate(loader):
    model.eval()
    validation_loss = 0.0
    
    n_correct = 0
    n_total = 0
    
    with torch.no_grad():
        for data in loader:
            waveform, labels = data
            
            waveform = waveform.to(device)
            labels = labels.to(device)
            
            out = model(waveform)
            loss = criterion(out, labels)
            
            validation_loss += loss.item()
            
            # count correct predictions
            preds = None
            if model_type == "LSTM":
                preds = (out > 0).long()
            else:
                preds = out.argmax(dim=1)
            
            n_correct = n_correct + (preds == labels).sum().item()
            n_total = n_total + labels.size(0)
            
    validation_loss = validation_loss / len(loader)
    accuracy = n_correct / n_total
    
    return validation_loss, accuracy
            

In [None]:
# reference paper uses patience = 5
patience = 5
best_validation_loss = 10000.0
fail_count = 0

training_losses = []
val_losses = []
test_losses = []

for epoch in tqdm(range(epochs)):
    training_loss = train(FoR_train_loader)
    print(f"[Epoch {epoch}] Training Loss: {training_loss}")
    
    training_losses.append(training_loss)
    
    validation_loss, val_accuracy = validate(FoR_val_loader)    
    print(f"[Epoch {epoch}] Validation Loss: {validation_loss} Accuracy: {val_accuracy}")
    
    val_losses.append(validation_loss)
    
    test_loss, test_accuracy = validate(FoR_test_loader)
    print(f"[DEBUG Epoch {epoch}] Test Loss: {test_loss} Accuracy: {test_accuracy}")
    
    test_losses.append(test_loss)
    
    if validation_loss < best_validation_loss:
        best_validation_loss = validation_loss
        fail_count = 0
    else:
        # increment number of epochs of no improvement
        fail_count = fail_count + 1
        
    if fail_count >= patience:
        print(f"Triggering early breaking on epoch {epoch}")
        break
    
    scheduler.step()

In [None]:
print(f"Test EER: {compute_EER(model, FoR_test_loader)}")
test_loss, test_accuracy = validate(FoR_test_loader)
print(f"Testing loss: {test_loss} Accuracy: {test_accuracy}")

# expected to be quite low, though obvious overfitting at current settings
print(f"Train EER: {compute_EER(model, FoR_train_loader)}")

In [None]:
plt.plot(range(len(training_losses)), training_losses, label="Training Loss", marker='o')
plt.plot(range(len(val_losses)), val_losses, label="Validation Loss", marker='s')
plt.plot(range(len(test_losses)), test_losses, label="Test Loss", marker='x')

plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss vs epochs')
plt.legend()
plt.show()


# Process entire ITW dataset

In [None]:
if not os.path.isdir("data/release_in_the_wild"):
    with zipfile.ZipFile('data/release_in_the_wild.zip') as zip_ref:
        zip_ref.extractall('data')

In [None]:
src_csv = 'data/release_in_the_wild/meta.csv'

df = pd.read_csv(src_csv)

for _, row in df.iterrows():
    name = row['file']
    label = str(row['label'])

    src_path = os.path.join('data/release_in_the_wild', name)
    dst_dir = os.path.join('data/release_in_the_wild', label)
    dst_path = os.path.join(dst_dir, name)

    os.makedirs(dst_dir, exist_ok=True)

    if os.path.exists(src_path):
        shutil.move(src_path, dst_path)

os.rename('data/release_in_the_wild/bona-fide', 'data/release_in_the_wild/real')
os.rename('data/release_in_the_wild/spoof', 'data/release_in_the_wild/fake')

In [None]:
def process_directory(directory, output_dir):
  os.makedirs(output_dir, exist_ok=True)

  for filename in tqdm(os.listdir(directory)):
    if filename.endswith('.wav'):
      audio_path = os.path.join(directory, filename)
      cqt, log, mel = compute_spectrograms(audio_path)

      base_name = os.path.splitext(filename)[0]
      # Save spectrograms as numpy arrays
      np.save(f"{output_dir}/{base_name}_cqt.npy", cqt)
      np.save(f"{output_dir}/{base_name}_log.npy", log)
      np.save(f"{output_dir}/{base_name}_mel.npy", mel)

data_dirs = {
    'ITWfull_real': 'data/release_in_the_wild/real',
    'ITWfull_fake': 'data/release_in_the_wild/fake/'
}
if compute_specs:
    for set_name, directory in data_dirs.items():
        output_dir = f'data/spectrograms/{set_name}_spectrograms'
        process_directory(directory, output_dir)
        print(f"Processed {set_name} set.")


In [None]:
# viewing some of the data
ITW_display_cqt = "data/spectrograms/ITWfull_real_spectrograms/5_cqt.npy"
ITW_display_log = "data/spectrograms/ITWfull_real_spectrograms/5_log.npy"
ITW_display_mel = "data/spectrograms/ITWfull_real_spectrograms/5_mel.npy"

ITW_cqt_test = np.load(display_cqt)
ITW_log_test = np.load(display_log)
ITW_mel_test = np.load(display_mel)
print(ITW_cqt_test.shape)
print(ITW_log_test.shape)
print(ITW_mel_test.shape)

# for reference, should be the same as before
ITW_cqt_size = 84
ITW_log_size = 1025
ITW_mel_size = 128

# Evaluate the FoR trained model on the ITW dataset

In [None]:
# already defined above
# feature_type = "cqt"
# resizing = True
# dims_resize = (224, 224)

batch_size = 128

ITW_full_dataset = None
ITW_full_loader = None

if model_type == 'raw2':
    ITW_full_dataset = RawnetWaveformDataset("ITWFull")
    ITW_full_loader = DataLoader(ITW_full_dataset, batch_size=batch_size, shuffle=True)

else:
    ITW_full_dataset = SpecDataset(feature_type, "ITWFull")
    ITW_full_loader = DataLoader(ITW_full_dataset, batch_size=batch_size, shuffle=True, collate_fn=dynamic_collate)


In [None]:
print(f"ITW Test EER: {compute_EER(model, ITW_full_loader)}")
test_loss, test_accuracy = validate(ITW_full_loader)
print(f"ITW Testing loss: {test_loss} Accuracy: {test_accuracy}")

# Transfer learning from FoR to ITW

In [None]:
# Most of the settings should be kept the same from previous training, because we are
# using the same model.

from torch.utils.data import random_split

train_size = 0.8 * len(ITW_full_dataset)
val_size = 0.1 * len(ITW_full_dataset)
train_size = int(train_size)
val_size = int(val_size)

test_size = len(ITW_full_dataset) - val_size - train_size

# now they should all sum to ITW_full_dataset, do the split

ITW_train, ITW_val, ITW_test = random_split(ITW_full_dataset, [train_size, val_size, test_size])

batch_size = 32
epochs = 30
weight_decay = 5e-4
learning_rate = 0.0001

optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay = weight_decay)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = epochs)

ITW_train_loader = DataLoader(ITW_train, batch_size=batch_size, shuffle=True, collate_fn=dynamic_collate)
ITW_val_loader = DataLoader(ITW_val, batch_size=batch_size, shuffle=True, collate_fn=dynamic_collate)
ITW_test_loader = DataLoader(ITW_test, batch_size=batch_size, shuffle=True, collate_fn=dynamic_collate)

print(len(ITW_train_loader.dataset))
print(len(ITW_val_loader.dataset))
print(len(ITW_test_loader.dataset))

In [None]:
# Same loop, but for our ITW transfer learning.

patience = 5
best_validation_loss = 10000.0
fail_count = 0

TL_training_losses = []
TL_val_losses = []
# TL_test_losses = []

print("Starting transfer learning from FoR dataset model to ITW")

for epoch in tqdm(range(epochs)):
    training_loss = train(ITW_train_loader)
    print(f"[Epoch {epoch}] Training Loss: {training_loss}")
    
    TL_training_losses.append(training_loss)
    
    validation_loss, val_accuracy = validate(ITW_val_loader)    
    print(f"[Epoch {epoch}] Validation Loss: {validation_loss} Accuracy: {val_accuracy}")
    
    TL_val_losses.append(validation_loss)
    
    # test_loss, test_accuracy = validate(ITW_test_loader)
    # print(f"[DEBUG Epoch {epoch}] Test Loss: {test_loss} Accuracy: {test_accuracy}")
    
    # TL_test_losses.append(test_loss)
    
    if validation_loss < best_validation_loss:
        best_validation_loss = validation_loss
        fail_count = 0
    else:
        # increment number of epochs of no improvement
        fail_count = fail_count + 1
        
    if fail_count >= patience:
        print(f"Triggering early breaking on epoch {epoch}")
        break
    
    scheduler.step()

In [None]:
print(f"Transfer Learning ITW Test EER: {compute_EER(model, ITW_test_loader)}")
test_loss, test_accuracy = validate(ITW_test_loader)
print(f"Testing loss: {test_loss} Accuracy: {test_accuracy}")

# Pure ITW training

In [None]:
# We already have data loaders, simply setup the same model and training procedures.

train_transform = transforms.Compose([
    transforms.ToTensor()
    #transforms.Normalize(mean, std)
  ])
test_transform = transforms.Compose([transforms.ToTensor()])

# setup the model, exact same one as used prior with empty weights

model = None

torch.cuda.empty_cache()

if model_type == "LSTM":
    model = LSTMSpectrogram()
elif model_type == "enet":
    model =  EfficientNetSpectrogram("b0")
elif model_type == "res":
    model = ResNet50Spectrogram()
elif model_type == 'res18'
    model = ResNet18Spectrogram()

model = model.to(device)

#epochs = 100
epochs = 30
weight_decay = 5e-4
learning_rate = 0.0001

criterion = nn.CrossEntropyLoss()
if model_type == "LSTM":
    criterion = nn.BCEWithLogitsLoss()
    
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay = weight_decay)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = epochs)

In [None]:
# Train on the ITW dataset with an empty model

# Same loop, but for our ITW transfer learning.

patience = 5
best_validation_loss = 10000.0
fail_count = 0

ITW_training_losses = []
ITW_val_losses = []
# ITW_test_losses = []

print("Starting pure ITW training using the same model type (with weights cleared)")

for epoch in tqdm(range(epochs)):
    training_loss = train(ITW_train_loader)
    print(f"[Epoch {epoch}] Training Loss: {training_loss}")
    
    ITW_training_losses.append(training_loss)
    
    validation_loss, val_accuracy = validate(ITW_val_loader)    
    print(f"[Epoch {epoch}] Validation Loss: {validation_loss} Accuracy: {val_accuracy}")
    
    ITW_val_losses.append(validation_loss)
    
    test_loss, test_accuracy = validate(ITW_test_loader)
    # print(f"[DEBUG Epoch {epoch}] Test Loss: {test_loss} Accuracy: {test_accuracy}")
    
    # ITW_test_losses.append(test_loss)
    
    if validation_loss < best_validation_loss:
        best_validation_loss = validation_loss
        fail_count = 0
    else:
        # increment number of epochs of no improvement
        fail_count = fail_count + 1
        
    if fail_count >= patience:
        print(f"Triggering early breaking on epoch {epoch}")
        break
    
    scheduler.step()

In [None]:
print(f"Pure ITW Test EER: {compute_EER(model, ITW_test_loader)}")
test_loss, test_accuracy = validate(ITW_test_loader)
print(f"Pure ITW Testing loss: {test_loss} Accuracy: {test_accuracy}")

# Benchmark the inference time of the model (should be same across either training method)

In [None]:
# get sample data
input_data, _ = next(iter(ITW_full_loader))
input_data = input_data.to(device)

# make sure model is in fastest cache
with torch.no_grad():
    for _ in range(5):
        _ = model(input_data)
        
n_bench_runs = 1000
run_times = []

with torch.no_grad():
    for _ in tqdm(range(n_bench_runs)):
        # important to make sure each run is done sequentially
        torch.cuda.synchronize()
        
        start = torch.cuda.Event(enable_timing = True)
        end = torch.cuda.Event(enable_timing = True)
        
        start.record()
        _ = model(input_data)
        end.record()
        
        # important to make sure each run is done sequentially
        torch.cuda.synchronize()
        
        run_times.append(start.elapsed_time(end))
    
average_rtime = sum(run_times) / n_bench_runs
print(f'Average run time for batch of size {batch_size} on model {model_type} with features {feature_type}')
print(f'{average_rtime} ms')
print(f'Averages to {average_rtime / batch_size} ms per input')