In [15]:
import os
import pathlib

import librosa
import torch
import torchaudio
import numpy as np

import sys
sys.path.append('..')

In [2]:
home = pathlib.Path.home()
WAVEFAKE=os.path.abspath(os.path.join(home, "Desktop", "deep-truth", "wavefake"))


REAL_SAMPLES=os.path.join(home, "ml-sandbox", "VoID", "data", "test")

In [5]:
# # TEST - will replace with torchaudio dataset
# # collect real audio samples

# samples = []
# for name in ["aman", "imran", "labib"]:
#     directory = os.path.join(REAL_SAMPLES, name)
#     for file_name in os.listdir(directory):
#         audio_path = os.path.join(directory, file_name)
#         audio, sr = librosa.load(audio_path, sr=16000)
#         samples.append(audio)
     
# samples

[array([0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 3.0423456e-05,
        3.1125106e-05, 0.0000000e+00], dtype=float32),
 array([0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 7.6964752e-05,
        1.0224331e-04, 0.0000000e+00], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 4.0697127e-05,
        3.3644861e-05, 3.1134885e-05], dtype=float32),
 array([ 0.        ,  0.        ,  0.        , ..., -0.00025719,
        -0.00047243,  0.        ], dtype=float32),
 array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -8.4769024e-05, -1.0100049e-04,  0.0000000e+00], dtype=float32),
 array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -3.20843938e-05, -1.10914925e-05,  0.00000000e+00], dtype=float32),
 array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -3.0254517e-07,  7.2804220e-08,  0.0000000e+00], dtype=float32),
 array([0.0000000e+00, 0.0000000e+00, 

In [7]:
from models.RawNet3 import RawNet3, Bottle2neck


In [8]:
model = RawNet3(
        Bottle2neck,
        model_scale=8,
        context=True,
        summed=True,
        encoder_type="ECA",
        nOut=256,
        out_bn=False,
        sinc_stride=10,
        log_sinc=True,
        norm_sinc="mean",
        grad_mult=1,
    )

self.encoder_type ECA


In [12]:
model.load_state_dict(
        torch.load(
            "./../model.pt",
            map_location=lambda storage, loc: storage,
        )["model"]
    )

<All keys matched successfully>

In [34]:
def extract_emb(sample, n_samples=16000*3, n_segments=10):
    
    if (len(sample) < n_samples):  # RawNet3 was trained using utterances of 3 seconds
        shortage = n_samples - len(sample) + 1
        sample = np.pad(sample, (0, shortage), "wrap")
    
    audios = []
    startframe = np.linspace(0, len(audio) - n_samples, num=n_segments)
    for asf in startframe:
        audios.append(audio[int(asf) : int(asf) + n_samples])

    audios = torch.from_numpy(np.stack(audios, axis=0).astype(np.float32))
    with torch.no_grad():
        output = model(audios)

    return output.mean(0)

In [35]:
wavefake_sample_file = os.path.join(WAVEFAKE, "ljspeech_hifiGAN", "LJ001-0001_generated.wav")

wavefake_audio, sr = librosa.load(wavefake_sample_file, sr=16000)
fake_emb = extract_emb(wavefake_audio)
fake_emb

tensor([-0.0059,  0.0090, -0.0023,  0.0081,  0.0023,  0.0036,  0.0049, -0.0045,
         0.0051,  0.0072,  0.0104,  0.0033,  0.0137, -0.0234, -0.0188, -0.0025,
        -0.0001,  0.0047,  0.0019, -0.0075,  0.0012, -0.0042, -0.0053,  0.0049,
         0.0055,  0.0121,  0.0043, -0.0116, -0.0047, -0.0013,  0.0094,  0.0050,
        -0.0027, -0.0159,  0.0099,  0.0012, -0.0027, -0.0038, -0.0107,  0.0043,
         0.0021,  0.0142, -0.0028,  0.0063,  0.0062,  0.0098,  0.0036,  0.0124,
        -0.0005, -0.0039,  0.0123, -0.0004, -0.0064, -0.0090, -0.0076,  0.0012,
        -0.0114,  0.0104, -0.0081, -0.0004,  0.0025,  0.0135,  0.0069,  0.0007,
        -0.0011,  0.0016,  0.0184,  0.0016, -0.0028, -0.0030,  0.0008, -0.0058,
         0.0059, -0.0177,  0.0050,  0.0172, -0.0198,  0.0137, -0.0146, -0.0078,
        -0.0040,  0.0049, -0.0060, -0.0079, -0.0098, -0.0013,  0.0005,  0.0041,
        -0.0016, -0.0067,  0.0085, -0.0050,  0.0024,  0.0058,  0.0057,  0.0184,
         0.0060,  0.0049, -0.0006,  0.00

In [None]:
samples = []
cos = torch.nn.CosineSimilarity(dim=0)

for name in ["aman", "imran", "labib"]:
    directory = os.path.join(REAL_SAMPLES, name)
    cosines = []
    for file_name in os.listdir(directory):
        audio_path = os.path.join(directory, file_name)
        audio, sr = librosa.load(audio_path, sr=16000)
        
        emb = extract_emb(audio)
        cosines.append(cos(fake_emb, emb))
    samples.append(cosines)

In [None]:
samples