In [15]:
import os
import pathlib

import librosa
import torch
import torchaudio
import numpy as np

import sys
sys.path.append('..')

In [2]:
home = pathlib.Path.home()
WAVEFAKE=os.path.abspath(os.path.join(home, "Desktop", "deep-truth", "wavefake"))


REAL_SAMPLES=os.path.join(home, "ml-sandbox", "VoID", "data", "test")

In [5]:
# # TEST - will replace with torchaudio dataset
# # collect real audio samples

# samples = []
# for name in ["aman", "imran", "labib"]:
#     directory = os.path.join(REAL_SAMPLES, name)
#     for file_name in os.listdir(directory):
#         audio_path = os.path.join(directory, file_name)
#         audio, sr = librosa.load(audio_path, sr=16000)
#         samples.append(audio)
     
# samples

[array([0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 3.0423456e-05,
        3.1125106e-05, 0.0000000e+00], dtype=float32),
 array([0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 7.6964752e-05,
        1.0224331e-04, 0.0000000e+00], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 4.0697127e-05,
        3.3644861e-05, 3.1134885e-05], dtype=float32),
 array([ 0.        ,  0.        ,  0.        , ..., -0.00025719,
        -0.00047243,  0.        ], dtype=float32),
 array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -8.4769024e-05, -1.0100049e-04,  0.0000000e+00], dtype=float32),
 array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -3.20843938e-05, -1.10914925e-05,  0.00000000e+00], dtype=float32),
 array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -3.0254517e-07,  7.2804220e-08,  0.0000000e+00], dtype=float32),
 array([0.0000000e+00, 0.0000000e+00, 

In [7]:
from models.RawNet3 import RawNet3, Bottle2neck


In [8]:
model = RawNet3(
        Bottle2neck,
        model_scale=8,
        context=True,
        summed=True,
        encoder_type="ECA",
        nOut=256,
        out_bn=False,
        sinc_stride=10,
        log_sinc=True,
        norm_sinc="mean",
        grad_mult=1,
    )

self.encoder_type ECA


In [12]:
model.load_state_dict(
        torch.load(
            "./../model.pt",
            map_location=lambda storage, loc: storage,
        )["model"]
    )

<All keys matched successfully>

In [38]:
def extract_emb(sample, n_samples=16000*3, n_segments=10):
    
    if (len(sample) < n_samples):  # RawNet3 was trained using utterances of 3 seconds
        shortage = n_samples - len(sample) + 1
        sample = np.pad(sample, (0, shortage), "wrap")
    
    audios = []
    startframe = np.linspace(0, len(audio) - n_samples, num=n_segments)
    for asf in startframe:
        audios.append(audio[int(asf) : int(asf) + n_samples])

    audios = torch.from_numpy(np.stack(audios, axis=0).astype(np.float32))
    with torch.no_grad():
        output = model(audios)

    return output

In [39]:
wavefake_sample_file = os.path.join(WAVEFAKE, "ljspeech_hifiGAN", "LJ001-0001_generated.wav")

wavefake_audio, sr = librosa.load(wavefake_sample_file, sr=16000)
fake_emb = extract_emb(wavefake_audio)
fake_emb

tensor([[-7.2845e-01,  2.2579e+00, -1.2156e+00,  ...,  4.2831e+00,
         -4.1521e+00, -4.7248e-02],
        [-7.4800e-01,  1.1806e+00, -2.1779e+00,  ...,  2.8731e+00,
         -2.9903e-01, -4.0673e-01],
        [-7.9645e-01, -1.0566e+00, -2.1581e+00,  ...,  8.0725e-01,
          3.5630e-01,  1.1956e+00],
        ...,
        [-1.8625e-03,  2.2452e-01,  2.0662e+00,  ..., -1.3752e+00,
          3.9430e-01,  4.5133e-01],
        [ 6.3450e-01,  4.2701e-01,  1.9718e+00,  ..., -2.2553e+00,
          7.9145e-01, -1.7063e+00],
        [ 4.4545e-01, -1.5625e+00,  3.0931e+00,  ..., -2.9432e+00,
          1.5092e+00, -2.3301e-01]])

In [40]:
samples = []
cos = torch.nn.CosineSimilarity(dim=0)

for name in ["aman", "imran", "labib"]:
    directory = os.path.join(REAL_SAMPLES, name)
    cosines = []
    for file_name in os.listdir(directory):
        audio_path = os.path.join(directory, file_name)
        audio, sr = librosa.load(audio_path, sr=16000)
        
        emb = extract_emb(audio)
        cosines.append(cos(fake_emb, emb))
    samples.append(cosines)

In [41]:
samples

[[tensor([-3.5104e-01, -1.7040e-01,  1.3801e-01,  5.8419e-01, -5.0171e-01,
           2.1875e-02,  2.6102e-01, -4.2739e-01,  2.9262e-01, -2.4115e-01,
          -9.4932e-02,  4.8480e-01,  1.6765e-01,  2.9199e-02,  4.2101e-01,
          -1.3490e-01, -1.4011e-01, -1.9987e-01, -1.8042e-01, -1.3434e-01,
          -2.3580e-01, -7.3422e-01,  2.3322e-01, -2.8766e-01, -5.3511e-01,
          -5.0447e-01,  1.3577e-02, -3.4032e-01, -3.8511e-01, -1.8369e-01,
          -1.0590e-01, -1.3838e-01, -2.3891e-01,  1.7330e-01,  3.6154e-01,
           3.0827e-01, -2.6135e-01, -7.8445e-02,  2.2490e-01,  3.6892e-01,
           2.8057e-01,  3.7353e-01,  3.4899e-01,  8.5300e-02,  1.3796e-01,
           8.4831e-01,  3.1702e-02,  3.8199e-02,  6.2207e-02,  4.2292e-01,
          -3.5365e-01, -4.1782e-01,  1.8043e-01,  3.6767e-01,  1.5265e-01,
          -3.0629e-01, -2.1072e-01, -7.5936e-03, -6.5997e-02,  4.9129e-01,
          -4.9975e-01,  4.5391e-01,  1.7270e-02, -2.2489e-01, -4.3863e-01,
          -3.7687e-01, -2