# SW중심대학 디지털 경진대회_SW와 생성AI의 만남 : AI부문
 - 이 AI 경진대회에서는 5초 분량의 오디오 샘플에서 진짜 사람 목소리와 AI가 생성한 가짜 목소리를 정확하게 구분할 수 있는 모델을 개발하는 것이 목표입니다.
 - 이 작업은 보안, 사기 감지 및 오디오 처리 기술 향상 등 다양한 분야에서 매우 중요합니다.

In [1]:
try:
    import torch
except ImportError:
    try:
        %conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
    except:
        %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

try:
    import librosa
except:
    try:
        %conda install -c conda-forge librosa
    except:
        %pip install librosa

try:
    import wespeaker
except ImportError:
    %pip install git+https://github.com/wenet-e2e/wespeaker.git

try:
    import huggingface_hub
except ImportError:
    %pip install huggingface_hub

## Imports
모델 학습 및 추론에 사용할 라이브러리들을 불러옵니다.

In [2]:
import os
import copy
import random

import librosa

import torch
import torchaudio

import numpy as np
import pandas as pd

from torch import nn
import torch.nn.functional as F
import torchaudio.transforms as T
import torchaudio.pipelines as pipelines
from torch.utils.data import Dataset, DataLoader

from huggingface_hub import hf_hub_download
import wespeaker

from tqdm.notebook import tqdm

### Check GPU Availability

In [3]:
!nvidia-smi

Thu Jul 18 18:26:29 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  On   | 00000000:04:00.0 Off |                    0 |
| N/A   40C    P0    28W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   38C    P0    25W / 250W |      2MiB / 16280MiB |      0%      Default |
|       

In [4]:
# Set CUDA Device Number 0~7
device_num = 0

if torch.cuda.is_available():
    torch.cuda.set_device(device_num)
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    device_num = -1  # cpu
print(f"INFO: Using device - {device}:{device_num}")

INFO: Using device - cuda:0


## Config
- 딥러닝 모델을 학습하기 전에 설정해야하는 다양한 매개변수를 정의하는 설정 클래스입니다.
- 클래스를 사용하여 학습에 필요한 설정 값을 미리 지정합니다.

##### 오디오 신호
- 우리가 듣는 소리는 공기의 압력 변화로, 이것을 디지털 신호로 변환한 것이 오디오 신호입니다.
- 이 신호는 시간에 따라 변하는 진폭 값을 가지고 있습니다.

In [5]:
class Config:
    SEED = 20240719  # 재현성을 위해 랜덤 시드 고정
    NB_NAME = "final"  # ipython 노트북 이름 지정
    DATA_ROOT = os.path.join(".", "data")
    SAMPLE_RATE = 16000
    AUDIO_LENGTH = 5.0  # 테스트 도메인 오디오 길이
    BATCH_SIZE = 128
    LR = 1e-5

    @staticmethod
    def fix_seed(seed=SEED):
        random.seed(seed)
        os.environ['PYTHONHASHSEED'] = str(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

Config.fix_seed()  # Seed 고정

## Dataset

In [6]:
from torchvision.datasets import utils
from sklearn.model_selection import train_test_split as split

utils.tqdm = tqdm


class VoiceDataset(Dataset):
    download_url = "https://drive.usercontent.google.com/download?id=1hi1dibkHyFbaxAteLlZJw6r3g9ddd4Lf&export=download&authuser=0&confirm=t&uuid=c40c278b-d74b-4b75-bc79-09e8a3ccffa4&at=APZUnTUvIVFVM9gjGNUCmDb4YZCy%3A1719807236671"

    Train = "train"
    Test = "test"
    Unlabeled = "unlabeled_data"

    cache_dir = os.path.join(".", "cache")

    def download(self, root='./data', filename="download.zip", md5=None):
        filepath = os.path.join(root, filename)
        if not os.path.exists(filepath):
            utils.download_and_extract_archive(self.download_url, root, root, filename, md5)
            print("Extraction completed.")
        else:
            print(f"File already exists in {filepath}")
        return root

    def load_from_source(self, train=True, train_size=1.0):
        random_state = 1  # fixed random_state

        if os.path.isfile(self.get_dataset_path):
            df = pd.read_csv(self.get_dataset_path)
        else:
            files = os.listdir(self.get_dataset_path.replace(".csv", ""))
            df = pd.DataFrame(dict(
                id=[".".join(*file.split(".")[:-1]) for file in files],
                path=[f"./{self.name}/{file}" for file in files]
            ))

        if train_size == 1.0 or train_size == 0.0:
            df1, df2 = df, df
        elif 'label' in df.columns:
            df1, df2, _, _ = split(df, df['label'], test_size=float(1-train_size), random_state=random_state)
        else:
            df1, df2 = split(df, test_size=float(1-train_size), random_state=random_state)

        return df1 if train else df2

    @property
    def get_dataset_path(self):
        return os.path.join(self.download_root, self.name + ".csv")

    @property
    def submission_form_path(self):
        return os.path.join(self.download_root, "sample_submission.csv")

    @classmethod
    def get_cache_file_path(cls, name):
        return os.path.join(cls.cache_dir, Config.NB_NAME, name+".dataset")

    def __init__(self, root=".", name=None, train=True, train_size=1.0, transform=None):
        convert_path = lambda path: os.path.join(Config.DATA_ROOT, *path.replace("./", "").split("/"))
        self.download_root = self.download(root)
        super().__init__()
        self.name = name

        if isinstance(transform, list) or isinstance(transform, tuple):
            self.transforms = transform
        else:
            self.transforms = [transform] if transform else []

        self.raw_data = self.load_from_source(train, train_size)
        if 'label' in self.raw_data.columns:
            self.label = [(0, 1) if lb == 'real' else (1, 0) for lb in self.raw_data['label'].tolist()]
        else:
            self.label = None
        self.path = list(map(convert_path, self.raw_data['path'].tolist()))
        self.data = self.path
        if self.name == self.Train and not train:
            self.name = "valid"

        if not os.path.isdir(self.cache_dir):
            os.mkdir(self.cache_dir)
        if not os.path.isdir(os.path.join(self.cache_dir, Config.NB_NAME)):
            os.mkdir(os.path.join(self.cache_dir, Config.NB_NAME))
        if os.path.isfile(self.get_cache_file_path(self.name)):
            self.data, self.label = torch.load(self.get_cache_file_path(self.name))
        else:
            new_data, new_label = [], []
            for data, label in zip(
                    tqdm(self.data, desc=f"Loading {self.name} dataset"),
                    self.label if self.label else [None for _ in range(len(self.data))]
            ):
                data, label = self.transform(self.load_audio(data), label)
                new_data.append(data), new_label.append(label)
            self.data, self.label = new_data, (None if None in new_label else new_label)
            torch.save([self.data, self.label], self.get_cache_file_path(self.name))

        print(f"INFO: Dataset is loaded - Name: {self.name}, Size: {len(self.path)}")

    def transform(self, *items):
        for transform in self.transforms:
            items = transform(*items)
        return items

    @staticmethod
    def load_audio(path):
        waveform, sample_rate = torchaudio.load(path)
        if sample_rate != Config.SAMPLE_RATE:
            waveform = T.Resample(sample_rate, Config.SAMPLE_RATE)(waveform)
        return waveform

    @staticmethod
    def generate_silence(duration=Config.AUDIO_LENGTH, sample_rate=Config.SAMPLE_RATE):
        num_samples = int(duration * sample_rate)
        silence = torch.zeros((1, num_samples))  # assume mono channel
        return silence

    @staticmethod
    def pad(waveform, pad_size):
        left_pad = 0
        if isinstance(pad_size, list) or isinstance(pad_size, tuple):
            left_pad, pad_size = pad_size
        right_pad = pad_size - waveform.size(1) - left_pad
        return F.pad(waveform, (int(left_pad), int(right_pad)))

    @staticmethod
    def trim(waveform, target_size):
        current_length = waveform.size(1)
        if current_length <= target_size:
            return waveform
        cut_length = current_length - target_size
        start_point = random.randint(0, cut_length)
        trimmed = []
        for channel in waveform:
            trimmed.append(channel[start_point:start_point+target_size])
        trimmed_tensor = torch.stack(trimmed)
        return trimmed_tensor

    def augmented(self, *args, **kwargs):
        return augmented(self, *args, **kwargs)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        if self.label:
            return self.data[index], self.label[index]
        return self.data[index]

In [7]:
import torchaudio.compliance.kaldi as kaldi

def compute_fbank(sample_rate=Config.SAMPLE_RATE, num_mel_bins=80, frame_length=25, frame_shift=10, cmn=True):
    def convert(waveform, *args):
        features = kaldi.fbank(
            waveform.float(), num_mel_bins=num_mel_bins, frame_length=frame_length, frame_shift=frame_shift,
            sample_frequency=sample_rate
        )
        return ((features - torch.mean(features, 0)) if cmn else features), *args
    return convert

In [8]:
class ResNet152(nn.Module):
    model_id = "Wespeaker/wespeaker-voxceleb-resnet152-LM"
    model_name = model_id.replace("Wespeaker/wespeaker-", "").replace("-", "_")

    root_dir = hf_hub_download(model_id, filename=model_name+".onnx").replace(model_name+".onnx", "")
    if not os.path.isfile(root_dir+"avg_model.pt"):
        os.rename(hf_hub_download(model_id, filename=model_name+".pt"), root_dir+"avg_model.pt")
    if not os.path.isfile(root_dir+"config.yaml"):
        os.rename(hf_hub_download(model_id, filename=model_name+".yaml"), root_dir+"config.yaml")

    def __init__(self):
        super().__init__()
        self.sample_rate = 16000
        self.embedding_dim = 256
        self.pretrained = wespeaker.load_model_local(self.root_dir).model

    def forward(self, features):
        *_, embedding = self.pretrained(features)
        return embedding

    def to_embedding(self, *args):
        with torch.no_grad():
            embedding = self(args[0].to(device).unsqueeze(0))
            return embedding.squeeze(0).cpu(), *args[1:]

In [9]:
to_filter_bank = compute_fbank()
to_embedding = ResNet152().to(device).to_embedding
to_tensor = lambda *args: (*args[:-1], torch.tensor(args[-1]))

train_set = VoiceDataset(root=Config.DATA_ROOT, name=VoiceDataset.Train, train=True, train_size=0.8, transform=[to_filter_bank, to_embedding, to_tensor])
valid_set = VoiceDataset(root=Config.DATA_ROOT, name=VoiceDataset.Train, train=False, train_size=0.8, transform=[to_filter_bank, to_embedding, to_tensor])
unlabeled_set = VoiceDataset(root=Config.DATA_ROOT, name=VoiceDataset.Unlabeled, transform=[to_filter_bank, to_embedding])
test_set = VoiceDataset(root=Config.DATA_ROOT, name=VoiceDataset.Test, transform=[to_filter_bank, to_embedding])



File already exists in ./data/download.zip
INFO: Dataset is loaded - Name: train, Size: 44350
File already exists in ./data/download.zip
INFO: Dataset is loaded - Name: valid, Size: 11088
File already exists in ./data/download.zip
INFO: Dataset is loaded - Name: unlabeled_data, Size: 1264
File already exists in ./data/download.zip
INFO: Dataset is loaded - Name: test, Size: 50000


In [10]:
print("INFO: Query Dataset for checking:", train_set[0][0], train_set[0][0].shape, train_set[0][1])
train_set.raw_data

INFO: Query Dataset for checking: tensor([-1.0575e-01,  1.2401e-02, -3.2578e-02, -7.2848e-02,  1.6257e-01,
         3.0282e-02, -2.7707e-02, -5.2055e-02,  2.2901e-02, -1.4740e-01,
         8.9606e-02,  2.3146e-01, -8.3596e-02, -3.3466e-02,  5.4436e-02,
        -6.8190e-02,  2.7634e-02,  1.0542e-01,  1.2255e-01,  2.4392e-02,
         1.8459e-02,  2.3491e-02,  6.8233e-02,  1.0311e-01,  3.1676e-02,
         8.1166e-02,  1.3044e-01, -9.2186e-02, -1.4744e-01, -1.7171e-01,
        -9.9712e-03,  2.5658e-02, -9.2734e-02, -2.0453e-02, -5.3845e-03,
         1.4175e-01,  4.0326e-02,  1.3282e-01, -1.1752e-01,  4.9214e-02,
         8.6155e-02,  8.1838e-02,  6.2421e-02,  9.9082e-02,  1.6571e-02,
        -1.0959e-01,  2.5400e-02,  1.7021e-02,  1.5112e-02,  1.4608e-01,
        -4.1715e-02, -1.3312e-01,  3.5552e-02,  1.7474e-02, -1.8467e-01,
         6.7727e-02,  5.9093e-03, -5.9369e-02,  1.0620e-01,  2.3001e-01,
        -1.3601e-01, -5.0523e-02,  5.6959e-02,  9.2574e-02, -6.0151e-02,
         1.4229e-

Unnamed: 0,id,path,label
19535,NQJUDUMG,./train/NQJUDUMG.ogg,fake
37414,SGACBBDI,./train/SGACBBDI.ogg,fake
40645,SIBSFMAP,./train/SIBSFMAP.ogg,fake
16487,LLBQPFAD,./train/LLBQPFAD.ogg,real
954,ZWYRTAOF,./train/ZWYRTAOF.ogg,real
...,...,...,...
50057,BDFFJCBX,./train/BDFFJCBX.ogg,fake
32511,NEFSVUCS,./train/NEFSVUCS.ogg,real
5192,MJFGSHIR,./train/MJFGSHIR.ogg,fake
12172,USIDOXOR,./train/USIDOXOR.ogg,real


### Data Augmentation

In [11]:
def augmented(dataset, amount=1/4, duration=Config.AUDIO_LENGTH):  # augmentation for (1, 1), (0, 0) samples
    if not dataset.label:
        raise NotImplementedError("ERROR: Augmentation is not available for unlabeled dataset.")

    dataset = copy.deepcopy(dataset)
    tensor_path = dataset.get_cache_file_path(dataset.name+f"_augmented{amount}")

    if os.path.isfile(tensor_path):
        dataset.data, dataset.label = torch.load(tensor_path)
        print(f"INFO: Augmented dataset({amount}) of {dataset.name} is loaded - Size: {len(dataset.path)}")
        return dataset

    waveform_length = int(duration * Config.SAMPLE_RATE)  # fit to test domain audio length

    fakes = [odo for odo, label in zip(dataset.path, dataset.label) if label.tolist() == [1, 0]]
    reals = [odo for odo, label in zip(dataset.path, dataset.label) if label.tolist() == [0, 1]]
    fake_random = lambda l: random.sample(fakes, l)
    real_random = lambda l: random.sample(fakes, l)
    rand_length = lambda odo: (random.randint(0, waveform_length - odo.size(1)), waveform_length)
    append_data, append_label = dataset.data.append, dataset.label.append
    append = lambda args: (append_data(args[0]), append_label(args[1]))

    # (1, 1) - [fake 1, real 1]
    for fake, real in zip(fake_random(int(len(fakes)*amount)), tqdm(real_random(int(len(reals)*amount)), desc="Fake 1, Real 1")):
        data1_loaded = dataset.trim(dataset.load_audio(fake), waveform_length)
        data2_loaded = dataset.trim(dataset.load_audio(real), waveform_length)
        data1_loaded = dataset.pad(data1_loaded, rand_length(data1_loaded))
        data2_loaded = dataset.pad(data2_loaded, rand_length(data2_loaded))
        append(dataset.transform((data1_loaded + data2_loaded) / 2, (1, 1)))

    # (0, 0) - [fake 0, real 0]
    for _ in tqdm(range(int(len(fakes)/100*amount)), desc="Fake 0, Real 0"):
        append(dataset.transform(dataset.generate_silence(duration=duration), (0, 0)))

    torch.save([dataset.data, dataset.label], tensor_path)
    print(f"INFO: Augmented dataset({amount}) of {dataset.name} is saved to", tensor_path)
    return dataset

In [12]:
# (1, 1), (0, 0) label augmentation for 1/4 of train dataset
train_aug = train_set.augmented(1/4)
valid_aug = valid_set.augmented(1/4)

INFO: Augmented dataset(0.25) of train is loaded - Size: 44350
INFO: Augmented dataset(0.25) of valid is loaded - Size: 11088


## DataLoader
    - DataLoader는 구축된 데이터셋에서 배치크기(batch_size)에 맞게 데이터를 추출하고, 필요에 따라 섞거나(shuffle=True) 순서대로 반환(shuffle=False)하는 역할을 합니다.
    - 훈련 데이터(train_loader)는 일반적으로 섞어서 모델이 데이터에 덜 편향되게 학습하도록하며,
      검증 데이터(val_loader)는 모델 성능 평가를 위해 순서대로 사용하고,
      테스트 데이터(test_loader)는 최종적인 추론을 위해 사용합니다.

    이렇게 DataLoader를 사용함으로써, 효율적인 데이터 처리와 모델 학습 및 평가가 가능해집니다.

In [67]:
def infinite_loader(data_loader, batch_size=Config.BATCH_SIZE):
    while True:
        for data in data_loader:
            if len(data) != batch_size:
                continue
            yield data

In [68]:
BATCH_SIZE = Config.BATCH_SIZE

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
tr_aug_loader = DataLoader(train_aug, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False)
val_aug_loader = DataLoader(valid_aug, batch_size=BATCH_SIZE, shuffle=False)
unlabeled_loader = DataLoader(unlabeled_set, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)

## Define Noise Model

##### DomainDifferenceLearner: 학습 도메인과 테스트 도메인의 오디오 특징을 추출하고, 그 차이를 학습합니다.
##### NoiseGenerator: 도메인 차이를 입력으로 받아 그에 해당하는 노이즈를 생성합니다.
##### DomainDiscriminator: 생성된 노이즈가 추가된 학습 도메인 오디오와 실제 테스트 도메인 오디오를 구분하려 합니다.

학습 과정:
- 도메인 차이를 학습합니다.
- 학습된 차이를 바탕으로 노이즈를 생성합니다.
- 생성된 노이즈를 학습 도메인 오디오에 추가합니다.
- 판별자를 통해 생성된 노이지 오디오가 테스트 도메인과 유사해지도록 학습합니다.
- 도메인 차이 보존 손실을 통해 생성된 노이즈가 실제 도메인 차이를 반영하도록 합니다.

In [126]:
class DomainDifferenceLearner(nn.Module):
    def __init__(self, input_channel, latent_size, **kwargs):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Conv1d(input_channel, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Conv1d(64, 128, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Conv1d(128, 256, kernel_size=3, stride=2, padding=1),
            nn.ReLU()
        )
        self.fc = nn.Linear(256, latent_size)

    def forward(self, x):
        x = self.encoder(x)
        x = x.mean(dim=2)
        return self.fc(x)

In [None]:
class NoiseGenerator(nn.Module):
    def __init__(self, latent_size, output_size, **kwargs):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(latent_size, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 1024),
            nn.LeakyReLU(0.2),
            nn.Linear(1024, output_size),
            nn.Tanh()
        )

    def forward(self, x):
        return self.model(x)

In [128]:
from torch.nn.utils import spectral_norm

class DomainDiscriminator(nn.Module):
    def __init__(self, input_channel, **kwargs):
        super().__init__()
        self.model = nn.Sequential(
            spectral_norm(nn.Conv1d(input_channel, 64, kernel_size=3, stride=2, padding=1)),
            nn.LeakyReLU(0.2),
            spectral_norm(nn.Conv1d(64, 128, kernel_size=3, stride=2, padding=1)),
            nn.LeakyReLU(0.2),
            spectral_norm(nn.Conv1d(128, 256, kernel_size=3, stride=2, padding=1)),
            nn.LeakyReLU(0.2),
            nn.Flatten(),
            spectral_norm(nn.Linear(256 ** 2 // 8, 1))
        )

    def forward(self, x):
        return self.model(x)

In [137]:
model_params = dict(
    input_channel=1,
    latent_size=128,
    output_size=256
)
model_params

{'input_channel': 1, 'latent_size': 128, 'output_size': 256}

In [138]:
domain_learner = DomainDifferenceLearner(**model_params)
domain_learner.to(device)

DomainDifferenceLearner(
  (encoder): Sequential(
    (0): Conv1d(1, 64, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
    (2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Conv1d(64, 128, kernel_size=(3,), stride=(2,), padding=(1,))
    (4): ReLU()
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Conv1d(128, 256, kernel_size=(3,), stride=(2,), padding=(1,))
    (7): ReLU()
  )
  (fc): Linear(in_features=256, out_features=128, bias=True)
)

In [139]:
noise_generator = NoiseGenerator(**model_params)
noise_generator.to(device)

NoiseGenerator(
  (model): Sequential(
    (0): Linear(in_features=128, out_features=1024, bias=True)
    (1): ReLU()
    (2): Linear(in_features=1024, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=256, bias=True)
    (5): Tanh()
  )
)

In [140]:
domain_discriminator = DomainDiscriminator(**model_params)
domain_discriminator.to(device)

DomainDiscriminator(
  (model): Sequential(
    (0): Conv1d(1, 64, kernel_size=(3,), stride=(2,), padding=(1,))
    (1): LeakyReLU(negative_slope=0.2)
    (2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Conv1d(64, 128, kernel_size=(3,), stride=(2,), padding=(1,))
    (4): LeakyReLU(negative_slope=0.2)
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Conv1d(128, 256, kernel_size=(3,), stride=(2,), padding=(1,))
    (7): LeakyReLU(negative_slope=0.2)
    (8): Flatten(start_dim=1, end_dim=-1)
    (9): Linear(in_features=8192, out_features=1, bias=True)
    (10): Sigmoid()
  )
)

In [141]:
criterion_gan = nn.BCELoss()
criterion_diff = nn.MSELoss()

optimizer_learn = torch.optim.Adam(domain_learner.parameters(), lr=0.0001)
optimizer_gen = torch.optim.Adam(noise_generator.parameters(), lr=0.0002)
optimizer_disc = torch.optim.Adam(domain_discriminator.parameters(), lr=0.0004)

def hinge_loss_d(real_output, fake_output):
    return torch.mean(F.relu(1 - real_output)) + torch.mean(F.relu(1 + fake_output))

def hinge_loss_g(fake_output):
    return -torch.mean(fake_output)

### Training

In [142]:
def compute_gradient_penalty(discriminator, real_samples, fake_samples):
    alpha = torch.rand(real_samples.size(0), 1, 1).to(device)
    interpolates = (alpha * real_samples + (1 - alpha) * fake_samples).requires_grad_(True)
    d_interpolates = discriminator(interpolates)
    gradients = torch.autograd.grad(outputs=d_interpolates, inputs=interpolates,
                                    grad_outputs=torch.ones_like(d_interpolates),
                                    create_graph=True, retain_graph=True)[0]
    gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean()
    return gradient_penalty

In [143]:
num_epochs = 200
log_interval = 5

train_domain_loader = tr_aug_loader
test_domain_loader = infinite_loader(unlabeled_loader)

epochs = tqdm(range(1, num_epochs+1), desc="Running Epochs")
train_progress = tqdm(total=len(train_domain_loader), desc="Training")
for epoch in epochs:
    train_progress.reset(total=len(train_domain_loader))
    losses = [[], []]

    for (step, (train_audio, _)), test_audio in zip(enumerate(train_domain_loader), test_domain_loader):
        train_audio, test_audio = train_audio.unsqueeze(1).to(device), test_audio.unsqueeze(1).to(device)
        if train_audio.size(0) != test_audio.size(0):
            continue

        # Learning domain difference between train and test
        optimizer_learn.zero_grad()
        train_features = domain_learner(train_audio)
        test_features = domain_learner(test_audio)
        domain_diff = test_features - train_features

        # Generate noise
        generated_noise = noise_generator(domain_diff)
        noisy_train_audio = train_audio + generated_noise.unsqueeze(1)

        # Train discriminator
        optimizer_disc.zero_grad()
        real_output = domain_discriminator(test_audio)
        fake_output = domain_discriminator(noisy_train_audio.detach())

        loss_d_real = criterion_gan(real_output, torch.ones_like(real_output))
        loss_d_fake = criterion_gan(fake_output, torch.zeros_like(fake_output))
        #loss_d = (loss_d_real + loss_d_fake) / 2
        loss_d = hinge_loss_d(real_output, fake_output)
        #gradient_penalty = compute_gradient_penalty(domain_discriminator, test_audio, noisy_train_audio.detach())
        #loss_d = -torch.mean(real_output) + torch.mean(fake_output) + 10 * gradient_penalty
        losses[0].append(loss_d.item())
        loss_d.backward()
        optimizer_disc.step()

        # Train domain difference learner and noise generator
        optimizer_gen.zero_grad()
        optimizer_learn.zero_grad()

        fake_output = domain_discriminator(noisy_train_audio)
        loss_gan = criterion_gan(fake_output, torch.ones_like(fake_output))

        # Domain difference retention loss
        noisy_train_features = domain_learner(noisy_train_audio)
        loss_diff = criterion_diff(noisy_train_features, test_features)

        #loss_g = loss_gan + 10 * loss_diff  # put weights on the domain diff retention loss
        loss_g = hinge_loss_g(fake_output) + 10 * loss_diff
        losses[1].append(loss_g.item())
        loss_g.backward()
        optimizer_gen.step()
        optimizer_learn.step()

        train_progress.update(1)
        print(f"\rEpoch [{epoch}/{num_epochs}], Step [{step}/{len(train_loader)}], Loss_D: {np.mean(losses[0]):.6f}, Loss_G: {np.mean(losses[1]):.6f}", end="   ")

    print(f"\rEpoch [{epoch}/{num_epochs}], Step [{step}/{len(train_loader)}], Loss_D: {np.mean(losses[0]):.6f}, Loss_G: {np.mean(losses[1]):.6f}", end="\n" if epoch % log_interval == 0 else "   ")

Running Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

Training:   0%|          | 0/347 [00:00<?, ?it/s]

Epoch [5/100], Step [346/347], Loss_D: 18.525682, Loss_G: 42.871819   
Epoch [10/100], Step [346/347], Loss_D: 0.000016, Loss_G: 57.158884   
Epoch [15/100], Step [346/347], Loss_D: 0.000004, Loss_G: 56.693651   
Epoch [20/100], Step [346/347], Loss_D: 0.000002, Loss_G: 56.326333   
Epoch [25/100], Step [346/347], Loss_D: 0.000001, Loss_G: 55.752930   
Epoch [28/100], Step [184/347], Loss_D: 0.000000, Loss_G: 55.099751   

1. 판별자 손실 (Loss_D):
- 이상적으로는 약 0.5 주변으로 수렴해야 합니다.
- 0.5에 가까워진다는 것은 판별자가 실제 테스트 도메인 오디오와 노이즈가 추가된 학습 도메인 오디오를 구분하기 어려워한다는 의미입니다.
- 만약 이 값이 0에 가까워지면 판별자가 너무 강해진 것이고, 1에 가까워지면 너무 약해진 것입니다.

2. 생성기 손실 (Loss_G):
- 초기에는 높은 값을 가지다가 점차 감소하는 경향을 보여야 합니다.
- 하지만 0으로 수렴하지는 않아야 합니다. 너무 낮아지면 모드 붕괴(mode collapse)가 일어날 수 있습니다.

3. 도메인 차이 보존 손실 (Loss_diff):
- 이 값도 점차 감소해야 하지만, 완전히 0이 되어서는 안 됩니다.
- 0에 가까워진다는 것은 생성된 노이즈가 도메인 간 차이를 잘 포착하고 있다는 의미입니다.

4. GAN 손실 (Loss_gan):
- 이 값은 생성기가 판별자를 얼마나 잘 속이는지를 나타냅니다.
- 학습이 진행됨에 따라 이 값이 점차 감소해야 하지만, 너무 빠르게 0에 가까워지면 안 됩니다.

##### 이상적인 학습 과정은 다음과 같습니다:
- 초기에는 모든 손실 값들이 높습니다.
- 학습이 진행됨에 따라 Loss_D는 0.5에 수렴합니다.
- Loss_G, Loss_diff, Loss_gan은 점차 감소하지만, 어느 정도 수준에서 안정화됩니다.
- 손실 값들이 큰 변동 없이 안정적으로 유지됩니다.

구체적인 수치는 데이터셋과 모델 구조에 따라 다를 수 있지만, 대략적인 가이드라인은 다음과 같습니다:
- Loss_D: 0.4 ~ 0.6 사이
- Loss_G: 초기 값의 10~20% 수준으로 감소 후 안정화
- Loss_diff: 초기 값의 5~10% 수준으로 감소 후 안정화
- Loss_gan: 2~3 정도로 안정화

이러한 경향을 보이면서 동시에 생성된 노이즈가 추가된 학습 도메인 오디오가 실제로 테스트 도메인과 유사해지는지를 주관적으로 평가해보는 것도 중요합니다. 오디오 샘플을 직접 들어보거나 스펙트로그램을 시각적으로 비교해보는 것이 도움이 될 수 있습니다.

또한, 학습된 모델을 사용하여 실제 태스크(예: 가짜 음성 탐지)의 성능이 향상되는지를 확인하는 것도 모델의 성공을 판단하는 중요한 지표가 될 수 있습니다.

In [None]:
# Freeze layer
for param in optimizer_gen.parameters():
    param.requires_grad = False

In [None]:
def generate_noise(size):
    rand = torch.randn(size)
    with torch.no_grad():
        try:
            noise = noise_generator(rand.to(device))
        except NameError:
            noise = torch.zeros(size)
    return noise.to(device)

## Define Detection Model

In [None]:
from sklearn.metrics import roc_auc_score

def multi_label_auc(y_true, y_scores):
    auc_scores = []
    for idx in range(y_true.shape[1]):
        auc = roc_auc_score(y_true[:, idx], y_scores[:, idx])
        auc_scores.append(auc)
    mean_auc_score = np.mean(auc_scores)
    return mean_auc_score

In [None]:
def state_manager(num_epochs, log_interval=5):
    cache = [0.0 for _ in range(10)]

    def update_state(
            progress, epoch=None, step=None, steps=None, train_loss=None,
            domain_acc=None, domain_loss=None, valid_acc=None, valid_loss=None, cross_acc=None, cross_loss=None
    ):
        new_state = [epoch, step, steps, train_loss, domain_acc, domain_loss, valid_acc, valid_loss, cross_acc, cross_loss]
        for i, state in enumerate(new_state):
            if state:
                cache[i] = state
        epoch, step, steps, train_loss, domain_acc, domain_loss, valid_acc, valid_loss, cross_acc, cross_loss = cache
        progress.update(1)
        print(f"\rEpoch [{epoch}/{num_epochs}], Step [{step}/{steps}], Loss: {train_loss:.6f}, Domain: {domain_acc:.6%} | {domain_loss:.6f}, "
            + f"Valid: {valid_acc:.6%} | {valid_loss:.6f}, Cross Valid: {cross_acc:.6%} | {cross_loss:.6f}", end="   ")

        def result():
            print(end="\n" if epoch % log_interval == 0 or epoch == num_epochs else "")
            return cross_acc
        return result

    return tqdm(range(1, num_epochs+1), desc="Running Epochs"), update_state

### 1. Feature Extraction Layer (ResNet-like structure)

In [None]:
class AttentionLayer(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.feature_dim = feature_dim
        self.attention_weights = nn.Parameter(torch.randn(feature_dim))

    def forward(self, x):
        attention_scores = F.softmax(self.attention_weights, dim=0)
        weighted_features = x * attention_scores
        return weighted_features

In [None]:
class FeatureExtractor(nn.Module):
    def __init__(self, embedding_size, hidden_size, latent_size):
        super().__init__()

        # Encoder with skip connections
        self.encoder_block1 = nn.Sequential(
            nn.Linear(embedding_size, hidden_size),
            nn.LeakyReLU(0.01),
            nn.Dropout(0.5),
            nn.BatchNorm1d(hidden_size)
        )
        self.skip1 = nn.Linear(embedding_size, hidden_size)

        self.encoder_block2 = nn.Sequential(
            nn.Linear(hidden_size, hidden_size//2),
            nn.LeakyReLU(0.01),
            nn.Dropout(0.5),
            nn.BatchNorm1d(hidden_size//2)
        )
        self.skip2 = nn.Linear(hidden_size, hidden_size//2)

        self.encoder_block3 = nn.Sequential(
            nn.Linear(hidden_size//2, hidden_size//4),
            nn.LeakyReLU(0.01),
            nn.Dropout(0.5),
            nn.BatchNorm1d(hidden_size//4)
        )
        self.skip3 = nn.Linear(hidden_size//2, hidden_size//4)

        self.encoder_block4 = nn.Sequential(
            nn.Linear(hidden_size//4, hidden_size//8),
            nn.LeakyReLU(0.01),
            nn.Dropout(0.5),
            nn.BatchNorm1d(hidden_size//8)
        )
        self.skip4 = nn.Linear(hidden_size//4, hidden_size//8)

        self.final_encoder = nn.Sequential(
            nn.Linear(hidden_size//8, latent_size),
            nn.LeakyReLU(0.01)
        )

        self.attention = AttentionLayer(latent_size)

    def forward(self, x):
        x1 = self.encoder_block1(x) + self.skip1(x)
        x2 = self.encoder_block2(x1) + self.skip2(x1)
        x3 = self.encoder_block3(x2) + self.skip3(x2)
        x4 = self.encoder_block4(x3) + self.skip4(x3)
        encoded = self.final_encoder(x4)
        attention = self.attention(encoded)
        return attention

### 2. Domain Adaptation Layer

In [None]:
class DomainAdaptationLayer(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.feature_map = nn.Sequential(
            nn.Linear(feature_dim, feature_dim),
            nn.ReLU(),
            nn.Linear(feature_dim, feature_dim)
        )

    def forward(self, x):
        return self.feature_map(x)

### 3. Adversarial Domain Classification Layer

In [None]:
class AdversarialDomainClassifier(nn.Module):
    def __init__(self, feature_dim, hidden_dim=100):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(feature_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)  # output => domain 0(train), 1(test)
        )
    
    def forward(self, x):
        return self.classifier(x)

### 4. Uncertainty-Aware Classification Layer

In [None]:
class UncertaintyAwareClassifier(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.classifier = nn.Linear(feature_dim, 2)
        self.uncertainty_threshold = nn.Parameter(torch.tensor([0.5]))

    def forward(self, features):
        logits = self.classifier(features)
        probabilities = F.softmax(logits, dim=1)

        uncertainty = -(probabilities * torch.log(probabilities)).sum(dim=1)
        uncertain = (uncertainty > self.uncertainty_threshold).float()
        
        return probabilities, uncertain.unsqueeze(1)

### 5. Total Fake & Real Voice Detection Model

In [None]:
class VoiceDetector(nn.Module):
    def __init__(self, embedding_dim, hidden_size, latent_size):
        super().__init__()
        self.encoder = FeatureExtractor(
            embedding_size=embedding_dim,
            hidden_size=hidden_size,
            latent_size=latent_size
        )
        self.domain_adapter = DomainAdaptationLayer(feature_dim=latent_size)
        self.domain_classifier = AdversarialDomainClassifier(feature_dim=latent_size)
        self.classifier = UncertaintyAwareClassifier(latent_size)

    def forward(self, embedding):
        features = self.encoder(embedding)
        adapted = self.domain_adapter(features)
        output, uncertain = self.classifier(features)
        return F.sigmoid(output), self.domain_classifier(adapted), uncertain

In [None]:
class SimpleVoiceDetector(nn.Module):
    def __init__(self, embedding_dim, hidden_size, latent_size):
        super().__init__()
        self.encoder = FeatureExtractor(
            embedding_size=embedding_dim,
            hidden_size=hidden_size,
            latent_size=latent_size
        )
        self.classifier = nn.Linear(latent_size, 2)

    def forward(self, embedding):
        features = self.encoder(embedding)
        return F.sigmoid(self.classifier(features)), None, None

In [None]:
# Set model parameters
model_params = dict(
    embedding_dim=256,
    hidden_size=1024,
    latent_size=128
)
model_params

In [None]:
complex_model = VoiceDetector(**model_params)
complex_model.to(device)

In [None]:
simple_model = SimpleVoiceDetector(**model_params)
simple_model.to(device)

In [None]:
criterion = nn.BCELoss()
domain_criterion = nn.BCEWithLogitsLoss()
uncertainty_criterion = nn.BCEWithLogitsLoss()

In [None]:
model = simple_model
optimizer = torch.optim.Adam(params=model.parameters(), lr=Config.LR)

### Training & Validation

#### Simple Model

In [None]:
num_epochs = 15
epochs, update_state = state_manager(num_epochs, log_interval=5)

performance = 0
tr_ldr, val_ldr, crv_ldr = tr_aug_loader, val_aug_loader, valid_loader
tr_len, val_len, crv_len = len(tr_ldr), len(val_ldr), len(crv_ldr)

with (tqdm(total=tr_len, desc="Training") as tr_pgrs, tqdm(total=val_len, desc="Validation") as val_pgrs, tqdm(total=crv_len, desc="Cross Validation") as crv_pgrs):
    for epoch in epochs:
        [p.reset(total=l) for p, l in zip((tr_pgrs, val_pgrs, crv_pgrs), (tr_len, val_len, crv_len))]  # progressbar reset

        # Train
        model.train()
        for step, train_inputs in enumerate(tr_ldr):
            optimizer.zero_grad()

            features, labels = (data.float().to(device) for data in train_inputs)
            outputs, *_ = model(features + generate_noise(features.size()))

            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            update_state(tr_pgrs, epoch, step, tr_len, loss.item())

        # Cross Domain Validation
        model.eval()
        with torch.no_grad():
            for loader, loader_len, progress, cross in zip(
                    (val_ldr, crv_ldr), (val_len, crv_len), (val_pgrs, crv_pgrs), (False, True)
            ):
                val_loss, val_labels, val_outputs = [], [], []

                for inputs in loader:
                    features, labels = (data.float().to(device) for data in inputs)
                    predicted, *_ = model(features)

                    val_loss.append(criterion(predicted, labels).item())
                    val_labels.append(labels.cpu().numpy())
                    val_outputs.append(predicted.cpu().numpy())
                    val_acc = multi_label_auc(np.concatenate(val_labels, axis=0), np.concatenate(val_outputs, axis=0))

                    if cross:
                        result = update_state(progress, cross_acc=val_acc, cross_loss=np.mean(val_loss))
                    else:
                        update_state(progress, valid_acc=val_acc, valid_loss=np.mean(val_loss))

        performance = result()

#### Complex Model

In [None]:
num_epochs = 15
epochs, update_state = state_manager(num_epochs, log_interval=5)

performance = 0
loaders = (tr_aug_loader(), unlabeled_loader, val_aug_loader, valid_loader)
lens = list(map(len, loaders))

with (tqdm(total=lens[0], desc="Training") as tr_pgrs, tqdm(total=lens[2], desc="Validation") as val_pgrs, tqdm(total=lens[3], desc="Cross Validation") as crv_pgrs):
    for epoch in epochs:
        [p.reset(total=l) for p, l in zip((tr_pgrs, val_pgrs, crv_pgrs), lens)]  # progressbar reset

        # Train & Domain Adapt
        model.train()
        for step, (train_inputs, adpt_inputs) in enumerate(zip(loaders[0], infinite_loader(loaders[1]))):
            optimizer.zero_grad()

            features, labels = (data.float().to(device) for data in train_inputs)
            outputs, train_domain = model(features)
            _, test_domain = model(adpt_inputs.float().to(device))
            domain_outputs = torch.cat([train_domain, test_domain])
            domain_labels = torch.cat([torch.zeros(train_domain.shape), torch.ones(test_domain.shape)]).to(device)

            loss = criterion(outputs, labels)
            domain_loss = domain_criterion(domain_outputs, domain_labels)  # induce domain classifier do wrong pred
            domain_acc = ((F.softmax(domain_outputs, dim=1) >= 0.5) == domain_labels).sum() / len(domain_labels)

            (loss + domain_loss).backward()
            optimizer.step()

            update_state(tr_pgrs, epoch, step, lens[0], loss.item(), domain_acc, domain_loss.item())

        # Cross Domain Validation
        model.eval()
        with torch.no_grad():
            for loader, loader_len, progress, cross in zip(loaders[-2:], lens[-2:], (val_pgrs, crv_pgrs), (False, True)):
                val_loss, val_labels, val_outputs = [], [], []
                
                for inputs in loader:
                    features, labels = (data.float().to(device) for data in inputs)
                    predicted = model(features)

                    val_loss.append(criterion(predicted, labels).item())
                    val_labels.append(labels.cpu().numpy())
                    val_outputs.append(predicted.cpu().numpy())
                    val_acc = multi_label_auc(np.concatenate(val_labels, axis=0), np.concatenate(val_outputs, axis=0))

                    if cross:
                        result = update_state(progress, cross_acc=val_acc, cross_loss=np.mean(val_loss))
                    else:
                        update_state(progress, valid_acc=val_acc, valid_loss=np.mean(val_loss))

        performance = result()

### Model Save

In [None]:
if not os.path.isdir(os.path.join(".", "models")):
    os.mkdir(os.path.join(".", "models"))

# Model Save
save_path = os.path.join(".", "models", f"{Config.NB_NAME}_acc_{performance*100:.6f}.pt")
torch.save(model.state_dict(), save_path)
print(f"Model saved to {save_path}")

## Inference
테스트 데이터셋에 대한 추론은 다음 순서로 진행됩니다.

1. 모델 및 디바이스 설정
    - 모델을 주어진 device(GPU 또는 CPU)로 이동시키고, 평가모드로 전환합니다.
2. 예측 수행
    - 예측 결과를 저장한 빈 리스트를 초기화하고 test_loader에서 배치별로 데이터를 불러와 예측을 수행합니다.
    - 각 배치에 대해 스펙트로그램 데이터를 device로 이동시킵니다.
    - 모델 예측 확률(probs)을 계산합니다.
    - 예측 확률을 predictions리스트에 추가합니다.

In [None]:
predicted_labels = []

model.to(device)
model.eval()
with torch.no_grad():
    for features in tqdm(test_loader):
        probs, *_ = model(features.to(device))
        probs = probs.cpu().detach().numpy()
        predicted_labels += probs.tolist()

### Submission
추론 결과를 제출 양식에 덮어 씌워 CSV 파일로 생성하는 과정은 다음과 같습니다.

1. 제출 양식 로드
    - pd.read_csv('./sample_submission.csv')를 사용하여 제출을 위한 샘플 형식 파일을 로드합니다.
    - 이 파일은 일반적으로 각 테스트 샘플에 대한 ID와 예측해야 하는 필드가 포함된 템플릿 형태를 가지고 있습니다.
2. 예측 결과 할당
    - submit.iloc[:,1:] = preds 추론함수(inference)에서 반환된 예측결과(preds)를 샘플 제출 파일에 2번째 열부터 할당합니다.
3. 제출 파일 저장
    - 수정된 제출 파일을 baseline_submit 이란 이름의 CSV 파일로 저장합니다.
    - index=False는 파일 저장시 추가적인 index가 발생하지 않도록 설정하여, 제작한 제출 파일과 동일한 형태의 파일을 저장합니다.

In [None]:
submit = pd.read_csv(test_set.submission_form_path)
submit.iloc[:, 1:] = predicted_labels
submit.head()

In [None]:
submission_dir = "submissions"
if not os.path.isdir(submission_dir):
    os.mkdir(submission_dir)

submit_file_path = os.path.join(".", submission_dir, f"{Config.NB_NAME}_acc_{performance*100:.6f}_submit.csv")
submit.to_csv(submit_file_path, index=False)
print("File saved to", submit_file_path)