# SW중심대학 디지털 경진대회_SW와 생성AI의 만남 : AI부문
 - 이 AI 경진대회에서는 5초 분량의 오디오 샘플에서 진짜 사람 목소리와 AI가 생성한 가짜 목소리를 정확하게 구분할 수 있는 모델을 개발하는 것이 목표입니다.
 - 이 작업은 보안, 사기 감지 및 오디오 처리 기술 향상 등 다양한 분야에서 매우 중요합니다.

In [None]:
try:
    import wespeaker
except ImportError:
    %pip install git+https://github.com/wenet-e2e/wespeaker.git

## Imports
모델 학습 및 추론에 사용할 라이브러리들을 불러옵니다.

In [None]:
import os
import copy
import random

import torch
import torchaudio

import numpy as np
import pandas as pd

from torch import nn
import torch.nn.functional as F
import torchaudio.transforms as T
import torchaudio.pipelines as pipelines
from torch.utils.data import Dataset, DataLoader

from huggingface_hub import hf_hub_download
import wespeaker

from tqdm.notebook import tqdm

### Check GPU Availability

In [None]:
!nvidia-smi

In [None]:
# Set CUDA Device Number 0~7
DEVICE_NUM = 0

if torch.cuda.is_available():
    torch.cuda.set_device(DEVICE_NUM)
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    DEVICE_NUM = -1  # cpu
print(f"INFO: Using device - {device}:{DEVICE_NUM}")

## Config
- 딥러닝 모델을 학습하기 전에 설정해야하는 다양한 매개변수를 정의하는 설정 클래스입니다.
- 클래스를 사용하여 학습에 필요한 설정 값을 미리 지정합니다.

##### 오디오 신호
- 우리가 듣는 소리는 공기의 압력 변화로, 이것을 디지털 신호로 변환한 것이 오디오 신호입니다.
- 이 신호는 시간에 따라 변하는 진폭 값을 가지고 있습니다.

In [None]:
class Config:
    """ Configuration Class """
    SEED = 20240719  # 재현성을 위해 랜덤 시드 고정
    NB_NAME = "transfer_learning"  # ipython 노트북 이름 지정
    ROOT_FOLDER = os.path.join(".", "data")

    BATCH_SIZE = 128
    LR = 1e-5

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(Config.SEED)  # Seed 고정

## Dataset

In [None]:
from torchvision.datasets import utils
from sklearn.model_selection import train_test_split as split

utils.tqdm = tqdm


class VoiceDataset(Dataset):
    download_url = "https://drive.usercontent.google.com/download?id=1hi1dibkHyFbaxAteLlZJw6r3g9ddd4Lf&export=download&authuser=0&confirm=t&uuid=c40c278b-d74b-4b75-bc79-09e8a3ccffa4&at=APZUnTUvIVFVM9gjGNUCmDb4YZCy%3A1719807236671"

    @classmethod
    def download(cls, root='./data', filename="download.zip", md5=None):
        cls.download_root = root
        filepath = os.path.join(root, filename)
        if not os.path.exists(filepath):
            utils.download_and_extract_archive(cls.download_url, root, root, filename, md5)
            print("Extraction completed.")
        else:
            print(f"File already exists in {filepath}")

    @property
    def get_dataset_path(self):
        filename = "train.csv" if self.is_train else "test.csv"
        if self.custom_csv:
            filename = self.custom_csv + ".csv"
        return os.path.join(self.download_root, filename)

    @property
    def submission_form_path(cls):
        return os.path.join(cls.download_root, "sample_submission.csv")

    def __init__(self, root="./data", train=True, split_ratio=1, transform=None, custom_csv=None):
        """
        Voice Dataset for Contrastive Learning
        
        :param root: The path to the data directory
        :param train: is train or test
        :param split_ratio: split ratio for train(can be 0.5 or above) and valid(can be lower than 0.5) set
        :param transform: data transformer
        :param target_transform: label transformer
        """
        super().__init__()
        self.download(root)
        self.download_root = root
        self.is_train = train
        self.custom_csv = custom_csv
        self.name = ("train" if train else "test") if not custom_csv else custom_csv

        raw_data = self._load_data(self.get_dataset_path, split_ratio if split_ratio >= 0.5 else 1-split_ratio)
        if not self.is_train or split_ratio >= 0.5:
            self.raw_data, _ = raw_data
        else:
            _, self.raw_data = raw_data
            if "train" not in self.name:
                print(f"Warning: The name of dataset should start with 'train' for training set. (current - {self.name})")
            self.name = self.name.replace("train", "valid")

        self.data0 = self.raw_data['path'].tolist()
        self.data1 = self.raw_data['path'].tolist()

        if 'label' in self.raw_data.columns:
            self.label = [(0, 1) if lb == 'real' else (1, 0) for lb in self.raw_data['label'].tolist()]
        else:
            if 'real' in self.raw_data.columns and 'fake' in self.raw_data.columns:
                f_label = self.raw_data['fake'].tolist()
                r_label = self.raw_data['real'].tolist()
                self.label = list(zip(f_label, r_label))
            else:
                self.label = None

        self.transforms(transform)

    @staticmethod
    def _load_data(dataset_path, split_ratio=1):
        random_state = 1  # fixed random_state

        df = pd.read_csv(dataset_path)

        if split_ratio == 1 or split_ratio == 0:
            return (df, None) if split_ratio == 1 else (None, df)

        if 'label' in df.columns:
            df1, df2, _, _ = split(df, df['label'], test_size=1-split_ratio, random_state=random_state)
        else:
            df1, df2 = split(df, test_size=1-split_ratio, random_state=random_state)
        return df1, df2

    def transforms(self, transform=None):
        if transform is not None:
            if not isinstance(transform, list) and not isinstance(transform, tuple):
                transform = [transform]
            for t in transform:
                self.data0, self.data1, self.label = t(self.data0, self.data1, self.label)

    def __len__(self):
        return len(self.data0)

    def __getitem__(self, index):
        if self.label is not None:
            return self.data0[index], self.data1[index], self.label[index]
        return self.data0[index], self.data1[index]

In [None]:
split_ratio = 0.8

train_dataset = VoiceDataset(root=Config.ROOT_FOLDER, train=True, split_ratio=split_ratio)
valid_dataset = VoiceDataset(root=Config.ROOT_FOLDER, train=True, split_ratio=1-split_ratio)
unlabeled_dataset = VoiceDataset(root=Config.ROOT_FOLDER, train=False, custom_csv="unlabeled_data")
test_dataset = VoiceDataset(root=Config.ROOT_FOLDER, train=False)

print(f"Loaded Dataset - train({len(train_dataset)}), valid({len(valid_dataset)}), unlabeled({len(unlabeled_dataset)}) test({len(test_dataset)})")
print("Query Dataset for checking:", train_dataset[0])
train_dataset.raw_data

#### Data Transformation
By using 
[TorchAudio Models](https://pytorch.org/audio/stable/models.html) |
[TorchAudio Pretrained Models](https://pytorch.org/audio/stable/pipelines.html#module-torchaudio.pipelines)

In [None]:
class AudioPipelines:
    """ Audio Pipelines - Pretrained Embeddings """
    
    wav2vec_bundle = pipelines.WAV2VEC2_ASR_BASE_960H
    resnet_bundle = "Wespeaker/wespeaker-voxceleb-resnet152-LM"
    device_setting = (device, DEVICE_NUM)
    
    def __init__(self, audio_cache_dir="audio_cache", nb_name=Config.NB_NAME):
        self.audio_cache_dir = audio_cache_dir
        if not os.path.isdir(audio_cache_dir):
            os.mkdir(audio_cache_dir)
        if not os.path.isdir(os.path.join(audio_cache_dir, nb_name)):
            os.mkdir(os.path.join(audio_cache_dir, nb_name))
        self.wav2vec = self.get_wav2vec(audio_cache_dir=audio_cache_dir)
        self.resnet = self.get_resnet(audio_cache_dir=audio_cache_dir)
        if not os.path.isdir(os.path.join(audio_cache_dir, nb_name, self.wav2vec.name)):
            os.mkdir(os.path.join(audio_cache_dir, nb_name, self.wav2vec.name))
        if not os.path.isdir(os.path.join(audio_cache_dir, nb_name, self.resnet.name)):
            os.mkdir(os.path.join(audio_cache_dir, nb_name, self.resnet.name))

    @classmethod
    def get_wav2vec(cls, audio_cache_dir="."):
        sr = cls.wav2vec_bundle.sample_rate  # Wav2Vec2 Model uses sample rate 16kHz
        wav2vec_model = cls.wav2vec_bundle.get_model()
        wav2vec_model.to(cls.device_setting[0])
        print(f"INFO: Wav2Vec Model Loaded on {cls.device_setting[0]}:{cls.device_setting[1]}. - Bundle: {cls.wav2vec_bundle}")
        wav2vec_model.eval()
        
        def wav2vec(path):
            waveform, sample_rate = torchaudio.load(path, normalize=True)
            if sample_rate != sr:
                resampler = T.Resample(sample_rate, sr)
                waveform = resampler(waveform)
            with torch.no_grad():
                embedding, _ = wav2vec(waveform.to(cls.device_setting[0]))
            return embedding
        
        wav2vec.__dict__['name'] = str(cls.wav2vec_bundle._path).split(".")[0]
        wav2vec.__dict__['cache'] = audio_cache_dir
        return wav2vec

    @classmethod
    def get_resnet(cls, audio_cache_dir="."):
        model_id = cls.resnet_bundle
        model_name = model_id.replace("Wespeaker/wespeaker-", "").replace("-", "_")
    
        root_dir = hf_hub_download(model_id, filename=model_name+".onnx").replace(model_name+".onnx", "")
        if not os.path.isfile(root_dir+"avg_model.pt"):
            os.rename(hf_hub_download(model_id, filename=model_name+".pt"), root_dir+"avg_model.pt")
        if not os.path.isfile(root_dir+"config.yaml"):
            os.rename(hf_hub_download(model_id, filename=model_name+".yaml"), root_dir+"config.yaml")
    
        resnet_model = wespeaker.load_model_local(root_dir)
        resnet_model.set_gpu(-1 if cls.device_setting[0] == torch.device('cpu') else cls.device_setting[1])
        print(f"INFO: ResNet Model Loaded on {resnet_model.device}")
    
        def resnet(path):
            return resnet_model.extract_embedding(path)

        resnet.__dict__['name'] = model_name
        resnet.__dict__['cache'] = audio_cache_dir
        return resnet

In [None]:
def to_embedding(dataset_name, pretrained, d_idx):
    convert_path = lambda path: os.path.join(Config.ROOT_FOLDER, *path.replace("./", "").split("/"))
    embedding_path = os.path.join(pretrained.cache, Config.NB_NAME, pretrained.name, f"{dataset_name}.embedding")

    def convert(*args):
        *datas_list, labels = args
        if not os.path.isfile(embedding_path):
            new_datas = [pretrained(convert_path(path)) for path in tqdm(datas_list[d_idx], desc=f"Convert {dataset_name} dataset with {pretrained.name}")]
            torch.save(new_datas, embedding_path)
            print("INFO: Voice Embedding saved.")
        else:
            new_datas = torch.load(embedding_path)
            print(f"INFO: Pretrained {pretrained.name} embedding for {dataset_name} dataset is loaded.)")
        datas_list[d_idx] = new_datas
        return *datas_list, labels
    return convert

In [None]:
WV_DIM_SIZE = 1024

def flatten_tensor(adaptive_pool=nn.AdaptiveAvgPool1d(WV_DIM_SIZE), d_idx=1):
    def flatten(*args):
        *datas_list, labels = args
        datas_list[d_idx] = [adaptive_pool(torch.flatten(t).unsqueeze(0)).squeeze(0) for t in datas_list[d_idx]]
        return *datas_list, labels
    return flatten

In [None]:
to_tensor = lambda *args: (*args[:-1], list(map(torch.tensor, args[-1])))  # label to tensor

apl = AudioPipelines()  # Create Audio Pipeline for converting audio to embeddings

for dataset in [train_dataset, valid_dataset]:
    dataset.transforms(transform=[
        to_embedding(dataset.name, apl.resnet, d_idx=0),
        to_embedding(dataset.name, apl.wav2vec, d_idx=1),
        flatten_tensor(d_idx=1),
        to_tensor
    ])

for dataset in [unlabeled_dataset, test_dataset]:
    dataset.transforms(transform=[
        to_embedding(dataset.name, apl.resnet, d_idx=0),
        to_embedding(dataset.name, apl.wav2vec, d_idx=1),
        flatten_tensor(d_idx=1)
    ])

del apl  # release memory

In [None]:
for (data, label), i in zip(train_dataset, range(5)):
    print(f"Train Dataset {i}: {label}", data)

In [None]:
for (data, label), i in zip(valid_dataset, range(5)):
    print(f"Valid Dataset {i}: {label}", data)

In [None]:
for dataset, i in zip(unlabeled_dataset, range(5)):
    print(f"UnLabeled Dataset {i}:", dataset)

In [None]:
for dataset, i in zip(test_dataset, range(5)):
    print(f"Test Dataset {i}:", dataset)

## DataLoader
    - DataLoader는 구축된 데이터셋에서 배치크기(batch_size)에 맞게 데이터를 추출하고, 필요에 따라 섞거나(shuffle=True) 순서대로 반환(shuffle=False)하는 역할을 합니다.
    - 훈련 데이터(train_loader)는 일반적으로 섞어서 모델이 데이터에 덜 편향되게 학습하도록하며,
      검증 데이터(val_loader)는 모델 성능 평가를 위해 순서대로 사용하고,
      테스트 데이터(test_loader)는 최종적인 추론을 위해 사용합니다.

    이렇게 DataLoader를 사용함으로써, 효율적인 데이터 처리와 모델 학습 및 평가가 가능해집니다.

In [None]:
BATCH_SIZE = Config.BATCH_SIZE

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

## Define Model

Discriminator A: fake(0) ~ real(1)

Detector B: no voice(0) ~ voice(1)

---

*power = lambda a: 1 - abs(0.5-a) / 0.5

(give more weights to detector b)

---

| Case | Model Output | Label | Calculation |
|---|---|---|---|
| fake 1 real 1 | A: 0.5, B: 1.0 | 1, 1 | (1-A) * (1-power(A)) + B * power(A), A * (1-power(A)) + B * power(A) |
| fake 2 real 0 | A: 0.0, B: 1.0 | 1, 0 | (1-A) * (1-power(A)) + B * power(A), A * (1-power(A)) + B * power(A) |
| fake 0 real 2 | A: 1.0, B: 1.0 | 0, 1 | (1-A) * (1-power(A)) + B * power(A), A * (1-power(A)) + B * power(A) |
| fake 1 real 0 | A: 0.0, B: 1.0 | 1, 0 | (1-A) * (1-power(A)) + B * power(A), A * (1-power(A)) + B * power(A) |
| fake 0 real 1 | A: 1.0, B: 1.0 | 0, 1 | (1-A) * (1-power(A)) + B * power(A), A * (1-power(A)) + B * power(A) |
| fake 0 real 0 | A: 0.5, B: 0.0 | 0, 0 | (1-A) * (1-power(A)) + B * power(A), A * (1-power(A)) + B * power(A) |


In [None]:
class VAD(nn.Module):
    power = lambda a: 1 - abs(0.5-a) / 0.5
    
    def forward(self, is_real, is_voice):
        power = self.power(is_real)
        return torch.tensor([(1-is_real) * (1-power) + is_voice * power, is_real * (1-power) + is_voice * power])

In [None]:
from sklearn.metrics import roc_auc_score

def multi_label_auc(y_true, y_scores):
    auc_scores = []
    for i in range(y_true.shape[1]):
        auc = roc_auc_score(y_true[:, i], y_scores[:, i])
        auc_scores.append(auc)
    mean_auc_score = np.mean(auc_scores)
    return mean_auc_score

### 0. Test

In [None]:
class FeatureExtractor(nn.Module):
    def __init__(self, embedding_size, hidden_size, latent_size):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(embedding_size, hidden_size),
            nn.Tanh(),
            nn.Dropout(0.5),
            nn.Linear(hidden_size, hidden_size//2),
            nn.Tanh(),
            nn.Dropout(0.5),
            nn.Linear(hidden_size//2, latent_size)
        )
        self.fc = nn.Linear(latent_size, 1)

    def forward(self, x):
        encoded = self.encoder(x)
        out = self.classifier(encoded)
        return F.sigmoid(out)

In [None]:
class FakeVoiceDetectionModel(nn.Module):
    def __init__(self, embedding_size, hidden_size, latent_size):
        super().__init__()
        self.discriminator = FeatureExtractor(embedding_size[0], hidden_size, latent_size)
        self.detector = FeatureExtractor(embedding_size[1], hidden_size, latent_size)
        self.vad = VAD()

    def forward(self, *x):
        reals = self.discriminator(x[0])
        voices = self.detector(x[1])
        return reals, voices, self.vad(reals, voices)

In [None]:
# 모델 파라미터 지정
model_params = dict(
    embedding_size=[len(d) for d in train_dataset[0][0]],
    hidden_size=1024,
    latent_size=128
)
model_params

In [None]:
# 모델 생성
model = FakeVoiceDetectionModel(**model_params)
model.to(device)

In [None]:
# BinaryCrossEntropy
criterion = nn.BCELoss().to(device)

# Adam optimizer
optimizer = torch.optim.Adam(params=model.parameters(), lr=Config.LR)

In [None]:
# Set Epoch Count
num_epochs = 100
log_interval = 5

In [None]:
last_val_score = 0
train_len, valid_len = map(len, (train_loader, valid_loader))

epochs = tqdm(range(1, num_epochs+1), desc="Running Epochs")
with tqdm(total=train_len, desc="Training") as train_progress, tqdm(total=valid_len, desc="Validation") as valid_progress:
    for epoch in epochs:
        train_progress.reset(total=train_len)
        valid_progress.reset(total=valid_len)

        # Train
        model.train()
        for i, inputs in enumerate(train_loader):
            optimizer.zero_grad()

            *features, labels = (data.float().to(device) for data in inputs)
            reals, voices, outputs = model(*features)
            real_labels = torch.transpose(torch.argmax(labels, dim=1).unsqueeze(0), 1, 0).float()  # TODO: 0.5인 경우
            voice_labels = torch.transpose((labels.sum(dim=1) > 0).unsqueeze(0), 1, 0).float()

            loss = (criterion(reals, real_labels) + criterion(voices, voice_labels) + criterion(outputs, labels)) / 3
            loss.backward()
            optimizer.step()

            train_progress.update(1)
            print(f"\rEpoch [{epoch}/{num_epochs}], Step [{i+1}/{train_len}], Loss: {loss.item():.6f}", end="")

        val_loss, val_labels, val_outputs = 0, [], []

        # Validation
        model.eval()
        with torch.no_grad():
            for inputs in valid_loader:
                *features, labels = (data.float().to(device) for data in inputs)
                _, _, predicted = model(*features)

                val_loss += criterion(predicted, labels).item() / valid_len
                val_labels.append(labels.cpu().numpy())
                val_outputs.append(predicted.cpu().numpy())

                valid_progress.update(1)

        last_val_score = multi_label_auc(np.concatenate(val_labels, axis=0), np.concatenate(val_outputs, axis=0))

        print(f"\rEpoch [{epoch}/{num_epochs}], Step [{train_len}/{train_len}], Loss: {loss.item():.6f}, "
            + f"Valid Acc: {last_val_score:.6%}, Valid Loss: {val_loss:.6f}", end="\n" if epoch % log_interval == 0 or epoch == num_epochs else "")

### 1. Encoder-Decoder Feature Extractor

In [None]:
class FeatureExtractor(nn.Module):
    """ Feature Extractor Model """

    def __init__(self, embedding_size, latent_size):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(embedding_size, embedding_size//2),
            nn.Tanh(),
            nn.Dropout(0.5),
            nn.Linear(embedding_size//2, embedding_size//4),
            nn.Tanh(),
            nn.Dropout(0.5),
            nn.Linear(embedding_size//4, latent_size)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_size, embedding_size//4),
            nn.Tanh(),
            nn.Dropout(0.5),
            nn.Linear(embedding_size//4, embedding_size//2),
            nn.Tanh(),
            nn.Dropout(0.5),
            nn.Linear(embedding_size//2, embedding_size)
        )
        self.tanh = nn.Tanh()

    def forward(self, x):
        latent = self.tanh(self.encoder(x))
        restored = self.decoder(latent)
        return latent, restored

In [None]:
class RecurrentFeatureExtractor(FeatureExtractor):
    """ Recurrent-Feature Extractor Model """

    def __init__(self, embedding_size=1024, latent_size=32, embedding_dim=29):
        super().__init__(embedding_size, latent_size)
        self.resizer = nn.RNN(embedding_dim, embedding_size, batch_first=True)

    def forward(self, x):
        sequences, sequence_lengths = x
        sequences_packed = pack_padded_sequence(sequences, sequence_lengths, batch_first=True, enforce_sorted=False)
        
        resized, _ = self.resizer(sequences_packed)
        resized, _ = pad_packed_sequence(resized, batch_first=True)
        resized = torch.tanh(resized[:, -1, :])
        
        latent = self.encoder(resized)
        restored = self.decoder(latent)
        return latent, restored

In [None]:
class Classifier(nn.Module):
    def __init__(self, embedding_size, hidden_size, output_size=2):
        super().__init__()
        self.fc1 = nn.Linear(embedding_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size//2)
        self.fc3 = nn.Linear(hidden_size//2, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return F.sigmoid(self.fc3(x))


### 2. Real-Fake Voice Discriminator

In [None]:
class RFVoiceDiscriminator(nn.Module):
    """ Real-Fake Voice Discriminator Model """

    def __init__(self, embedding_size, latent_size):
        super().__init__()
        self.latent_size = latent_size
        
        self.feature = FeatureExtractor(embedding_size, latent_size)
        self.fc = Classifier(embedding_size, latent_size*2, 1)

    def forward(self, x):
        latent, restored = self.feature(x)
        return self.fc(x), self.fc(restored)

In [None]:
rf_disc = RFVoiceDiscriminator(embedding_size=EMBEDDING_SIZE, latent_size=32)
rf_disc.to(device)

In [None]:
class RFVoiceDiscriminatorConfig:
    num_epochs = 50
    log_interval = 5
    
    # BinaryCrossEntropy
    criterion_b = nn.BCELoss().to(device)
    
    # Mean Squared Error
    criterion_m = nn.MSELoss().to(device)
    
    # Adam optimizer
    optimizer = torch.optim.Adam(params=rf_disc.parameters(), lr=Config.LR/10)
    
    # Dataset
    train_dataset = copy.deepcopy(train_dataset)
    train_dataset.data.extend(train_noise_type1.data)
    train_dataset.label.extend(train_noise_type1.label)
    
    valid_dataset = copy.deepcopy(valid_dataset)
    valid_dataset.data.extend(valid_noise_type1.data)
    valid_dataset.label.extend(valid_noise_type1.label)
    
    # DataLoader
    train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True)#, collate_fn=collate_fn)
    valid_loader = DataLoader(valid_dataset, batch_size=Config.BATCH_SIZE, shuffle=False)#, collate_fn=collate_fn)

In [None]:
rf_disc_config = RFVoiceDiscriminatorConfig

num_epochs = rf_disc_config.num_epochs
log_interval = rf_disc_config.log_interval
train_len, valid_len = map(len, (rf_disc_config.train_loader, rf_disc_config.valid_loader))
valid_dataset_len = len(rf_disc_config.valid_dataset)

In [None]:
epochs = tqdm(range(1, num_epochs+1), desc="Running Epochs")
with tqdm(total=train_len, desc="Training") as train_progress, tqdm(total=valid_len, desc="Validation") as valid_progress:
    optimizer = rf_disc_config.optimizer
    criterion_b = rf_disc_config.criterion_b
    criterion_m = rf_disc_config.criterion_m

    for epoch in epochs:
        train_progress.reset(total=train_len)
        valid_progress.reset(total=valid_len)

        # Train
        rf_disc.train()
        for i, (features, labels) in enumerate(rf_disc_config.train_loader):
            optimizer.zero_grad()

            features = features.to(device)
            preds1, preds2 = rf_disc(features)
            labels = torch.transpose(torch.argmax(labels, dim=1).unsqueeze(0), 1, 0).float().to(device)

            loss = (criterion_b(preds1, labels) + criterion_b(preds2, labels) + criterion_b(preds1, preds2)) / 3
            loss.backward()
            optimizer.step()

            train_progress.update(1)
            print(f"\rEpoch [{epoch}/{num_epochs}], Step [{i+1}/{train_len}], Loss: {loss.item():.6f}", end="")

        val_acc1, val_acc2, val_loss = 0, 0, 0

        # Validation
        rf_disc.eval()
        with torch.no_grad():
            for features, labels in rf_disc_config.valid_loader:
                preds1, preds2 = rf_disc(features)
                labels = torch.transpose(torch.argmax(labels, dim=1).unsqueeze(0), 1, 0).float().to(device)

                val_loss += (criterion_b(preds1, labels).item() + criterion_b(preds2, labels).item()) / (2 * valid_len)
                val_acc1 += ((preds1 >= 0.5).float() == labels).sum() / valid_dataset_len
                val_acc2 += ((preds2 >= 0.5).float() == labels).sum() / valid_dataset_len

                valid_progress.update(1)

        print(f"\rEpoch [{epoch}/{num_epochs}], Step [{train_len}/{train_len}], Loss: {loss.item():.6f}, Valid Acc1: {val_acc1:.6%}, "
            + f"Valid Acc2: {val_acc2:.6%}, Valid Loss: {val_loss:.6f}", end="\n" if epoch % log_interval == 0 or epoch == num_epochs else "")

### 3. Voice Presence Detector

In [None]:
class VoiceDetector(nn.Module):
    """ Voice presence detection module """

    def __init__(self, embedding_size, latent_size):
        super().__init__()
        self.latent_size = latent_size
        
        self.feature = FeatureExtractor(embedding_size=embedding_size, latent_size=latent_size)
        self.fc1 = Classifier(latent_size, latent_size//2, 1)
        self.fc2 = Classifier(embedding_size, latent_size*2, 1)

    def forward(self, x):
        latent, restored = self.feature(x)
        return restored, self.fc1(latent), self.fc2(restored)

In [None]:
detector = VoiceDetector(embedding_size=EMBEDDING_SIZE, latent_size=32)
detector.to(device)

In [None]:
class VoiceDetectorConfig:
    num_epochs = 100
    log_interval = 10

    # BinaryCrossEntropy
    criterion_b = nn.BCELoss().to(device)

    # Mean Squared Error
    criterion_m = nn.MSELoss().to(device)

    # Adam optimizer
    optimizer = torch.optim.Adam(params=rf_disc.parameters(), lr=Config.LR)

    # Dataset
    train_dataset = train_augmented
    self_train_dataset = unlabeled_dataset
    valid_dataset = valid_augmented
    
    # DataLoader
    train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=False)
    valid_loader = DataLoader(valid_dataset, batch_size=Config.BATCH_SIZE, shuffle=False)

In [None]:
vdet_config = VoiceDetectorConfig

num_epochs = vdet_config.num_epochs
log_interval = vdet_config.log_interval
train_len, valid_len = map(len, (vdet_config.train_loader, vdet_config.valid_loader))
valid_dataset_len = len(vdet_config.valid_dataset)

In [None]:
epochs = tqdm(range(1, num_epochs+1), desc="Running Epochs")
with tqdm(total=train_len, desc="Training") as train_progress, tqdm(total=valid_len, desc="Validation") as valid_progress:
    optimizer = vdet_config.optimizer
    criterion_b = vdet_config.criterion_b
    criterion_m = vdet_config.criterion_m

    for epoch in epochs:
        train_progress.reset(total=train_len)
        valid_progress.reset(total=valid_len)

        # Train
        detector.train()
        for i, (features, _) in enumerate(vdet_config.train_loader):
            optimizer.zero_grad()

            features = features.to(device)
            restoreds, preds1, preds2 = detector(features)
            labels = torch.softmax(features, dim=1)
            labels = torch.transpose((torch.max(labels, dim=1)[0] >= 0.5).unsqueeze(0), 1, 0).float()
            
            loss = (criterion_b(preds1, labels) + criterion_b(preds2, labels) + criterion_b(preds1, preds2)*4) / 6

            #loss = (criterion_m(features, restoreds)/20 + criterion_b(preds1, labels) + criterion_b(preds2, labels)) / 3
            loss.backward()
            optimizer.step()

            train_progress.update(1)
            print(f"\rEpoch [{epoch}/{num_epochs}], Step [{i+1}/{train_len}], Loss: {loss.item():.6f}", end="")

        val_acc1, val_acc2, val_loss = 0, 0, 0

        # Validation
        detector.eval()
        with torch.no_grad():
            for (features, labels) in vdet_config.valid_loader:
                _, preds1, preds2 = detector(features)
                labels = torch.transpose((labels.to(device).sum(dim=1) > 0).unsqueeze(0), 1, 0).float().to(device)

                val_loss += (criterion_b(preds1, labels).item() + criterion_b(preds2, labels).item()) / (2 * valid_len)
                val_acc1 += ((preds1 >= 0.5).float() == labels).sum() / valid_dataset_len
                val_acc2 += ((preds2 >= 0.5).float() == labels).sum() / valid_dataset_len

                valid_progress.update(1)

        print(f"\rEpoch [{epoch}/{num_epochs}], Step [{train_len}/{train_len}], Loss: {loss.item():.6f}, Valid Acc1: {val_acc1:.6%}, "
            + f"Valid Acc2: {val_acc2:.6%}, Valid Loss: {val_loss:.6f}", end="\n" if epoch % log_interval == 0 or epoch == num_epochs else "")

### 4. Total FakeVoiceDetectionModel

In [None]:
from sklearn.metrics import roc_auc_score

def multi_label_auc(y_true, y_scores):
    auc_scores = []
    for i in range(y_true.shape[1]):
        auc = roc_auc_score(y_true[:, i], y_scores[:, i])
        auc_scores.append(auc)
    mean_auc_score = np.mean(auc_scores)
    return mean_auc_score

In [None]:
class FakeVoiceDetectionModel(nn.Module):
    """ Fake Voice Detection Model """

    def __init__(self, hidden_size):
        super().__init__()
        self.latent_size = rf_disc.latent_size + detector.latent_size
        self.hidden_size = hidden_size
        
        self.encoder1 = rf_disc.feature.encoder
        self.encoder2 = detector.feature.encoder
        self.classifier = nn.Sequential(
            nn.Linear(self.latent_size, self.hidden_size),
            nn.ReLU(),
            nn.Linear(self.hidden_size, 2)
        )

    def forward(self, x):
        latent1, latent2 = self.encoder1(x), self.encoder2(x)
        latent = latent1.cat(latent2, dim=1)
        return self.classifier(latent)

In [None]:
model = FakeVoiceDetectionModel(hidden_size=256)
model.to(device)

In [None]:
class FakeVoiceDetectionModelConfig:
    num_epochs = 5
    log_interval = 1

    # BinaryCrossEntropy
    criterion = nn.BCELoss().to(device)

    # Adam optimizer
    optimizer = torch.optim.Adam(params=rf_disc.parameters(), lr=Config.LR)

    # Dataset
    train_dataset = copy.deepcopy(train_dataset)
    train_dataset.data.extend(train_noise_type1.data)
    train_dataset.data.extend(train_augmented.data)
    self_train_dataset = unlabeled_dataset
    valid_dataset = copy.deepcopy(valid_dataset)
    valid_dataset.data.extend(valid_noise_type1.data)
    valid_dataset.data.extend(valid_augmented.data)

    # DataLoader
    train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True)
    self_train_loader = DataLoader(self_train_dataset, batch_size=1, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=Config.BATCH_SIZE, shuffle=False)

model_config = FakeVoiceDetectionModelConfig

In [None]:
last_val_score = 0
self_labels = []  # do not shuffle data
num_epochs = model_config.num_epochs
log_interval = model_config.log_interval
train_len, self_len, valid_len = map(len, (model_config.train_loader, model_config.self_train_loader, model_config.valid_loader))

epochs = tqdm(range(1, num_epochs+1), desc="Total Model Running Epochs")
with (
    tqdm(total=train_len, desc="Self-Supervised Learning") as self_progress,
    tqdm(total=train_len, desc="Supervised Learning") as train_progress,
    tqdm(total=valid_len, desc="Validation") as valid_progress
    ):
    for epoch in epochs:
        train_progress.reset(total=train_len)
        self_progress.reset(total=self_len)
        valid_progress.reset(total=valid_len)

        # Self-Supervised Learning
        model.train()
        for i, inputs in enumerate(zip(model_config.self_train_loader, self_labels)):
            optimizer.zero_grad()

            features, labels = (data.float().to(device) for data in inputs)
            outputs = model(features)

            loss = model_config.criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            self_progress.update(1)
            print(f"\rEpoch [{epoch}/{num_epochs}], Step [{i+1}/{train_len}], Loss: {loss.item():.6f}", end="")

        # Supervised Learning
        model.train()
        for i, inputs in enumerate(model_config.train_loader):
            optimizer.zero_grad()

            features, labels = (data.float().to(device) for data in inputs)
            outputs = model(features)

            loss = model_config.criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_progress.update(1)
            print(f"\rEpoch [{epoch}/{num_epochs}], Step [{i+1}/{train_len}], Loss: {loss.item():.6f}", end="")

        # Self-Supervised Labeling
        self_loss, _self_labels, self_outputs = 0, [], []
        model.eval()
        with torch.no_grad():
            for inputs in zip(model_config.self_train_loader, self_labels):
                features, labels = (data.float().to(device) for data in inputs)
                predicted = model(features)

                self_loss += model_config.criterion(predicted, labels).item() / self_len
                _self_labels.append(labels.cpu().numpy())
                self_outputs.append(predicted.cpu().numpy())

                self_progress.update(1)

        self_score = multi_label_auc(np.concatenate(_self_labels, axis=0), np.concatenate(self_outputs, axis=0))
        self_labels = (_self_labels >= 0.5).float()

        print(f"\rEpoch [{epoch}/{num_epochs}], Loss: {loss.item():.6f}, Self Acc: {self_score:.6%}, Self Loss: {self_loss:.6f}", end="")

        # Validation
        model.eval()
        val_loss, val_labels, val_outputs = 0, [], []
        with torch.no_grad():
            for inputs in model_config.valid_loader:
                features, labels = (data.float().to(device) for data in inputs)
                predicted = model(features)

                val_loss += model_config.criterion(predicted, labels).item() / valid_len
                val_labels.append(labels.cpu().numpy())
                val_outputs.append(predicted.cpu().numpy())

                valid_progress.update(1)

        last_val_score = multi_label_auc(np.concatenate(val_labels, axis=0), np.concatenate(val_outputs, axis=0))  # Calculate AUC score

        print(f"\rEpoch [{epoch}/{num_epochs}], Loss: {loss.item():.6f}, Self Acc: {self_score:.6%}, Self Loss: {self_loss:.6f}, "
            + f"Valid Acc: {last_val_score:.6%}, Valid Loss: {val_loss:.6f}", end="\n" if epoch % log_interval == 0 or epoch == num_epochs else "")

### Model Save

In [None]:
if not os.path.isdir(os.path.join(".", "models")):
    os.mkdir(os.path.join(".", "models"))

# Model Save
save_path = os.path.join(".", "models", f"{Config.NB_NAME}_acc_{last_val_score*100:.6f}.pt")
torch.save(model.state_dict(), save_path)
print(f"Model saved to {save_path}")

## Inference
테스트 데이터셋에 대한 추론은 다음 순서로 진행됩니다.

1. 모델 및 디바이스 설정
    - 모델을 주어진 device(GPU 또는 CPU)로 이동시키고, 평가모드로 전환합니다.
2. 예측 수행
    - 예측 결과를 저장한 빈 리스트를 초기화하고 test_loader에서 배치별로 데이터를 불러와 예측을 수행합니다.
    - 각 배치에 대해 스펙트로그램 데이터를 device로 이동시킵니다.
    - 모델 예측 확률(probs)을 계산합니다.
    - 예측 확률을 predictions리스트에 추가합니다.

In [None]:
test_loader = DataLoader(test_dataset, batch_size=Config.BATCH_SIZE, shuffle=False)

In [None]:
predicted_labels = []

model.to(device)
model.eval()
with torch.no_grad():
    for features in tqdm(test_loader):
        probs = model(features.to(device))
        probs = probs.cpu().detach().numpy()
        predicted_labels += probs.tolist()

### Submission
추론 결과를 제출 양식에 덮어 씌워 CSV 파일로 생성하는 과정은 다음과 같습니다.

1. 제출 양식 로드
    - pd.read_csv('./sample_submission.csv')를 사용하여 제출을 위한 샘플 형식 파일을 로드합니다.
    - 이 파일은 일반적으로 각 테스트 샘플에 대한 ID와 예측해야 하는 필드가 포함된 템플릿 형태를 가지고 있습니다.
2. 예측 결과 할당
    - submit.iloc[:,1:] = preds 추론함수(inference)에서 반환된 예측결과(preds)를 샘플 제출 파일에 2번째 열부터 할당합니다.
3. 제출 파일 저장
    - 수정된 제출 파일을 baseline_submit 이란 이름의 CSV 파일로 저장합니다.
    - index=False는 파일 저장시 추가적인 index가 발생하지 않도록 설정하여, 제작한 제출 파일과 동일한 형태의 파일을 저장합니다.

In [None]:
submit = pd.read_csv(test_dataset.submission_form_path)
submit.iloc[:, 1:] = predicted_labels
submit.head()

In [None]:
submission_dir = "submissions"
if not os.path.isdir(submission_dir):
    os.mkdir(submission_dir)

submit_file_path = os.path.join(".", submission_dir, f"{Config.NB_NAME}_acc_{last_val_score*100:.6f}_submit.csv")
submit.to_csv(submit_file_path, index=False)
print("File saved to", submit_file_path)