# SW중심대학 디지털 경진대회_SW와 생성AI의 만남 : AI부문
 - 이 AI 경진대회에서는 5초 분량의 오디오 샘플에서 진짜 사람 목소리와 AI가 생성한 가짜 목소리를 정확하게 구분할 수 있는 모델을 개발하는 것이 목표입니다.
 - 이 작업은 보안, 사기 감지 및 오디오 처리 기술 향상 등 다양한 분야에서 매우 중요합니다.

## Imports
모델 학습 및 추론에 사용할 라이브러리들을 불러옵니다.

In [1]:
import os
import random

import torch
import torchaudio

import librosa
import numpy as np
import pandas as pd

from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm.notebook import tqdm

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("Using device -", device)

Using device - cuda


## Config
- 딥러닝 모델을 학습하기 전에 설정해야하는 다양한 매개변수를 정의하는 설정 클래스입니다.
- 클래스를 사용하여 학습에 필요한 설정 값을 미리 지정합니다.

##### 오디오 신호
- 우리가 듣는 소리는 공기의 압력 변화로, 이것을 디지털 신호로 변환한 것이 오디오 신호입니다.
- 이 신호는 시간에 따라 변하는 진폭 값을 가지고 있습니다.

In [4]:
class CONFIG:
    """ Configuration Class """
    SEED = 202102545  # 재현성을 위해 랜덤 시드 고정
    
    """ SR(Sample Rate)
    - 오디오 데이터의 샘플링 레이트를 설정합니다.
    - 높은 샘플링 레이트는 더 높은 주파수의 소리를 캡처할 수 있지만, 처리에 더 많은 계산 자원이 필요합니다.
    - 오디오 데이터의 초당 샘플 수를 정의합니다.
    """
    SR = 32000

    """ ROOT_FOLDER
    - 데이터셋의 루트 폴더 경로를 설정합니다.
    """
    ROOT_FOLDER = os.path.join(".", "data")
    
    """ BATCH_SIZE
    - 학습 시 한 번에 처리할 데이터 샘플의 수를 정의합니다
    - 큰 배치 크기는 메모리 사용량을 증가시키지만, 학습 속도를 높입니다.
    """
    BATCH_SIZE = 100
    
    """ N_EPOCHS
    - 전체 데이터셋을 학습할 횟수를 정의합니다.
    - 에폭 수가 너무 적으면 과소적합이 발생할 수 있고, 너무 많으면 과적합이 발생할 수 있습니다.
    """
    N_EPOCHS = 200
    
    """ LR (Learning Rate)
    - 모델의 가중치를 업데이트할 때 사용되는 학습 속도를 정의합니다.
    - 학습률이 너무 크면 학습이 불안정해질 수 있고, 너무 작으면 학습 속도가 느려집니다.
    """
    LR = 1e-5

In [5]:
def seed_everything(seed):
    """ Fixed RandomSeed
    아래의 코드는 머신러닝이나 딥러닝 모델을 훈련할 때, 결과의 재현성을 보장하기 위해 사용되는 함수입니다.
    이 함수는 다양한 랜덤 시드를 고정하여, 실행할 때마다 동일한 결과를 얻기 위해 사용됩니다.
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CONFIG.SEED)  # Seed 고정

## Dataset

In [6]:
from torchvision.datasets.utils import download_and_extract_archive
from sklearn.model_selection import train_test_split as split


class ContrastingVoiceDataset(Dataset):
    download_url = "https://drive.usercontent.google.com/download?id=1hi1dibkHyFbaxAteLlZJw6r3g9ddd4Lf&export=download&authuser=0&confirm=t&uuid=c40c278b-d74b-4b75-bc79-09e8a3ccffa4&at=APZUnTUvIVFVM9gjGNUCmDb4YZCy%3A1719807236671"
    
    @classmethod
    def download(cls, root='./data', filename="download.zip", md5=None):
        cls.download_root = root
        filepath = os.path.join(root, filename)
        if not os.path.exists(filepath):
            download_and_extract_archive(cls.download_url, root, root, filename, md5)
            print("Extraction completed.")
        else:
            print(f"File already exists in {filepath}")
    
    @classmethod
    def get_dataset_path(cls, root, train=True):
        return os.path.join(root, "train.csv" if train else "test.csv")

    @property
    def submission_form_path(cls):
        return os.path.join(cls.download_root, "sample_submission.csv")
    
    def __init__(self, root="./data", train=True, split_ratio=1, transform=None):
        """
        Voice Dataset for Contrastive Learning
        
        :param root: The path to the data directory
        :param train: is train or test
        :param split_ratio: split ratio for train(can be 0.5 or above) and valid(can be lower than 0.5) set
        :param transform: data transformer
        :param target_transform: label transformer
        """
        super().__init__()
        self.download(root)
        self.download_root = root
        self.is_train = train

        raw_data = self._load_data(self.get_dataset_path(root, train), split_ratio if split_ratio >= 0.5 else 1-split_ratio)
        if split_ratio >= 0.5:
            self.raw_data, _ = raw_data
        else:
            _, self.raw_data = raw_data
            
        self.multi_label = 'path' not in self.raw_data.columns
        
        self.ids = self.raw_data['id']
        if self.multi_label:
            self.data = [self.raw_data['path0'], self.raw_data['path1']]
            self.label = [self.raw_data['label0'], self.raw_data['label1']]
        else:
            self.data = self.raw_data['path']
            if 'label' in self.raw_data.columns:
                self.label = self.raw_data['label']
            else:
                self.label = None

        self.transforms(transform)
    
    @staticmethod
    def _load_data(dataset_path, split_ratio):
        df = pd.read_csv(dataset_path)
        
        if split_ratio == 1 or split_ratio == 0:
            return (df, None) if split_ratio == 1 else (None, df)
            
        df1, df2, _, _ = split(df, df['label'], test_size=1-split_ratio, random_state=202102545)
        return df1, df2
    
    @staticmethod
    def get_id_from_path(path: str):
        return [pth for pth in path.replace("/0.ogg", ".ogg").split("/") if '.ogg' in pth][0].replace(".ogg", "")

    def transforms(self, transform=None):
        if transform is not None:
            if not isinstance(transform, list) and not isinstance(transform, tuple):
                transform = [transform]
            for t in transform:
                self.ids, self.data, self.label = t(self.ids, self.data, self.label)

    def __len__(self):
        if self.multi_label:
            return len(self.data[0])
        return len(self.data)

    def __getitem__(self, index):
        if self.label is not None:
            if self.multi_label:
                return [d[index] for d in self.data], [l[index] for l in self.label], self.ids[index]
            return self.data[index], self.label[index], self.ids[index]
        return self.data[index], self.ids[index]

In [7]:
def to_list(multi_label=False):
    def to_list_inner(ids, datas, labels):
        ids = [_id for _id in ids]
        if not multi_label:
            datas, labels = [datas], [labels]
        datas = [[d for d in data] for data in datas]
        try:
            labels = [[l for l in label] for label in labels]
        except TypeError:
            pass
        if not multi_label:
            datas, labels = datas[0], labels[0]
        return ids, datas, labels
    return to_list_inner

In [8]:
train_dataset = ContrastingVoiceDataset(root=CONFIG.ROOT_FOLDER, train=True, split_ratio=0.8, transform=to_list())
valid_dataset = ContrastingVoiceDataset(root=CONFIG.ROOT_FOLDER, train=True, split_ratio=0.2, transform=to_list())
test_dataset = ContrastingVoiceDataset(root=CONFIG.ROOT_FOLDER, train=False, split_ratio=1, transform=to_list())

print("Query Dataset for checking:", train_dataset[0])
train_dataset.raw_data

File already exists in .\data\download.zip
File already exists in .\data\download.zip
File already exists in .\data\download.zip
Query Dataset for checking: ('./train/BIQYKAWL.ogg', 'fake', 'BIQYKAWL')


Unnamed: 0,id,path,label
23745,BIQYKAWL,./train/BIQYKAWL.ogg,fake
8355,WTCXWLEU,./train/WTCXWLEU.ogg,fake
34884,MRZQEWBF,./train/MRZQEWBF.ogg,real
14462,OHLGZHAF,./train/OHLGZHAF.ogg,fake
43295,FRZNSAKS,./train/FRZNSAKS.ogg,fake
...,...,...,...
7636,FFZRTCWE,./train/FFZRTCWE.ogg,fake
44556,CRTOENWR,./train/CRTOENWR.ogg,fake
19320,IHSKSRCJ,./train/IHSKSRCJ.ogg,real
15989,HGESQVRG,./train/HGESQVRG.ogg,fake


In [9]:
# apply seperated dataset
ContrastingVoiceDataset.get_dataset_path = lambda self, root, train=True: os.path.join(root, "train.csv" if train else "test_separated.csv")
test_dataset = ContrastingVoiceDataset(root=CONFIG.ROOT_FOLDER, train=False, split_ratio=1, transform=to_list(multi_label=True))

File already exists in .\data\download.zip


#### Data Transformation

In [10]:
from huggingface_hub import hf_hub_download
import wespeaker


def get_resnet152():
    model_id = "Wespeaker/wespeaker-voxceleb-resnet152-LM"
    model_name = model_id.replace("Wespeaker/wespeaker-", "").replace("-", "_")

    root_dir = hf_hub_download(model_id, filename=model_name+".onnx").replace(model_name+".onnx", "")

    import os
    if not os.path.isfile(root_dir+"avg_model.pt"):
        os.rename(hf_hub_download(model_id, filename=model_name+".pt"), root_dir+"avg_model.pt")
    if not os.path.isfile(root_dir+"config.yaml"):
        os.rename(hf_hub_download(model_id, filename=model_name+".yaml"), root_dir+"config.yaml")

    resnet = wespeaker.load_model_local(root_dir)
    resnet.set_gpu(-1 if device == torch.device('cpu') else 0)

    def resnet152(pcm, sample_rate=None):
        if isinstance(pcm, str):
            return resnet.extract_embedding(pcm)
        else:
            pass  # TODO: 메모리에 로드된 상태의 오디오 처리 코드 필요
            #return extract_embedding(resnet, pcm, sample_rate)

    print(f"ResNet152 Model Loaded on {resnet.device}")
    return resnet152

In [11]:
train_embedding_file = "train_embedding.pt"
valid_embedding_file = "valid_embedding.pt"
test_embedding_file = "test_embedding.pt"


def get_pretrained_embedding():
    if not os.path.isfile(train_embedding_file) \
            or not os.path.isfile(valid_embedding_file) \
            or not os.path.isfile(test_embedding_file):
        return get_resnet152()
    else:
        train_embedding = torch.load(train_embedding_file)
        valid_embedding = torch.load(valid_embedding_file)
        test_embedding = torch.load(test_embedding_file)
        dataset_list = {
            len(train_embedding): train_embedding,
            len(valid_embedding): valid_embedding,
            len(test_embedding[0]): test_embedding
        }
        print("INFO: Pretrained Voice Embedding loaded.", dataset_list.keys())
        
        def load_embedding(dataset):
            try:
                return dataset_list[len(dataset)]
            except KeyError:
                return dataset_list[len(dataset[0])]
        
        load_embedding.__dict__['pretrained'] = True
        return load_embedding

In [12]:
def to_embedding(pretrained=get_pretrained_embedding(), sample_rate=CONFIG.SR, multi_label=False):
    get_pth = lambda path: os.path.join(CONFIG.ROOT_FOLDER, *path[1:].split("/"))
    
    if not pretrained:
        def extract_embedding(ids, datas, labels):  # TODO: 임베딩 코드 추가 필요
            return ids, [torchaudio.load(data) for data in datas], labels
        return extract_embedding
    
    def pretrained_embedding(ids, dataset, labels):
        if pretrained.__dict__.get('pretrained'):
            new_dataset = pretrained(dataset)
            print("INFO: Voice Embedding extracted.")
        else:
            if multi_label:
                new_dataset = []
                
                for idx, dset in enumerate(tqdm(dataset)):
                    lst = []
                    new_dataset.append(lst)
                    for data in tqdm(dset):
                        lst.append(pretrained(get_pth(data), sample_rate))

                dataset_size = len(new_dataset[0])
            else:
                new_dataset = []

                for data in tqdm(dataset):
                    new_dataset.append(pretrained(get_pth(data), sample_rate))

                dataset_size = len(new_dataset)

            torch.save(new_dataset, "nonamed.pt")
            if dataset_size == len(train_dataset.raw_data):
                os.rename("nonamed.pt", "train_embedding.pt")
            elif dataset_size == len(valid_dataset.raw_data):
                os.rename("nonamed.pt", "valid_embedding.pt")
            elif dataset_size == len(test_dataset.raw_data):
                os.rename("nonamed.pt", "test_embedding.pt")
            else:
                raise ValueError(f"Invalid Dataset Size - Could not find relevant dataset sized {dataset_size}.")
                
            print("INFO: Voice Embedding saved.")
                
        return ids, new_dataset, labels  # [pretrained(get_pth(path), sample_rate) for path in dataset]
    
    return pretrained_embedding

INFO: Pretrained Voice Embedding loaded. dict_keys([44350, 11088, 50000])


In [13]:
def to_tensor_label(ids, datas, labels):
    if labels:
        labels = [torch.tensor([0]) if lb == "fake" else torch.tensor([1]) for lb in labels]
    return ids, datas, labels

In [14]:
train_dataset.transforms(transform=[to_embedding(), to_tensor_label])
valid_dataset.transforms(transform=[to_embedding(), to_tensor_label])
test_dataset.transforms(transform=to_embedding(multi_label=test_dataset.multi_label))

INFO: Voice Embedding extracted.
INFO: Voice Embedding extracted.
INFO: Voice Embedding extracted.


In [15]:
def to_embedding(pretrained=get_pretrained_embedding(), sample_rate=CONFIG.SR, multi_label=False):
    get_pth = lambda path: os.path.join(CONFIG.ROOT_FOLDER, *path[1:].split("/"))

    if not pretrained:
        def extract_embedding(ids, datas, labels):  # TODO: 임베딩 코드 추가 필요
            return ids, [torchaudio.load(data) for data in datas], labels
        return extract_embedding

    def pretrained_embedding(ids, dataset, labels):
        if pretrained.__dict__.get('pretrained'):
            new_dataset = pretrained(dataset)
            print("INFO: Voice Embedding extracted.")
        else:
            if multi_label:
                new_dataset = []

                for idx, dset in enumerate(tqdm(dataset)):
                    lst = torch.load(test_embedding_file)[idx]
                    new_dataset.append(lst)

                dataset_size = len(new_dataset[0])
            else:
                new_dataset = []

                for data in tqdm(dataset):
                    new_dataset.append(pretrained(get_pth(data), sample_rate))

                dataset_size = len(new_dataset)

            torch.save(new_dataset, "nonamed.pt")
            if dataset_size == len(train_dataset.raw_data):
                os.rename("nonamed.pt", "train_embedding.pt")
            elif dataset_size == len(valid_dataset.raw_data):
                os.rename("nonamed.pt", "valid_embedding.pt")
            elif dataset_size == len(test_dataset.raw_data):
                os.rename("nonamed.pt", "test_embedding.pt")
            else:
                raise ValueError(f"Invalid Dataset Size - Could not find relevant dataset sized {dataset_size}.")

            print("INFO: Voice Embedding saved.")

        return ids, new_dataset, labels  # [pretrained(get_pth(path), sample_rate) for path in dataset]

    return pretrained_embedding

INFO: Pretrained Voice Embedding loaded. dict_keys([44350, 11088, 50000])


In [16]:
for dataset, index in zip(train_dataset, range(5)):
    print(f"Dataset {index}: {'FAKE' if dataset[1] == torch.tensor([0]) else 'REAL'}", dataset[0])

Dataset 0: FAKE tensor([-1.8954e-01, -1.5126e-01, -1.9169e-01, -4.9767e-02, -2.0188e-02,
         1.5027e-01, -1.2862e-01, -8.1413e-02,  9.6770e-02, -4.1492e-02,
        -1.3344e-01,  2.2745e-01, -9.4607e-02,  2.3628e-02, -9.2205e-02,
        -5.7748e-02,  6.2188e-02,  1.4654e-01,  8.1550e-03, -2.6165e-01,
         5.2159e-02, -3.8855e-02, -1.5270e-01, -1.3968e-01, -1.9361e-01,
         3.3684e-01,  1.1788e-01, -1.7271e-02, -2.3159e-01,  1.0430e-01,
         1.0551e-01, -6.3770e-02,  2.0027e-01,  1.1449e-01, -1.3287e-01,
        -1.8658e-01,  1.6191e-01, -1.0753e-01, -7.6320e-02,  7.8052e-02,
         1.2526e-01, -1.3934e-02, -1.0400e-01, -1.3194e-01,  1.5096e-02,
        -2.8469e-02, -8.2221e-02, -9.4001e-02, -2.0316e-01,  1.8863e-01,
        -7.0268e-02, -1.8608e-01,  4.7674e-02, -9.2132e-02, -2.4339e-01,
        -2.8908e-03, -4.8591e-02, -2.5877e-01,  7.3877e-02, -4.3380e-02,
         3.3759e-02, -6.5845e-02, -2.2777e-02, -5.2064e-02,  2.6466e-01,
         8.9427e-02, -7.9455e-02,  

In [17]:
for dataset, index in zip(valid_dataset, range(5)):
    print(f"Dataset {index}: {'FAKE' if dataset[1] == torch.tensor([0]) else 'REAL'}", dataset[0])

Dataset 0: REAL tensor([ 0.0724, -0.1702,  0.0588, -0.0646, -0.0232,  0.0529,  0.0260, -0.0317,
         0.0384,  0.0115,  0.0973, -0.1264, -0.1521,  0.0985, -0.1118, -0.1622,
        -0.0021, -0.1185, -0.0803,  0.1039,  0.0132, -0.0267,  0.0188, -0.0915,
         0.0882, -0.0660, -0.0154, -0.0652, -0.0497,  0.2414, -0.0733, -0.0123,
        -0.1224, -0.0921,  0.1879, -0.0109, -0.0182,  0.0160,  0.1230,  0.0801,
         0.0036,  0.0912, -0.0067, -0.0827,  0.0433,  0.0309,  0.0245,  0.0412,
        -0.0136, -0.0418, -0.0161,  0.1403, -0.0064, -0.0755, -0.0824,  0.0289,
        -0.2233, -0.0769,  0.0059, -0.0435, -0.0988,  0.0930, -0.0204,  0.0090,
        -0.0963,  0.1296,  0.1252,  0.0526,  0.0266,  0.0555, -0.0423, -0.0080,
         0.0532, -0.0445, -0.1370,  0.0151, -0.0973,  0.0818, -0.0098, -0.0076,
        -0.1265,  0.2511, -0.0711,  0.0671,  0.0637,  0.0328,  0.0781, -0.0059,
         0.0129,  0.1937, -0.0829, -0.0067, -0.1427,  0.0659,  0.0203,  0.0583,
        -0.0811, -0.0144

In [18]:
for dataset, index in zip(test_dataset, range(1)):
    if test_dataset.multi_label:
        print(f"Dataset {index}-0:", dataset[1][0], dataset[0][0])
        print(f"Dataset {index}-1:", dataset[1][1], dataset[0][1])
    else:
        print(f"Dataset {index}:", dataset[0])

Dataset 0-0: speech tensor([-9.3006e-02,  2.0858e-02,  2.1069e-02, -1.1293e-02, -8.1300e-03,
        -1.6473e-01, -5.3140e-02, -9.4842e-02,  1.8247e-01, -4.9645e-03,
         9.4941e-02, -8.9201e-02, -1.7526e-02,  5.7322e-02, -2.0652e-02,
        -6.9793e-02, -1.5087e-01,  6.3277e-02,  2.8609e-02,  1.1608e-01,
        -1.0370e-01, -5.3388e-02,  1.3286e-01,  9.9311e-02,  6.6260e-02,
         4.8091e-02,  8.3858e-03,  9.4828e-02, -4.1591e-02, -1.2084e-02,
         1.3298e-01,  2.7751e-02,  3.9478e-03, -1.7945e-02, -7.7472e-02,
         8.5669e-02, -4.3482e-02, -4.9096e-02, -2.1974e-01, -3.1006e-02,
         1.7944e-03, -1.9233e-01, -3.6963e-02,  1.3835e-01,  3.0929e-02,
        -2.0693e-01, -6.7273e-02, -3.4093e-02,  1.7037e-02,  1.6843e-01,
         2.4608e-02,  1.3921e-02,  1.0036e-03,  8.1279e-02, -6.6403e-02,
         2.9936e-02, -1.0958e-01, -3.3157e-02,  3.7874e-02,  1.0300e-02,
         1.1396e-01,  1.1564e-02,  3.5110e-02, -4.3579e-03, -1.8605e-02,
         1.6384e-02, -4.0687e-0

In [19]:
# 라벨 분리

import copy

real_dataset = copy.deepcopy(train_dataset)
fake_dataset = copy.deepcopy(train_dataset)

def data_filter(target):
    def filter_data(ids, dataset, labels):
        filtered = [(_id, data, label) for _id, data, label in zip(ids, dataset, labels) if label == target]
        transposed = list(zip(*filtered))
        return transposed
    return filter_data

real_dataset.transforms(transform=data_filter(torch.tensor([1])))
fake_dataset.transforms(transform=data_filter(torch.tensor([0])))

In [20]:
for dataset, index in zip(real_dataset, range(5)):
    print(f"Dataset {index}: {'FAKE' if dataset[1] == torch.tensor([0]) else 'REAL'}", dataset[0])

Dataset 0: REAL tensor([ 0.0529,  0.1010,  0.1172, -0.0631,  0.0632, -0.1159, -0.0335, -0.0172,
         0.1637,  0.0391,  0.0796, -0.0766,  0.0501, -0.0188,  0.0757, -0.1223,
        -0.2077,  0.0897,  0.1898,  0.1035, -0.0897, -0.2672,  0.0721,  0.2452,
         0.2348,  0.2213, -0.0099,  0.1075,  0.0690, -0.2120,  0.0235,  0.0167,
        -0.1776, -0.0702, -0.0985,  0.2446, -0.0017, -0.0182, -0.2660,  0.0888,
         0.0833,  0.1668, -0.1144,  0.2026,  0.0562, -0.1339, -0.0277,  0.0940,
        -0.0397,  0.0998, -0.0096, -0.0504, -0.1266,  0.0529, -0.0065, -0.0421,
        -0.1613, -0.1036,  0.0413, -0.1035,  0.0032,  0.1689,  0.0485,  0.0773,
         0.0421,  0.0869,  0.1642, -0.2338, -0.0700, -0.0171,  0.0859,  0.1754,
        -0.0498, -0.0659, -0.2626, -0.3053, -0.1249,  0.3110, -0.2882,  0.1583,
        -0.1290,  0.1530,  0.1997,  0.0978, -0.0706, -0.3306, -0.1232,  0.1099,
        -0.0240,  0.2895, -0.1534,  0.0213,  0.0170, -0.0752,  0.2857, -0.1547,
        -0.3040,  0.0814

## DataLoader
    - DataLoader는 구축된 데이터셋에서 배치크기(batch_size)에 맞게 데이터를 추출하고, 필요에 따라 섞거나(shuffle=True) 순서대로 반환(shuffle=False)하는 역할을 합니다.
    - 훈련 데이터(train_loader)는 일반적으로 섞어서 모델이 데이터에 덜 편향되게 학습하도록하며,
      검증 데이터(val_loader)는 모델 성능 평가를 위해 순서대로 사용하고,
      테스트 데이터(test_loader)는 최종적인 추론을 위해 사용합니다.

    이렇게 DataLoader를 사용함으로써, 효율적인 데이터 처리와 모델 학습 및 평가가 가능해집니다.

In [39]:
BATCH_SIZE = CONFIG.BATCH_SIZE

real_loader = DataLoader(real_dataset, batch_size=BATCH_SIZE, shuffle=True)
augmt_loader = DataLoader(real_dataset, batch_size=BATCH_SIZE, shuffle=True)
fake_loader = DataLoader(fake_dataset, batch_size=BATCH_SIZE, shuffle=True)

valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False)

test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

## Define Model

In [22]:
class VoiceEncoder(nn.Module):
    """ Voice Encoder Model """
    
    def __init__(self, embedding_dim, hidden_dim, output_size):
        super().__init__()
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        h1 = self.relu(self.fc1(x))
        out = self.fc2(h1)
        return F.sigmoid(out)

In [23]:
class ConstrastiveDistanceFunction(nn.Module):
    """ Contrastive Distance Function """
    
    def __init__(self, latent_size):
        super().__init__()
        self.fc1 = nn.Linear(latent_size, latent_size * 2)
        self.fc2 = nn.Linear(latent_size * 2, latent_size)
        self.out = nn.Linear(latent_size, 1)
        self.relu = nn.ReLU()

    def forward(self, anchor, comparative):
        combined = self.relu(self.fc1(anchor + comparative))
        output = self.relu(self.fc2(combined))
        return F.sigmoid(self.out(output))

In [24]:
class BinaryDiscriminator(nn.Module):
    """ Binary Discriminator Model using Contrastive Learning """
    
    def __init__(self, embedding_dim, hidden_size, latent_size):
        super().__init__()
        self.encoder = VoiceEncoder(embedding_dim, hidden_size, latent_size)
        self.real_distance = ConstrastiveDistanceFunction(latent_size)
        self.fake_distance = ConstrastiveDistanceFunction(latent_size)

    def forward(self, anchor, comparative):
        anchor = self.encoder(anchor)
        return [self.real_distance(anchor, self.encoder(comp)) for comp in comparative], [self.fake_distance(anchor, self.encoder(comp)) for comp in comparative]

In [25]:
# 모델 파라미터 지정
model_params = dict(
    embedding_dim=len(train_dataset[0][0]),
    hidden_size=512,
    latent_size=128
)
model_params

{'embedding_dim': 256, 'hidden_size': 512, 'latent_size': 128}

In [26]:
# 모델 생성
voice_discrimination_model = BinaryDiscriminator(**model_params)
discriminator = voice_discrimination_model
discriminator.to(device)

BinaryDiscriminator(
  (encoder): VoiceEncoder(
    (fc1): Linear(in_features=256, out_features=512, bias=True)
    (fc2): Linear(in_features=512, out_features=128, bias=True)
    (relu): ReLU()
  )
  (real_distance): ConstrastiveDistanceFunction(
    (fc1): Linear(in_features=128, out_features=256, bias=True)
    (fc2): Linear(in_features=256, out_features=128, bias=True)
    (out): Linear(in_features=128, out_features=1, bias=True)
    (relu): ReLU()
  )
  (fake_distance): ConstrastiveDistanceFunction(
    (fc1): Linear(in_features=128, out_features=256, bias=True)
    (fc2): Linear(in_features=256, out_features=128, bias=True)
    (out): Linear(in_features=128, out_features=1, bias=True)
    (relu): ReLU()
  )
)

In [27]:
# BinaryCrossEntropy
criterion = nn.BCELoss().to(device)

In [28]:
# Adam optimizer
optimizer = torch.optim.Adam(params=discriminator.parameters(), lr=CONFIG.LR)

## Train & Validation

In [59]:
epochs = CONFIG.N_EPOCHS
batch_size = CONFIG.BATCH_SIZE
train_amount = len(real_loader)
valid_amount = len(valid_loader)
anchor_sample = next(iter(real_loader))[0][0].to(device)

#best_val_score = 0
last_val_acc = 0

loader_refresh = (epochs // 4, epochs // 4 * 2, epochs // 4 * 3)

for epoch in tqdm(range(epochs)):
    # Train
    discriminator.train()
    train_loss = [0, 0]
    
    if epoch in loader_refresh:
        real_loader = DataLoader(real_dataset, batch_size=BATCH_SIZE, shuffle=True)
        augmt_loader = DataLoader(real_dataset, batch_size=BATCH_SIZE, shuffle=True)
        fake_loader = DataLoader(fake_dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    for i, (real, augmt, fake) in enumerate(zip(real_loader, augmt_loader, fake_loader)):
        optimizer.zero_grad()
        
        anchor, *_ = real
        positive, *_ = augmt
        negative, *_ = fake
        
        if anchor.shape[0] != positive.shape[0] or anchor.shape[0] != negative.shape[0]:
            continue

        real, fake = discriminator(anchor.to(device), [positive.to(device), negative.to(device)])
        
        positive_loss = criterion(real[0], torch.ones(real[0].shape).to(device))
        negative_loss = criterion(real[1], torch.zeros(real[1].shape).to(device))
        real_loss = positive_loss + negative_loss
        
        positive_loss = criterion(real[0], torch.ones(real[0].shape).to(device))
        negative_loss = criterion(real[1], torch.zeros(real[1].shape).to(device))
        fake_loss = positive_loss + negative_loss
        
        loss = real_loss + fake_loss

        loss.backward()
        optimizer.step()

        train_loss[0] += real_loss.item()
        train_loss[1] += fake_loss.item()
        
        print(f"\rEpoch [{epoch+1}/{epochs}], Step: [{i+1}/{train_amount}], Train Loss: {train_loss[0]/(i+1):.5f} | {train_loss[1]/(i+1):.5f}", end="")

    # Validation
    discriminator.eval()
    valid_loss = 0
    valid_acc = 0

    if (epoch+1) % 10 == 0 or epoch+1 == epochs:
        with torch.no_grad():
            for feature, label, _ in valid_loader:
                feature, label = feature.to(device), label.to(device)
                
                real, fake = discriminator(anchor_sample, [feature])
                real, fake, label = real[0][0], fake[0][0], label[0]
                
                real_loss = criterion(real, label.float())
                fake_loss = criterion(fake, 1-label.float())
                    
                loss = real_loss + fake_loss
                
                valid_loss += loss.item()
                valid_acc += torch.eq(real*(1-fake) >= 0.5, label).sum().item() / batch_size
    
        print(f"\rEpoch [{epoch+1}/{epochs}], Step: [{train_amount}/{train_amount}], Train Loss: {train_loss[0]/train_amount:.5f} | {train_loss[1]/train_amount:.5f} => Valid Loss: {valid_loss/valid_amount:.5f}, Valid ACC: {valid_acc/valid_amount:.5%}", end="    \n")
        last_val_acc = f"{valid_acc/valid_amount:.5%}"[:-1]

  0%|          | 0/200 [00:00<?, ?it/s]

Epoch [10/200], Step: [221/221], Train Loss: 0.00813 | 0.00813 => Valid Loss: 0.70945, Valid ACC: 0.50108%    
Epoch [20/200], Step: [221/221], Train Loss: 0.00751 | 0.00751 => Valid Loss: 0.70663, Valid ACC: 0.50108%    
Epoch [30/200], Step: [221/221], Train Loss: 0.00523 | 0.00523 => Valid Loss: 0.70543, Valid ACC: 0.50108%    
Epoch [40/200], Step: [221/221], Train Loss: 0.00491 | 0.00491 => Valid Loss: 0.70727, Valid ACC: 0.50108%    
Epoch [50/200], Step: [221/221], Train Loss: 0.00338 | 0.00338 => Valid Loss: 0.70673, Valid ACC: 0.50108%    
Epoch [60/200], Step: [221/221], Train Loss: 0.00315 | 0.00315 => Valid Loss: 0.70665, Valid ACC: 0.50108%    
Epoch [70/200], Step: [221/221], Train Loss: 0.00207 | 0.00207 => Valid Loss: 0.70632, Valid ACC: 0.50108%    
Epoch [80/200], Step: [221/221], Train Loss: 0.00200 | 0.00200 => Valid Loss: 0.70522, Valid ACC: 0.50108%    
Epoch [90/200], Step: [221/221], Train Loss: 0.00155 | 0.00155 => Valid Loss: 0.70665, Valid ACC: 0.50108%    
E

In [None]:
# Model Save
save_path = os.path.join(".", "models", f"contrastive_model_acc_{last_val_acc}.pt")
torch.save(discriminator.state_dict(), save_path)
print(f"Model saved to {save_path}")

### Inference
테스트 데이터셋에 대한 추론은 다음 순서로 진행됩니다.

1. 모델 및 디바이스 설정
    - 모델을 주어진 device(GPU 또는 CPU)로 이동시키고, 평가모드로 전환합니다.
2. 예측 수행
    - 예측 결과를 저장한 빈 리스트를 초기화하고 test_loader에서 배치별로 데이터를 불러와 예측을 수행합니다.
    - 각 배치에 대해 스펙트로그램 데이터를 device로 이동시킵니다.
    - 모델 예측 확률(probs)을 계산합니다.
    - 예측 확률을 predictions리스트에 추가합니다.

In [None]:
# Set Model ID
model_id = "contrastive_model_acc_99.66667"

In [None]:
# Load Model
discriminator = BinaryDiscriminator(**model_params)
discriminator.load_state_dict(torch.load(os.path.join(".", "models", f"{model_id}.pt")))
discriminator.to(device)

In [None]:
from speechbrain.inference.VAD import VAD

def vad_filter(
        dataset, use_preset=True,
        filter_model=VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty", savedir='./.cache/vad-crdnn-libriparty', run_opts={"device":"cuda"})
):
    def query_preset(_id: str, index: int, *args, **kwargs):
        return dataset.label[index][int(_id.replace("TEST_", ""))]
    
    def vad(_id: str, index: int, activation_th=0.4):
        file_path = os.path.join(".", "test_separated", _id, f"{index}", ".16hz.ogg")
        boundaries = filter_model.get_speech_segments(file_path.replace("ogg", "16hz.ogg"), activation_th=activation_th)
        label = "noise"
        last_end = 0
        for i in range(boundaries.shape[0]):
            begin_value = boundaries[i, 0]
            end_value = boundaries[i, 1]
            if last_end == begin_value:
                label = "speech"
            last_end = end_value
        return label
    
    return query_preset if use_preset else vad

In [None]:
predicted_labels = []
anchor_sample = next(iter(real_dataset))[0].to(device)
vad = vad_filter(test_dataset, use_preset=True)

discriminator.eval()
with torch.no_grad():
    for inputs in tqdm(test_loader):
        if test_dataset.multi_label:
            features, _, ids = inputs
            ids = ids[0]
        else:  # TODO: Non-Multi-Label Test
            raise NotImplemented("Test for Non-Multi-Label is not yet implemented.")
        
        filtered = []
        
        for idx, feature in enumerate(features):
            feature = feature[0].to(device)
            if vad(ids, idx) == "speech":
                filtered.append(feature)

        predicted = discriminator(anchor_sample, filtered)
        possibilities = [torch.tensor(1), torch.tensor(1)]  # positive, fake

        if len(predicted) == 0:
            positive, negative = 0, 0
        elif len(predicted) == 1:
            positive, negative = (1, 0) if predicted[0] >= 0.5 else (0, 1)
        else:
            is_positive0, is_positive1 = predicted[0] >= 0.5, predicted[1] >= 0.5
            
            if is_positive0 and is_positive1:
                positive, negative = 1, 0
            elif is_positive0 and not is_positive1:
                positive, negative = 1, 1-predicted[1][0]
            elif not is_positive0 and is_positive1:
                positive, negative = 1-predicted[0][0], 1
            else:
                positive, negative = 0, 1
        
        predicted_labels += [(torch.tensor(positive), torch.tensor(negative))]

In [None]:
predicted_labels

### Submission
추론 결과를 제출 양식에 덮어 씌워 CSV 파일로 생성하는 과정은 다음과 같습니다.

1. 제출 양식 로드
    - pd.read_csv('./sample_submission.csv')를 사용하여 제출을 위한 샘플 형식 파일을 로드합니다.
    - 이 파일은 일반적으로 각 테스트 샘플에 대한 ID와 예측해야 하는 필드가 포함된 템플릿 형태를 가지고 있습니다.
2. 예측 결과 할당
    - submit.iloc[:,1:] = preds 추론함수(inference)에서 반환된 예측결과(preds)를 샘플 제출 파일에 2번째 열부터 할당합니다.
3. 제출 파일 저장
    - 수정된 제출 파일을 baseline_submit 이란 이름의 CSV 파일로 저장합니다.
    - index=False는 파일 저장시 추가적인 index가 발생하지 않도록 설정하여, 제작한 제출 파일과 동일한 형태의 파일을 저장합니다.

In [None]:
submit = pd.read_csv(test_dataset.submission_form_path)
submit.iloc[:, 1:] = torch.tensor(predicted_labels).cpu().numpy()
submit.head()

In [None]:
submit.to_csv(f"{model_id}_submit.csv", index=False)