# Imports

In [1]:
import librosa

import numpy as np
import pandas as pd
import random

import torch
import torchmetrics
import os
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, TrainingArguments, Trainer

import tensorflow as tf



In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

### Config

In [19]:
class Config:
    SR = 16000
    N_MFCC = 40
    FIXED_LENGTH = 200000
    
    # Dataset
    ROOT_DIR = 'C:/HongBeomsun/Dataset_SSD/FakeVoice'
    
    # Training
    N_CLASSES = 2
    BATCH_SIZE = 16
    N_EPOCHS = 50
    LEARNING_RATE = 1e-3
    
    # Others
    SEED = 42
    
CONFIG = Config()

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [5]:
seed_everything(CONFIG.SEED)

### Data

In [6]:
df = pd.read_csv(os.path.join(CONFIG.ROOT_DIR,'train.csv'))

In [7]:
print(len(df))
df.head()

55438


Unnamed: 0,id,path,label
0,RUNQPNJF,./train/RUNQPNJF.ogg,real
1,JFAWUOGJ,./train/JFAWUOGJ.ogg,fake
2,RDKEKEVX,./train/RDKEKEVX.ogg,real
3,QYHJDOFK,./train/QYHJDOFK.ogg,real
4,RSPQNHAO,./train/RSPQNHAO.ogg,real


In [8]:
df['label'].value_counts()

label
fake    27818
real    27620
Name: count, dtype: int64

In [9]:
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=CONFIG.SEED, stratify=df['label'])

In [10]:
train['label'].value_counts()
val['label'].value_counts()

label
fake    5564
real    5524
Name: count, dtype: int64

In [11]:
def normalize_volume(y, target_dB=-20):
    rms = np.sqrt(np.mean(y**2))
    loudness = 20 * np.log10(rms)
    loudness_change_dB = target_dB - loudness
    y_normalized = y * (10 ** (loudness_change_dB / 20))
    return y_normalized

In [12]:
def get_features(df, processor, train_mode=True):
    features = []
    labels = []
    max_length = 0
    for i, (index, row) in enumerate(tqdm(df.iterrows(), total=len(df)), 1):
        try:
            y, sr = librosa.load(os.path.join(CONFIG.ROOT_DIR, row['path']), sr=CONFIG.SR)
            y = normalize_volume(y)
            
            if len(y) > CONFIG.FIXED_LENGTH:
                y = y[:CONFIG.FIXED_LENGTH]
            elif len(y) < CONFIG.FIXED_LENGTH:
                y = np.pad(y, (0, CONFIG.FIXED_LENGTH - len(y)), mode = 'constant')
                
            feature = processor(y, sampling_rate = sr, return_tensors='pt', padding=True)
            features.append(feature['input_values'].squeeze().numpy())
            
            if train_mode:
                label = row['label']
                label_vector = np.zeros(CONFIG.N_CLASSES, dtype=float)
                label_vector[0 if label == 'fake' else 1] = 1
                labels.append(label_vector)
                
        except Exception as e:
            print(f'Error while {index} : {e}')
            continue
    
    if train_mode:
        return features, labels
    return features

In [13]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")

train_features, train_labels = get_features(train, processor, train_mode=True)
val_features, val_labels = get_features(val, processor, train_mode=True)

100%|██████████| 44350/44350 [07:21<00:00, 100.44it/s]
100%|██████████| 11088/11088 [02:01<00:00, 91.28it/s]


### Dataset

In [24]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        return {
            'input_values': torch.tensor(self.features[index], dtype=torch.float),
            'labels': torch.tensor(self.labels[index], dtype=torch.float)
        }

In [25]:
train_dataset = CustomDataset(train_features, train_labels)
val_dataset = CustomDataset(val_features, val_labels)

In [26]:
len(train_dataset[5]['input_values'])

200000

In [27]:
train_loader = DataLoader(
    train_dataset,
    batch_size=CONFIG.BATCH_SIZE,
    shuffle=True
)
val_loader = DataLoader(
    val_dataset,
    batch_size=CONFIG.BATCH_SIZE,
    shuffle=False
)

### Define Model

In [28]:
model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-base", num_labels=CONFIG.N_CLASSES)
model.to(device)

training_args = TrainingArguments(
    output_dir='./wav2vec_output/result',
    num_train_epochs=CONFIG.N_EPOCHS,
    per_device_train_batch_size=CONFIG.BATCH_SIZE,
    per_device_eval_batch_size=CONFIG.BATCH_SIZE,
    logging_dir='./wav2vec_output/logs',
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=10,
    save_steps=10,
    seed=CONFIG.SEED,
    learning_rate=CONFIG.LEARNING_RATE,
    # gradient_checkpointing=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

RuntimeError: [enforce fail at ..\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 2359296 bytes.

### Train & Validation

In [None]:
from sklearn.metrics import roc_auc_score
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler

In [None]:
def train(model, scheduler, optimizer, train_loader, val_loader, device):
    model.to(device)
    criterion = nn.BCELoss().to(device)
    
    best_val_score = 0
    best_model = None
    
    for epoch in range(1, CONFIG.N_EPOCHS+1):
        model.train()
        train_loss = []
        for features, labels in tqdm(iter(train_loader)):
            features = features.float().to(device)
            labels = labels.float().to(device)
            
            optimizer.zero_grad()
            
            output = model(features)
            loss = criterion(output, labels)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
                    
        _val_loss, _val_score = validation(model, criterion, val_loader, device)
        _train_loss = np.mean(train_loss)
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val AUC : [{_val_score:.5f}] LEARNING RATE : [{optimizer.param_groups[0]["lr"]:.5f}]')

        scheduler.step(_val_loss)
            
        if best_val_score < _val_score:
            best_val_score = _val_score
            best_model = model
    
    return best_model

In [None]:
def multiLabel_AUC(y_true, y_scores):
    auc_scores = []
    for i in range(y_true.shape[1]):
        auc = roc_auc_score(y_true[:, i], y_scores[:, i])
        auc_scores.append(auc)
    mean_auc_score = np.mean(auc_scores)
    return mean_auc_score

In [None]:
def validation(model, criterion, val_loader, device):
    model.eval()
    val_loss, all_labels, all_probs = [], [], []
    
    with torch.no_grad():
        for features, labels in tqdm(iter(val_loader)):
            features = features.float().to(device)
            labels = labels.float().to(device)
            
            probs = model(features)
            
            loss = criterion(probs, labels)

            val_loss.append(loss.item())

            all_labels.append(labels.cpu().numpy())
            all_probs.append(probs.cpu().numpy())
        
        _val_loss = np.mean(val_loss)

        all_labels = np.concatenate(all_labels, axis=0)
        all_probs = np.concatenate(all_probs, axis=0)
        
        # Calculate AUC score
        auc_score = multiLabel_AUC(all_labels, all_probs)
    
    return _val_loss, auc_score

### Run

In [None]:
model = CNNModel()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CONFIG.LEARNING_RATE)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

infer_model = train(model, scheduler, optimizer, train_loader, val_loader, device)

### Inference

In [None]:
test = pd.read_csv(os.path.join(CONFIG.ROOT_DIR, 'test.csv'))
test_mfcc = get_features(test, False)
test_dataset = CustomDataset(test_mfcc, None)
test_loader = DataLoader(
    test_dataset,
    batch_size=CONFIG.BATCH_SIZE,
    shuffle=False
)

In [None]:
# np.save(os.path.join(CONFIG.ROOT_DIR, 'npy/test_mfcc.npy'), test_mfcc)
# test_mfcc = np.load(os.path.join(CONFIG.ROOT_DIR, 'npy/test_mfcc.npy'))

In [None]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    predictions = []
    with torch.no_grad():
        for features in tqdm(iter(test_loader)):
            features = features.float().to(device)
            
            probs = model(features)

            probs  = probs.cpu().detach().numpy()
            predictions += probs.tolist()
    return predictions

In [None]:
preds = inference(infer_model, test_loader, device)

## Submission

In [None]:
submit = pd.read_csv(os.path.join(CONFIG.ROOT_DIR,'./sample_submission.csv'))
submit.iloc[:, 1:] = preds
submit.head()

In [None]:
submit.to_csv(f'./output/submit_RawCNN.csv', index=False)