# Imports

In [1]:
import librosa

import numpy as np
import pandas as pd
import random

import torch
import torchmetrics
import os
import torch.nn.functional as F

import xgboost
from xgboost import XGBClassifier

from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

### MLFlow

In [3]:
import mlflow
import mlflow.pytorch

mlflow.set_experiment('FakeVoice')

def mlflow_run_decorator(run_name=None):
    def decorator(func):
        def wrapper(*args, **kwargs):
            mlflow.start_run(run_name=run_name)
            try:
                result = func(*args, **kwargs)
                mlflow.set_tag("Status", "SUCCEESS")
            except Exception as e:
                mlflow.log_param("Exception", e)
                mlflow.set_tag("Status", "FAIL")
                raise e
            finally:
                mlflow.end_run()
            return result
        return wrapper
    return decorator

### Config

In [4]:
class Config:
    SR = 32000
    N_MFCC = 40
    
    # Dataset
    ROOT_DIR = 'C:/HongBeomsun/Dataset_SSD/FakeVoice'
    
    # Training
    N_CLASSES = 2
    BATCH_SIZE = 64
    N_EPOCHS = 70
    LEARNING_RATE = 0.1
    
    # Others
    SEED = 42
    
CONFIG = Config()

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [6]:
seed_everything(CONFIG.SEED)

### Data

In [7]:
df = pd.read_csv(os.path.join(CONFIG.ROOT_DIR,'train.csv'))

In [8]:
print(len(df))
df.head()

55438


Unnamed: 0,id,path,label
0,RUNQPNJF,./train/RUNQPNJF.ogg,real
1,JFAWUOGJ,./train/JFAWUOGJ.ogg,fake
2,RDKEKEVX,./train/RDKEKEVX.ogg,real
3,QYHJDOFK,./train/QYHJDOFK.ogg,real
4,RSPQNHAO,./train/RSPQNHAO.ogg,real


In [9]:
df['label'].value_counts()

label
fake    27818
real    27620
Name: count, dtype: int64

### Train test split

In [10]:
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=CONFIG.SEED, stratify=df['label'])

In [11]:
train['label'].value_counts()
val['label'].value_counts()

label
fake    5564
real    5524
Name: count, dtype: int64

### Feature Extraction

In [12]:
def noise(data):
    noise_amp = 0.01*np.random.uniform()*np.amax(data)
    data = data + noise_amp * np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    stretch_data = librosa.effects.time_stretch(data, rate=rate)
    return stretch_data

def pitch(data, sampling_rate, pitch_factor=0.7):
    pitch_data = librosa.effects.pitch_shift(y=data, sr=sampling_rate, n_steps=pitch_factor)
    return pitch_data

In [13]:
def normalize_volume(y, target_dB=-20):
    rms = np.sqrt(np.mean(y**2))
    loudness = 20 * np.log10(rms)
    loudness_change_dB = target_dB - loudness
    y_normalized = y * (10 ** (loudness_change_dB / 20))
    return y_normalized

def load_audio(file_path, sr):
    y, sr = librosa.load(file_path, sr=sr)
    y = normalize_volume(y)
    return y, sr

def extract_features(y, sr):
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CONFIG.N_MFCC).T, axis=0)
    return mfcc

def augment_data(y, sr):
    augmented_data = []
    augmented_data.append(noise(y))
    augmented_data.append(stretch(y))
    augmented_data.append(pitch(y, sr))
    return augmented_data

def create_void_data(data, sr):
    void_data = np.zeros_like(data)
    void_data = noise(void_data)
    return void_data

def create_duo_data(data1, data2, sr):
    if len(data1) > len(data2):
        data2 = np.pad(data2, (0, len(data1)-len(data2)), 'constant')
    else:
        data1 = np.pad(data1, (0, len(data2)-len(data1)), 'constant')
        
    duo_data = data1 + data2
    max_val = np.max(np.abs(duo_data))
    if max_val > 1:
        duo_data = duo_data / max_val
    
    return duo_data

def mix_two_random_data(df, sr):
    idx1, idx2 = random.sample(range(len(df)), 2)
    y1, _ = load_audio(os.path.join(CONFIG.ROOT_DIR, df.iloc[idx1]['path']), sr)
    y2, _ = load_audio(os.path.join(CONFIG.ROOT_DIR, df.iloc[idx2]['path']), sr)
    y_duo = create_duo_data(y1, y2, sr)
    label_y1 = df.iloc[idx1]['label']
    label_y2 = df.iloc[idx2]['label']
    
    label_vector = np.zeros(CONFIG.N_CLASSES, dtype=float)
    label_vector[0 if label_y1 == 'fake' else 1] = 1
    label_vector[0 if label_y2 == 'fake' else 1] = 1
    
    return y_duo, label_vector

In [14]:
def get_features(df, train_mode=True, augment=False):
    features = []
    labels = []
    total = len(df)
    
    for i, (index, row) in enumerate(tqdm(df.iterrows(), total=total), 1):
        y, sr = load_audio(os.path.join(CONFIG.ROOT_DIR, row['path']), CONFIG.SR)
        
        if train_mode:
            label = row['label']
            label_vector = np.zeros(CONFIG.N_CLASSES, dtype=float)
            label_vector[0 if label == 'fake' else 1] = 1
            labels.append(label_vector)
        
        features.append(extract_features(y, sr))
    
    if augment:
        augmented_features = []
        augmented_labels = []
        
        num_mixed_samples = int(total * 0.1)
        for _ in range(num_mixed_samples):
            try:
                y_duo, y_duo_label = mix_two_random_data(df, CONFIG.SR)
                augmented_features.append(extract_features(y_duo, CONFIG.SR))
                augmented_labels.append(y_duo_label)
            except Exception as e:
                print(f'Error during data augmentation: {e}')
                continue
        
        num_augmented_samples = int(total * 0.2)
        original_features = list(features)
        original_labels = list(labels)
        for idx in range(num_augmented_samples):
            try:
                augmented_data = augment_data(original_features[idx], CONFIG.SR)
                for aug_y in augmented_data:
                    augmented_features.append(extract_features(aug_y, CONFIG.SR))
                    augmented_labels.append(original_labels[idx])
            except Exception as e:
                print(f'Error during augmentation: {e}')
                continue
        
        features.extend(augmented_features)
        labels.extend(augmented_labels)
    
    if train_mode:
        return np.array(features), np.array(labels)
    return np.array(features)

In [15]:
# train_features, train_labels = get_features(train, train_mode=True, augment=True)
# val_features, val_labels = get_features(val, train_mode=True, augment=False)

In [16]:
def save_np():
    os.makedirs(os.path.join(CONFIG.ROOT_DIR, 'npy'), exist_ok=True)
    np.save(os.path.join(CONFIG.ROOT_DIR, 'npy/train_features_VariousFeatures_1000.npy'), train_features)
    np.save(os.path.join(CONFIG.ROOT_DIR, 'npy/train_labels_VariousFeatures_1000.npy'), train_labels)
    np.save(os.path.join(CONFIG.ROOT_DIR, 'npy/val_features_VariousFeatures_1000.npy'), val_features)
    np.save(os.path.join(CONFIG.ROOT_DIR, 'npy/val_labels_VariousFeatures_1000.npy'), val_labels)

In [17]:
def load_np():
    train_features = np.load(os.path.join(CONFIG.ROOT_DIR, 'npy/train_features_VariousFeatures_1000.npy'))
    train_labels = np.load(os.path.join(CONFIG.ROOT_DIR, 'npy/train_labels_VariousFeatures_1000.npy'))
    val_features = np.load(os.path.join(CONFIG.ROOT_DIR, 'npy/val_features_VariousFeatures_1000.npy'))
    val_labels = np.load(os.path.join(CONFIG.ROOT_DIR, 'npy/val_labels_VariousFeatures_1000.npy'))
    
    return train_features, train_labels, val_features, val_labels

In [18]:
# save_np()
train_features, train_labels, val_features, val_labels = load_np()

In [19]:
train_features = np.array(train_features)
train_labels = np.array(train_labels)
val_features = np.array(val_features)
val_labels = np.array(val_labels)

In [20]:
print(train_features.shape, len(train_labels))
print(val_features.shape, len(val_labels))

(197400, 181) 197400
(11088, 181) 11088


### Scaling

In [21]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [22]:
train_features = scaler.fit_transform(train_features)
val_features = scaler.transform(val_features)

### Dataset

In [23]:
train_data = xgboost.DMatrix(train_features, label=train_labels)
val_data = xgboost.DMatrix(val_features, label=val_labels)

### Define Model

In [39]:
@mlflow_run_decorator(run_name="XGBoost_Augment")
def train_xgboost(train_features, train_labels, val_features, val_labels):
    models = []
    for i in range(CONFIG.N_CLASSES):
        print(f"Training model for label {i}")
        model = XGBClassifier(
            random_state=CONFIG.SEED,
            tree_method='gpu_hist',
            gpu_id=0,
            n_jobs=-1,
            objective='binary:logistic',
            n_estimators=CONFIG.N_EPOCHS,
            learning_rate=CONFIG.LEARNING_RATE,
            eval_metric='logloss'
        )
        
        eval_set = [(train_features, train_labels[:, i]), (val_features, val_labels[:, i])]
        model.fit(
            train_features,
            train_labels[:, i],
            eval_set=eval_set,
            verbose=True
        )
        models.append(model)
    return models

In [40]:
models = train_xgboost(train_features, train_labels, val_features, val_labels)

Training model for label 0



    E.g. tree_method = "hist", device = "cuda"



[0]	validation_0-logloss:0.65482	validation_1-logloss:0.64786
[1]	validation_0-logloss:0.62309	validation_1-logloss:0.60518
[2]	validation_0-logloss:0.59459	validation_1-logloss:0.56799
[3]	validation_0-logloss:0.56877	validation_1-logloss:0.53255
[4]	validation_0-logloss:0.54605	validation_1-logloss:0.49834
[5]	validation_0-logloss:0.52546	validation_1-logloss:0.47109
[6]	validation_0-logloss:0.50805	validation_1-logloss:0.45234
[7]	validation_0-logloss:0.49190	validation_1-logloss:0.43161
[8]	validation_0-logloss:0.47657	validation_1-logloss:0.40872
[9]	validation_0-logloss:0.46272	validation_1-logloss:0.38886
[10]	validation_0-logloss:0.45122	validation_1-logloss:0.37443
[11]	validation_0-logloss:0.43750	validation_1-logloss:0.36145
[12]	validation_0-logloss:0.42711	validation_1-logloss:0.34780
[13]	validation_0-logloss:0.41557	validation_1-logloss:0.33684
[14]	validation_0-logloss:0.40508	validation_1-logloss:0.32470
[15]	validation_0-logloss:0.39639	validation_1-logloss:0.31554
[1


    E.g. tree_method = "hist", device = "cuda"



[4]	validation_0-logloss:0.53096	validation_1-logloss:0.48822
[5]	validation_0-logloss:0.50578	validation_1-logloss:0.46137
[6]	validation_0-logloss:0.48631	validation_1-logloss:0.43769
[7]	validation_0-logloss:0.46480	validation_1-logloss:0.41859
[8]	validation_0-logloss:0.44917	validation_1-logloss:0.39826
[9]	validation_0-logloss:0.43292	validation_1-logloss:0.38116
[10]	validation_0-logloss:0.41951	validation_1-logloss:0.36534
[11]	validation_0-logloss:0.40479	validation_1-logloss:0.35319
[12]	validation_0-logloss:0.39296	validation_1-logloss:0.33742
[13]	validation_0-logloss:0.38138	validation_1-logloss:0.32553
[14]	validation_0-logloss:0.37049	validation_1-logloss:0.30884
[15]	validation_0-logloss:0.36097	validation_1-logloss:0.30238
[16]	validation_0-logloss:0.35185	validation_1-logloss:0.29755
[17]	validation_0-logloss:0.34324	validation_1-logloss:0.28151
[18]	validation_0-logloss:0.33383	validation_1-logloss:0.27191
[19]	validation_0-logloss:0.32575	validation_1-logloss:0.2612

### Inference

In [42]:
test = pd.read_csv(os.path.join(CONFIG.ROOT_DIR, 'test.csv'))
test.head()

Unnamed: 0,id,path
0,TEST_00000,./test/TEST_00000.ogg
1,TEST_00001,./test/TEST_00001.ogg
2,TEST_00002,./test/TEST_00002.ogg
3,TEST_00003,./test/TEST_00003.ogg
4,TEST_00004,./test/TEST_00004.ogg


In [43]:
# test_mfcc = get_features(test, train_mode=False, augment=False)

In [44]:
# np.save(os.path.join(CONFIG.ROOT_DIR, 'npy/test_VariousFeatures_1000.npy'), test_mfcc)
test_mfcc = np.load(os.path.join(CONFIG.ROOT_DIR, 'npy/test_VariousFeatures_1000.npy'))

In [46]:
def inference(models, test_features):
    predictions = []
    for model in models:
        preds = model.predict_proba(test_features)[:, 1]
        predictions.append(preds)
    return np.vstack(predictions).T

In [47]:
preds = inference(models, test_mfcc)


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"



In [51]:
preds[0]

array([0.80321026, 0.5836144 ], dtype=float32)

### Submission

In [53]:
submit = pd.read_csv(os.path.join(CONFIG.ROOT_DIR,'./sample_submission.csv'))
submit.iloc[:, 1:] = preds
submit.head()

  submit.iloc[:, 1:] = preds
  submit.iloc[:, 1:] = preds


Unnamed: 0,id,fake,real
0,TEST_00000,0.80321,0.583614
1,TEST_00001,0.782818,0.548817
2,TEST_00002,0.860601,0.575823
3,TEST_00003,0.669753,0.149042
4,TEST_00004,0.830531,0.316183


In [54]:
submit.to_csv(f'./output/submit_XGBoost_Augment.csv', index=False)

### AfterTest

In [None]:
print(model(torch.tensor(train_features).float().to(device)).cpu().detach().numpy()[:10])
print(train_labels[:10])

In [None]:
np.where((train_labels[:, 0] == 1) & (train_labels[:, 1] == 1))[0]