# Train Dataset Augmentation

## Import

In [None]:
import os

import random

import librosa
import soundfile

import torch
import torchaudio

from IPython.display import Audio

import pandas as pd
from tqdm.notebook import tqdm

## Load Data

In [None]:
df = pd.read_csv("./data/train.csv")

In [None]:
df['id']

In [None]:
df['label']

## Augmentation Function

In [None]:
def generate(id1, id2, from_pth="./data/train", to_pth="./data/train_augmented", filename=None, sample_rate=32000, channel_size=1):
    # Load Audio Files
    if "NONE_" not in id1:
        waveform1, sample_rate1 = torchaudio.load(from_pth + "/" + id1 + ".ogg")
    else:
        waveform1, sample_rate1 = torch.zeors((channel_size, 44000)), sample_rate
    if "NONE_" not in id2:
        waveform2, sample_rate2 = torchaudio.load(from_pth + "/" + id2 + ".ogg")
    else:
        waveform2, sample_rate2 = torch.zeros((channel_size, 44000)), sample_rate
    
    if sample_rate1 != sample_rate or sample_rate2 != sample_rate:
        raise ValueError("Sampling rate does not match.")
    
    waveform_size = max(waveform1.size(1), waveform2.size(1))
    
    # Extend waveform size
    waveform1 = torch.nn.functional.pad(waveform1, (0, waveform_size - waveform1.size(1)))
    waveform2 = torch.nn.functional.pad(waveform2, (0, waveform_size - waveform2.size(1)))
    
    # Create Random White Noise
    noise = torch.zeros((channel_size, 0))
    while noise.size(1) < waveform_size:
        partition = torch.randn((channel_size, random.randint(1, waveform_size//5)))
        if random.choice([True, False, False]):
            partition *= random.uniform(0, 0.1)  # select noise volume
        else:  # remove noise
            partition *= 0
        noise = torch.cat((noise, partition), dim=1)
    noise = noise[:, :waveform_size]
    
    # Combine Waveforms
    waveform_noise = waveform1 + waveform2 + noise
    
    # Save Augmented Audio
    if filename is None:
        filename = id1 + "-" + id2
    filepath = to_pth + "/" + filename + ".ogg"
    torchaudio.save(filepath, waveform_noise, sample_rate)
    return filepath

### Augmentation 1: Add Noise to Waveforms

In [None]:
for count in range(3):
    target_dir = f"./data/train_noise_type{count+1}"
    if not os.path.isdir(target_dir):
        os.mkdir(target_dir)

    df = pd.read_csv("./data/train.csv")

    for i in tqdm(range(len(df))):
        df['path'][i] = generate(df['id'][i], "NONE_", from_pth="./data/train", to_pth=target_dir, filename=df['id'][i]).replace("./data", ".")
    
    df.to_csv(target_dir + ".csv", index=False)

## Augmentation 2: Combine Waveforms

In [None]:
if not os.path.isdir("./data/train_augmented"):
    os.mkdir("./data/train_augmented")

In [None]:
df = pd.read_csv("./data/train.csv")

In [None]:
fakes = df[df['label'] == 'fake']
fakes

In [None]:
reals = df[df['label'] == 'real']
reals

In [None]:
next(iter(fakes.itertuples()))

In [None]:
from sklearn.utils import shuffle
shuffle(fakes)

In [None]:
next(iter(shuffle(fakes).itertuples()))

In [None]:
labels = dict(id=[], path=[], fake=[], real=[])

#### Real + Real [0, 1]

In [None]:
reals1 = shuffle(reals).itertuples()
reals2 = shuffle(reals).itertuples()

for _, r1, r2 in zip(tqdm(range(len(reals))), reals1, reals2):
    new_pth = generate(r1[1], r2[1], from_pth="./data/train", to_pth="./data/train_augmented").replace("./data", ".")
    labels['id'].append(r1[1] + "-" + r2[1])
    labels['path'].append(new_pth)
    labels['fake'].append(0)
    labels['real'].append(1)

In [None]:
pd.DataFrame(labels)

#### Fake + Fake [1, 0]

In [None]:
fakes1 = shuffle(fakes).itertuples()
fakes2 = shuffle(fakes).itertuples()

for _, f1, f2 in zip(tqdm(range(len(reals))), fakes1, fakes2):
    new_pth = generate(f1[1], f2[1], from_pth="./data/train", to_pth="./data/train_augmented").replace("./data", ".")
    labels['id'].append(f1[1] + "-" + f2[1])
    labels['path'].append(new_pth)
    labels['fake'].append(1)
    labels['real'].append(0)

In [None]:
pd.DataFrame(labels)

#### Real + Fake [1, 1]

In [None]:
for _ in range(2):
    reals1 = shuffle(reals).itertuples()
    fakes2 = shuffle(fakes).itertuples()
    
    for _, r1, f2 in zip(tqdm(range(len(reals))), reals1, fakes2):
        new_pth = generate(r1[1], f2[1], from_pth="./data/train", to_pth="./data/train_augmented").replace("./data", ".")
        labels['id'].append(r1[1] + "-" + f2[1])
        labels['path'].append(new_pth)
        labels['fake'].append(1)
        labels['real'].append(1)

In [None]:
pd.DataFrame(labels)

#### None + None [0, 0]

In [None]:
for i in tqdm(range(1000)):
    new_id = f"NONE-NONE{i}"
    new_pth = generate("NONE_", "NONE_", from_pth="./data/train", to_pth="./data/train_augmented", filename=new_id).replace("./data", ".")
    labels['id'].append(new_id)
    labels['path'].append(new_pth)
    labels['fake'].append(0)
    labels['real'].append(0)

In [None]:
pd.DataFrame(labels).to_csv(f"./data/train_augmented.csv", index=False)