In [2]:
import torch
import random
import librosa
import numpy as np
from datasets import load_dataset
from IPython.display import Audio
from librosa.beat import beat_track
from diffusers import DiffusionPipeline
import pandas as pd
from diffusers import Mel
import scipy.io.wavfile
import random

#### Load Model

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
generator = torch.Generator(device=device)

In [1]:
# Install audio diffusion pipeline - https://github.com/teticio/audio-diffusion
audio_diffusion = DiffusionPipeline.from_pretrained('teticio/audio-diffusion-ddim-256').to(device)
mel = audio_diffusion.mel
sample_rate = mel.get_sample_rate()

#### Randomly Generate Data Pairs

In [5]:
df = pd.read_csv('/scratch/ys5hd/Riffusion/music/UrbanSound8K/metadata/UrbanSound8K_train.csv')
df = df[['slice_file_name', 'fsID', 'fold', 'classID']]

In [6]:
sample_df = []
for cls in df['classID'].unique():
    smp1 = df.loc[df['classID']==cls].reset_index(drop=True)
    smp2 = smp1.iloc[random.sample(list(range(smp1.shape[0])), 100)].reset_index(drop=True)
    smp2.columns = [x+'_1' for x in smp2.columns]
    smp2['slice_file_name_2'] = ''

    for index, row in smp2.iterrows():
        smp2.loc[index, 'slice_file_name_2'] = \
        random.sample(list(smp1.loc[smp1['fsID']!=row['fsID_1'], 'slice_file_name'].values), 1)[0]

    smp1.columns = [x+'_2' for x in smp1.columns] 
    smp2 = pd.merge(smp1, smp2, on='slice_file_name_2')    
    
    sample_df.append(smp2)

3
2
1
0
9
8
5
4
7
6


In [8]:
sample_df = pd.concat(sample_df)
sample_df.to_csv('sample_aug.csv', index=False)

#### Interpolate

In [23]:
def interpolate_audio(fname1, fname2, TARGET_PATH):
    mel = Mel(x_res=256,
              y_res=256,
              hop_length=256,
              sample_rate=22050,
              n_fft=2048,
              n_iter=32)

    image = mel.load_audio(fname1)
    image = mel.audio_slice_to_image(0)

    image2 = mel.load_audio(fname2)
    image2 = mel.audio_slice_to_image(0)

    noise = audio_diffusion.encode([image])
    noise2 = audio_diffusion.encode([image2])

    alpha = 0.5  #@param {type:"slider", min:0, max:1, step:0.1}
    output = audio_diffusion(
        noise=audio_diffusion.slerp(noise, noise2, alpha),
        generator=generator)
    audio = output.audios[0, 0]
    
    scipy.io.wavfile.write(TARGET_PATH, sample_rate, audio[:len(mel.audio)])    
    
    return 

#     display(Audio(mel.image_to_audio(image), rate=sample_rate))
#     display(Audio(mel.image_to_audio(image2), rate=sample_rate))
#     display(Audio(audio[:len(mel.audio)], rate=sample_rate))

In [2]:
# Iterate through sample_df and generate new audio file

TARGET_PATH = '/scratch/ys5hd/Riffusion/music/UrbanSound8K/audio/fold11/'
PATH = '/scratch/ys5hd/Riffusion/music/UrbanSound8K/audio/fold'

for index, row in sample_df.iterrows():
    fname1 = PATH+str(row['fold_2'])+'/'+row['slice_file_name_2']
    fname2 = PATH+str(row['fold_1'])+'/'+row['slice_file_name_1']

    interpolate_audio(fname1, fname2, \
                    TARGET_PATH+row['slice_file_name_2'][:-4]+'_'+row['slice_file_name_1'])

#### Generate New Training Data File for Urban Sound 8k

- We add generated audios as another fold to our training dataset

In [32]:
df = pd.read_csv('/scratch/ys5hd/Riffusion/music/UrbanSound8K/metadata/UrbanSound8K_train.csv')

In [35]:
df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [40]:
sample_df['slice_file_name'] = sample_df['slice_file_name_2'].str[:-4]+'_'+sample_df['slice_file_name_1']

In [43]:
sdf = sample_df[['slice_file_name', 'classID_1', 'fold_1']]
sdf.columns = ['slice_file_name', 'classID', 'fold']
sdf['fold'] = 11

In [51]:
pd.concat([df, sdf]).to_csv('/scratch/ys5hd/Riffusion/music/UrbanSound8K/metadata/UrbanSound8K_train_aug.csv', index=False)