In [1]:
import os, time
import librosa
import matplotlib.pyplot as plt
import librosa.display
import numpy as np
import json
import datetime

In [2]:
data_path = '/home/bruno/datasets/musan/speech/librivox/'
save_path = '/home/bruno/datasets/mixed-musan/wav16/'
json_annotation_path = '/home/bruno/datasets/mixed-musan/'

if not os.path.isdir(save_path):
    os.makedirs(save_path)
if not os.path.isdir(json_annotation_path):
    os.makedirs(json_annotation_path)

In [3]:
all_speakers = []
for path, dirs, files in sorted(os.walk(data_path)):
    for file in files:
        if file.endswith('.wav'):
            all_speakers.append(os.path.join(path,file))

speaker_0 = all_speakers[0:][::2]
speaker_1 = all_speakers[1:][::2]

In [4]:
sr = 16000
max_utt = min(len(speaker_0), len(speaker_1))

utter = np.empty(shape=[0, ])
#json_data = []

for i in range(max_utt):
    json_data = []
    
    utter0, rate = librosa.core.load(speaker_0[i],sr=sr)     
    intervals = librosa.effects.split(utter0, top_db=25)         # voice activity detection
    
    for idx, current_interval in enumerate(intervals):
        start = str(datetime.timedelta(seconds = current_interval[0]/sr))
        end = str(datetime.timedelta(seconds = current_interval[1]/sr))
        json_data.append({
            'start': start,
            'end': end,
            'speaker':0
        })
    final_utter0 = utter0.shape[0]/sr   
    
    utter1, rate = librosa.core.load(speaker_1[i],sr=sr)     
    intervals = librosa.effects.split(utter1, top_db=25)         # voice activity detection
    
    for idx, current_interval in enumerate(intervals):
        start = str(datetime.timedelta(seconds = final_utter0 + current_interval[0]/sr))
        end = str(datetime.timedelta(seconds = final_utter0 + current_interval[1]/sr))
        json_data.append({
            'start': start,
            'end': end,
            'speaker':1
        })
        
    #utter = np.concatenate((utter0, utter1), axis=0)
    #output_wav = os.path.join(save_path, 'two_speakers_' + str(i) + '.wav')
    #librosa.output.write_wav(output_wav, utter, sr)
    
    output_json = os.path.join(json_annotation_path,'two_speakers_' + str(i) + '.json')
    with open(output_json,'w') as f:
        json.dump(json_data,f,indent=4)

In [None]:
sr = 16000
utter1, rate = librosa.core.load(speaker_0[0],sr=sr)
plt.figure()
plt.subplot(3,1,1)
librosa.display.waveplot(utter1,sr)
plt.title('Wave 1')

utter2, rate = librosa.core.load(speaker_1[0],sr=sr)
plt.figure()
plt.subplot(3,1,2)
librosa.display.waveplot(utter2,sr)
plt.title('Wave 2')

utter3 = np.concatenate((utter1, utter2), axis=0)
plt.figure()
plt.subplot(3,1,3)
librosa.display.waveplot(utter3,sr)
plt.title('Wave 1 + Wave 2')