Please be careful that your input voices must be clear and without any amount of noise or music or silence. They should be voice of only one person (because of some TTS models conditions).

In [None]:
from pydub import AudioSegment
from pydub.silence import split_on_silence
import speech_recognition as sr
import pydub
from pydub import AudioSegment
import wave
import io
import os
import soundfile as sf
import scipy.signal as signal

durs is a list of lists which contains durations of target voice in each file in miliseconds. for example if in first file i want to trim the file from 00:00:12 to 01:05:03 then my first element should be: [12x1000:1x60x60x1000+5x60x1000+3x1000] in other way [12000:3903000]. 

here is a real example for three files.

In [None]:
durs = [[53893,1028903],[15429,18*60000+35068],[13961,17*60000+28160]]

we give the duration of voice files to code and get resampled single-channel audio chunks with 200 ms silence at the start and end of each chunk. the code trims the voice in silence parts so you won't get chunks that have incomplete words. I've set the minimum chunk duration to 7 seconds and the maximum to 10 seconds. 

In [None]:
os.mkdir('data')
os.mkdir('data/wavs')

directo = "<directory to voice files>"

target_sample_rate = 24000

voice_names = os.listdir(directo)
voice_names.sort(key=lambda f: int(''.join(filter(str.isdigit, f))))

sound = AudioSegment.silent(duration=200)


for voicename, x in zip(voice_names, durs):

  voice = AudioSegment.from_file(directo+"/"+voicename)
  sound += voice[x[0]:x[1]]
  # Be careful about the RAM capacity. If using all voices at once takes high values of RAM, then try list slicing (divide it into smaller parts).
  # For example once from voice 1 till 8 and then from voice 8 till last one.


sound.set_channels(1)

min_len = 7*1000
max_len = min_len + 5*1000

start = 0
end = 0

silence_2 = AudioSegment.silent(duration=200)

i = 0

while end < len(sound):

    audio_chunks = split_on_silence(sound[start + min_len:start + max_len], min_silence_len=200, silence_thresh=-47, keep_silence=0)
    output_file = f"data/wavs/{i}.wav"
    print("Exporting file", output_file)
    end = start + min_len + len(audio_chunks[0])
    final_file = (silence_2 + sound[start:end] + silence_2).set_channels(1)
    final_file.export(output_file, format="wav")

    data, samplerate = sf.read(output_file)
    resampled_data = signal.resample(data, int(len(data) * target_sample_rate / samplerate))
    sf.write(output_file, resampled_data, target_sample_rate, subtype='PCM_24')
    print("Resampling file", output_file)
    start = end

    i += 1




Now we want to remove the files (chunks) with a length of less than 7 seconds, to delete chunks of end of voices which usually contain nothing because of their short length.

In [None]:
directory = "data/wavs"
files_names = os.listdir(directory)
files_names.sort(key=lambda f: int(''.join(filter(str.isdigit, f))))

for filename in files_names:
  with wave.open(directory+"/"+filename) as mywav2:
        duration = mywav2.getnframes() / mywav2.getframerate()

  if duration < 7:
    os.remove(directory+"/"+filename)
    print(f"{directory+'/'+filename} with {duration} length removed!")

Now let's multiprocess the function that uses audio chunks to write the relevant transcription for each one in a CSV file.

In [None]:
import multiprocessing

directory = "data/wavs"

files_names = os.listdir(directory)
files_names.sort(key=lambda f: int(''.join(filter(str.isdigit, f))))
meta = io.open(r"data/metadata.csv", mode="w", encoding="utf-8")

def fun(part):

  file_names = files_names[part[0]:part[1]]

  recognizer = sr.Recognizer()

  for filename in file_names:

      with sr.AudioFile(directory+"/"+filename) as source:
          audio = recognizer.record(source)

      text = recognizer.recognize_google(audio, language='fa-IR')

      meta.write(f"{text}|{filename}\n")

if __name__ == '__main__':

  pool = multiprocessing.Pool()

  inputs = [[0, 500],[500, 1000],[1000, 1500],[1500, 2000],[2000, 2500],[2500,len(files_names)]]
  # I've used "input" as a list to process the first 500 chunks in a single process and the second 500 chunks in another process and so on.
  # Here I have 6 parallel processes.(6 elements in list)
  # (multiprocess is very useful here to boost the code because every single chunk takes some time to get processed on 
  # the internet and generate the transcription).

  pool = multiprocessing.Pool(processes=len(inputs))

  pool.map(fun, inputs)




meta.close()


Because of multiprocessing, it's possible to lose some data, so we compare transcriptions with audio chunks and generate transcriptions for the audio chunks that don't have one.

In [None]:
import io
import os

meta = io.open(r"data/metadata.csv", mode="r", encoding="utf-8")
files_names = os.listdir("data/wavs/")
files_names.sort(key=lambda f: int(''.join(filter(str.isdigit, f))))

list_wavs = []
list_txt = []

for filename in files_names:
    list_wavs.append(filename)


for txt in meta.readlines():
    list_txt.append(txt.split("|")[1][:-1])

def uncommon_elements(list1, list2):
    result = []
    for element in list1:
        if element not in list2:
            result.append(element)
    return result

unc_elements = uncommon_elements(list_wavs, list_txt)
print(unc_elements)
meta.close()

meta = io.open(r"data/metadata.csv", mode="a", encoding="utf-8")
recognizer = sr.Recognizer()

for filename in unc_elements:

    with sr.AudioFile("data/wavs/"+filename) as source:
        audio = recognizer.record(source)
    print(filename)
    text = recognizer.recognize_google(audio, language='fa-IR')

    meta.write(f"{text}|{filename}\n")


meta.close()


It's not necessary but we can sort the transcriptions using their related audio chunk name using the code below.

In [None]:
import io

lines_seen = set()

outfile = io.open(r"metadata.csv", mode="w", encoding="utf-8")

for line in io.open(r"metadataAllAll.csv", mode="r", encoding="utf-8"):

    if line not in lines_seen:

        outfile.write(line)
        lines_seen.add(line)

outfile.close()


def my_sort(line):
	line_fields = line.strip().split('|')
	indx = float(line_fields[1][:-4])
	return indx


fp = io.open(r"metadata.csv", mode="r", encoding="utf-8")
contents = fp.readlines()
fp.close()

fp = io.open(r"metadata.csv", mode="w", encoding="utf-8")

contents.sort(key=my_sort)

for line in contents:
	fp.write(line)
	
fp.close()
