In [65]:
from TTS.api import TTS
from pydub import AudioSegment
import os
from tqdm import tqdm

In [None]:
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True)

In [71]:
def split_text(text, max_len=200):
    paragraphs = text.split("\n")
    chunks = []
    for paragraph in paragraphs:
        sentences = paragraph.split(".")
        chunk = ""
        for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence) == 0:
                continue
            sentence += "."
            if len(chunk) + len(sentence) <= max_len:
                chunk += " " + sentence
            else:
                if len(sentence) > max_len:
                    for i in range(0, len(sentence), max_len):
                        sub_sentence = sentence[i:i+max_len]
                        if sub_sentence[-1] != ".":
                            sub_sentence += "."
                        chunks.append(sub_sentence)
                else:
                    chunks.append(chunk.strip())
                    chunk = sentence
        if chunk:
            chunks.append(chunk.strip())
    return chunks

def gen_speech(text, name='output', speaker="Ana Florence", language="en", out_fmt="mp3"):
    audio = AudioSegment.empty()
    outfilename = f"{name}_{language}"
    for ind, chunk in enumerate(tqdm(split_text(text))):
        file_path=f"./processed_audio/{outfilename}_chunk_{ind}.wav"
        tts.tts_to_file(text=chunk,
            file_path=file_path,
            speaker=speaker,
            language=language)
        
        audio_chunk = AudioSegment.from_file(file_path)
        audio += audio_chunk
    
    audio.export(f"./processed_audio/{outfilename}.{out_fmt}", format=out_fmt)
    
    # remove the chunk audio files
    # for ind in range(len(split_text(text))):
    #     file_path=f"{name}_{language}_chunk_{ind}.wav"
    #     os.remove(file_path)
    return audio

In [75]:
with open("./inputs/meluha_text_en.txt", "r") as f:
    text_en = f.read()

gen_speech(text_en, name="meluha", language="en", speaker="Ana Florence")

  0%|          | 0/9 [00:00<?, ?it/s]

 > Text splitted to sentences.
['Shiva gazed at the orange sky.', 'The clouds hovering above Mansarovar had just parted to reveal the setting sun.', 'The brilliant giver of life was calling it a day once again.']


 11%|█         | 1/9 [00:20<02:44, 20.59s/it]

 > Processing time: 20.568287134170532
 > Real-time factor: 1.5986504261902186
 > Text splitted to sentences.
['But the sunset!', 'He tried never to miss the sunset!', 'On any other day, Shiva would have taken in the vista — the sun and the immense lake against the magnificent backdrop of the Himalayas stretching as .']


 22%|██▏       | 2/9 [00:46<02:44, 23.45s/it]

 > Processing time: 25.4265718460083
 > Real-time factor: 1.6414950262463197
 > Text splitted to sentences.
['far back as the eye could see.']


 33%|███▎      | 3/9 [00:49<01:26, 14.38s/it]

 > Processing time: 3.591294050216675
 > Real-time factor: 1.5857904879701552
 > Text splitted to sentences.
['Shiva had seen a few sunrises in his twenty-one years.', 'But not today.']


 44%|████▍     | 4/9 [00:57<00:59, 11.82s/it]

 > Processing time: 7.8878700733184814
 > Real-time factor: 1.5060225747841554
 > Text splitted to sentences.
['He squatted and perched his lithe, muscular body on the narrow ledge extending over the lake.', 'The numerous battle scars on his skin gleamed in the shimmering reflected light of the waters.']


 56%|█████▌    | 5/9 [01:18<01:00, 15.09s/it]

 > Processing time: 20.858378171920776
 > Real-time factor: 1.6212889124748067
 > Text splitted to sentences.
['Shiva remembered well his carefree childhood days.', 'He had perfected the art of throwing pebbles that bounced off the surface of the lake.']


 67%|██████▋   | 6/9 [01:32<00:43, 14.61s/it]

 > Processing time: 13.670319080352783
 > Real-time factor: 1.5572333015879634
 > Text splitted to sentences.
['He still held the record in his tribe for the highest number of bounces: seventeen.']


 78%|███████▊  | 7/9 [01:41<00:25, 12.98s/it]

 > Processing time: 9.605350971221924
 > Real-time factor: 1.6252147706832676
 > Text splitted to sentences.
['On a normal day, Shiva would have smiled at the memory from a cheerful past that had been overwhelmed by the angst of the present.']


 89%|████████▉ | 8/9 [01:54<00:13, 13.00s/it]

 > Processing time: 13.036885023117065
 > Real-time factor: 1.6884181160119545
 > Text splitted to sentences.
['But today, he turned back towards his village without any hint of joy.']


In [None]:
with open("./inputs/meluha_text_hi.txt", "r") as f:
    text_hi = f.read()

gen_speech(text_hi, name="meluha", language="hi", speaker="Ana Florence")

In [16]:
text_en = "Shiva gazed at the orange sky. The clouds hovering above Mansarovar had just parted to reveal the setting sun. The briliant giver of life was calling it a day once again. Shiva had seen a few sunrises in his twenty-one years."
len(text_en)

225

In [None]:
# generate speech by cloning a voice using default settings
tts.tts_to_file(text=text_en,
                file_path="output_en.wav",
                speaker="Ana Florence",
                language="en")

In [18]:
text_hi = "शिव ने नारंगी आकाश की ओर देखा। मानसरोवर के ऊपर मंडरा रहे बादल अस्त होते सूर्य को दिखाने के लिए अभी-अभी छंट चुके थे। जीवन का प्रतिभाशाली दाता एक बार फिर अपने दिन का आह्वान कर रहा था। शिव ने अपनी इक्कीस वर्ष की आयु में कुछ सूर्योदय देखे थे। "
len(text_hi)

239

In [5]:
# generate speech by cloning a voice using default settings
tts.tts_to_file(text=text_hi,
                file_path="output_hi.wav",
                speaker="Ana Florence",
                language="hi")

 > Text splitted to sentences.
['शिव ने नारंगी आकाश की ओर देखा। मानसरोवर के ऊपर मंडरा रहे बादल अस्त होते सूर्य को दिखाने के लिए अभी-अभी छंट चुके थे। जीवन का प्रतिभाशाली दाता एक बार फिर अपने दिन का आह्वान कर रहा था। शिव ने अपनी इक्कीस वर्ष की आयु में कुछ सूर्योदय देखे थे।']
 > Processing time: 33.49595499038696
 > Real-time factor: 1.7040722422801517


'output_hi.wav'