In [65]:
from TTS.api import TTS
from pydub import AudioSegment
import os
from tqdm import tqdm

In [None]:
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True)

In [61]:
def split_text(text, max_len=200):
    sentences = text.split(".")
    chunks = []
    chunk = ""
    for sentence in sentences:
        sentence = sentence.strip()
        if len(sentence) == 0:
            continue
        sentence += "."
        if len(chunk) + len(sentence) <= max_len:
            chunk += " " + sentence
        else:
            if len(sentence) > max_len:
                for i in range(0, len(sentence), max_len):
                    sub_sentence = sentence[i:i+max_len]
                    if sub_sentence[-1] != ".":
                        sub_sentence += "."
                    chunks.append(sub_sentence)
            else:
                chunks.append(chunk.strip())
                chunk = sentence
    if chunk:
        chunks.append(chunk.strip())
    return chunks

def gen_speech(text, name='output', speaker="Ana Florence", language="en", out_fmt="mp3"):
    audio = AudioSegment.empty()
    for ind, chunk in enumerate(tqdm(split_text(text))):
        file_path=f"{name}_{language}_chunk_{ind}.wav"
        tts.tts_to_file(text=chunk,
            file_path=file_path,
            speaker=speaker,
            language=language)
        
        audio_chunk = AudioSegment.from_file(file_path)
        audio += audio_chunk
    
    audio.export(f"{name}_{language}.{out_fmt}", format=out_fmt)
    
    # remove the chunk audio files
    # for ind in range(len(split_text(text))):
    #     file_path=f"{name}_{language}_chunk_{ind}.wav"
    #     os.remove(file_path)
    return audio

In [62]:
text_en = "Shiva gazed at the orange sky. The clouds hovering above Mansarovar had just parted to reveal the setting sun. The briliant giver of life was calling it a day once again. Shiva had seen a few sunrises in his twenty-one years. But the sunset! He tried never to miss the sunset! On any other day, Shiva would have taken in the vista — the sun and the immense lake against the magnificent backdrop of the Himalayas stretching as far back as the eye could see. But not today. He squatted and perched his lithe, muscular body on the narrow ledge extending over the lake. The numerous batde-scars on his skin gleamed in the shimmering reflected light of the waters. Shiva remembered wel his carefree childhood days. He had perfected the art of throwing pebbles that bounced off the surface of the lake. He stil held the record in his tribe for the highest number of bounces: seventeen. On a normal day, Shiva would have smiled at the memory from a cheerful past that had been overwhelmed by the angst of the present. But today, he turned back towards his vilage without any hint of joy."

In [63]:
gen_speech(text_en, name='Meluha', language='en')

  0%|          | 0/7 [00:00<?, ?it/s]

 > Text splitted to sentences.
['Shiva gazed at the orange sky.', 'The clouds hovering above Mansarovar had just parted to reveal the setting sun.', 'The briliant giver of life was calling it a day once again.']


 14%|█▍        | 1/7 [00:20<02:02, 20.37s/it]

 > Processing time: 20.34393310546875
 > Real-time factor: 1.557322824583354
 > Text splitted to sentences.
['But the sunset!', 'He tried never to miss the sunset!', 'On any other day, Shiva would have taken in the vista — the sun and the immense lake against the magnificent backdrop of the Himalayas stretching as .']


 29%|██▊       | 2/7 [00:48<02:05, 25.14s/it]

 > Processing time: 28.443516969680786
 > Real-time factor: 1.5602723330749246
 > Text splitted to sentences.
['far back as the eye could see.']


 43%|████▎     | 3/7 [00:52<01:00, 15.13s/it]

 > Processing time: 3.2237179279327393
 > Real-time factor: 1.4234816627466536
 > Text splitted to sentences.
['Shiva had seen a few sunrises in his twenty-one years.', 'But not today.', 'He squatted and perched his lithe, muscular body on the narrow ledge extending over the lake.']


 57%|█████▋    | 4/7 [01:09<00:47, 15.95s/it]

 > Processing time: 17.179306030273438
 > Real-time factor: 1.5080725602248921
 > Text splitted to sentences.
['The numerous batde-scars on his skin gleamed in the shimmering reflected light of the waters.', 'Shiva remembered wel his carefree childhood days.']


 71%|███████▏  | 5/7 [01:25<00:32, 16.17s/it]

 > Processing time: 16.558461904525757
 > Real-time factor: 1.6168081558859686
 > Text splitted to sentences.
['He had perfected the art of throwing pebbles that bounced off the surface of the lake.', 'He stil held the record in his tribe for the highest number of bounces: seventeen.']


 86%|████████▌ | 6/7 [01:47<00:17, 17.98s/it]

 > Processing time: 21.455121994018555
 > Real-time factor: 1.6166123563699737
 > Text splitted to sentences.
['On a normal day, Shiva would have smiled at the memory from a cheerful past that had been overwhelmed by the angst of the present.', 'But today, he turned back towards his vilage without any hint of joy.']


100%|██████████| 7/7 [02:08<00:00, 18.36s/it]

 > Processing time: 21.19934320449829
 > Real-time factor: 1.608597338051933





In [41]:
combined_output_en = AudioSegment.empty()
for ind, chunk in enumerate(split_text(text)):
    audio_chunk = AudioSegment.from_file(f"output_en_chunk_{ind}.wav")
    combined_output_en += audio_chunk

    

In [42]:
combined_output_en

In [51]:
combined_output_en.export("output_en_full.mp3", format="mp3")

<_io.BufferedRandom name='output_en_full.mp3'>

In [16]:
text_en = "Shiva gazed at the orange sky. The clouds hovering above Mansarovar had just parted to reveal the setting sun. The briliant giver of life was calling it a day once again. Shiva had seen a few sunrises in his twenty-one years."
len(text_en)

225

In [None]:
# generate speech by cloning a voice using default settings
tts.tts_to_file(text=text_en,
                file_path="output_en.wav",
                speaker="Ana Florence",
                language="en")

In [18]:
text_hi = "शिव ने नारंगी आकाश की ओर देखा। मानसरोवर के ऊपर मंडरा रहे बादल अस्त होते सूर्य को दिखाने के लिए अभी-अभी छंट चुके थे। जीवन का प्रतिभाशाली दाता एक बार फिर अपने दिन का आह्वान कर रहा था। शिव ने अपनी इक्कीस वर्ष की आयु में कुछ सूर्योदय देखे थे। "
len(text_hi)

239

In [5]:
# generate speech by cloning a voice using default settings
tts.tts_to_file(text=text_hi,
                file_path="output_hi.wav",
                speaker="Ana Florence",
                language="hi")

 > Text splitted to sentences.
['शिव ने नारंगी आकाश की ओर देखा। मानसरोवर के ऊपर मंडरा रहे बादल अस्त होते सूर्य को दिखाने के लिए अभी-अभी छंट चुके थे। जीवन का प्रतिभाशाली दाता एक बार फिर अपने दिन का आह्वान कर रहा था। शिव ने अपनी इक्कीस वर्ष की आयु में कुछ सूर्योदय देखे थे।']
 > Processing time: 33.49595499038696
 > Real-time factor: 1.7040722422801517


'output_hi.wav'