Install NeMo.

In [None]:
# Install NeMo library. If you are running locally (rather than on Google Colab), comment out the below lines
# and instead follow the instructions at https://github.com/NVIDIA/NeMo#Installation
GITHUB_ACCOUNT = "NVIDIA"
BRANCH = "main"
!python -m pip install git+https://github.com/{GITHUB_ACCOUNT}/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]

# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,
# comment out the below lines and set NEMO_DIR to your local path.
NEMO_DIR = 'nemo'
!git clone -b {BRANCH} https://github.com/{GITHUB_ACCOUNT}/NeMo.git $NEMO_DIR

Make imports

In [None]:
import torch
import IPython.display as ipd
import re
import soundfile as sf
from matplotlib.pyplot import imshow
from matplotlib import pyplot as plt

from nemo.collections.tts.models import FastPitchModel
from nemo.collections.tts.models import HifiGanModel

Define file names

In [None]:
INPUT_TEXT = "input_text.txt"
INPUT_FOR_G2P = "input_for_g2p.txt"
OUTPUT_OF_G2P = "output_of_g2p.txt"
INPUT_TEXT_PHONEMES = "input_text_phonemes.txt"

Create file with some input text.
Note that text normalization (conversion of digits to words etc.) is **not** included in this pipeline.

In [None]:
!echo "(Я представляю себе вашу ироническую улыбку. Тем не менее – буквально два слова.) Как известно, мир несовершенен." > {INPUT_TEXT}
!echo "Устоями общества являются корыстолюбие, страх и продажность." >> {INPUT_TEXT}
!echo "Конфликт мечты с действительностью не утихает тысячелетиями." >> {INPUT_TEXT}
!echo "Вместо желаемой гармонии на земле царят хаос и беспорядок." >> {INPUT_TEXT}
!echo "Более того, нечто подобное мы обнаружили в собственной душе." >> {INPUT_TEXT}
!echo "Мы жаждем совершенства, а вокруг торжествует пошлость. Как в этой ситуации поступает деятель, революционер?" >> {INPUT_TEXT}
!echo "Революционер делает попытки установить мировую гармонию." >> {INPUT_TEXT}
!echo "Он начинает преобразовывать жизнь, достигая иногда курьезных мичуринских результатов." >> {INPUT_TEXT}
!echo "Допустим, выводит морковь, совершенно неотличимую от картофеля. В общем, создает новую человеческую породу." >> {INPUT_TEXT}
!echo "Известно, чем это кончается… Что в этой ситуации предпринимает моралист? Он тоже пытается достичь гармонии." >> {INPUT_TEXT}


Some helper preprocessing functions

In [None]:
def clean_russian_g2p_trascription(text: str) -> str:
    result = text
    result = result.replace("<DELETE>", " ").replace("+", "").replace("~", "")
    result = result.replace("ʑ", "ɕ:").replace("ɣ", "x")
    result = result.replace(":", "ː").replace("'", "`")
    result = "".join(result.split())
    result = result.replace("_", " ")
    return result


def clean_russian_text_for_tts(text: str) -> str:
    result = text
    result = result.replace("+", "")  # remove stress
    result = result.casefold()  # lowercase
    result = result.replace("ё", "е")
    result = result.replace("\u2011", "-")  # non-breaking hyphen
    result = result.replace("\u2014", "-")  # em dash
    result = result.replace("\u2026", ".")  # horizontal ellipsis
    result = result.replace("\u00ab", "\"")  # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
    result = result.replace("\u00bb", "\"")  # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
    result = result.replace("\u2019", "'")  # ’ Right Single Quotation Mark
    result = result.replace("\u201c", "\"")  # “ Left Double Quotation Mark
    result = result.replace("\u201d", "\"")  # ” Right Double Quotation Mark
    result = result.replace("\u201e", "\"")  # „ Double Low-9 Quotation Mark
    result = result.replace("\u201f", "\"")  # ‟ Double High-reversed-9 Quotation Mark
    return result


Take all unique words from the input text and prepare them to feed to G2P model.
Note that G2P model works with separate words and does not take context into account.

In [None]:
all_words = set()
with open(INPUT_TEXT, "r", encoding="utf-8") as inp:
    for line in inp:
        text = line.strip()
        words = re.compile('\w+').findall(text)
        for w in words:
            all_words.add(clean_russian_text_for_tts(w))            

with open(INPUT_FOR_G2P, "w", encoding="utf-8") as out:
    for w in all_words:
        out.write(" ".join(list(w)) + "\n")


In [None]:
!head {INPUT_FOR_G2P}

Clone [G2P model](https://huggingface.co/bene-ges/ru_g2p_ipa_bert_large) from HuggingFace.
If cloning doesn't work try `git lfs install`


In [None]:
!git clone https://huggingface.co/bene-ges/ru_g2p_ipa_bert_large

Run G2P inference on the words that we prepared

In [None]:
!python {NEMO_DIR}/examples/nlp/text_normalization_as_tagging/normalization_as_tagging_infer.py \
  pretrained_model=ru_g2p_ipa_bert_large/ru_g2p.nemo \
  inference.from_file={INPUT_FOR_G2P} \
  inference.out_file={OUTPUT_OF_G2P} \
  model.max_sequence_len=512 \
  inference.batch_size=128 \
  lang=ru


In [None]:
!head {OUTPUT_OF_G2P}

Preprocess input text for TTS using G2P results and vocabularies of known transcriptions.

In [None]:
# heteronyms are words with ambiguous transcription, we will leave them as plain text
heteronyms = set()
with open("ru_g2p_ipa_bert_large/heteronyms.txt", "r", encoding="utf-8") as f:
    for line in f:
        inp = line.strip()
        heteronyms.add(inp)

g2p_vocab = {}
# first read transcriptions from our g2p prediction
with open(OUTPUT_OF_G2P, "r", encoding="utf-8") as f:
    for line in f:
        try:
            _, inp, transcription, _, _ = line.strip().split("\t")
        except:
            print("cannot read line: " + line)
            continue
        inp = inp.replace(" ", "")
        g2p_vocab[inp] = clean_russian_g2p_trascription(transcription)

# then override known transcriptions using vocabulary
with open("ru_g2p_ipa_bert_large/g2p_correct_vocab.txt", "r", encoding="utf-8") as f:
    for line in f:
        # Example input: ледок \t lʲɪd`ok
        inp, transcription = line.strip().split("\t")
        g2p_vocab[inp] = transcription

out = open(INPUT_TEXT_PHONEMES, "w", encoding="utf-8")

with open(INPUT_TEXT, "r", encoding="utf-8") as inp:
    for line in inp:
        text = line.strip()
        text = clean_russian_text_for_tts(text)
        phonemized_text = ""
        m = re.search(r"[\w\-]+", text)
        while m is not None:
            begin = m.start()
            end = m.end()
            phonemized_text += text[0:begin]
            w = text[begin:end]
            if w in heteronyms:
                phonemized_text += w
            elif w in g2p_vocab:
                phonemized_text += clean_russian_g2p_trascription(g2p_vocab[w])
            else:  # shouldn't go here as all words are expected to pass through g2p
                phonemized_text += w

            if end >= len(text):
                break
            text = text[end:]
            m = re.search(r"[\w\-]+", text)
        if end < len(text):
            phonemized_text += text[end:]
        
        out.write(phonemized_text + "\n")

out.close()

Look at the final TTS input

In [None]:
!head {INPUT_TEXT_PHONEMES} 

Run TTS. The resulting wav files will be saved to working directory and also displayed in the output cell.

In [None]:
if torch.cuda.is_available():
  device = "cuda"
else:
  device = "cpu"

# Load FastPitch
spectrogram_generator = FastPitchModel.from_pretrained("bene-ges/tts_ru_ipa_fastpitch_ruslan").eval().to(device)
# Load vocoder
vocoder = HifiGanModel.from_pretrained(model_name="bene-ges/tts_ru_hifigan_ruslan").eval().to(device)

i = 0
with open(INPUT_TEXT_PHONEMES, "r", encoding="utf-8") as inp:
    for line in inp:
        text = line.strip()
        parsed = spectrogram_generator.parse(text)
        spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)
        audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)

        # Note that vocoder return a batch of audio. In this example, we just take the first and only sample.
        filename = str(i) + ".wav"
        sf.write(filename, audio.to('cpu').detach().numpy()[0], 22050)
        i += 1

        # display
        print(f'"{text}"\n')
        ipd.display(ipd.Audio(audio.to('cpu').detach(), rate=22050))
