# Modern Kinyarwanda TTS Inference

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/osinkolu/RW-DEEPSPEECH-API/blob/main/tts/Digital_Umuganda_TTS_Inference_Tutorial.ipynb)

This notebook demonstrates how to run the Kinyarwanda TTS model using modern Python (3.10+) and Coqui-TTS (v0.22+).

Install Dependencies, and pull repo if not already done

In [5]:
# 1. Install System Dependencies
!sudo apt-get install -y espeak-ng

# 2. Install Python Libraries
# We use the latest version of Coqui TTS
!pip install coqui-tts scipy numpy torch --quiet

# 3. Clone the Repository (if not already present)
import os
repo_name = "RW-DEEPSPEECH-API"

if not os.path.exists(repo_name):
    print(f"Cloning {repo_name}...")
    !git clone https://github.com/agent87/RW-DEEPSPEECH-API.git
else:
    print(f"{repo_name} already exists. Skipping clone.")

# 4. Enter the directory and pull LFS files
# CRITICAL: We use %cd so the directory change sticks for the git command
%cd {repo_name}
print("Downloading model weights via Git LFS...")
!git lfs pull
# Go back to root to keep paths simple for the python script
%cd ..

print("\nSetup complete.")

In [3]:
import os
import json
import torch
import numpy as np
from scipy.io.wavfile import write
from IPython.display import Audio

from TTS.tts.models.vits import Vits
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer

# --- 1. SETUP PATHS ---
repo_root = "RW-DEEPSPEECH-API"
tts_folder = os.path.join(repo_root, "tts")

# Verify paths exist before running
if not os.path.exists(tts_folder):
    raise FileNotFoundError(f"Could not find TTS folder at {tts_folder}. Did the clone fail?")

model_path = os.path.join(tts_folder, "model.pth")
config_path = os.path.join(tts_folder, "config.json")
encoder_path = os.path.join(tts_folder, "SE_checkpoint.pth.tar")
encoder_config_path = os.path.join(tts_folder, "config_se.json")
reference_audio = os.path.join(tts_folder, "conditioning_audio.wav")
output_path = "output.wav"

# --- 2. LOAD & PATCH CONFIG ---
print(">> Loading and patching configuration...")
conf = VitsConfig()
conf.load_json(config_path)

# PATCH: Fix sample rate & Force-enable layers
conf.output_sample_rate = 22050
conf.audio.output_sample_rate = 22050
conf.phoneme_language = "en"
conf.use_speaker_embedding = True
conf.model_args.use_speaker_embedding = True
conf.use_d_vector_file = True
conf.model_args.use_d_vector_file = True
conf.d_vector_dim = 512
conf.model_args.d_vector_dim = 512

# Nullify broken paths from original author's machine
conf.speakers_file = None
conf.d_vector_file = None
if conf.model_args:
    conf.model_args.speakers_file = None
    conf.model_args.d_vector_file = None

# --- 3. LOAD MODEL & TOKENIZER ---
print(">> Loading Model components...")
tokenizer_output = TTSTokenizer.init_from_config(conf)
tokenizer = tokenizer_output[0] if isinstance(tokenizer_output, tuple) else tokenizer_output

model = Vits(config=conf, ap=None, tokenizer=None, speaker_manager=None)

# Load weights
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"   Using device: {device}")

if torch.cuda.is_available():
    model.load_checkpoint(config=conf, checkpoint_path=model_path, eval=True)
    model.cuda()
else:
    cp = torch.load(model_path, map_location=device)
    model.load_state_dict(cp['model'])
    model.eval()

speaker_manager = SpeakerManager(
    encoder_model_path=encoder_path,
    encoder_config_path=encoder_config_path,
    use_cuda=torch.cuda.is_available()
)

# --- 4. APPLY RUNTIME FIX (Monkey Patch) ---
# Override the internal function that keeps dropping the embedding
def fixed_set_cond_input(aux_input):
    return None, aux_input["d_vector"], None, None

model._set_cond_input = fixed_set_cond_input
print("   Applied runtime patch for speaker embeddings.")

# --- 5. INFERENCE FUNCTION ---
def text_to_speech(text, output_file="output.wav"):
    print(f"\nGenerating audio for: '{text}'")

    # Tokenize
    token_ids = tokenizer.text_to_ids(text)
    x = torch.LongTensor(token_ids).unsqueeze(0).to(device)

    # Get Embedding
    d_vectors = speaker_manager.compute_embedding_from_clip([reference_audio])
    d_vector_tensor = torch.tensor(d_vectors, dtype=torch.float32).unsqueeze(0).unsqueeze(-1).to(device)

    # Generate
    outputs = model.inference(x, aux_input={"d_vector": d_vector_tensor})

    # Save
    waveform = outputs["model_outputs"].squeeze().cpu().detach().numpy()
    write(output_file, 22050, waveform)
    print(f"Saved to {output_file}")

    return output_file

print(">> Setup complete. Ready to generate.")

>> Loading and patching configuration...
>> Loading Model components...


Failed to deserialize field: max_text_len (<class 'int'>) = Infinity
Value `Infinity` does not match field type `<class 'int'>`
Replaced it with field's default value: inf
  self.deserialize(dump_dict)


   Using device: cpu
   Applied runtime patch for speaker embeddings.
>> Setup complete. Ready to generate.


In [4]:
# Generate Audio
text = "Muraho, nishimiye gukoresha iri koranabuhanga."
output_file = text_to_speech(text, output_path)

# Play Audio
Audio(output_file)


Generating audio for: 'Muraho, nishimiye gukoresha iri koranabuhanga.'
Saved to output.wav
