## Newsletter Text-To-Speech - Kokoro

### Import Modules

In [1]:
import soundfile as sf
import tempfile
import os
import numpy as np
from pathlib import Path
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
from kokoro import KPipeline
import torch
import json

### Helper Functions

In [2]:
def combine_audio_files(file_list, output_path):
    """
    Combines a list of WAV files into a single output file.
    """
    combined_data = []
    for file_path in file_list:
        data, sample_rate = sf.read(file_path)
        if sample_rate != 24000:
            continue
        combined_data.append(data)
    
    final_audio = np.concatenate(combined_data)
    sf.write(output_path, final_audio, 24000)

def generate_speech(json_path, output_filename="final_audio.wav", voice="af_heart"):
    """
    Processes a JSON article, letting Kokoro handle text chunking internally.
    
    Args:
        json_path (str): Path to the input JSON file.
        output_filename (str): Desired name for the final .wav file.
        voice (str): The Kokoro voice to use.
    """
    pipeline = KPipeline(lang_code="a", repo_id="hexgrad/Kokoro-82M")
    
    with open(json_path, "r", encoding="utf-8") as f:
        article = json.load(f)
    
    all_text_blocks = []
    
    all_text_blocks.append(article.get("title", ""))
    all_text_blocks.append(article.get("subtitle", ""))

    for section in article.get("sections", []):
        section_title = section.get("title", "")
        if section_title and section_title.lower() != article.get("title", "").lower():
            all_text_blocks.append(section_title)

        for para in section.get("content", []):
            if para.startswith("<start quote>"):
                quote_text = para.replace("<start quote>", "Start quote.").replace("<end quote>", "End quote.").strip()
                all_text_blocks.append(quote_text)
            else:
                all_text_blocks.append(para)
    
    full_text_to_speak = "\n\n".join(block for block in all_text_blocks if block)
    
    if not full_text_to_speak:
        return None

    with tempfile.TemporaryDirectory() as temp_dir:
        temp_dir_path = Path(temp_dir)
        audio_part_paths = []
        
        generator = pipeline(full_text_to_speak, voice=voice, speed=1.0)
        
        for i, (text_chunk, _, audio_tensor) in enumerate(generator):
            if audio_tensor is None or len(audio_tensor) == 0:
                continue
            
            part_path = temp_dir_path / f"part_{i:04d}.wav"
            sf.write(str(part_path), audio_tensor.numpy(), 24000)
            audio_part_paths.append(str(part_path))

        if not audio_part_paths:
            return None
        
        combine_audio_files(audio_part_paths, output_filename)
        return output_filename

### Test Run

In [4]:
text_file = "processed-json/Money Stuff - A Drug-Trial Stock Sale.json"
output_audio_file = "Money Stuff - A Drug-Trial Stock Sal.wav"

result_path = generate_speech(text_file, output_filename=output_audio_file)

### Convert All Processed `.json` Articles to Audio

In [None]:
directory_path = Path('processed-json/')

if directory_path.is_dir():
    for entry in directory_path.iterdir():
        with open(entry, "r", encoding="utf-8") as f:
            text = f.read()
        final_audio_path = generate_speech(text, title=entry.stem)