In [None]:
import os, re, json
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
import soundfile as sf
from moviepy import ImageClip, AudioFileClip, concatenate_videoclips
import textwrap

os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
from kokoro import KPipeline
import torch

# import nltk
# nltk.download('punkt')
# nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

from huggingface_hub import login
login(token="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")

In [None]:
SCREEN_SIZE = (720, 1280)
FONT_SIZE = 45
MIN_FONT_SIZE = 24
FRAME_DIR = Path("video-resource/frames"); FRAME_DIR.mkdir(exist_ok=True)
AUDIO_DIR = Path("video-resource/audio"); AUDIO_DIR.mkdir(exist_ok=True)
OUTPUT_VIDEO = Path("video-resource/")

FONT_DIR = Path("video-resource/fonts")
FONTS = {
    "bold": str(FONT_DIR / "BearSansUI-Bold.otf"),
    "italic": str(FONT_DIR / "BearSansUI-Italic.otf"),
    "regular": str(FONT_DIR / "BearSansUI-Regular.otf")
}

pipeline = KPipeline(lang_code="a", repo_id="hexgrad/Kokoro-82M")
frame_index = 0

In [None]:
def split_text(text):
    """Splits a block of text into a clean list of sentences."""
    if not text:
        return []
    paragraphs = re.split(r"\n{2,}", text)
    chunks = []
    for para in paragraphs:
        chunks.extend(sent_tokenize(para.strip()))
    return [s.strip() for s in chunks if s.strip()]

def wrap_text_by_pixels(draw, text, font, max_width):
    """Wraps text based on rendered pixel width to respect margins."""
    lines = []
    words = text.split()
    if not words:
        return []

    current_line = words[0]
    for word in words[1:]:
        if draw.textlength(current_line + " " + word, font=font) <= max_width:
            current_line += " " + word
        else:
            lines.append(current_line)
            current_line = word
    lines.append(current_line)
    return lines

def create_text_image(header=None, subtitle=None, body=None):
    """
    Creates an image with text. Dynamically adjusts body font size to fit.
    """
    img = Image.new("RGB", SCREEN_SIZE, "white")
    draw = ImageDraw.Draw(img)
    padding = 80
    max_width = SCREEN_SIZE[0] - 2 * padding
    y_pos = padding
    line_spacing = 10

    # Draw Header
    if header:
        try:
            font = ImageFont.truetype(FONTS["bold"], int(FONT_SIZE * 1.2))
        except IOError:
            font = ImageFont.load_default()
        lines = wrap_text_by_pixels(draw, header, font, max_width)
        for line in lines:
            draw.text((padding, y_pos), line, font=font, fill="black")
            y_pos += font.getbbox(line)[3] + line_spacing
        y_pos += line_spacing # Extra space after header

    # Draw Subtitle
    if subtitle:
        try:
            font = ImageFont.truetype(FONTS["italic"], FONT_SIZE)
        except IOError:
            font = ImageFont.load_default()
        lines = wrap_text_by_pixels(draw, subtitle, font, max_width)
        for line in lines:
            draw.text((padding, y_pos), line, font=font, fill="gray")
            y_pos += font.getbbox(line)[3] + line_spacing
        y_pos += line_spacing # Extra space after subtitle

    # Draw Body with dynamic font sizing
    if body:
        body_top_y = y_pos
        available_height = SCREEN_SIZE[1] - body_top_y - padding
        current_font_size = FONT_SIZE

        while current_font_size >= MIN_FONT_SIZE:
            try:
                body_font = ImageFont.truetype(FONTS["regular"], current_font_size)
            except IOError:
                body_font = ImageFont.load_default()

            body_lines = wrap_text_by_pixels(draw, body, body_font, max_width)
            total_text_height = sum(body_font.getbbox(line)[3] + line_spacing for line in body_lines)

            if total_text_height <= available_height:
                break
            else:
                current_font_size -= 2
        else:
            print(f"Warning: Text may be clipped as it exceeds available space even at min font size {MIN_FONT_SIZE}pt.")


        body_start_y = body_top_y + (available_height - total_text_height) / 2
        for line in body_lines:
            draw.text((padding, body_start_y), line, font=body_font, fill="black")
            body_start_y += body_font.getbbox(line)[3] + line_spacing

    return img


def generate_audio_and_frame(text_to_speak, header=None, subtitle=None):
    """
    Generates an audio file and a corresponding text frame.
    Prevents header/subtitle text from being duplicated in the body.
    """
    global frame_index
    audio_path = AUDIO_DIR / f"part_{frame_index:04}.wav"

    full_audio = []
    text_for_image = ""
    for gs, _, audio in pipeline(text_to_speak, voice="af_heart", speed=1.0):
        if audio is not None:
            full_audio.append(audio)
            text_for_image = gs
    
    if not full_audio:
        print(f"Warning: No audio generated for text: '{text_to_speak}'")
        return
    
    body_text = text_for_image
    if text_for_image == header or text_for_image == subtitle:
        body_text = None
    
    image = create_text_image(header=header, subtitle=subtitle, body=body_text)

    image_path = FRAME_DIR / f"frame_{frame_index:04}.png"
    image.save(image_path)

    combined_audio = torch.cat(full_audio).unsqueeze(0)
    sf.write(str(audio_path), combined_audio.squeeze().cpu().numpy(), 24000)

    frame_index += 1

def process_article(json_path):
    """
    Processes a JSON article, generating audio and frames for each part sequentially.
    """
    with open(json_path, "r", encoding="utf-8") as f:
        article = json.load(f)

    main_title = article.get("title", "")
    subtitle = article.get("subtitle", "")
    sections = article.get("sections", [])

    for chunk in split_text(main_title):
        generate_audio_and_frame(text_to_speak=chunk, header=chunk)

    for chunk in split_text(subtitle):
        generate_audio_and_frame(text_to_speak=chunk, header=main_title, subtitle=chunk)

    for section in sections:
        section_title = section.get("title", "")
        if section_title and section_title.lower() != main_title.lower():
            for chunk in split_text(section_title):
                generate_audio_and_frame(text_to_speak=chunk, header=section_title)

        for para in section.get("content", []):
            for chunk in split_text(para):
                # Use section_title as header, fallback to main_title
                header = section_title if section_title.lower() != main_title.lower() else main_title
                generate_audio_and_frame(text_to_speak=chunk, header=header)


def render_video(title):
    """
    Renders the final video by combining all generated frames and audio clips.
    """
    clips = []
    audio_files = sorted(AUDIO_DIR.glob("part_*.wav"))
    
    if not audio_files:
        print("No audio files found. Cannot render video.")
        return

    for audio_file in audio_files:
        idx = audio_file.stem.split("_")[1]
        image_file = FRAME_DIR / f"frame_{idx}.png"
        
        if not image_file.exists():
            print(f"Warning: Missing image file {image_file} for audio {audio_file}. Skipping clip.")
            continue

        try:
            audio_info = sf.info(str(audio_file))
            duration = audio_info.duration
            if duration < 0.1: # Skip silent/too short clips
                print(f"Skipping very short clip: {audio_file}")
                continue
        except Exception as e:
            print(f"Could not read duration for {audio_file}: {e}. Setting to 2 seconds.")
            duration = 2

        img_clip = ImageClip(str(image_file), duration=duration)
        audio_clip = AudioFileClip(str(audio_file))
        final_clip = img_clip.with_audio(audio_clip)
        clips.append(final_clip)

    if not clips:
        print("No valid clips were created. Aborting video rendering.")
        return

    final_video = concatenate_videoclips(clips, method="compose")
    final_video.write_videofile(OUTPUT_VIDEO / title, fps=24, audio_codec='aac', threads=4)

In [None]:
json_file = "video-resource/A Drug-Trial Stock Sale.json"
if os.path.exists(json_file):
    process_article(json_file)
    render_video("A Drug-Trial Stock Sale.mp4")