In [None]:
!pip install -q TTS langid unidic-lite unidic deepspeed gtts moviepy Pillow
!pip install -q numpy



In [None]:
import json

def parse_quiz_file(file_path):
    # Read the file content
    with open(file_path, 'r') as file:
        lines = [line.strip() for line in file if line.strip()]

    # Initialize the structure
    quizzes = {"quizzes": []}
    quiz_batch = []

    # Iterate through the lines, assuming alternating question-answer pairs
    for i in range(0, len(lines), 2):  # Step by 2 to get question-answer pairs
        question = lines[i]
        answer = lines[i + 1] if i + 1 < len(lines) else ""
        quiz_batch.append({"question": question, "answer": answer})

        # Every time we get 5 questions, add them as a quiz object and reset
        if len(quiz_batch) == 5:
            quizzes["quizzes"].append({"questions": quiz_batch})
            quiz_batch = []

    # Add any remaining questions (if there are fewer than 5 in the last batch)
    if quiz_batch:
        quizzes["quizzes"].append({"questions": quiz_batch})

    return quizzes

# Specify the file path of your text file
file_path = 'quiz.txt'

# Parse the file and get the result in desired structure
quiz_structure = parse_quiz_file(file_path)

# Print the result in a pretty JSON format
print(json.dumps(quiz_structure, indent=4))


In [None]:
# prompt: get all files in /content/output folder and zip em!

import os
import zipfile

def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, dirs, files in os.walk(path):
        for file in files:
            ziph.write(os.path.join(root, file))

if __name__ == '__main__':
    zipf = zipfile.ZipFile('/content/output.zip', 'w', zipfile.ZIP_DEFLATED)
    zipdir('/content/output', zipf)
    zipf.close()


In [None]:
import os
import re
import time
import torch
import torchaudio
from gtts import gTTS
from moviepy.editor import AudioFileClip, ImageClip, concatenate_videoclips, CompositeAudioClip
from PIL import Image, ImageDraw, ImageFont, ImageFilter
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.utils.manage import ModelManager
import json
import datetime

# Download for mecab
os.system('python -m unidic download')

# By using XTTS you agree to CPML license https://coqui.ai/cpml
os.environ["COQUI_TOS_AGREED"] = "1"

# Initialize and download the XTTS model
def initialize_xtts_model():
    print("Downloading Coqui XTTS V2 if not already downloaded")
    model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
    ModelManager().download_model(model_name)
    model_path = "/root/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2"
    print("XTTS downloaded")

    config = XttsConfig()
    config.load_json(os.path.join(model_path, "config.json"))

    model = Xtts.init_from_config(config)
    model.load_checkpoint(
        config,
        checkpoint_path=os.path.join(model_path, "model.pth"),
        vocab_path=os.path.join(model_path, "vocab.json"),
        speaker_file_path=os.path.join(model_path, "speakers_xtts.pth"),
        eval=True,
        use_deepspeed=True,
    )
    model.cuda()
    return model

model = initialize_xtts_model()

# Get conditioning latents
def get_conditioning_latents(model, speaker_wav):
    try:
        return model.get_conditioning_latents(
            audio_path=speaker_wav, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60
        )
    except Exception as e:
        print(f"Speaker encoding error: {str(e)}")
        print("It appears something is wrong with the reference. Did you unmute your microphone?")
        return None, None

speaker_wav = "/content/hermione.mp3"
gpt_cond_latent, speaker_embedding = get_conditioning_latents(model, speaker_wav)

# Load quiz from JSON
def load_quiz(quiz_file):
    with open(quiz_file, 'r', encoding='utf-8') as f:
        return json.load(f)

# Create blurred wrapped text image
def create_blurred_wrapped_text_image(font, text, background_image_path, width=1080, height=1920, text_color="black", blur_radius=15, box_opacity=150):
    background = Image.open(background_image_path).resize((width, height))
    blurred_background = background.filter(ImageFilter.GaussianBlur(radius=blur_radius))
    overlay = Image.new('RGBA', (width, height), (255, 255, 255, 0))
    draw_overlay = ImageDraw.Draw(overlay)

    def wrap_text(draw, text, font, max_width):
        lines = []
        words = text.split()
        current_line = ""

        for word in words:
            test_line = f"{current_line} {word}".strip()
            if draw.textbbox((0, 0), test_line, font=font)[2] <= max_width:
                current_line = test_line
            else:
                lines.append(current_line)
                current_line = word

        if current_line:
            lines.append(current_line)

        return lines

    wrapped_lines = wrap_text(draw_overlay, text, font, width - 40)
    total_text_height = sum(draw_overlay.textbbox((0, 0), line, font=font)[3] - draw_overlay.textbbox((0, 0), line, font=font)[1] for line in wrapped_lines)
    start_y = (height - total_text_height) // 2

    box_padding = 20
    for line in wrapped_lines:
        line_bbox = draw_overlay.textbbox((0, 0), line, font=font)
        line_x = (width - (line_bbox[2] - line_bbox[0])) // 2
        box_coords = (line_x - box_padding, start_y, line_x + (line_bbox[2] - line_bbox[0]) + box_padding, start_y + (line_bbox[3] - line_bbox[1]) + box_padding)
        draw_overlay.rectangle(box_coords, fill=(255, 255, 255, box_opacity))
        draw_overlay.text((line_x, start_y), line, fill=text_color, font=font)
        start_y += line_bbox[3] - line_bbox[1]

    combined = Image.alpha_composite(blurred_background.convert('RGBA'), overlay)
    temp_image = "temp_blurred_text_image_fixed.png"
    combined.save(temp_image)

    return temp_image

# Generate text-to-speech (TTS) audio
def generate_tts(text, language='en', filename="tts_audio.mp3"):
    tts = gTTS(text=text, lang=language)
    tts.save(filename)
    return AudioFileClip(filename)

def generate_xttsv2(text, model, gpt_cond_latent, speaker_embedding, language='en', filename="tts_audio.mp3"):
    prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)", r"\1 \2\2", text)
    print("I: Generating new audio...")
    t0 = time.time()
    out = model.inference(
        prompt,
        language,
        gpt_cond_latent,
        speaker_embedding,
        repetition_penalty=5.0,
        temperature=0.75,
    )
    inference_time = time.time() - t0
    print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
    real_time_factor = (time.time() - t0) / out['wav'].shape[-1] * 24000
    print(f"Real-time factor (RTF): {real_time_factor}")
    torchaudio.save("/content/output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
    return AudioFileClip("/content/output.wav")

# Generate video for quiz
def create_quiz_video(quiz, quiz_data, output_file, model, gpt_cond_latent, speaker_embedding):
    clips = []
    music_clip = AudioFileClip(quiz_data['background_music']).volumex(0.2)
    font = ImageFont.truetype(quiz_data['font'], 80)

    for item in quiz["questions"]:
        question = item['question']
        answer = item['answer']

        temp_image = create_blurred_wrapped_text_image(font, question, quiz_data['background_image'])
        question_clip = ImageClip(temp_image).set_duration(5)
        question_audio = generate_xttsv2(question, model, gpt_cond_latent, speaker_embedding)
        question_clip = question_clip.set_audio(question_audio)
        clips.append(question_clip)

        temp_image_2 = create_blurred_wrapped_text_image(font, answer, quiz_data['background_image'])
        answer_clip = ImageClip(temp_image_2).set_duration(2)
        answer_audio = generate_xttsv2(answer, model, gpt_cond_latent, speaker_embedding)
        answer_clip = answer_clip.set_audio(answer_audio)
        clips.append(answer_clip)

    final_clip = concatenate_videoclips(clips)
    final_clip = final_clip.set_audio(CompositeAudioClip([final_clip.audio, music_clip]))
    final_clip = final_clip.subclip(0, 35)
    final_clip.write_videofile(output_file, fps=24)

    if os.path.exists("tts_audio.mp3"):
        os.remove("tts_audio.mp3")

# Main script
if __name__ == "__main__":
    quiz_data = load_quiz('/content/quiz.json')
    os.makedirs("/content/output", exist_ok=True)

    for quiz in quiz_data['quizzes']:
        current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        output_filename = f"/content/output/quiz_{quiz_data['theme']}_{current_time}.mp4"
        create_quiz_video(quiz, quiz_data, output_filename, model, gpt_cond_latent, speaker_embedding)

Downloading Coqui XTTS V2 if not already downloaded
 > Downloading model to /root/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2
 > Model's license - CPML
 > Check https://coqui.ai/cpml.txt for more info.
XTTS downloaded


  self.speakers = torch.load(speaker_file_path)

  return torch.load(f, map_location=map_location, **kwargs)



[2024-10-08 14:31:38,343] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-10-08 14:31:40,824] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.15.1, git-hash=unknown, git-branch=unknown
[2024-10-08 14:31:40,836] [INFO] [logging.py:96:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1


Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
Creating extension directory /root/.cache/torch_extensions/py310_cu121/transformer_inference...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/transformer_inference/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].

Building extension module transformer_inference...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


Time to load transformer_inference op: 82.40205121040344 seconds
[2024-10-08 14:33:04,332] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed-Inference config: {'layer_id': 0, 'hidden_size': 1024, 'intermediate_size': 4096, 'heads': 16, 'num_hidden_layers': -1, 'dtype': torch.float32, 'pre_layer_norm': True, 'norm_type': <NormType.LayerNorm: 1>, 'local_rank': -1, 'stochastic_mode': False, 'epsilon': 1e-05, 'mp_size': 1, 'scale_attention': True, 'triangular_masking': True, 'local_attention': False, 'window_size': 1, 'rotary_dim': -1, 'rotate_half': False, 'rotate_every_two': True, 'return_tuple': True, 'mlp_after_attn': True, 'mlp_act_func_type': <ActivationFuncType.GELU: 1>, 'specialized_mode': False, 'training_mp_size': 1, 'bigscience_bloom': False, 'max_out_tokens': 1024, 'min_out_tokens': 1, 'scale_attn_by_inverse_layer_idx': False, 'enable_qkv_quantization': False, 'use_mup': False, 'return_single_tuple': False, 'set_empty_params': False, 'transposed_mode': False, 'use_triton': Fa

Loading extension module transformer_inference...


I: Generating new audio...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


I: Time to generate audio: 1775 milliseconds
Real-time factor (RTF): 0.74974588445715
I: Generating new audio...
I: Time to generate audio: 452 milliseconds
Real-time factor (RTF): 0.42471975088119507
I: Generating new audio...
I: Time to generate audio: 863 milliseconds
Real-time factor (RTF): 0.36467047961982524
I: Generating new audio...
I: Time to generate audio: 284 milliseconds
Real-time factor (RTF): 0.3419899596617772
I: Generating new audio...
I: Time to generate audio: 474 milliseconds
Real-time factor (RTF): 0.23282124265950388
I: Generating new audio...
I: Time to generate audio: 387 milliseconds
Real-time factor (RTF): 0.23278336112315837
I: Generating new audio...
I: Time to generate audio: 644 milliseconds
Real-time factor (RTF): 0.21726841763626759
I: Generating new audio...
I: Time to generate audio: 227 milliseconds
Real-time factor (RTF): 0.2599436698890314
I: Generating new audio...
I: Time to generate audio: 524 milliseconds
Real-time factor (RTF): 0.21723450153274



MoviePy - Done.
Moviepy - Writing video /content/output/quiz_harry_potter_20241008_143308.mp4





Moviepy - Done !
Moviepy - video ready /content/output/quiz_harry_potter_20241008_143308.mp4


In [None]:
import moviepy.editor
moviepy.editor.ipython_display("/content/output/quiz_harry_potter_20241008_143308.mp4")