### Test

In [None]:
import json
import librosa
import numpy as np
import pydub

from os import path

from transformers import VitsModel, AutoTokenizer, pipeline

OBJECTS_DB_FILE_PATH = "./metadata/json/objects.json"
OUTPUT_PATH = "../../mp3s/captions"

TTS_MODEL_EN = "facebook/mms-tts-eng"
TTS_MODEL_PT = "facebook/mms-tts-por"

### Init Models

In [None]:
import warnings
warnings.filterwarnings("ignore")

ttss = {
  "en": pipeline(model=TTS_MODEL_EN, device="cuda"),
  "pt": pipeline(model=TTS_MODEL_PT, device="cuda")
}

### Generate Audios

In [None]:
%%time

import warnings
warnings.filterwarnings("ignore")

CAPTION_MODEL = "gpt"
TARGET_SR = 11025
TARGET_BITRATE = "32k"

img_data = {}
with open(OBJECTS_DB_FILE_PATH, "r", encoding="utf8") as f:
  img_data = json.load(f)["images"]

for id,data in img_data.items():
  for l,tts in ttss.items():
    output_file_path = path.join(OUTPUT_PATH, l, f"{id}.mp3")
    if path.isfile(output_file_path):
      continue

    output = tts(data["captions"][l][CAPTION_MODEL])

    samples = librosa.resample(output["audio"], orig_sr=output["sampling_rate"], target_sr=TARGET_SR)
    mp3 = pydub.AudioSegment(np.int16(samples * 2 ** 15).tobytes(), frame_rate=TARGET_SR, sample_width=2, channels=1)
    ofhandle = mp3.export(output_file_path, format="mp3", bitrate=TARGET_BITRATE)
    ofhandle.close()