In [1]:
import json
import math
import io
import os
import random
import subprocess
import wave
import soundfile as sf

import requests
from tqdm import tqdm

from IPython.display import Audio

In [2]:
# 定数
# ポート一覧
port_map = {"voicebox": "50021", "sharevox": "50025"}

# リストに含まれていない名前は機械学習への活用が禁止されている可能性が高いので実行しない
# 実行可能
allow_for_ml = [
    # voicebox組
    # 運営tweet: https://twitter.com/hiho_karuta/status/1553026175098028038
    "春日部つむぎ",# 規約  https://tsumugi-official.studio.site/rule
    "波音リツ", # 規約: https://www.canon-voice.com/terms/ 
    "冥鳴ひまり", # 規約: https://meimeihimari.wixsite.com/himari/terms-of-use
    "剣崎雌雄", # 規約: https://frontier.creatia.cc/fanclubs/413/posts/4507#fromHistory
    "櫻歌ミコ", # 規約: https://voicevox35miko.studio.site/rule
    "小夜/SAYO", # 規約: https://316soramegu.wixsite.com/sayo-official/guideline FAQ: https://316soramegu.wixsite.com/sayo-official/question
    # sharevox組 規約: https://www.sharevox.app/characters
    "小春音アミ",
    "つくよみちゃん",
    "白痴ー",
    "Yくん/開発者"
]

In [3]:
# ボイボ名の一覧の取得
speakers_info = {}
id_map = {}
id_map_r = {}
host = "127.0.0.1" # localhost

for engine in port_map.keys():
    res = requests.get("http://{}:{}/speakers".format(host, port_map[engine]))
    if not res.ok:
        continue
    for info in json.loads(res.text):
        info["engine"] = engine
        if "ノーマル" in [i["name"] for i in info["styles"]]:
            info["default_style"] = "ノーマル"
        else:
            info["default_style"] = sorted(info["styles"], key=lambda x: x["id"])[0]["name"]
        speakers_info[info["name"]] = info
        
        for style in info["styles"]:
            id_map[(info["engine"], style["id"])] = (info["name"], style["name"])
            id_map_r[(info["name"], style["name"])] = (info["engine"], style["id"])

In [4]:
# RVCサーバーの準備

host = "127.0.0.1"
rvc_port = "5001"          

In [None]:
rvc_model_file = r""
faiss_index_file = r""

file_names = {"rvc_model_file": rvc_model_file, "faiss_index_file": faiss_index_file}
res = requests.post(f"http://{host}:{rvc_port}/upload_model", json=file_names)

In [5]:
from scipy.io.wavfile import write
def speech_vox(speaker_name, script):
    style = speakers_info[speaker_name]["default_style"]
    engine, speaker = id_map_r[(speaker_name, style)]
    params = (
        ("text", script),
        ("speaker", speaker)
    )
    query = requests.post(
        "http://{}:{}/audio_query".format(host, port_map[engine]),
        params=params
    )
    data = query.json()

    headers = {'Content-Type': 'application/json'}

    vv_res = requests.post(
        'http://{}:{}/synthesis'.format(host, port_map[engine]),
        headers=headers,
        params=params,
        data=json.dumps(data)
    )

    audio, sr = sf.read(io.BytesIO(vv_res.content))
    return audio, sr

def convert_voras(speaker_id, audio, sr):
    params = {
        "speaker_id": str(speaker_id)
    }

    audio_buffer = io.BytesIO()
    write(audio_buffer, rate=sr, data=audio)
    json_buffer = io.BytesIO(json.dumps(params).encode('utf-8'))
    files = {"input_wav": audio_buffer,
             "params": json_buffer}

    res = requests.post(f"http://{host}:{rvc_port}/convert_sound", files=files)

    a, s = sf.read(io.BytesIO(res.content))
    return a, s

In [5]:
script = """
こんにちは、ナダレです。今日はのどの調子が悪いので、ボイスボックス、シェアボックスの音声を用いた動画です。
ボーラスの開発をいったん中止するので、どこまでできたかについてデモを行います。
はじめに、ボイスボックスの冥鳴ひまりさんの音声で読み上げた音声を変換します
"""
audio, sr = speech_vox("冥鳴ひまり", script)
Audio(audio, rate=sr, autoplay=True)

In [6]:
script = "ボーラスを呼び出すAPIのテスト音声です"
speaker_ids = [4, 3, 2, 1, 0]
speaker_names = ["あみたろ", "つくよみちゃん", "刻鳴時雨", "黄琴まひろ", "黄琴海月"]

audio, sr = speech_vox("冥鳴ひまり", script)
Audio(audio, rate=sr, autoplay=True)

In [7]:
speaker_id = speaker_ids.pop()
print(speaker_names[speaker_id])
a, s = convert_voras(speaker_id, audio, sr)
Audio(a, rate=s, autoplay=True)

あみたろ


In [8]:
script = """
次に、シェアボックスの白痴ーさんの音声で読み上げた音声を変換します
"""
audio, sr = speech_vox("白痴ー", script)
Audio(audio, rate=sr, autoplay=True)

In [9]:
script = "ボーラスを呼び出すAPIのテスト音声です"
speaker_ids = [4, 3, 2, 1, 0]
speaker_names = ["あみたろ", "つくよみちゃん", "刻鳴時雨", "黄琴まひろ", "黄琴海月"]

audio, sr = speech_vox("白痴ー", script)
Audio(audio, rate=sr, autoplay=True)

In [10]:
speaker_id = speaker_ids.pop()
print(speaker_names[speaker_id])
a, s = convert_voras(speaker_id, audio, sr)
Audio(a, rate=s, autoplay=True)

あみたろ
