In [None]:
import gzip
import json

def read_jsonl_gz(file_path):
    with gzip.open(file_path, 'rt') as f:
        for line in f:
            yield json.loads(line)

file_path = './data/manifests/genshin_supervisions_dev.jsonl.gz'

# 逐行读取和解析 JSON 数据
json_data = list(read_jsonl_gz(file_path))

# 查看数据
json_data


In [1]:
import os
from bin.myinfer import InferModel

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
model = InferModel(checkpoint="exp/valle1/best-valid-loss.pt")

In [38]:
import os
import json
import shutil
from typing import List
from collections import OrderedDict
from os import PathLike
from pathlib import Path

import sys

# 清除导入的缓存
if 'bin.utils' in sys.modules:
    del sys.modules['bin.utils']

from bin.utils import *


class GenShinDataWrapper:
    def __init__(
        self,
        dir: PathLike = Path("download/genshin"),
    ) -> None:
        self.voice = json.load(open(dir / "result_chs.json"))
        self.voice_test = json.load(open(dir / "result_chs_test.json"))

        # 按照 value['npcName'] 进行分组
        grouped_dict = OrderedDict()
        for key, value in self.voice.items():
            group_key = value["npcName"]
            grouped_dict.setdefault(group_key, []).append(key)
        self.grouped_dict = grouped_dict
        self.text_processor = TextProcessor()

    def create_file(
        self,
        file_name: PathLike,
        ids: List[str],
        texts: List[str],
        infer_dir: PathLike,
        copy: bool = False,
    ):
        os.makedirs(infer_dir, exist_ok=True)
        with open(file_name, "w") as file:
            used = {}
            for n, id in enumerate(ids):
                value = self.voice[id]
                text_prompt = value["text"]
                audio_prompt = f'download/genshin/{value["fileName"]}'
                text = self.text_processor.text_pre_process(texts[n % len(texts)])
                if id in used:
                    used[id] += 1
                    suffix = f"infer{used[id]}"
                else:
                    used[id] = 0
                    suffix = "infer"
                audio_out = f"{infer_dir}/{id}_{suffix}.wav"
                file.write(
                    "\t".join([text_prompt, audio_prompt, text, audio_out]) + "\n"
                )
        print(open(file_name).read())
        if copy:
            shutil.copyfile(file_name, f"../../../{file_name}")
        return file_name

    def get_npc_ids(
        self,
        name: str,
        count: int = 5,
        min_len: int = 8,
        max_len: int = 18,
    ):
        ids = [
            key
            for key in self.grouped_dict[name]
            if min_len <= len(self.voice[key]["text"]) <= max_len
        ]
        return random_samples(ids, count)

    def get_test_ids(
        self,
        count: int = 5,
        min_len=8,
        max_len=18,
    ) -> List[str]:
        ids = [
            key
            for key, value in self.voice_test.items()
            if min_len <= len(value["text"]) <= max_len
        ]
        return random_samples(ids, count)

In [45]:
wrapper = GenShinDataWrapper()

ids = wrapper.get_npc_ids("派蒙")
# ids = ["c15260d94fa350c1"] * 5
# ids = wrapper.get_test_ids(5)

texts = get_tts_texts("mid")

file_name = wrapper.create_file("genshin.txt", ids, texts, "audios/genshin")

谢谢你们,我们会转达的!	download/genshin/Chinese/VO_LQ/VO_paimon/vo_HTLQ003_12_paimon_01.wav	希望是人类前进的动力,它激发我们努力奋斗,追求更好的未来.	audios/genshin/895c33cac0108881_infer.wav
我肯定会记半辈子!	download/genshin/Chinese/VO_EQ/VO_paimon/vo_EQHDJ201_2_paimon_17.wav	每个人都有潜力成为更好的自己,关键在于发现和释放这个潜力.	audios/genshin/c21c5e1123494d09_infer.wav
应该足够强了吧?	download/genshin/Chinese/VO_EQ/VO_paimon/vo_EQXS005_2_paimon_04.wav	生活的道路上有坎坷,只要坚持向前,终将会到达成功的彼岸.	audios/genshin/4f1125d6c84998b4_infer.wav
没错,我也觉得你会挨骂,不要冲动呀!	download/genshin/Chinese/VO_EQ/VO_paimon/vo_EQFH001_6_paimon_08c.wav	真正的友谊建立在信任,理解和共享的基础上,它能够经受时间的考验.	audios/genshin/05c3d184b722bf94_infer.wav
感觉付出了很大的代价.	download/genshin/Chinese/VO_LQ/VO_paimon/vo_HTLQ004_5_paimon_02.wav	人生犹如一场旅行,我们要勇敢踏上未知的道路,发现新的可能性.	audios/genshin/2fe264098baa2ce5_infer.wav



In [None]:
# file_name = 'test.txt'

model.infer_by_file(file_name)

show_audios(file_name)

In [53]:
int("aaaaaa11"[-2:])
aa = lambda x:(1,2)
b,c = aa(1)
print(b,c)

1 2
