In [None]:
from utils.hparams import load_hparams_json
from utils.util import intersperse
import json
from models.synthesizer.models.vits import Vits
import torch
import numpy as np
import IPython.display as ipd
from models.synthesizer.utils.symbols import symbols
from models.synthesizer.utils.text import text_to_sequence


hps = load_hparams_json("data/ckpt/synthesizer/vits2/config.json")
print(hps.train)
model = Vits(
    len(symbols),
    hps["data"]["filter_length"] // 2 + 1,
    hps["train"]["segment_size"] // hps["data"]["hop_length"],
    n_speakers=hps["data"]["n_speakers"],
    **hps["model"])
_ = model.eval()
device = torch.device("cpu")
checkpoint = torch.load(str("data/ckpt/synthesizer/vits2/G_120000.pth"), map_location=device)
if "model_state" in checkpoint:
    state = checkpoint["model_state"]
else:
    state = checkpoint["model"]
model.load_state_dict(state, strict=False)

# 随机抽取情感参考音频的根目录
random_emotion_root = "D:\\audiodata\\aidatatang_200zh\\corpus\\train\\G0017"
import random, re
from pypinyin import lazy_pinyin, Style

import os

def tts(txt, emotion, sid=0):
    txt = " ".join(lazy_pinyin(txt, style=Style.TONE3, neutral_tone_with_five=True))
    text_norm = text_to_sequence(txt, hps["data"]["text_cleaners"])
    if hps["data"]["add_blank"]:
        text_norm = intersperse(text_norm, 0)
    stn_tst = torch.LongTensor(text_norm)

    with torch.no_grad(): #inference mode
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        sid = torch.LongTensor([sid])
        if emotion.endswith("wav"):
            from models.synthesizer.preprocess_audio import extract_emo
            import librosa
            wav, sr = librosa.load(emotion, 16000)
            emo = torch.FloatTensor(extract_emo(np.expand_dims(wav, 0), sr, embeddings=True))
        else:
            print("emotion参数不正确")

        audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1, emo=emo)[0][0,0].data.float().numpy()
    ipd.display(ipd.Audio(audio, rate=hps["data"]["sampling_rate"], normalize=False))




推理：

In [None]:
txt = "随机抽取的音频文件路径可以用于使用该情感合成其他句子"
tts(txt, emotion='C:\\Users\\babys\\Desktop\\voicecollection\\secondround\\美玉.wav', sid=2)

预处理：

In [None]:
from models.synthesizer.preprocess import preprocess_dataset
from pathlib import Path
from utils.hparams import HParams
datasets_root = Path("../audiodata/")
hparams = HParams(
        n_fft = 1024, # filter_length
        num_mels = 80,
        hop_size = 256,                             # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
        win_size = 1024,                             # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
        fmin = 55,
        min_level_db = -100,
        ref_level_db = 20,
        max_abs_value = 4.,                         # Gradient explodes if too big, premature convergence if too small.
        sample_rate = 16000,
        rescale = True,
        max_mel_frames = 900,
        rescaling_max = 0.9,        
        preemphasis = 0.97,                         # Filter coefficient to use if preemphasize is True
        preemphasize = True,
        ### Mel Visualization and Griffin-Lim
        signal_normalization = True,

        utterance_min_duration = 1.6,               # Duration in seconds below which utterances are discarded
        ### Audio processing options
        fmax = 7600,                                # Should not exceed (sample_rate // 2)
        allow_clipping_in_normalization = True,     # Used when signal_normalization = True
        clip_mels_length = True,                    # If true, discards samples exceeding max_mel_frames
        use_lws = False,                            # "Fast spectrogram phase recovery using local weighted sums"
        symmetric_mels = True,                      # Sets mel range to [-max_abs_value, max_abs_value] if True,
                                                    #               and [0, max_abs_value] if False
        trim_silence = True,                        # Use with sample_rate of 16000 for best results

)
preprocess_dataset(datasets_root=datasets_root, 
        out_dir=datasets_root.joinpath("SV2TTS", "synthesizer"),
        n_processes=8,
        skip_existing=True, 
        hparams=hparams, 
        no_alignments=False, 
        dataset="magicdata", 
        emotion_extract=True)

训练：

In [None]:
from models.synthesizer.train_vits import run
from pathlib import Path
from utils.hparams import HParams
import torch, os
import torch.multiprocessing as mp

datasets_root = Path("../audiodata/SV2TTS/synthesizer")
hparams= HParams(
  model_dir = "data/ckpt/synthesizer/vits",
)
hparams.loadJson(Path(hparams.model_dir).joinpath("config.json"))
hparams.data["training_files"] = str(datasets_root.joinpath("train.txt"))
hparams.data["validation_files"] = str(datasets_root.joinpath("train.txt"))
hparams.data["datasets_root"] = str(datasets_root)

n_gpus = torch.cuda.device_count()
# for spawn
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '8899'
mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hparams))

挑选只有对应emo文件的meta数据

In [None]:
from pathlib import Path
import os
root = Path('../audiodata/SV2TTS/synthesizer')
dict_info = []
with open(root.joinpath("train.txt"), "r", encoding="utf-8") as dict_meta:
    for raw in dict_meta:
        if not raw:
            continue
        v = raw.split("|")[0].replace("audio","emo")
        emo_fpath = root.joinpath("emo").joinpath(v)
        if emo_fpath.exists():
            dict_info.append(raw)
        # else:
        #     print(emo_fpath)
# Iterate over each wav
meta2 = Path('../audiodata/SV2TTS/synthesizer/train2.txt')
metadata_file = meta2.open("w", encoding="utf-8")
for new_info in dict_info:
    metadata_file.write(new_info)
metadata_file.close()

In [None]:
from pathlib import Path
import os
import shutil
emo_root = Path('../audiodata/SV2TTS/synthesizer').joinpath('emo')
# raw_root = Path('../audiodata/aidatatang_200zh/corpus/train')
# emo_file_list = emo_root.glob("**/*.npy")
# for emo_file in emo_file_list:
#     if emo_file.name.endswith('wav__00.npy'):
#         folder = emo_file.parent
#         os.rename(emo_file, folder.joinpath(emo_file.name.replace("__00", "_00")))
    # shutil.move(emo_file, emo_root.joinpath(emo_file.name))

root = Path('../audiodata/SV2TTS/synthesizer')
dict_info = []
with open(root.joinpath("train.txt"), "r", encoding="utf-8") as dict_meta:
    for raw in dict_meta:
        if not raw:
            continue
        v = raw.split("|")[0].replace("audio","emo")
        emo_fpath = root.joinpath("emo").joinpath(v)
        if emo_fpath.exists():
            dict_info.append(raw)
        # else:
        #     print(emo_fpath)
# Iterate over each wav
meta2 = Path('../audiodata/SV2TTS/synthesizer/train2.txt')
metadata_file = meta2.open("w", encoding="utf-8")
for new_info in dict_info:
    metadata_file.write(new_info)
metadata_file.close()