<a href="https://colab.research.google.com/github/carrotmet/my-first-/blob/main/vits%E6%A8%A1%E5%9E%8B%E8%AE%AD%E7%BB%83%E7%AC%94%E8%AE%B0%E6%9C%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Forked from https://github.com/CjangCjengh/vits/blob/main/vits.ipynb

该笔记本可用于训练vits单人和多人模型，不包括语音合成。

**请注意该笔记本不包括合成语音，而且默认只保存一个checkpoint以节省空间。**

**默认每隔1000次迭代保存一次，可在“每隔多少次迭代保存一次断点”部分进行修改。**

**在看到进度save之前不要轻易退出，以免丢失进度。**

vits的配置较繁琐，简单配置可使用[tacotron2笔记本](https://colab.research.google.com/drive/18fbCupSaQde-FtF2Z2Na-LP5BrukjNMs?usp=sharing)





In [None]:
#@title 准备
#@markdown 定义工具函数 `run_command` `run_command_by_line` `get_symbols` 和 `get_tensorboard_showing`
# forked from https://www.endpointdev.com/blog/2015/01/getting-realtime-output-using-python/
import os
import subprocess
def run_command(command_args):
    def print_pipe(raw):
        return print(raw.decode("utf-8"), end='')
    try:
      process = subprocess.Popen(command_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
      out, err = process.communicate()
    except:
      pass
    print_pipe(out)
    print_pipe(err)
    rc = process.poll()
    return rc

def run_command_by_line(command_args):
    def print_pipe(raw):
        return print(raw.decode("utf-8"), end='')
    with subprocess.Popen(command_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as process:
      while process.poll() is None:
        print_pipe(process.stdout.readline())
      [print_pipe(line) for line in process.stderr.readlines()]
    return

'''
Defines the set of symbols used in text input to the model.
'''

symbols_map = {
    "japanese_cleaners": {
        "_pad": '_',
        "_punctuation": ',.!?-',
        "_letters": 'AEINOQUabdefghijkmnoprstuvwyzʃʧ↓↑ '
    },
    "korean_cleaners": {
        "_pad": '_',
        "_punctuation": ',.!?…~',
        "_letters": 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ ',
    }
}


def get_symbols(specify_cleaners):
    if specify_cleaners not in symbols_map.keys():
        raise ValueError("不存在对应cleaners的symbols!")
    symbols = symbols_map[specify_cleaners]
    return [symbols["_pad"]] + list(symbols["_punctuation"]) + list(symbols["_letters"])

def get_tensorboard_showing(logdir):
    from multiprocessing import Process
    from tensorboard import notebook
    import tensorflow as tf
    import time

    def run_tb():
        run_command_by_line(["tensorboard","--reload_interval", "30",  "--logdir", logdir, "--bind_all"])
    
    def monitor_tb():
        while True:
            try:
                notebook.display(height=998)
                break
            except Exception as e:
                print(e)
                time.sleep(3)

    if param_enable_tb:
        Process(target=run_tb).start()
        Process(target=monitor_tb).start()

In [None]:
#@title 下载依赖库
#@markdown 取消勾选则不会节省空间
colab_save_space = True #@param {type:"boolean"}
os.chdir('/content')
run_command_by_line(["git", "clone", "https://github.com/wind4000/vits.git", "-b", "save-space-2" if colab_save_space else "main"])
os.chdir('/content/vits')
!pip install -r requirements.txt
!sudo apt-get install espeak -y
!sudo apt-get install p7zip-full p7zip-rar
!pip install demjson

In [None]:
#@title 加载Google云端硬盘
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title 解压数据集
#@markdown 压缩包路径
import subprocess
dataset_path = "/content/drive/MyDrive/dataset/YOURDATASET.zip"  #@param {type:"string"}
os.chdir('/content/vits')
run_command_by_line(["7z", "x", dataset_path])

In [None]:
#@title 生成配置文件
# forked from https://github.com/CjangCjengh/vits/blob/main/configs/japanese_ss_base2.json
#@markdown 配置文件名称
json_filename = "test.json" #@param {type:"string"}
#@markdown 训练次数
hparams_epochs = 2000 #@param {type:"integer"}
#@markdown 每隔多少次迭代保存一次断点
hparams_eval_interval = 200 #@param {type:"integer"}
#@markdown 单次迭代的文件数（建议在16以内）
hparams_batch_size = 12 #@param {type:"integer"}
#@markdown 训练集文件列表
hparams_training_files = "/content/vits/filelists/list.txt" #@param {type:"string"}
#@markdown 验证集文件列表
hparams_validation_files = "/content/vits/filelists/list.txt"#@param {type:"string"}
#@markdown 选择cleaner
hparams_cleaner =  "japanese_cleaners" #@param {type:"string"}
#@markdown 人物名，多个人物用英文逗号隔开
hparams_speaker = "test" #@param {type:"string"}
#@markdown 模型名
hparams_model_name = "test" #@param {type:"string"}

hparams_symbols = get_symbols(hparams_cleaner)
speakers = [speaker.strip() for speaker in hparams_speaker.split(",")]
print("speakers: ")
for i, speaker in enumerate(speakers):
  print("\t{a}: {b}".format(a=i, b=speaker))
training_json = {
  "train": {
    "log_interval": 200,
    "eval_interval": hparams_eval_interval,
    "seed": 1234 ,
    "epochs": hparams_epochs,
    "learning_rate": 2e-4,
    "betas": [0.8, 0.99],
    "eps": 1e-9,
    "batch_size": hparams_batch_size,
    "fp16_run": True,
    "lr_decay": 0.999875,
    "segment_size": 8192,
    "init_lr_ratio": 1,
    "warmup_epochs": 0,
    "c_mel": 45,
    "c_kl": 1.0
  },
  "data": {
    "training_files": hparams_training_files + ".cleaned",
    "validation_files": hparams_validation_files + ".cleaned",
    "text_cleaners":[hparams_cleaner],
    "max_wav_value": 32768.0,
    "sampling_rate": 22050,
    "filter_length": 1024,
    "hop_length": 256,
    "win_length": 1024,
    "n_mel_channels": 80,
    "mel_fmin": 0.0,
    "mel_fmax": None,
    "add_blank": True,
    "n_speakers": len(speakers) if len(speakers) > 1 else 0,
    "cleaned_text": True
  },
  "model": {
    "inter_channels": 192,
    "hidden_channels": 192,
    "filter_channels": 768,
    "n_heads": 2,
    "n_layers": 6,
    "kernel_size": 3,
    "p_dropout": 0.1,
    "resblock": "1",
    "resblock_kernel_sizes": [3,7,11],
    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
    "upsample_rates": [8,8,2,2],
    "upsample_initial_channel": 512,
    "upsample_kernel_sizes": [16,16,4,4],
    "n_layers_q": 3,
    "use_spectral_norm": False,
  },
  "speakers": speakers,
  "symbols": hparams_symbols
}

if len(speakers) > 1:
  training_json["model"]["gin_channels"] = 256

import demjson
os.chdir('/content/vits/configs')
training_json_text = demjson.encode(training_json)
with open(json_filename, "w") as file:
  file.write(training_json_text)

os.chdir('/content/vits/text')
with open("symbols.py", "w") as file:
  print("symbols = ", hparams_symbols, sep="", file=file)
os.chdir('/content/vits')



**目前支持的cleaner(和tacotron2版效果不同)**

cleaners from https://github.com/CjangCjengh/vits

1. japanese_cleaners 日语
2. korean_cleaners 韩语

In [None]:
#@title 预处理
os.chdir('/content/vits/monotonic_align')
!python setup.py build_ext --inplace
os.chdir('/content/vits')
run_command(["python", "preprocess.py", "--text_index", "2" if len(speakers) > 1 else "1", "--text_cleaners", hparams_cleaner, "--filelists", hparams_training_files, hparams_validation_files])

In [None]:
#@title 训练

#@markdown 启用tensorboard可视化数据
param_enable_tb = True  # @param {type:"boolean"}
if param_enable_tb:
  logdir = os.path.join("/content/drive/MyDrive/", hparams_model_name)
  get_tensorboard_showing(logdir)
os.chdir('/content/vits')
run_command_by_line(["python", "train_ms.py" if len(speakers) > 1 else "train.py", "-c", "configs/{json}".format(json=json_filename), "-m", hparams_model_name])

## 工具

这部分辅助[MoeTTS](https://github.com/luoyily/MoeTTS)等软件用vits合成语音。

运行本部分前必须执行的步骤：“准备”、“下载依赖库”、“加载Google云端硬盘”和“生成配置文件”。

这部分代码不要求GPU，可使用非GPU运行时，即达到限额后仍可使用。

In [None]:
#@title 生成供MoeTTS使用的配置文件
#@markdown 保存路径
moetts_savepath = "/content/drive/MyDrive/" #@param {type:"string"}
moetts_filepath = moetts_savepath + "config.json"
training_json["data"]["cleaners"] = ["custom_cleaners"]
training_json_text = demjson.encode(training_json)
with open(moetts_filepath, "w") as file:
  file.write(training_json_text)
print("已保存到", moetts_filepath)

In [None]:
#@title 合成前转换文本
os.chdir('/content/vits')
import text
input_text = "\u3053\u308C\u304B\u3089\u3082\u3001\u304A\u308C\u305F\u3061\u304C\u305F\u3061\u3068\u307E\u3089\u306A\u3044\u304B\u304E\u308A\u3001\u9053\u306F\u7D9A\u304F\u3002" #@param {type:"string"}
input_cleaners = "japanese_cleaners" #@param {type:"string"}
try:
  output_text = text._clean_text(input_text, [input_cleaners])
  print("转换结果：", output_text)
except Exception as e:
  print("文本有误？", e)