# Data Parsing

In [None]:
""" Json Example
{
	"S0069-06-11-00.wav": "신기루야!",
	"S0069-06-11-01.wav": "또 나타났네.",
	"S0069-06-11-02.wav": "신기루는 수렁이다.",
	"S0069-06-11-03.wav": "일, 일을 해야지.",
	"S0069-06-11-04.wav": "역시",
	"S0069-06-11-05.wav": "아무것도 없네.",
	"S0069-06-11-06.wav": "김팀장님 말이 맞았어.",
	"S0069-06-11-07.wav": "네, 저 주진우예요.",
	"S0069-06-11-08.wav": "출근이 빠르네요.",
	"S0069-06-11-09.wav": "어제는 비바람이 심했잖아요.",
	"S0069-06-11-10.wav": "모자가 다 날아갈 거 같더라구요.",
	"S0069-06-11-11.wav": "만약에 밖에 나가 돌아다녔더라면요.",
	"S0069-06-11-12.wav": "그랬더라면, 저건 환상이야.",
	"S0069-06-11-13.wav": "신기루!",
	"S0069-06-11-14.wav": "곧 사라질 거야.",
	"S0069-06-11-15.wav": "주진우, 너 정말 사람이 보고 싶구나.",
	"S0069-06-11-16.wav": "다른 사람이 보고 싶어.",
	"S0069-06-11-17.wav": "없어졌겠지?"
}
"""

In [None]:
MODEL_NAME = 'ModelName'

In [7]:
import os, json

def json2text(json_path):
    dir_name = os.path.dirname(json_path)
    with open(json_path, 'r') as f:
        data = json.load(f)
    lines = []
    for filename, text in data.items():
        lines.append(f'{filename}|{text}\n')
    with open(os.path.join(dir_name, f'{os.path.basename(json_path)[:-5]}.txt'), 'w', encoding='utf-8') as f:
        f.writelines(lines)

In [None]:
json2text(f'./data/{MODEL_NAME}/MP3_data.json')

# Training

In [None]:
!python ./src/preprocess.py -m ModelName -f ../data/ModelName/MP3_data.txt

In [None]:
!python ./src/train.py -m ModelName

# Inference

In [None]:
import sys, json, logging
from scipy.io import wavfile
import torch
sys.path.append('./src')
from models import SynthesizerTrn
from text import symbols, text_to_sequence
from commons import intersperse
logger = logging.getLogger(__name__)

class AudioProcessor:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.load_tts_model()

    def load_tts_model(self):
        try:
            with open('./assets/models/VITS/config.json', 'r') as f:
                self.tts_config = json.load(f)
            self.net_g = SynthesizerTrn(
                len(symbols),
                self.tts_config['data']['filter_length']//2+1,
                self.tts_config['train']['segment_size']//self.tts_config['data']['hop_length'],
                **self.tts_config['model']
            ).to(self.device)
            checkpoint = torch.load('./assets/models/VITS/result.pth', map_location=self.device)
            self.net_g.load_state_dict(checkpoint['model'])
            self.net_g.eval()
        except Exception as e:
            logger.error(f"Failed to load TTS model: {str(e)}")
            raise

    def generate_speech(self, text, path):
        try:
            text = text.replace('\n', ' ')
            text_norm = text_to_sequence(f'[KO]{text}[KO]')
            if self.tts_config['data']['add_blank']:
                text_norm = intersperse(text_norm, 0)
            stn_tst = torch.LongTensor(text_norm).unsqueeze(0).to(self.device)
            with torch.no_grad():
                x_tst = stn_tst.to(self.device)
                x_tst_lengths = torch.LongTensor([stn_tst.size(1)]).to(self.device)
                audio = self.net_g.infer(x_tst, x_tst_lengths, noise_scale=0.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
            wavfile.write(path, self.tts_config['data']['sampling_rate'], (audio * 32767).astype('int16'))
            return path
        except Exception as e:
            logger.error(f"Failed to generate speech: {str(e)}")
            raise

audio_processor = AudioProcessor()

In [None]:
text = '안녕하세요.'

tts_path = audio_processor.generate_speech(text, './test.wav')