In [1]:
import json
from pathlib import Path

from rich import print as rprint

import midii

import preprocess_svs as ps 

# MSSV File Correction

In [2]:
mssv = "D:/dataset/004.다화자 가창 데이터"

In [3]:
sample_mssv_midi = "sample/mssv/midi/ba_05688_-4_a_s02_m_02.mid"
mid = midii.MidiFile(sample_mssv_midi, convert_1_to_0=True)
tempo_rank = mid.tempo_rank()
rprint(tempo_rank)
rprint(ps.calculate_top_tempo_percentage(tempo_rank))

## Analysis Tempo Deviation

In [4]:
ps.tempo_statistics(mssv, parallel=True)

len: 4205
min: 28.21997105643994
max: 100.0
mean: 97.90013544608966
population stddev: 6.07
sample stddev(ddof=1): 6.07
99 >= 3199
95 >= 3774
90 >= 3971
80 >= 4112
50 >= 4205


- --> MSSV 데이터셋의 tempo 의 편차가 커서, ticks 단위에서 음표 길이 정규화를 해야 함.

## Verify notes sorted by time

- mssv 의 json 은 MIDI 에서 직접 변환했으므로 time 정렬이 본질적으로 내재되어 있으므로 이 단계를 skip

## fill silence note between notes

- mssv 의 json 은 MIDI 에서 변환하는 과정에서 notes 사이에 공백을 채웠음

## verify correspondence wav vs mid 

In [5]:
rprint(ps.find_exclusive_two_type_files("*.mid", "*.wav", mssv))

## Check abnormal files

In [6]:
rprint(ps.check_abnormal_mssv_file(mssv))

## Rename abnormal files

In [7]:
rprint(ps.rename_abnormal_mssv_file(mssv))

## Remove abnormal files

In [8]:
rprint(ps.remove_abnormal_mssv_file(mssv))

## Verify midi pattern(on-lyrics-off)

In [9]:
ps.verify_midi_files_pattern_on_lyrics_off(mssv, parallel=True)

## Verify lyrics has no time

In [10]:
ps.verify_midi_files_lyrics_has_no_time(mssv, parallel=True)

# MSSV Preprocessing

In [2]:
midi_filepath = "sample/mssv/midi/ba_05688_-4_a_s02_m_02.mid"
wav_filepath = "sample/mssv/wav/ba_05688_-4_a_s02_m_02.wav"
json_filepath = "sample/mssv/json/ba_05688_-4_a_s02_m_02.json"
split_json_filepath = "sample/mssv/split_json/ba_05688_-4_a_s02_m_02.json"
preprocessed_mssv_path = "preprocessed_mssv/"
preprocessed_mssv_duration_path = "preprocessed_mssv/duration"
preprocessed_mssv_pitch_path = "preprocessed_mssv/pitch"
preprocessed_mssv_wav_path = "preprocessed_mssv/wav"

## Step 1 - midi to json 

- note duration quantization
- duration conversion [ticks --> seconds -> frames]

In [3]:
df_notes = ps.mssv_midi_to_dataframe(midi_filepath)
json_filepath = Path(json_filepath)
json_filepath.parent.mkdir(exist_ok=True, parents=True)
df_notes.to_json(
    json_filepath,
    orient="records",
    indent=4,
    force_ascii=False,
)

## Step 2 - split notes by silence

In [4]:
split_json = ps.split_json_by_silence_mssv(json_filepath, min_length=6)
split_json_filepath = Path(split_json_filepath)
split_json_filepath.parent.mkdir(exist_ok=True, parents=True)
with open(split_json_filepath, "w", encoding="utf-8") as f:
    json.dump(split_json, f, indent=4, ensure_ascii=False)

## Step 3 or Step 4 

- regularization korean (metadata.txt 의 가사의 글자 갯수가 split 된 duration/pitch/wav 의 갯수와 일치해야 하는지? 만약 일치하지 않아도 된다면, step 4 에 해도 되고, json 이 아니라 kor seq/pitch seq/GT 만 받아도 해도 된다)

In [5]:
split_json_filepath

WindowsPath('sample/mssv/split_json/ba_05688_-4_a_s02_m_02.json')

## Step 4 - save duration, pitch as npy file, split audio, save metadata

In [6]:
metadata_list = []
metadata_list.append(
    ps.preprocess_mssv_one(
        wav_filepath,
        split_json_filepath,
        preprocessed_mssv_pitch_path,
        preprocessed_mssv_duration_path,
        preprocessed_mssv_wav_path,
    )
)
with open(f"{preprocessed_mssv_path}/metadata.txt", "w", encoding="utf-8") as f:
    f.write("".join(metadata_list))