In [1]:
from dataset import OpencpopDataset, MusicLoaderGenerator
from helper import parser_line, merge_note, get_pitch_labels, get_transposed_phoneme_labels, print_all

## 数据集加载

In [2]:
def dataset_transform(sample, sample_rate=None):
    id, text, phoneme, note, note_duration, phoneme_duration, slur_note = parser_line(sample['text'])
    text_with_p, phoneme, note, note_duration, slur_note = merge_note(text, phoneme, note, note_duration, slur_note)
    sample['chinese'] = text_with_p
    sample['phoneme'] = phoneme
    sample['note'] = note
    sample['duration'] = note_duration
    sample['slur'] = slur_note
    return sample

dataset = OpencpopDataset('/scratch/bh2283/data/opencpop/segments/', transform=dataset_transform)

In [3]:
train_set, test_set = dataset.split()
len(train_set), len(test_set)

(3744, 12)

In [8]:
note_labels = get_pitch_labels()
phoneme_labels = get_transposed_phoneme_labels()
slur_labels = [0, 1]
# 0-1 分辨率0.01，1-2 分辨率0.05，2-7 分辨率0.2
duration_labels = [i for i in range(7)]

labels = (
    phoneme_labels,
    note_labels,
    duration_labels,
    slur_labels
)
loaderGenerator = MusicLoaderGenerator(labels)
train_loader = loaderGenerator.dataloader(train_set, batch_size=2)
print('train_set:', len(train_set), 'test_set:',len(test_set))
steps = 1
for i_batch, sample_batched in enumerate(train_loader):
    if steps <= 0:
        break

    print(sample_batched.keys())
    print(sample_batched['chinese'])
    print(sample_batched['phoneme'])
    # print_all(sample_batched.values())
    steps -= 1

train_set: 3744 test_set: 12
dict_keys(['audio', 'audio_len', 'audio_duration', 'audio_duration_quant', 'chinese', 'phoneme', 'phoneme_pre', 'phoneme_post', 'note', 'note_pre', 'note_post', 'slur'])
['也', '许', 'SP', '他', '能', '~', '给', '你', '更', '多', 'SP', 'AP', 'SP', '今', '天', '你', '要', '嫁', 'SP', '给', '我', 'SP', 'AP', 'SP']
[[23, 33], [15, 27], [61, 0], [7, 28], [8, 52], [52, 0], [10, 38], [8, 25], [10, 52], [6, 32], [61, 0], [60, 0], [61, 0], [13, 47], [7, 43], [8, 25], [23, 39], [13, 29], [61, 0], [10, 38], [24, 31], [61, 0], [60, 0], [61, 0]]


# Model 设计

- 尝试使用逆卷积，上采样得到所需的音频
- 我们无需去计算时间停止符，只需要在输出的时间内计算loss并且最小化即可
- 设定一个最大时间长度，比如两秒，超过的就不要了（用阈值筛掉）
- 多层上采样得到最佳的输出
- 使用梅尔频谱，还有解码器，可以使得输出音质比stft好（猜的，需要验证）一般机器学习声码器都会好点

但是有问题：
- 使用逆卷积太过刚直，没有变化性，导致无法很好的收敛
- 一般逆卷积和GAN一起使用，用判别器取代刚直的loss
- 使用tacotron模式就会好很多，无需GAN，自收敛