In [None]:
import pandas as pd
import csv, os

In [None]:
# 读取数据
valid_path = r'C:\Users\samfi\Downloads\yue\validated.tsv'
valid_df = pd.read_csv(valid_path, sep = '\t', 
                        quoting=csv.QUOTE_NONE, 
                        low_memory = False, 
                          dtype = { #
                              'client_id': 'str',
                              'path': 'str',
                              'sentence_id': 'str',
                              'sentence': 'str',
                              'up_votes': 'int16',
                              'down_votes': 'int16',
                              'age': 'str',
                              'gender': 'str',
                              'accentes': 'str',
                              'variant': 'str',
                              'locale': 'str',
                              'segment': 'str'
                          })

valid_df['speaker_id'] = pd.factorize(valid_df['client_id'])[0] + 1

# 分词
# 加载pycantonese包以使用粤语分词工具
import pycantonese

# 从validated.tsv中提取文字标注信息
yue_sentences = valid_df['sentence'].tolist()

# 使用pycantonese中的segment命令进行分词
yue_tok = [' '.join(pycantonese.segment(sentence)) for sentence in yue_sentences]

# 将分词结果添加回validated数据集
valid_df['sentence_tok'] = pd.Series(yue_tok)

# 显示结果
print(valid_df['sentence_tok'][0:10])

In [None]:
# 读取clip_duration.tsv文件
clip_dur_file = r"C:\Users\samfi\Downloads\yue\clip_durations.tsv"
clip_dur = pd.read_csv(clip_dur_file, sep = '\t', dtype = {'clip': 'str', 'duration[ms]': 'float64'})

# 将栏名改为path和dur，以方便与validated.tsv合并
clip_dur.rename(columns = {'clip':'path', 'duration[ms]':'dur'}, inplace=True) 

# 转换成以秒为单位
clip_dur['dur'] = clip_dur['dur']/1000 

# 将其与validated.tsv数据结合在一起
valid_df = pd.merge(valid_df, clip_dur, on='path', how='left')

In [None]:
%pip install praatio

In [None]:
# 从praatio包中加载textgrid对象
from praatio import textgrid
# 创建一个读取四个argumeent
def create_textgrid(snd_file, dur, speaker_id, transcript, output_folder):
    # 确保除了dur是浮点数以外，其他参数都是字符串
    snd_file = str(snd_file)
    dur = float(dur)
    speaker_id = str(speaker_id)
    transcript = str(transcript)
    
    # 创建一个空白的textgrid对象
    tg = textgrid.Textgrid()

    # 创建一个新的分段标记层(Interval tier)
    speaker_tier = textgrid.IntervalTier(speaker_id, # 标记层名为话者id
                                        [(0.05, dur-0.05, transcript)], # 标记段的起始时间，终止时间和文本信息
                                        0, # 整个标记层的起始位置（应与录音相同）
                                        dur) # 整个标记曾的结束位置（应与录音相同）

    # 将该标记层添加到TextGrid对象
    tg.addTier(speaker_tier)
    # 获取音频文件的文件名
    snd_name, _ = os.path.splitext(snd_file)
    # 创建TextGrid的文件路径名
    tg_filename = snd_name + '.TextGrid'
    tg_path = os.path.join(output_folder, tg_filename)
    print(tg_path, transcript)
    # 将这个TextGrid对象存入硬盘
    tg.save(tg_path, format='short_textgrid', includeBlankSpaces=True)

In [None]:
output_folder = r"C:\Users\samfi\Downloads\yue\validated"
for snd_file, dur, speaker_id, transcript in zip(valid_df.path, valid_df.dur, valid_df.speaker_id, valid_df.sentence_tok):
    create_textgrid(snd_file, dur, speaker_id, transcript, output_folder)