In [None]:
import pandas as pd
import csv

In [None]:
valid_path = r'C:\Users\samfi\Downloads\cv_yue\cv-corpus-17.0-2024-03-15\yue\validated.tsv'
valid_df = pd.read_csv(valid_path, sep = '\t', 
                        quoting=csv.QUOTE_NONE, 
                        low_memory = False, 
                          dtype = { #
                              'client_id': 'str',
                              'path': 'str',
                              'sentence': 'str',
                              'up_votes': 'int16',
                              'down_votes': 'int16',
                              'age': 'str',
                              'gender': 'str',
                              'accentes': 'str',
                              'variant': 'str',
                              'locale': 'str',
                              'segment': 'str'
                          })
print(valid_df.columns)

In [None]:
valid_df['speaker_id'] = pd.factorize(valid_df['client_id'])[0] + 1
print('说话人编号：', valid_df['speaker_id'].unique().min(), '-', valid_df['speaker_id'].unique().max())

In [None]:
import os
root_dir = r"C:\Users\samfi\Downloads\cv_yue\cv-corpus-17.0-2024-03-15\yue" # 用你自己电脑上的文件夹路径
source_folder = os.path.join(root_dir, "clips")
target_folder = os.path.join(root_dir, "validated")
file_names = valid_df['path'].tolist()
valid_df['source_path'] = [os.path.join(source_folder, file_name) for file_name in file_names]
valid_df['target_path'] = [os.path.join(target_folder, file_name) for file_name in file_names]
os.mkdir(target_folder)

In [None]:
import shutil
for src_path, tgt_path in zip(valid_df.source_path, valid_df.target_path):
    shutil.move(src_path, tgt_path)

In [None]:
# 安装pycantonese
%pip install pycantonese

In [None]:
import pycantonese
yue_sentences = valid_df['sentence'].tolist()
print('分词前结果:')
for i in yue_sentences[0:9]:
    print('\t' + i)
print('\n分词后结果:')
yue_tok = [' '.join(pycantonese.segment(sentence)) for sentence in yue_sentences]
for i in yue_tok[0:9]:
    print('\t' + i)

In [None]:
import re
punct = r'[。，“”、；：？！—「」『』（）《》〈〉【】〔〕“”‘’……——～]' # 所有可能的标点符号
yue_texts = [re.sub(punct, ' ', text) for text in yue_tok] # 将所有标点符号替代为一个半角空格
yue_texts = [re.sub('[\d\s\W]+', ' ', text) for text in yue_texts]
for i in yue_texts[0:9]:
    print(i)

In [52]:
yue_words = ' '.join(yue_texts) # 将所有句子都合并为一个字符串
yue_words = re.sub('\s+', ' ', yue_words) # 把连续的多个空格替换为一个
yue_words = yue_words.split(' ') # 将每个词按照空格分开
yue_words = list(sorted(set(yue_words))) # 将每个独一无二的词取出来

# 将含有拉丁字母的词剔除出单词列表
latins = re.compile('[A-Za-z]') 
yue_words = [word for word in yue_words if not latins.search(word)]
for item in yue_words[0:9]:
    print(item)

〇
㓤
㓤親
㖭
㗇
㗇人
㗎
㗎仔
㗱


In [50]:
jyp = []
for word in yue_words:
    jyutping = pycantonese.characters_to_jyutping(word)[0][1]
    jyp.append(jyutping)
for item in jyp[0:9]:
    print(item)

ling4
gat1
gat1can1
tim1
gaa5
haa1jan4
gaa4
gaa4zai2
zaap6


In [None]:
%pip install epitran jamo g2pk lingpy

In [53]:
import epitran
epi = epitran.Epitran('yue-Latn')
yue_ipa = []
for item in jyp:
    if item is None:
        phone = ''
    else:
        phone = epi.transliterate(item)
    yue_ipa.append(phone)
for item in yue_ipa[0:9]:
    print(item)

lɪŋ
kɐtʰ
kɐtʰt͡sʰɐn
tʰiːm
kaː
haːjɐn
kaː
kaːt͡sɐi̯
t͡saːpʰ


In [77]:
from lingpy import ipa2tokens # 导入能将国际音标用空格区隔开的功能ipa2tokens
yue_trans = []
for item in yue_ipa:
    if item.strip() == '': # 如果该单词没有产生对应的粤语拼音则其国际音标转写也为空
        item = ''
    else:
        item = re.sub(r'(p|pʰ|t|tʰ|k|kʰ)($|p|t|t͡s|s|f|k|m|n|ŋ|l|j|w|h|ʔ)', lambda m: f"{m.group(1).replace('ʰ', '')}̚{m.group(2)}", item) # 将音节末塞音上的送气符号去掉并添加未除阻符号
        item = re.sub(r'j(i|y)', r'\1', item) # 删掉i或y前的j
        item = re.sub('wu', 'u', item) # 删掉u前的w
        item = ' '.join(ipa2tokens(item)) # 把每个音以空格区分开
 
    yue_trans.append(item)
for item in yue_trans[0:9]:
    print(item)

l ɪ ŋ
k ɐ t̚
k ɐ t̚ t͡sʰ ɐ n
tʰ iː m
k aː
h aː j ɐ n
k aː
k aː t͡s ɐi̯
t͡s aː p̚


In [78]:
yue_dict = []
for word, trans in zip(yue_words, yue_trans):
    entry = word + '\t' + trans
    yue_dict.append(entry)
for entry in yue_dict[0:9]:
    print(entry)
    

〇	l ɪ ŋ
㓤	k ɐ t̚
㓤親	k ɐ t̚ t͡sʰ ɐ n
㖭	tʰ iː m
㗇	k aː
㗇人	h aː j ɐ n
㗎	k aː
㗎仔	k aː t͡s ɐi̯
㗱	t͡s aː p̚


In [81]:
dict_file = r"C:\Users\samfi\Downloads\cv_yue\cv-corpus-17.0-2024-03-15\yue\yue_dict.txt" # 替换为你电脑上的路径
with open(dict_file, 'w') as f:
    for entry in yue_dict:
        f.write(entry + '\n')
print("The dictionary is saved to:", dict_file)

The dictionary is saved to: C:\Users\samfi\Downloads\cv_yue\cv-corpus-17.0-2024-03-15\yue\yue_dict.txt
