In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd
from pathlib import Path
from collections import defaultdict
import math
import re
import concurrent
import soundfile as sf
from tqdm.notebook import tqdm
import numpy as np
from lhotse import RecordingSet, Recording, SupervisionSegment, SupervisionSet, CutSet
from phonemizer.backend import EspeakBackend
from phonemizer.separator import Separator

# TextProcessor

In [3]:
class TextProcessor:
    def __init__(self):
        self.dict_cho   = {0:u"ᄀ",  1:u"ᄁ",  2:u"ᄂ",  3:u"ᄃ",  4:u"ᄄ",  5:u"ᄅ",  6:u"ᄆ",  7:u"ᄇ",  8:u"ᄈ",  9:u"ᄉ",
            10:u"ᄊ", 11:u"ᄋ", 12:u"ᄌ", 13:u"ᄍ", 14:u"ᄎ", 15:u"ᄏ", 16:u"ᄐ", 17:u"ᄑ", 18:u"ᄒ"}
        self.dict_jung  = {0:u"ㅏ",  1:u"ㅐ",  2:u"ㅑ",  3:u"ㅒ",  4:u"ㅓ",  5:u"ㅔ",  6:u"ㅕ",  7:u"ㅖ",  8:u"ㅗ",  9:u"ㅘ",
            10:u"ㅙ", 11:u"ㅚ", 12:u"ㅛ", 13:u"ㅜ", 14:u"ㅝ", 15:u"ㅞ", 16:u"ㅟ", 17:u"ㅠ", 18:u"ㅡ", 19:u"ㅢ", 20:u"ㅣ"}
        self.dict_jong  = { 0:u" ",   1:u"ᆨ",  2:u"ᆩ",  3:u"ᆪ",  4:u"ᆫ",  5:u"ᆬ",  6:u"ᆭ",  7:u"ᆮ",  8:u"ᆯ",  9:u"ᆰ",  
            10:u"ᆱ", 11:u"ᆲ", 12:u"ᆳ", 13:u"ᆴ", 14:u"ᆵ", 15:u"ᆶ", 16:u"ᆷ", 17:u"ᆸ", 18:u"ᆹ", 19:u"ᆺ", 
            20:u"ᆻ", 21:u"ᆼ", 22:u"ᆽ", 23:u"ᆾ", 24:u"ᆿ", 25:u"ᇀ", 26:u"ᇁ", 27:u"ᇂ"}

    def __call__(self, text: str) -> list:
        cjj = ""
        prefix = ""
        for unicode in text:
            enc = unicode.encode()
            if len(enc) == 3:   # 한글 자모
                h___ = enc[0]-224
                _h__ = (enc[1]-128) // 4
                next_ = (enc[1]-128) % 4
                __h_ = (next_*64 + enc[2]-128) // 16
                ___h = (next_*64 + enc[2]-128) % 16
                hex = h___ * 4096 + _h__ * 256 + __h_ * 16 + ___h
    
                if hex < 44032:
                    raise Exception(f"Invalid text ({unicode}) ({text})")
                cho  = self.dict_cho[(hex - 44032) // 588]
                jung = self.dict_jung[((hex - 44032) % 588) // 28]
                jong  = self.dict_jong[((hex - 44032) % 588) % 28]
                if jong == u" ": cjj = f"{cjj}{prefix}{cho}{jung}"    # 종성 없는 경우
                else : cjj = f"{cjj}{prefix}{cho}{jung}{jong}"        # 종성 있는 경우
                prefix = ""
            else:   # 문장부호
                if unicode not in [" "]:   # 문장 부호도 아닌 경우 에러 출력
                    raise Exception(f"Invalid text ({unicode}) ({text})")
                prefix = " "
        return cjj
tp = TextProcessor()

# initialize the espeak backend
ipa_backend = EspeakBackend('ko')
ipa_separator = Separator(phone=None, word=None)

# Create Train cuts

In [3]:
with open("/home/shahn/Datasets/KoreanASR/train/wav.scp", "r") as f:
    filelist = f.readlines()
with open("/home/shahn/Datasets/KoreanASR/train/text", "r") as f:
    textlist = f.readlines()
assert len(filelist) == len(textlist)

In [None]:
recordings = defaultdict(list)
supervisions = defaultdict(list)
# padding = int(math.log10(len(filelist))) + 1
dataset_count = defaultdict(int)
dataset_length = defaultdict(int)

aihub_mapping = dict(
    command_kid="명령어 음성(소아, 유아)",
    command_nor="명령어 음성(일반남녀)",
    command_old="명령어 음성(노인남녀)",
    freetalk_kid="자유대화 음성(소아유아)",
    freetalk_nor="자유대화 음성(일반남녀)",
    freetalk_old="자유대화 음성(노인남녀)",
)

def make_rec_sup(file_hslee, text, text_cjj, dataset, _id):
    _id = str(_id)
    if dataset == "ksponspeech":
        name = re.match("ksponspeech\/KsponSpeech_\d{2}(?:_train|_test)?\/(KsponSpeech_\d{4}\/KsponSpeech_\d{6}\.)", file).groups()[0]
        path = Path("/home/shahn/Datasets/hinas2/aihub/ksponspeech/Ksponall_wav") / f"{name}wav"
        if not path.exists():
            path = Path("/home/shahn/Datasets/ksponspeech_wav_omitted") / f"{name}wav"
        rec = Recording.from_file(path, recording_id=_id)
        sup = SupervisionSegment(
            id=_id,
            recording_id=_id,
            text=text,
            start=0,
            duration=rec.duration,
            custom=dict(text=text_cjj),
        )
    elif dataset == "zeroth":
        path = Path("/home/shahn/Datasets") / f"{file}flac"
        rec = Recording.from_file(path, recording_id=_id)
        sup = SupervisionSegment(
            id=_id,
            recording_id=_id,
            text=text,
            start=0,
            duration=rec.duration,
            custom=dict(text=text_cjj),
        )
    elif dataset == "freetalk_old":
        return None, None, None
    else:
        hi14_path = aihub_mapping[dataset]
        ext = "PCM" if dataset == "freetalk_old" else "wav"
        path = Path(f"/home/shahn/Datasets/hinas2/aihub/{hi14_path}") / f"{file[len(dataset)+1:]}{ext}"
        rec = Recording.from_file(path, recording_id=_id)
        sup = SupervisionSegment(
            id=_id,
            recording_id=_id,
            text=text,
            start=0,
            duration=rec.duration,
            custom=dict(text=text_cjj),
        )
    return rec, sup, dataset

# futures = []
# with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
if True:
    for idx, (file_hslee, text_hslee) in tqdm(enumerate(zip(filelist, textlist)), total=len(filelist)):
        file = re.search("(?<=\.\/db\/)[^\s]+\.(?=pcm|flac|wav|PCM)", file_hslee).group()
        try:
            text = re.match("[^\s]+ (.+)", text_hslee).groups()[0]
        except:
            tqdm.write(f"{text_hslee.rstrip()} {file}")
            continue
        try:
            text_cjj = tp(text)
        except:
            tqdm.write(f"{text} {file}")
        dataset = file.split("/")[0]
        _id = dataset_count[dataset] + 1
        dataset_count[dataset] = _id
    #     futures.append(executor.submit(make_rec_sup, file_hslee, text_hslee, dataset, _id))
    # for idx, future in tqdm(enumerate(concurrent.futures.as_completed(futures), start=1), total=len(filelist)):
    #     rec, sup, dataset = future.result()
        rec, sup, dataset = make_rec_sup(file_hslee, text, text_cjj, dataset, _id)
        if dataset is not None:
            recordings[dataset].append(rec)
            supervisions[dataset].append(sup)
            dataset_length[dataset] += rec.duration
for dataset in dataset_count.keys():
    print(f"{dataset} - Total {dataset_count[dataset]} files, {dataset_length[dataset]/60/60:.1f} hours")
for dataset in dataset_count.keys():
    print(f"\r{dataset}          ", end="", flush=True)
    rec = RecordingSet.from_recordings(recordings[dataset])
    sup = SupervisionSet.from_segments(supervisions[dataset])
    cutset = CutSet.from_manifests(recordings=rec, supervisions=sup)
    rec.to_file(f"data/manifests/{dataset}_recordings_train.jsonl.gz")
    sup.to_file(f"data/manifests/{dataset}_supervisions_train.jsonl.gz")
    cutset.to_file(f"data/fbank/{dataset}_cuts_train.jsonl.gz")

  0%|          | 0/8301964 [00:00<?, ?it/s]

근데 대부분 다 암 치료하는 환자들이 구십 몇 % 다 보니까 많이 아프신 분들 밖에 안 온단 말이야 ksponspeech/KsponSpeech_01/KsponSpeech_0088/KsponSpeech_087797.
근데 내가 너를 데리고 왔어 그럼 니가 파는 폰 파는 거에 대해서 %율을 달마다 받아 ksponspeech/KsponSpeech_02/KsponSpeech_0216/KsponSpeech_215401.
일을 그만뒀는데도 내가 폰을 팔았으니까 그 %율이 계속 들어와 달마다 ksponspeech/KsponSpeech_03/KsponSpeech_0285/KsponSpeech_284574.
그리고 너가 누굴 데리고 오잖아 그럼 그 %를 너도 받고 나도 받아 왜냐면 내가 데려온 애고 ksponspeech/KsponSpeech_04/KsponSpeech_0398/KsponSpeech_397184.
우대 몇 % 해주는데 여기는 ksponspeech/KsponSpeech_05_train/KsponSpeech_0502/KsponSpeech_501006.
그냥 뭔가 약간 그 불닭만의 감칠맛이 있잖아 근데 그게 안 느껴져 어 짜증 나는데 맛있다고 근데 또 약간 불닭 맛이 나긴 하는데 불닭에서 약간 몇 % 부족한 느낌이야 ksponspeech/KsponSpeech_05_train/KsponSpeech_0503/KsponSpeech_502173.
마이너스가 엄청나게 때려져면 거의 천 오백이 백 됐으니까 마이너스 몇 %냐 그게 ksponspeech/KsponSpeech_05_train/KsponSpeech_0543/KsponSpeech_542363.
트위터에서 그래서 막 몇% 돌파 몇% 돌파 뭐 첫째 날 이만큼 할 수 있는데 돌파 막 이렇게 꼐속 떴 떴었단 말이야 ksponspeech/KsponSpeech_05_train/KsponSpeech_0582/KsponSpeech_581483.
이번 달 특별 교육 있는지 알려 줘. command_kid/Training/KE0769/K

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



점심 반찬에 돈까스 있어? command_kid/Training/script1_af_0617/script1_af_0617-28543-03-02-ICH-M-02-C.
안내장 몇 개예요? command_kid/Training/script1_af_0617/script1_af_0617-28695-03-02-ICH-M-02-C.
할아버지가 사진 보고 싶어 하셔. command_kid/Training/script1_af_0617/script1_af_0617-28373-03-02-ICH-M-02-C.
웃긴 영상 켜 줘. command_kid/Training/script1_af_0617/script1_af_0617-28296-03-02-ICH-M-02-C.
이번 달 빨간 날 있어? command_kid/Training/script1_af_0617/script1_af_0617-28004-03-02-ICH-M-02-C.
목록에서 가장 많이 들은 동화책 제목 알려 줘. command_kid/Training/script1_af_0617/script1_af_0617-28333-03-02-ICH-M-02-C.
방금 게임 어때? command_kid/Training/script1_af_0617/script1_af_0617-28215-03-02-ICH-M-02-C.
요리 재료가 몇 개 들어갔어? command_kid/Training/script1_af_0617/script1_af_0617-28678-03-02-ICH-M-02-C.
준비물 살 데가 어딨지? command_kid/Training/script1_af_0617/script1_af_0617-28821-03-02-ICH-M-02-C.
스트레칭 가르쳐 줄 수 있어? command_kid/Training/script1_af_0617/script1_af_0617-28779-03-02-ICH-M-02-C.
어린이 추천 동화 들려줘. command_kid/Training/script1_af_0617/script1_af_0617-28936-0

# Create Test cuts

In [10]:
with open("/home/shahn/Datasets/KoreanASR/test_ksponspeech/wav.scp", "r") as f:
    filelist = f.readlines()
with open("/home/shahn/Datasets/KoreanASR/test_zeroth/wav.scp", "r") as f:
    filelist.extend(f.readlines())
with open("/home/shahn/Datasets/KoreanASR/test_ksponspeech/text", "r") as f:
    textlist = f.readlines()
with open("/home/shahn/Datasets/KoreanASR/test_zeroth/text", "r") as f:
    textlist.extend(f.readlines())
assert len(filelist) == len(textlist)

In [12]:
recordings = defaultdict(list)
supervisions = defaultdict(list)
# padding = int(math.log10(len(filelist))) + 1
dataset_count = defaultdict(int)
dataset_length = defaultdict(int)

for file_hslee, text_hslee in tqdm(zip(filelist, textlist), total=len(filelist)):
    file = re.search("(?<=\.\/db\/)[^\s]+\.(?=pcm|flac|wav|PCM)", file_hslee).group()
    dataset = file.split("/")[0]
    if dataset != "zeroth":
        continue
    dataset = "zeroth_wav"
    _id = dataset_count[dataset] + 1
    dataset_count[dataset] = _id
    _id = f"{dataset}-{_id}"
    try:
        text = re.match("[^\s]+ (.+)", text_hslee).groups()[0]
        text = re.sub("['\" .,?!…‘’“”<>\\`]+", " ", text).strip()
    except:
        tqdm.write(f"{text_hslee}{file}")
        continue
    try:
        text_cjj = tp(text)
    except:
        tqdm.write(f"{text} {file}")
    text_ipa = ipa_backend.phonemize([text], separator=ipa_separator, strip=False)[0]
    if dataset == "ksponspeech":
        name = re.match("ksponspeech\/KsponSpeech_\d{2}(?:_train|_test)?\/(KsponSpeech_\d{4}\/KsponSpeech_\d{6}\.)", file).groups()[0]
        path = Path("/home/shahn/Datasets/hinas2/aihub/ksponspeech/Ksponall_wav") / f"{name}wav"
        if not path.exists():
            path = Path("/home/shahn/Datasets/ksponspeech_wav_omitted") / f"{name}wav"
        rec = Recording.from_file(path, recording_id=_id)
        sup = SupervisionSegment(
            id=_id,
            recording_id=_id,
            text=text,
            start=0,
            duration=rec.duration,
            custom=dict(
                cjj=text_cjj,
                ipa=text_ipa,
            ),
        )
    elif dataset == "zeroth":
        path = Path("/home/shahn/Datasets") / f"{file}flac"
        rec = Recording.from_file(path, recording_id=_id)
        sup = SupervisionSegment(
            id=_id,
            recording_id=_id,
            text=text,
            start=0,
            duration=rec.duration,
            custom=dict(
                cjj=text_cjj,
                ipa=text_ipa,
            ),
        )
    elif dataset == "zeroth_wav":
        parts = Path(file).parts
        path = Path("/home/shahn/Datasets/zeroth") / f"{parts[1]}_wav" / f"{parts[-1]}wav"
        rec = Recording.from_file(path, recording_id=_id)
        sup = SupervisionSegment(
            id=_id,
            recording_id=_id,
            text=text,
            start=0,
            duration=rec.duration,
            custom=dict(
                cjj=text_cjj,
                ipa=text_ipa,
            ),
        )
    else:
        raise RuntimeError(dataset)
    recordings[dataset].append(rec)
    supervisions[dataset].append(sup)
    dataset_length[dataset] += rec.duration
for dataset in dataset_count.keys():
    print(f"{dataset} - Total {dataset_count[dataset]} files, {dataset_length[dataset]/60/60:.1f} hours")
for dataset in dataset_count.keys():
    print(f"\r{dataset} saved          ", end="", flush=True)
    rec = RecordingSet.from_recordings(recordings[dataset])
    sup = SupervisionSet.from_segments(supervisions[dataset])
    cutset = CutSet.from_manifests(recordings=rec, supervisions=sup)
    rec.to_file(f"data/manifests/{dataset}_recordings_test.jsonl.gz")
    sup.to_file(f"data/manifests/{dataset}_supervisions_test.jsonl.gz")
    cutset.to_file(f"data/fbank/{dataset}_cuts_test.jsonl.gz")
print("\r                                         ", flush=True)

  0%|          | 0/1990 [00:00<?, ?it/s]

zeroth - Total 457 files, 1.2 hours
                                         


# ksponspeech train / dev / eval_clean / eval_other

In [4]:
SPLITS = (
    "train",
    "dev",
    "eval_clean",
    "eval_other",
)

recordings = defaultdict(list)
supervisions = defaultdict(list)
dataset_count = defaultdict(int)
dataset_length = defaultdict(int)

for split in SPLITS:
    with open(f"/home/shahn/Datasets/hinas2/aihub/ksponspeech/{split}_wav.trn") as f:
        lines = f.readlines()
    for line in tqdm(lines, desc=split):
        path_raw = re.match("[^\s]+\.wav(?= :: )", line).group()
        text_raw = re.search("(?<= :: ).+(?=[\n]?)", line).group()
        _id = dataset_count[split] + 1
        dataset_count[split] = _id
        _id = f"ksponspeech-{split}-{_id}"
        
        text = re.sub("['\" .,?!…‘’“”<>\\`]+", " ", text_raw).strip()
        
        try:
            text_cjj = tp(text)
        except:
            tqdm.write(f"{text} {file}")
            continue
        text_ipa = ipa_backend.phonemize([text], separator=ipa_separator, strip=False)[0]
        path = Path("/home/shahn/Datasets/aihub/ksponspeech") / path_raw
        if not path.exists():
            tqdm.write(f"{path_raw}")
            parts = path_raw.parts[1:]
            path = Path("/home/shahn/Datasets/aihub/ksponspeech_wav_omitted")
            for p in parts:
                path = path / p
            if not path.exists():
                tqdm.write(f"{path_raw}")
                continue
        rec = Recording.from_file(path, recording_id=_id)
        sup = SupervisionSegment(
            id=_id,
            recording_id=_id,
            text=text,
            start=0,
            duration=rec.duration,
            custom=dict(
                cjj=text_cjj,
                ipa=text_ipa,
            ),
        )
        recordings[split].append(rec)
        supervisions[split].append(sup)
        dataset_length[split] += rec.duration

train:   0%|          | 0/610506 [00:00<?, ?it/s]

dev:   0%|          | 0/2504 [00:00<?, ?it/s]

eval_clean:   0%|          | 0/2974 [00:00<?, ?it/s]

eval_other:   0%|          | 0/2959 [00:00<?, ?it/s]

In [5]:
with open("/home/shahn/Datasets/KoreanASR/train/wav.scp", "r") as f:
    filelist = f.readlines()
with open("/home/shahn/Datasets/KoreanASR/train/text", "r") as f:
    textlist = f.readlines()
with open("/home/shahn/Datasets/hinas2/aihub/ksponspeech/train_wav.trn") as f:
    filelist_wav = f.readlines()
    DT = set()
    for fw in filelist_wav:
        path = Path(re.match("[^\s]+\.wav(?= :: )", fw).group()).stem
        DT.add(path)
NDT = set()
with open("/home/shahn/Datasets/hinas2/aihub/ksponspeech/dev_wav.trn") as f:
    filelist_wav = f.readlines()
    for fw in filelist_wav:
        path = Path(re.match("[^\s]+\.wav(?= :: )", fw).group()).stem
        NDT.add(path)
with open("/home/shahn/Datasets/hinas2/aihub/ksponspeech/eval_clean_wav.trn") as f:
    filelist_wav = f.readlines()
    for fw in filelist_wav:
        path = Path(re.match("[^\s]+\.wav(?= :: )", fw).group()).stem
        NDT.add(path)
with open("/home/shahn/Datasets/hinas2/aihub/ksponspeech/eval_other_wav.trn") as f:
    filelist_wav = f.readlines()
    for fw in filelist_wav:
        path = Path(re.match("[^\s]+\.wav(?= :: )", fw).group()).stem
        NDT.add(path)

In [6]:
for line, text_raw in tqdm(zip(filelist, textlist), total=len(filelist)):
    file = re.search("(?<=\.\/db\/)[^\s]+(?=\.(pcm)|(flac)|(wav)|(PCM))", line).group()
    dataset = file.split("/")[0]
    if dataset != "ksponspeech":
        continue
    if Path(file).stem not in DT:
        if Path(file).stem in NDT:
            continue
        try:
            text = re.match("[^\s]+ (.+)", text_raw).groups()[0]
            text = re.sub("['\" .,?!…‘’“”<>\\`]+", " ", text).strip()
        except:
            tqdm.write(f"{text_hslee.rstrip()} {file}")
            continue
        try:
            text_cjj = tp(text)
        except:
            tqdm.write(f"{text} {file}")
            continue
        text_ipa = ipa_backend.phonemize([text], separator=ipa_separator, strip=False)[0]
        try:
            name = re.match("ksponspeech\/KsponSpeech_\d{2}(?:_train|_test)?\/(KsponSpeech_\d{4}\/KsponSpeech_\d{6})", file).groups()[0]
        except:
            print(file)
            raise RuntimeError()
        path = Path("/home/shahn/Datasets/aihub/ksponspeech/Ksponall_wav") / f"{name}.wav"
        assert path.exists(), path
        
        _id = dataset_count["train"] + 1
        dataset_count["train"] = _id
        _id = f"ksponspeech-train-{_id}"
        
        rec = Recording.from_file(path, recording_id=_id)
        sup = SupervisionSegment(
            id=_id,
            recording_id=_id,
            text=text,
            start=0,
            duration=rec.duration,
            custom=dict(
                cjj=text_cjj,
                ipa=text_ipa,
            ),
        )
        recordings["train"].append(rec)
        supervisions["train"].append(sup)
        dataset_length["train"] += rec.duration

for split in dataset_count.keys():
    print(f"{split} - Total {dataset_count[split]} files, {dataset_length[split]/60/60:.1f} hours")
for split in dataset_count.keys():
    rec = RecordingSet.from_recordings(recordings[split])
    sup = SupervisionSet.from_segments(supervisions[split])
    cutset = CutSet.from_manifests(recordings=rec, supervisions=sup)
    rec.to_file(f"data/manifests/ksponspeech_recordings_{split}.jsonl.gz")
    sup.to_file(f"data/manifests/ksponspeech_supervisions_{split}.jsonl.gz")
    cutset.to_file(f"data/fbank/ksponspeech_cuts_{split}.jsonl.gz")
    print(f"\r{split} saved          ", end="", flush=True)
print("\r                                         ", flush=True)

  0%|          | 0/8301964 [00:00<?, ?it/s]

train - Total 612851 files, 948.2 hours
dev - Total 2504 files, 3.8 hours
eval_clean - Total 2974 files, 2.6 hours
eval_other - Total 2959 files, 3.7 hours
                                         


# Create missing ksponall_wav

In [18]:
for idx, (file, text) in tqdm(enumerate(zip(filelist, textlist), start=1), total=len(filelist)):
    file = re.search("(?<=\.\/db\/)[^\s]+\.(?=pcm|flac|wav|PCM)", file).group()
    dataset = file.split("/")[0]
    if dataset == "ksponspeech":
        search = re.match("ksponspeech\/KsponSpeech_\d{2}(?:_train|_test)?\/(KsponSpeech_\d{4}\/KsponSpeech_\d{6}\.)", file)
        path = Path("/home/shahn/Datasets/hinas2/aihub/ksponspeech/Ksponall_wav") / f"{search.groups()[0]}wav"
        if not path.exists():
            with open(f"/home/shahn/Datasets/hinas2/aihub/ksponspeech/Ksponall/{path.parent.name}/{path.stem}.pcm", "rb") as f:
                wav = np.frombuffer(f.read(), dtype=np.int16)
            d = Path("/home/shahn/Datasets/ksponspeech_wav_omitted") / path.parent.name
            d.mkdir(exist_ok=True)
            sf.write(str(d / path.name), wav, 16_000)
            tqdm.write(f"{path.parent.name}/{path.name}")

  0%|          | 0/1533 [00:00<?, ?it/s]

KsponSpeech_0622/KsponSpeech_621513.wav
KsponSpeech_0622/KsponSpeech_621696.wav
KsponSpeech_0622/KsponSpeech_621951.wav
KsponSpeech_0622/KsponSpeech_621966.wav
KsponSpeech_0622/KsponSpeech_621976.wav
KsponSpeech_0622/KsponSpeech_621983.wav
KsponSpeech_0623/KsponSpeech_622045.wav
KsponSpeech_0623/KsponSpeech_622331.wav
KsponSpeech_0623/KsponSpeech_622475.wav


In [21]:
# Recording.from_file("sox -t raw -b 16 -e signed-integer -r 16000 /home/shahn/Datasets/hinas2/aihub/자유대화 음성(노인남녀)/Training/노인남녀_노인대화12_F_JSO00_62_수도권_녹음실/노인남녀_노인대화12_F_JSO00_62_수도권_녹음실_15404.PCM -t wav -b 16 -e signed-integer -r 16000 - remix 1 |")
# Recording.from_file("/home/shahn/Datasets/hinas2/aihub/자유대화 음성(노인남녀)/Training/노인남녀_노인대화12_F_JSO00_62_수도권_녹음실/노인남녀_노인대화12_F_JSO00_62_수도권_녹음실_15404.PCM")
with open("/home/shahn/Datasets/hinas2/aihub/자유대화 음성(노인남녀)/Training/노인남녀_노인대화12_F_JSO00_62_수도권_녹음실/노인남녀_노인대화12_F_JSO00_62_수도권_녹음실_15404.PCM", "rb") as f:
    wav = np.frombuffer(f.read(), dtype=np.int16)
ipd.display(ipd.Audio(wav, rate=16_000))

# delete_it

In [7]:
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.load("/home/shahn/Documents/icefall_github/egs/ksponspeech/ASR/data/lang_bpe_500/bpe.model")
texts = [
    "뭐하냐 바보야",
    "퇴근하고 싶다"
]
texts_cjj = [tp(text) for text in texts]
sp.encode(texts_cjj, out_type=str)

[['▁ᄆㅝ', 'ᄒㅏ', 'ᄂㅑ', '▁ᄇㅏ', 'ᄇㅗ', 'ᄋㅑ'],
 ['▁', 'ᄐ', 'ㅚ', 'ᄀ', 'ㅡ', 'ᆫ', 'ᄒㅏᄀㅗ', '▁ᄉㅣᇁ', 'ᄃㅏ']]

# IPA

In [13]:
import re
with open("/home/shahn/Documents/icefall_github/egs/ksponspeech/ASR/data/lang_bpe_500_ipa_kspon_freetalknor/transcript_words.txt", "r", encoding='utf-8') as f:
    lines = f.readlines()
for line in tqdm(lines):
    if "\\" in line:
        print(line)
        print(re.sub(r"( *\\+ *)+", " ", line))
        input()

  0%|          | 0/901805 [00:00<?, ?it/s]

과학자는 과학자지 그래서 연구를 \\ 왜 나는 사이코패스가 되지 않았을까\\ 를 가지고 연구를 한 거지

과학자는 과학자지 그래서 연구를 왜 나는 사이코패스가 되지 않았을까 를 가지고 연구를 한 거지



 


아 위기가 닥치니까 \\ 나는 뭐지\\ \\ 우린 어떻게 살았지\\ 돌아보게 됐다는 거지

아 위기가 닥치니까 나는 뭐지 우린 어떻게 살았지 돌아보게 됐다는 거지



 


요런 방석 한 두개 사서 구석에 놓아줘\\ 자기만의 고정된 영역이 있으면 좋거든

요런 방석 한 두개 사서 구석에 놓아줘 자기만의 고정된 영역이 있으면 좋거든



 


죄송해요\\ 저도 모르게 흥분해서 그럴 의도는 아니었어요

죄송해요 저도 모르게 흥분해서 그럴 의도는 아니었어요



 


그거 보고 어떤 사장님이 \\ 얘 너무 괜찮다\\ 그러고 채용할지

그거 보고 어떤 사장님이 얘 너무 괜찮다 그러고 채용할지



 


근데 브레이킹 배드\\ 랑 워킹 데드\\ 랑 은근 비슷하지 않아

근데 브레이킹 배드 랑 워킹 데드 랑 은근 비슷하지 않아



 


In [41]:
from phonemizer.backend import EspeakBackend
from phonemizer.separator import Separator
from tqdm.notebook import tqdm
import re

# initialize the espeak backend for English
backend = EspeakBackend('ko')

# separate phones by a space and ignoring words boundaries
separator = Separator(phone=None, word=None)

with open("/home/shahn/Documents/icefall_github/egs/ksponspeech/ASR/data/lang_bpe_500_ipa_filtered/transcript_words.txt", "r", encoding='utf-8') as f:
    lines = f.readlines()
with open("/home/shahn/Documents/icefall_github/egs/ksponspeech/ASR/data/lang_bpe_500_ipa_filtered/transcript_words_ipa.txt", "w", encoding='utf-8') as f:
    letters = set()
    for line in tqdm(lines):
        # filter \\
        line = re.sub(r"( *\\+ *)+", " ", line)
        
        # phonemize
        ipa = backend.phonemize([line], separator=separator, strip=True)[0]

        # filter wrong pronounciations
        if "r" in ipa or "ɹ" in ipa:
            continue

        # filter Palatalization (구개음화는 ʲ를 붙이더라 -> 굳이 구개음화로 인한 ㅈ와 그냥 ㅈ를 구분할 필요 없을듯)
        ipa = ipa.replace("tʃhʲ", "tʃh")

        # 합자
        ipa = ipa.replace("tɕ", "J").replace("dʑ", "J").replace("tʃh", "C").replace("kh", "K").replace("th", "T").replace("ph", "P")
        
        for l in ipa:
            letters.add(l)
        f.write(f"{ipa}\n")
print("".join(letters))

  0%|          | 0/901805 [00:00<?, ?it/s]

ijɐhuʌŋɯpJKoltdPTCwɛɡnɫqkbsmɾe


In [37]:
from phonemizer.backend import EspeakBackend
from phonemizer.separator import Separator
from tqdm.notebook import tqdm
import re

# initialize the espeak backend for English
backend = EspeakBackend('ko')

# separate phones by a space and ignoring words boundaries
separator = Separator(phone=" ", word="/")

with open("/home/shahn/Documents/icefall_github/egs/ksponspeech/ASR/data/lang_bpe_500_ipa_filtered/transcript_words.txt", "r", encoding='utf-8') as f:
    lines = f.readlines()
    letters = set()
    for line in tqdm(lines[:10000]):
        line = re.sub(r"( *\\+ *)+", " ", line)
        ipa = backend.phonemize([line], separator=separator, strip=True)[0]

        if "r" in ipa or "ɹ" in ipa: continue
        if "ŋɡ" in ipa:
            print(line, end="");print(ipa, end="\n\n")
        
        m = re.search("(?<!h)ʲ", ipa)
        if m is not None:
            print(line, end="");print(ipa, end="\n\n")
        if "tʃ" in ipa and "tʃ h" not in ipa:
            print(line, end="");print(ipa, end="\n\n")
        m = re.search("tʃ (?!h)", ipa)
        if m is not None:
            print(line, end="");print(ipa, end="\n\n")
        for l in ipa.replace("/", " ").split(" "):
            if len(l) <= 2:
                letters.add(l)
print(letters)

  0%|          | 0/10000 [00:00<?, ?it/s]

나랑 같이 듣는 수업
n ɐ ɾ ɐ ŋɡ ɐ tɕ h i/d ɯ n n ɯ n/s u ʌ p

너랑 가고 싶다고 근데 오션월드 말구 쫌 따른 데로
n ʌ ɾ ɐ ŋɡ ɐ q o/s i p t ɐ q o/ɡ ɯ n d e/o s j ʌ n w ʌ ɫ d ɯ m ɐ ɫ q u/tɕ o m/t ɐ ɾ ɯ n/d e ɾ o

내 나랑 같이 다니는 형 있거든 그 형이 어 그 형만 봤어
n ɛ n ɐ ɾ ɐ ŋɡ ɐ tɕ h i/d ɐ n i n ɯ n/h j ʌ ŋ/i t k ʌ d ɯ n/ɡ ɯ/h j ʌ ŋ i/ʌ/ɡ ɯ/h j ʌ ŋ m ɐ n/p w ɐ s ʌ

너랑 같은 친구
n ʌ ɾ ɐ ŋɡ ɐ t h ɯ n/tʃ h i n q u

망상 가는 길목에 있어 아까 말했던 그 대진 그쪽에 있어
m ɐ ŋ s ɐ ŋɡ ɐ n ɯ n/ɡ i ɫ m o q e/i s ʌ/ɐ q ɐ m ɐ ɫ h ɛ t t ʌ n/ɡ ɯ/d ɛ dʑ i n/ɡ ɯ tɕ o q e/i s ʌ

{'ih', 'h', 'ew', 'ʌɐ', 'ot', 'es', 'th', 'ŋi', 'ep', 'qɡ', 'nw', 'kh', 'ɫd', 'iq', 'iɡ', 'ɐj', 'nɯ', 'ne', 'ʌɛ', 'l', 'ɐo', 'uo', 'd', 'ŋt', 'tɕ', 'nʌ', 'ud', 'et', 'ɫp', 'tʃ', 'ʌp', 'ɐh', 'it', 'ms', 'ɛɾ', 'mɡ', 'ji', 'i', 'j', 'ʌɯ', 'ɫs', 'oɡ', 'ɫj', 'u', 'ʌɾ', 'iw', 'pp', 'id', 'ʌw', 'ŋɡ', 'ɯ', 'tʌ', 'qu', 'qi', 'ɫh', 'ʌs', 'ɛq', 'ɐɐ', 'ʌj', 'ɫt', 'ej', 'pɡ', 'o', 'ɛh', 'ɐu', 'ip', 'ni', 'uʌ', 'nɡ', 'ʌd', 'nd', 't', 'uh', 'ɐʌ', 'w', 'eʌ', 'ŋɐ', 'nɐ', 'ɡ', 'md', 'n', 'qs', 'ij', 'qo', 'op', 'ŋh', 'ɐ

In [39]:
backend = EspeakBackend('ko', words_mismatch='warn')
separator = Separator(phone=" ", word=".", syllable='|')
backend.phonemize(["툇마루 앉어"], separator=separator, strip=True)[0]
backend.phonemize(["좀 줘봐라"], separator=separator, strip=True)[0]

'tɕ o m.tɕ w ʌ b w ɐ ɾ ɐ'

In [56]:
print(re.search("tʃ(?!h)", "aatʃ hʲ"))

<re.Match object; span=(2, 4), match='tʃ'>
