# Setup Work Environment

Run the following cell to setup all the dependencies and the code. There should be no changes required from this cell.

Don't forget to ensure the GPU has already attached to your working environment. You can check it in `Runtime -> Manage sessions -> Check if word GPU is available next to the notebook's name`, you can also double check in `Runtime -> Change runtime type -> Check if GPU has already selected from the dropdown menu`

In [None]:
!rm -rf xib
!pip install pytrie enlighten colorlog inflection ipapy
!git clone https://github.com/akurniawan/xib.git
!cd xib && git clone https://github.com/j-luo93/dev_misc.git && cd dev_misc && git checkout b44fde842a6311e03f731cd4e110dcd9fc394db7 && pip install -e .
!cd xib && pip install -e .

Collecting pytrie
[?25l  Downloading https://files.pythonhosted.org/packages/d3/19/15ec77ab9c85f7c36eb590d6ab7dd529f8c8516c0e2219f1a77a99d7ee77/PyTrie-0.4.0.tar.gz (95kB)
[K     |████████████████████████████████| 102kB 12.1MB/s 
[?25hCollecting enlighten
[?25l  Downloading https://files.pythonhosted.org/packages/2e/15/7a22630323eb816bd560bb2b60b98c9c829a3fb90f55d9d224f3aa4d7bf3/enlighten-1.10.1-py2.py3-none-any.whl (53kB)
[K     |████████████████████████████████| 61kB 8.8MB/s 
[?25hCollecting colorlog
  Downloading https://files.pythonhosted.org/packages/32/e6/e9ddc6fa1104fda718338b341e4b3dc31cd8039ab29e52fc73b508515361/colorlog-5.0.1-py2.py3-none-any.whl
Collecting inflection
  Downloading https://files.pythonhosted.org/packages/59/91/aa6bde563e0085a02a435aa99b49ef75b0a4b062635e606dab23ce18d720/inflection-0.5.1-py2.py3-none-any.whl
Collecting ipapy
  Downloading https://files.pythonhosted.org/packages/41/0d/7e8652df6af20a61bb3315f5c9d99fb9ea8f3779ff80fca9d71001230f90/ipapy-0.0.9

# Setup Dataset

There are 2 ways to setup your dataset:
1. Mount Gdrive to Colab environment. If you decided to go with this and would like to access the data directly in our `LCT Project` shared folder, you can follow instruction in https://stackoverflow.com/questions/54351852/accessing-shared-with-me-with-colab to load `Preprocessed file` folder in Shared google drive. Essentially you just have to go to the location of the folder, right click and choose `Add a shortcut to Drive`. After that you just have to run the cell below
2. Upload your dataset to `sample_data` folder in google colab environment. Be aware that you **will** lose your data in this folder when you restart colab's environment

Please do remember that the word **must** be in an alphabetical or IPA form, that means no number, no non-alphabetical characters, etc. Otherwise, it will throw an error. If you're unsure whether your data is correct or not, run the last cell before **Run training** section to check whethere all your vocabs are valid


In [None]:
import sys

sys.path.append("./xib/dev_misc")

In [None]:
import random
import re
from abc import ABC, abstractmethod
from collections import defaultdict
from dataclasses import dataclass
from itertools import zip_longest
from typing import Callable, ClassVar, Iterator, List, Optional, Sequence, TextIO, Tuple, Union
import pandas as pd
from collections import Counter

import numpy as np
import pandas as pd
import torch
from dev_misc import add_argument, g
from dev_misc.devlib import BT, LT
from dev_misc.utils import cached_property, deprecated
from ipapy.ipachar import (
    DG_C_MANNER,
    DG_C_PLACE,
    DG_C_VOICING,
    DG_DIACRITICS,
    DG_S_BREAK,
    DG_S_LENGTH,
    DG_S_STRESS,
    DG_T_CONTOUR,
    DG_T_GLOBAL,
    DG_T_LEVEL,
    DG_TYPES,
    DG_V_BACKNESS,
    DG_V_HEIGHT,
    DG_V_ROUNDNESS,
)
from ipapy.ipastring import IPAString
from tqdm import tqdm

B, I, O = 0, 1, 2

tqdm.pandas()


dia2char = {
    "low": {"à": "a", "è": "e", "ò": "o", "ì": "i", "ù": "u", "ѐ": "e", "ǹ": "n", "ỳ": "y"},
    "high": {
        "á": "a",
        "é": "e",
        "ó": "o",
        "ú": "u",
        "ý": "y",
        "í": "i",
        "ḿ": "m",
        "ĺ": "l",
        "ǿ": "ø",
        "ɔ́": "ɔ",
        "ɛ́": "ɛ",
        "ǽ": "æ",
        "ə́": "ə",
        "ŕ": "r",
        "ń": "n",
    },
    "rising_falling": {"ã": "a"},
    "falling": {"â": "a", "î": "i", "ê": "e", "û": "u", "ô": "o", "ŷ": "y", "ĵ": "j"},
    "rising": {"ǎ": "a", "ǐ": "i", "ǔ": "u", "ǒ": "o", "ě": "e"},
    "extra_short": {"ă": "a", "ĕ": "e", "ĭ": "i", "ŏ": "o", "ŭ": "u"},
    "nasalized": {"ĩ": "i", "ũ": "u", "ã": "a", "õ": "o", "ẽ": "e", "ṽ": "v", "ỹ": "y"},
    "breathy_voiced": {"ṳ": "u"},
    "creaky_voiced": {"a̰": "a", "ḭ": "i", "ḛ": "e", "ṵ": "u"},
    "centralized": {"ë": "e", "ä": "a", "ï": "i", "ö": "o", "ü": "u", "ÿ": "y"},
    "mid": {"ǣ": "æ", "ū": "u", "ī": "i", "ē": "e", "ā": "a", "ō": "o"},
    "voiceless": {"ḁ": "a"},
    "extra_high": {"ő": "o"},
    "extra_low": {"ȁ": "a"},
    "syllabic": {"ạ": "a", "ụ": "u"},
}

dia2code = {
    "low": 768,
    "high": 769,
    "rising_falling": 771,
    "falling": 770,
    "rising": 780,
    "extra_short": 774,
    "nasalized": 771,
    "breathy_voiced": 804,
    "creaky_voiced": 816,
    "centralized": 776,
    "mid": 772,
    "voiceless": 805,
    "extra_high": 779,
    "extra_low": 783,
    "syllabic": 809,
    "high_rising": 7620,
    "low_rising": 7621,
}

char2ipa_char = dict()
for dia, char_map in dia2char.items():
    code = dia2code[dia]
    s = chr(code)
    for one_char, vowel in char_map.items():
        char2ipa_char[one_char] = vowel + s


to_remove = {
    "ᶢ",
    "̍",
    "-",
    "ⁿ",
    "ᵑ",
    "ᵐ",
    "ᶬ",
    ",",
    "ᵊ",
    "ˢ",
    "~",
    "͍",
    "ˣ",
    "ᵝ",
    "⁓",
    "˭",
    "ᵈ",
    "⁽",
    "⁾",
    "˔",
    "ᵇ",
    "+",
    "⁻",
}


def clean(s):
    if s == "◌̃":
        return ""
    return "".join(c for c in s if c not in to_remove)


def sub(s):
    return "".join(char2ipa_char.get(c, c) for c in s)


to_standardize = {
    "ˁ": "ˤ",
    "'": "ˈ",
    "?": "ʔ",
    "ṭ": "ʈ",
    "ḍ": "ɖ",
    "ṇ": "ɳ",
    "ṣ": "ʂ",
    "ḷ": "ɭ",
    ":": "ː",
    "ˇ": "̌",
    "ỵ": "y˞",
    "ọ": "o˞",
    "ř": "r̝",  # Czech
    "͈": "ː",  # Irish
    "ŕ̩": sub("ŕ") + "̩",  # sanskrit
    "δ": "d",  # Greek
    "ń̩": sub("ń") + "̩",  # unsure
    "ε": "e",
    "X": "x",
    "ṍ": sub("õ") + chr(769),
    "ÿ̀": sub("ÿ") + chr(768),
    "∅": "ʏ",  # Norvegian,
}


def standardize(s):
    return "".join(to_standardize.get(c, c) for c in s)


def get_string(s: str) -> IPAString:
    return IPAString(unicode_string=clean(sub(standardize(s))))


def get_dg_value(s: IPAString, dg) -> List:
    return [c.dg_value(dg) for c in s.ipa_chars]


name2dg = {
    "ptype": DG_TYPES,
    "c_voicing": DG_C_VOICING,
    "c_place": DG_C_PLACE,
    "c_manner": DG_C_MANNER,
    "v_height": DG_V_HEIGHT,
    "v_backness": DG_V_BACKNESS,
    "v_roundness": DG_V_ROUNDNESS,
    "diacritics": DG_DIACRITICS,
    "s_stress": DG_S_STRESS,
    "s_length": DG_S_LENGTH,
    "s_break": DG_S_BREAK,
    "t_level": DG_T_LEVEL,
    "t_contour": DG_T_CONTOUR,
    "t_global": DG_T_GLOBAL,
}



class BaseSegment(ABC):

    has_gold_tag_seq: ClassVar[bool]

    # @property
    # @abstractmethod
    # def feat_matrix(self) -> LT:
    #     ...

    @abstractmethod
    def __len__(self):
        ...

    @abstractmethod
    def __str__(self):
        ...

    def __repr__(self):
        cls = type(self)
        return f'{cls.__name__}("{self}")'

    @abstractmethod
    def __getitem__(self, idx: int) -> str:
        """Get the corresponding unit (merged) given the index."""

    @property
    @abstractmethod
    def segment_list(self) -> List[str]:
        """Represent a list of IPAString, as a list of units."""

    @property
    @abstractmethod
    def merged_ipa(self) -> List[IPAString]:
        """Return a list of IPAString."""

    @cached_property
    def cv_list(self) -> List[str]:
        """Return a list of strings corresponding to the consonants and vowels."""
        ret = list()
        for ipa_unit in self.merged_ipa:
            unit = list()
            for c in ipa_unit:
                if c.is_vowel or c.is_consonant:
                    unit.append(str(c))
            if not unit:
                raise ValueError(f"There is no consonant/vowel in this unit.")
            ret.append("".join(unit))
        return ret

    def __eq__(self, other):
        if not isinstance(other, BaseSegment):
            return False
        else:
            return self.segment_list == other.segment_list

    def __hash__(self):
        return hash(tuple(self.segment_list))


class BaseSegmentWithGoldTagSeq(BaseSegment):

    has_gold_tag_seq: ClassVar[bool] = True

    @property
    @abstractmethod
    def gold_tag_seq(self) -> LT:
        ...

normal_feats = ["ptype", "c_voicing", "c_place", "c_manner", "v_height", "v_backness", "v_roundness"]
feats_to_merge = ["diacritics", "s_stress", "s_length", "s_break", "t_level", "t_contour", "t_global"]

def de_none(s):
    return "none" if s is None else s

# def indexify_ipa(col: str, lst: List) -> List:
#     cat_cls = Category.get_enum(col)
#     return [getattr(cat_cls, x.replace("-", "_").upper()).value.g_idx for x in lst]


class Segment(BaseSegmentWithGoldTagSeq):
    def __init__(self, raw_token: str):
        self._raw_token = raw_token
        self.is_noise = raw_token.startswith("#")
        self.token = raw_token[1:] if self.is_noise else raw_token
        self.ipa = get_string(self.token)
        self._merged = False
        if len(self.ipa) == 0:
            raise ValueError("Invalid IPA string.")
        self._apply_all()
        self._merge()
        # self._indexify()

    @property
    def merged_ipa(self):
        return self._merged_ipa

    def __len__(self):
        return len(self._merged_ipa)

    @property
    def gold_tag_seq(self) -> LT:
        if self.is_noise or len(self) < g.min_word_length or len(self) > g.max_word_length:
            return torch.LongTensor([O] * len(self))
        else:
            return torch.LongTensor([B] + [I] * (len(self) - 1))

    @property
    def segment_list(self) -> List[str]:
        return ["".join(map(str, unit)) for unit in self._merged_ipa]

    def permute(self) -> str:
        return "".join(random.sample(self.segment_list, len(self)))

    def __str__(self):
        return "#" * self.is_noise + "-".join(self.segment_list)

    def _apply_all(self):
        for name, dg in name2dg.items():
            setattr(self, name, get_dg_value(self.ipa, dg))
        if self.ptype[0] not in ["consonant", "vowel"]:
            raise ValueError("Invalid IPA string.")

    def __getitem__(self, feat_or_idx: Union[int, str]):
        if isinstance(feat_or_idx, str):
            return self._legacy_getitem(feat_or_idx)
        else:
            return self.segment_list[feat_or_idx]

    def _legacy_getitem(self, feat: str):
        if self._merged:
            try:
                return self.datum_cols[feat]
            except KeyError:
                return self.datum_inds[feat]
        else:
            try:
                return getattr(self, feat)
            except AttributeError:
                raise KeyError(f"Key {feat} not found.")

    def _merge(self):
        datum = merge_ipa(self, self.ipa, self.token)
        if not datum:
            raise ValueError("Invalid IPA string.")
        self._merged_ipa = datum[2]
        self.datum_cols = {feat: datum[3 + i] for i, feat in enumerate(normal_feats + feats_to_merge)}
        self._merged = True

    # def _indexify(self):
    #     self.datum_inds = {f"{feat}_idx": indexify_ipa(feat, value) for feat, value in self.datum_cols.items()}


def merge_ipa(s: Union[pd.Series, Segment], ipa: IPAString, segment: str) -> List:
    i = 0
    keep = True
    datum_cols = {feat: list() for feat in normal_feats + feats_to_merge}
    merged_ipa = list()
    ptypes = s["ptype"]
    while i < len(ptypes):
        # Get ptype and normal features first.
        for feat in normal_feats:
            datum_cols[feat].append(de_none(s[feat][i]))

        # Try to merge characters if needed.
        j = i + 1
        datum_c_to_merge = dict.fromkeys(feats_to_merge)
        while j < len(ptypes) and ptypes[j] not in ["consonant", "vowel"]:
            # Attach j-th char to i-th.
            for feat in feats_to_merge:
                value = s[feat][j]
                if value is not None:
                    try:
                        assert datum_c_to_merge[feat] is None
                        datum_c_to_merge[feat] = value
                    except:
                        print("error in:", ipa)
                        # errors[(feat)].append(s)
                        keep = False
            j += 1
        merged_ipa.append(ipa[i:j])
        i = j
        for feat in feats_to_merge:
            datum_cols[feat].append(de_none(datum_c_to_merge[feat]))
    datum = [segment, ipa, merged_ipa] + [datum_cols[feat] for feat in normal_feats + feats_to_merge]
    if keep:
        return datum
    else:
        return list()


  from collections import MutableSequence
  from pandas import Panel


In [None]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [None]:
# Set your dataset path here
KNOWN_LANG_PATH = "/content/sample_data/es.txtfix.txt"
UNKNOWN_LANG_PATH = "/content/sample_data/voynich_unambiguous.txt"

## Dataset check

Run the following check to ensure that both known and unknwon languages follow the acceptable input from the script

In [None]:
print("Check KNOWN Language file format")
with open(KNOWN_LANG_PATH, "r", encoding="utf8") as f:
    cnt = 0
    for line in f.readlines():
        try:
            Segment(line.strip())
        except Exception as e:
            print("error in:", line.strip())
            cnt += 1
    print("Total error in unknown language:", cnt)


print("\nCheck UNKNOWN_LANG_PATH Language file format")
with open(UNKNOWN_LANG_PATH, "r", encoding="utf8") as f:
    for line in f.readlines():
        try:
            Segment(line.strip())
        except Exception as e:
            print("error in:", line.strip())
    print("Total word error in unknown language:", cnt)

Check KNOWN Language file format
0

Check UNKNOWN_LANG_PATH Language file format


# Run Training

As of now, the training will run indefinitely and not sure if changing this will affect the rest of the code. For that reason, we need to stop the training manually once we feel that the losses are no longer improving. We can monitor the `ll` variable from script output inside a table with the following format to know when to stop the training (i.e. the `ll` is close to zero)

```
+----------------------------------------+  
|                  3_8                   |  
+-----------+----------+--------+--------+  
| name      | value    | weight | mean   |  
+-----------+----------+--------+--------+  
| grad_norm | 58.492   | 60     | 0.975  |  
| ll        | -336.246 | 60     | -5.604 |  
| reg       | 0.555    | 60     | 0.009  |  
+-----------+----------+--------+--------+
```

The other way to know when to stop the training is to also monitor the output of the model validation. Go to the next section to see how to analyze the result.

In [None]:
# Total number of phonetic feature groups
NUM_FEATURE_GROUPS = 7

# Total number of phonetic features
NUM_FEATURES = 140

# Initial value of threshold to determine whether two words are matched. This will determine
# whether two words are in match. The bigger the value, the more false positive we will have.
# However, if the value is too low, the model will not output anything
THRESHOLD = 6

# Cost in doing insertion and deletion operation in edit distance algorithm, refer to the paper for more details
INS_DEL_COST = 3.5

# Learning rate for adam optimizer
LR = 0.002

# How many training steps to do before running the evaluation steps
EVAL_INTERVAL = 500

In [None]:
!rm -rf log

In [None]:
!PYTHONPATH=/content/xib && /usr/local/bin/python -m xib.main --task extract \
  --vocab_path {KNOWN_LANG_PATH} --data_path {UNKNOWN_LANG_PATH} \
  --dim 112 --min_word_length 1 --max_word_length 10 --input_format text \
  --dense_input --eval_interval 100 \
  --char_per_batch 16 --gpus 0 \
  --num_feature_groups {NUM_FEATURE_GROUPS} --num_features {NUM_FEATURES} \
  --init_threshold {THRESHOLD} --init_ins_del_cost {INS_DEL_COST} --learning_rate {LR}

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                                                            +-----------+---------+--------+--------+
                                                            | grad_norm | 4.55    | 4      | 1.137  |
                                                            | ll        | -34.186 | 4      | -8.547 |
                                                            | reg       | 17.09   | 4      | 4.272  |
                                                            +-----------+---------+--------+--------+[0m
[Kcheck 100%|| 2/2 [00:00<00:00, 21.27/s][1;1H[1;1H[32mINFO - 06/26/21 10:20:55 - 1:00:58 at base_trainer.py:137 - +--------------------------------------+
                                                            |                553_8                 |
                                                            +-----------+---------+--------+-------+
                                                           

# Analyzing The Result

The result will be in the following file `log/<DATE>/default/<TIME>/predictions/extract.<EPOCH>_<STEPS>.tsv`

For some reason, google colab won't show anything under `log` folder, so I would suggest to analyze it via `cat` command or download the result to your local computer and analyze it from there.

The content inside of the `tsv` file will consist of 4 different columns: `segment`, `ground_truth`, `prediction`, `matched_segment`. From my understanding, `segment` is the original segment of the unknown language; `ground_truth` similar to `segment` but with their exact index locations; `prediction` is the vocabulary prediction in the known language; and `matched_segment` is the information on which segment the unknown language match the vocabulary in known language. If you want, you can get more details by looking at the code in `evaluator.py` line 257


In [None]:
# You can run the following command to get the list of experiment dates
!ls -R log/

log/:
2021-06-26

log/2021-06-26:
default

log/2021-06-26/default:
09-19-58

log/2021-06-26/default/09-19-58:
log		     saved.29_10.latest   saved.499_10.latest
predictions	     saved.299_10.best	  saved.509_10.latest
saved.109_10.best    saved.299_10.latest  saved.519_10.latest
saved.109_10.latest  saved.309_10.best	  saved.529_10.latest
saved.119_10.best    saved.309_10.latest  saved.539_10.latest
saved.119_10.latest  saved.319_10.best	  saved.549_10.latest
saved.129_10.best    saved.319_10.latest  saved.559_10.latest
saved.129_10.latest  saved.329_10.best	  saved.569_10.latest
saved.139_10.best    saved.329_10.latest  saved.579_10.latest
saved.139_10.latest  saved.339_10.best	  saved.589_10.latest
saved.149_10.latest  saved.339_10.latest  saved.59_10.best
saved.159_10.latest  saved.349_10.best	  saved.59_10.latest
saved.169_10.latest  saved.349_10.latest  saved.599_10.latest
saved.179_10.best    saved.359_10.latest  saved.609_10.latest
saved.179_10.latest  saved.369_10.latest  saved

In [None]:
# Or you can run the following command to download the result to your local environment
from google.colab import files

files.download("log/2021-06-26/default/09-19-58/predictions/extract.479_10.tsv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Finding the lowest error

Run the following cell to find which files have the lowest error

In [None]:
import re

log = '/content/log/2021-06-26/default/09-19-58/log'

def parse(fname):
    with open(fname) as f:
        data = f.read()
    values = re.findall(r'([0-9]*[9]+_10).+\n.+\n.+\n.+\n.+\n.+ll +\| (-\d+\.\d+|\d+\.\d+)', data)
    return sorted(values, key=lambda x: abs(float(x[1])))[:20]

parse(log)

[('479_10', '-22.614'),
 ('639_10', '-24.322'),
 ('339_10', '-25.428'),
 ('299_10', '-27.639'),
 ('549_10', '-28.186'),
 ('409_10', '-28.631'),
 ('139_10', '-29.142'),
 ('569_10', '-29.223'),
 ('279_10', '-29.355'),
 ('489_10', '-30.189'),
 ('149_10', '-30.686'),
 ('49_10', '-30.694'),
 ('439_10', '-30.979'),
 ('519_10', '-31.374'),
 ('129_10', '-31.812'),
 ('179_10', '-31.956'),
 ('319_10', '-32.024'),
 ('309_10', '-32.119'),
 ('579_10', '-32.438'),
 ('389_10', '-32.764')]

## Matches analysis

Run the following cell to get the word with the longest match as well as total match from the known language and the unknown language

In [None]:
df = pd.read_csv('log/2021-06-26/default/09-19-58/predictions/extract.479_10.tsv', delimiter='\t')
def analyze_results(df):
    #getting the % of matched_segments 
    data = len(df)
    
    new_df = df.dropna()
    
    hits = len(new_df)
    
    print('Matched ', hits/data*100, ' %, i.e.', hits,'/', data)
    
    #creating a dictionary prediction:matched_segment (1:1)
    pred_match = {}
    for prediction, match in zip(new_df['prediction'].values, 
                                 new_df['matched_segment'].values):
        pred_match[prediction] = match
    
    #getting the number of unique matched segments
    #and the number of unique predictions
    match_num = Counter(new_df['matched_segment'].values)
    pred_num = Counter(new_df['prediction'].values)

    #creating a dict to connect all matched_segments to all predictions (but from the 
    #segment column)
    #they are connected
    #{matched_segment: [segment1, segment2], matched_segment2:[segment2, segment3]}
    match_pred = {}
    for match in match_num:
        segs = []
        for pred in new_df[new_df['matched_segment']==match]['segment']:
            segs.append(pred)
        match_pred[match] = segs
    
    print('Number of unique matches: ', len(match_num))
    longest_match = sorted(match_num, key=len, reverse=True)[0].replace('-','')
    print('Longest match: ', longest_match, ', lenght: ', len(longest_match))
    return pred_match, match_num, pred_num, match_pred

analyze_results(df)

Matched  16.329804299005453  %, i.e. 1018 / 6234
Number of unique matches:  535
Longest match:  pɾeskɾit , lenght:  8


({'o-l-a-l-o-r:0:5': 'a-n-i-m-a-l',
  'y-d-y:0:2': 't-e-s-t',
  'o-k-a:0:2': 'a-b-i',
  'a-l-y:0:2': 'i-m-i-t',
  'o-q-o:0:2': 'a-t͡ʃ-a-k',
  'k-a-m:0:2': 'b-i-ʝ',
  'y-s-h-o-l:0:4': 't-ɾ-a-m',
  'k-e-d-a-r:0:4': 'b-e-s-i-n',
  'c-h-y:0:2': 'o-l-t',
  'a-r-y:0:2': 'o-l-t',
  'o-t-l:0:2': 'a-m',
  'a-i-n:0:2': 'ɾ-i-ɡ-t͡s',
  'o-l-k-c-h-o:0:5': 'a-n-d-o-ɾ',
  'l-c-h-d-y:0:4': 'm-o-s-t',
  'a-m-o-d:0:3': 'u-ɲ-a-s',
  's-a-i-l:0:3': 'p-u-ɡ-n',
  'o-l-t-e-y:0:4': 'a-m-b-i-t',
  'l-c-h-o-r:0:4': 'm-o-ɾ-a-l',
  'o-k-e-e-d:0:4': 'a-b-e-s',
  'o-e-e-s-y:0:4': 'a-e-k-t',
  's-o-l-d-a-m:0:5': 'k-a-n-s-i-ʝ',
  's-h-e-k-e-d-y:0:6': 'p-ɾ-e-b-i-s-t',
  'q-o-k-o-l:0:4': 'a-d-a-m',
  'c-h-k-o-r:0:4': 'o-ɾ-b-a-n',
  'o-k-e-o-l:0:4': 'a-d-a-m',
  'o-k-o-h-y:0:4': 'a-b-a-ɾ-k',
  'y-c-h-k-y:0:4': 'k-o-ɾ-t',
  'y-cˈ-t-h-y:0:4': 'k-o-ɾ-t',
  'k-o-d-a-r:0:4': 'b-a-s-i-l',
  'a-i-r-o-d:0:4': 'e-n-l-a-s',
  'o-s-h:0:2': 'a-ɾ',
  'o-l-l:0:2': 'a-n-n',
  'q-o-k:0:2': 't͡ʃ-a-d',
  'a-k-c-h-y:0:4': 'a-b-o-ɾ-t',
  '