Skip to content

Commit

Permalink
Support Wikipron Standardization
Browse files Browse the repository at this point in the history
  • Loading branch information
w11wo committed Apr 2, 2024
1 parent b32b093 commit b96543e
Show file tree
Hide file tree
Showing 4 changed files with 753 additions and 12 deletions.
2 changes: 1 addition & 1 deletion lexikos/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .lexicon import Lexicon
from .g2p import G2p

__version__ = "0.0.1rc6"
__version__ = "0.0.1rc7"
__all__ = ["Lexicon", "G2p"]
126 changes: 116 additions & 10 deletions lexikos/lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,35 +16,46 @@
from pathlib import Path
from typing import Any, Dict, List, Set, Union
import os
import re


class Lexicon(UserDict):
def __init__(
self,
normalize_phonemes: bool = False,
include_synthetic: bool = False,
self, normalize_phonemes: bool = False, include_synthetic: bool = False, standardize_wikipron: bool = False
):
dictionaries_dir = Path(os.path.join(os.path.dirname(__file__), "dict"))
files = list(dictionaries_dir.rglob("*/*.tsv"))
synthetic_files = list(dictionaries_dir.rglob("synthetic/*.tsv"))
wikipron_files = list(dictionaries_dir.rglob("wikipron/*.tsv"))
if not include_synthetic:
files = filter(lambda x: x not in synthetic_files, files)
dicts = [self._parse_tsv(file, normalize_phonemes) for file in files]

if not standardize_wikipron:
dicts = [self._parse_tsv(file, normalize_phonemes) for file in files]
else:
dicts = [self._parse_tsv(file, normalize_phonemes) for file in files if file not in wikipron_files]
wikipron = [self._parse_tsv(file, normalize_phonemes, standardize_wikipron) for file in wikipron_files]
dicts += wikipron

mapping: Dict[str, Set[str]] = self._merge_dicts(dicts)
super().__init__(mapping)

def _parse_tsv(
self, file: Union[Path, str], normalize_phonemes: bool
self, file: Union[Path, str], normalize_phonemes: bool, standardize_wikipron: bool = False
) -> Dict[str, Set[str]]:
lex = {}
with open(file, "r") as f:
for line in f.readlines():
word, phonemes = line.strip().split("\t")
phonemes = phonemes.replace(" . ", " ")
if normalize_phonemes:
phonemes = self._normalize_phonemes(phonemes)
word, _phonemes = line.strip().split("\t")
word = word.lower()
lex[word] = lex.get(word, set()) | set([phonemes])
for phonemes in _phonemes.split(" ~ "):
phonemes = phonemes.replace(".", " ")
phonemes = re.sub("\s+", " ", phonemes)
if standardize_wikipron:
phonemes = self._standardize_wikipron_phonemes(phonemes)
elif normalize_phonemes:
phonemes = self._normalize_phonemes(phonemes)
lex[word] = lex.get(word, set()) | set([phonemes])
return lex

def _merge_dicts(self, dicts: List[Dict[Any, Set]]):
Expand All @@ -71,6 +82,101 @@ def _normalize_phonemes(phonemes: str) -> str:
phonemes = phonemes.strip()
return phonemes

@staticmethod
def _standardize_wikipron_phonemes(phonemes: str) -> str:
"""
Standardize pronunciation phonemes from Wiktionary.
Inspired by [Michael McAuliffe](https://mmcauliffe.medium.com/creating-english-ipa-dictionary-using-montreal-forced-aligner-2-0-242415dfee32).
"""
diacritics = ["ː", "ˑ", "̆", "̯", "͡", "‿", "͜", "̩", "ˈ", "ˌ", "↓"]
digraphs = {
"a i": "aɪ",
"a j": "aɪ",
"a u": "aʊ",
"a ɪ": "aɪ",
"a ɪ̯": "aɪ",
"a ʊ": "aʊ",
"a ʊ̯": "aʊ",
"d ʒ": "dʒ",
"e i": "eɪ",
"e ɪ": "eɪ",
"e ɪ̯": "eɪ",
"e ɪ̪": "eɪ",
"o i": "ɔɪ",
"o u": "oʊ",
"o w": "oʊ",
"o ɪ": "ɔɪ",
"o ʊ": "oʊ",
"o ʊ̯": "oʊ",
"t ʃ": "tʃ",
"ɑ ɪ": "aɪ",
"ɔ i": "ɔɪ",
"ɔ ɪ": "ɔɪ",
"ɔ ɪ̯": "ɔɪ",
}
consonants = {
"pʰ": "p",
"b̥": "b",
"tʰ": "t",
"d̥": "d",
"tʃʰ": "tʃ",
"d̥ʒ̊": "dʒ",
"kʰ": "k",
"ɡ̊": "ɡ",
"ɸ": "f",
"β": "v",
"v̥": "v",
"t̪": "θ",
"ð̥": "ð",
"d̪": "ð",
"z̥": "z",
"ʒ̊": "ʒ",
"ɦ": "h",
"ç": "h",
"x": "h",
"χ": "h",
"ɱ": "m",
"ɫ": "l",
"l̥": "l",
"ɫ̥": "l",
"ɤ": "l",
"ɹʷ": "ɹ",
"r": "ɹ",
"ɻ": "ɹ",
"ɹ̥ʷ": "ɹ",
"ɹ̥": "ɹ",
"ɾ̥": "ɹ",
"ɻ̊": "ɹ",
"ʍ": "w",
"h w": "w",
"ɜ ɹ": "ɚ",
}
vowels = {
"ɐ": "ʌ",
"ɒ": "ɔ",
"ɜ": "ə",
"ɵ": "oʊ",
"ɘ": "ə",
}
leftover_vowels = {
"a": "æ",
"o": "ɔ",
"e": "ɛ",
}
for i, j in digraphs.items():
phonemes = phonemes.replace(i, j)
for d in diacritics:
phonemes = phonemes.replace(d, "")
for i, j in consonants.items():
phonemes = phonemes.replace(i, j)
for i, j in vowels.items():
phonemes = phonemes.replace(i, j)
for i, j in leftover_vowels.items():
phonemes = " ".join([j if p == i else p for p in phonemes.split()])
phonemes = phonemes.strip()
phonemes = re.sub("\s+", " ", phonemes)
return phonemes


if __name__ == "__main__":
lexicon = Lexicon()
Expand Down
Loading

0 comments on commit b96543e

Please sign in to comment.