In [1]:
from glob import glob

files = sorted(glob("./extracted/*.json"))

In [2]:
import json

file = files[1]
data = json.load(open(file))

In [3]:
from lexikos import Lexicon

lexicon = Lexicon(normalize_phonemes=True)

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
from itertools import combinations

lines = """ʌ,ɑ,ɔ,ə
ɔ,o,oʊ
u,ʊ
æ,ɛ,e,eɪ
ɪ,i
s,ʃ,z,ʒ
θ,ð
ɹ,ɝ,ɚ,əɹ,r
f,v
n,ŋ
""".split()

substitution_pairs = [t for line in lines for t in list(combinations(line.split(","), 2))]

In [5]:
similarity_score = lambda a, b: 1 if a == b or (a, b) in substitution_pairs or (b, a) in substitution_pairs else 0

In [6]:
from Bio import pairwise2

def align(transcript, phoneme):
    alignments = pairwise2.align.localcs(
        transcript,
        phoneme,
        similarity_score,
        -1,  # Gap opening penalty
        0,  # Gap extension penalty,
        gap_char=["-"],
    )

    if len(alignments) == 0:
        return None

    best_alignment = max(alignments, key=lambda x: x.score)
    aligned_transcript = best_alignment.seqA
    # aligned_phoneme = best_alignment.seqB
    alignment_score = best_alignment.score

    is_match = alignment_score == len(phoneme)

    if is_match:
        matched_phoneme = " ".join(
            aligned_transcript[best_alignment.start : best_alignment.end]
        )
        return matched_phoneme
    
    return None



In [7]:
from itertools import groupby
from tqdm import tqdm

L = {}

for word, datum in tqdm(data.items()):
    try:
        pronunciations = lexicon[word]
    except KeyError: # skip OOVs
        continue
    
    for transcript in datum.keys():
        # try and match with one of the pronunciations
        for pron in pronunciations:
            matched_phoneme = align(transcript.split(), pron.split())
            # if matched_phoneme:
            if matched_phoneme and matched_phoneme not in lexicon[word]:
                matched_phoneme = matched_phoneme.replace("t̚", "t").replace("p̚", "p").replace("k̚", "k").replace("d̚", "d").replace("b̚", "b").replace("ɡ̚", "ɡ")
                matched_phoneme = " ".join([p for p, _ in groupby(matched_phoneme.split())])
                # print(matched_phoneme)
                L[word] = L.get(word, set()) | set([matched_phoneme])
                break

L = {k: sorted(v) for k, v in L.items()}

100%|██████████| 57943/57943 [00:20<00:00, 2778.98it/s]


In [8]:
L = dict(sorted(L.items()))
L

{'a': ['eɪ', 'oʊ', 'ɛ', 'ɛ i', 'ʌ'],
 'abbas': ['ə b ə s'],
 'abdominal': ['æ b d ə m ə n ə l'],
 'abidjan': ['æ b ə dʒ ə n', 'æ b ɪ dʒ ə n'],
 'absorption': ['ə b s ɔ ɹ p ʃ ə n'],
 'abutment': ['ə b ɔ t m ə n t'],
 'acadian': ['ɑ k eɪ d i ə n'],
 'achilles': ['ə k ɪ l i s'],
 'acid': ['ɛ s i d', 'ɛ s ɪ d'],
 'acquire': ['ə k w aɪ ɹ'],
 'acquiring': ['ə k w aɪ ɹ ɪ ŋ'],
 'acre': ['æ k ɹ'],
 'action': ['ɛ k ʃ ə n'],
 'activity': ['ɛ k t ɪ v ə t i'],
 'ada': ['eɪ d ə'],
 'add': ['ɛ d'],
 'added': ['æ d ʌ d'],
 'adding': ['æ d ɪ n'],
 'ade': ['æ d'],
 'adenine': ['ɛ d ə n i n'],
 'adjuncts': ['æ dʒ ʌ n k t s'],
 'admired': ['æ d m aɪ ɝ d'],
 'adolf': ['æ d ɑ l f'],
 'adrian': ['æ d ɹ i ə n'],
 'adult': ['ə d ə l t'],
 'adulthood': ['ə d ə l t h ʊ d'],
 'adults': ['ə d ə l t s'],
 'adventure': ['ə d v ɛ n tʃ ɝ'],
 'adventures': ['ə d v ɛ n tʃ ɝ z'],
 'adversarial': ['æ d v ɝ s ɛ ɹ i ə l'],
 'advice': ['æ d v aɪ z'],
 'aeon': ['i ə n'],
 'aerobatic': ['ɛ ɹ ə b æ t ɪ k'],
 'aerobatics': ['ɛ ɹ

In [9]:
from pathlib import Path

with open(Path("outputs") / Path(file).name, "w") as f:
    json.dump(L, f, ensure_ascii=False, indent=2)

In [10]:
with open(Path("outputs") / Path(file).with_suffix(".tsv").name, "w") as f:
    for word, prons in L.items():
        for pron in prons:
            f.write(f"{word}\t{pron}\n")