This notebook assumes you have installed Naomi, and Naomi created a Pocketsphinx directory at ~/.config/naomi/pocketsphinx and downloaded the standard en-US model.

It also assumes you have installed phonetisaurus using `pip install phonetisaurus`

You also need to have downloaded the VOSK 'en-US' language model https://alphacephei.com/vosk/models/vosk-model-en-us-0.22-compile.zip from https://alphacephei.com/vosk/lm and unzipped it into your ~/.config/naomi/vosk directory.

You will also need to have installed the sphinxbase-utils package for access to the sphinx_jsgf2fsg command.

In [1]:
import os
import phonetisaurus
import re

# Pocketsphinx

In [6]:
# Read in the dictionary as a struct
RE_WORDS = re.compile(
    r"^(?P<word>[a-zA-Z0-9'\.\-]+)(\(\d\))?\s+(?P<pronunciation>[a-zA-Z]+.*[a-zA-Z0-9])\s*$"
)
lexicon = {}
with open(os.path.expanduser('~/.config/naomi/pocketsphinx/standard/en-US/cmudict.dict'), 'r') as f:
    line = f.readline()
    while line:
        one = False
        for match in RE_WORDS.finditer(line):
            one = True
            if(match.group('word') == 'magic'):
                print(f"{match.group('word')} {match.group('pronunciation')}")
            try:
                lexicon[match.group('word')].append(match.group('pronunciation').split())
            except KeyError:
                lexicon[match.group('word')]=[match.group('pronunciation').split()]
        if(not one):
            print(f"Unmatched line: {line}")
        line = f.readline()
lexicon

magic M AE JH IH K
Unmatched line: mm M



{"'bout": [['B', 'AW', 'T']],
 "'cause": [['K', 'AH', 'Z']],
 "'course": [['K', 'AO', 'R', 'S']],
 "'cuse": [['K', 'Y', 'UW', 'Z']],
 "'em": [['AH', 'M']],
 "'frisco": [['F', 'R', 'IH', 'S', 'K', 'OW']],
 "'gain": [['G', 'EH', 'N']],
 "'kay": [['K', 'EY']],
 "'m": [['AH', 'M']],
 "'n": [['AH', 'N']],
 "'round": [['R', 'AW', 'N', 'D']],
 "'s": [['EH', 'S']],
 "'til": [['T', 'IH', 'L']],
 "'tis": [['T', 'IH', 'Z']],
 "'twas": [['T', 'W', 'AH', 'Z']],
 'a': [['AH'], ['EY']],
 "a's": [['EY', 'Z']],
 'a.': [['EY']],
 "a.'s": [['EY', 'Z']],
 'a.d.': [['EY', 'D', 'IY']],
 'a.m.': [['EY', 'EH', 'M']],
 'a.s': [['EY', 'Z']],
 'aaa': [['T', 'R', 'IH', 'P', 'AH', 'L', 'EY']],
 'aaberg': [['AA', 'B', 'ER', 'G']],
 'aachen': [['AA', 'K', 'AH', 'N']],
 'aachener': [['AA', 'K', 'AH', 'N', 'ER']],
 'aaker': [['AA', 'K', 'ER']],
 'aaliyah': [['AA', 'L', 'IY', 'AA']],
 'aalseth': [['AA', 'L', 'S', 'EH', 'TH']],
 'aamodt': [['AA', 'M', 'AH', 'T']],
 'aancor': [['AA', 'N', 'K', 'AO', 'R']],
 'aardema': [[

In [7]:
phonetisaurus.train(
    lexicon,
    model_path='test_dict.model'
)

[94mINFO:phonetisaurus-train:2022-10-31 10:36:47[0m:  Checking command configuration...
[94mDEBUG:phonetisaurus-train:2022-10-31 10:36:47[0m:  Directory /tmp/tmpgwkdqahh/train does not exist.  Trying to create.
[94mINFO:phonetisaurus-train:2022-10-31 10:36:47[0m:  Checking lexicon for reserved characters: '}', '|', '_'...
[94mDEBUG:phonetisaurus-train:2022-10-31 10:36:47[0m:  arpa_path:  train/model.o8.arpa
[94mDEBUG:phonetisaurus-train:2022-10-31 10:36:47[0m:  corpus_path:  train/model.corpus
[94mDEBUG:phonetisaurus-train:2022-10-31 10:36:47[0m:  dir_prefix:  train
[94mDEBUG:phonetisaurus-train:2022-10-31 10:36:47[0m:  grow:  False
[94mDEBUG:phonetisaurus-train:2022-10-31 10:36:47[0m:  lexicon_file:  /tmp/tmpqdpc1ump.txt
[94mDEBUG:phonetisaurus-train:2022-10-31 10:36:47[0m:  logger:  <Logger phonetisaurus-train (DEBUG)>
[94mDEBUG:phonetisaurus-train:2022-10-31 10:36:47[0m:  makeJointNgramCommand:  <bound method G2PModelTrainer._mitlm of <__main__.G2PModelTrainer ob

In [8]:
for prediction in phonetisaurus.predict(
    words=['magic','voice','magicvoice',"'bout"],
    model_path='test_dict.model',
    nbest=1
):
    print(prediction)

('magic', ['M', 'AE', 'JH', 'IH', 'K'])
('voice', ['V', 'OY', 'S'])
('magicvoice', ['M', 'AE', 'JH', 'IH', 'K', 'V', 'OY', 'S'])
("'bout", ['B', 'AW', 'T'])


# VOSK

In [2]:
# Generate a vocabulary dictionary based on an exported corpus
!text2wfreq < vosk_lm/corpus.txt | wfreq2vocab > vosk_lm/vocab

wfreq2vocab : Will generate a vocabulary containing the most
              frequent 20000 words. Reading wfreq stream from stdin...
text2wfreq : Reading text from standard input...
text2wfreq : Done.
wfreq2vocab : Done.


In [3]:
# read the VOSK dictionary
RE_WORDS = re.compile(
    r"^(?P<word>[\[!]?[a-zA-Z0-9'´\.\-āăáæćçčėèéęğģģìi̇łľňñńņøóöōőřŕšşśťţűứüūž&]+)[\]]?\s+(?P<pronunciation>[a-zA-Z0-9\`\{\@\s\:]+)\s*$"
)
lexicon = {}
with open(os.path.expanduser('~/.config/naomi/vosk/vosk-model-en-us-0.22-compile/db/en.dic')) as f:
    line = f.readline()
    while line:
        one = False
        for match in RE_WORDS.finditer(line):
            one = True
            if(match.group('word') == 'magic'):
                print(f"{match.group('word')} {match.group('pronunciation')}")
            try:
                lexicon[match.group('word')].append(match.group('pronunciation').split())
            except KeyError:
                lexicon[match.group('word')]=[match.group('pronunciation').split()]
        if(not one):
            print(f"Unmatched line: {line}")
        line = f.readline()
lexicon 

Unmatched line: ch��vez s i eI tS v i i z i

Unmatched line: copyright© k A p i r aI t

Unmatched line: high�tech eI tS aI dZ i eI tS t i i s i eI tS

Unmatched line: hospital�acquired eI tS oU E s p i aI t i eI E l eI s i k j u j u aI A r i d i

magic m { dZ I k

Unmatched line: pseudo�democracies p i E s i j u d i oU d i i E m oU s i A r eI s i aI i E s

Unmatched line: self�serving E s i E l E f E s i A r v i aI E n dZ i

Unmatched line: they�focus t i eI tS i w aI E f oU s i j u E s

Unmatched line: you¿re w aI oU j u A r i



{'!SIL': [['SIL']],
 "'bout": [['b', 'aU', 't']],
 "'cause": [['k', '@', 'z']],
 "'clock": [['k', 'l', 'A', 'k']],
 "'d": [['d', 'i']],
 "'m": [['m']],
 "'re": [['3`'], ['r']],
 "'s": [['E', 's'], ['I', 'z'], ['s'], ['z']],
 "'t": [['t']],
 "'til": [['t', 'I', 'l']],
 "'ve": [['v']],
 'a': [['@'], ['V'], ['eI']],
 'a&e': [['eI', '{', 'n', 'd', 'i']],
 'a&m': [['eI', '{', 'n', 'd', 'E', 'm']],
 "a''s": [['eI', 'z']],
 "a'body": [['eI', 'b', 'A', 'd', 'i']],
 "a'court": [['eI', 'k', 'O', 'r', 't']],
 "a'd": [['eI', 'd']],
 "a'gha": [['eI', 'g', '@']],
 "a'goin": [['eI', 'g', 'OI', 'n']],
 "a'isha": [['A', 'I', 'S', 'V']],
 "a'll": [['eI', 'l']],
 "a'm": [['eI', 'm']],
 "a'mighty": [['eI', 'm', 'aI', '4', 'i']],
 "a'mighty's": [['eI', 'm', 'aI', '4', 'i', 'z']],
 "a'most": [['eI', 'm', 'oU', 's', 't']],
 "a'n't": [['eI', '@', 'n', 't']],
 "a'penny": [['eI', 'p', 'E', 'n', 'i']],
 "a'ready": [['eI', 'r', 'i', 'd', 'i']],
 "a'right": [['eI', 'r', 'aI', 't']],
 "a'rony": [['eI', 'r', 'oU', '

In [15]:
# run through the vocab file we just generated above and add words to the standard dictionary as necessary
with open("vosk_lm/vocab", 'r') as f:
    line = f.readline().lower().strip()
    while line:
        # filter out comments and open vocabulary matches
        if line[:1] != '#' and line[:1] != '{':
            if line not in lexicon:
                print(f"Adding '{line}'")
                lexicon[line] = " ".join(
                    phonetisaurus.predict(
                        words=[line],
                        model_path=os.path.expanduser('~/.config/naomi/vosk/vosk-model-en-us-0.22-compile/db/en-g2p/en.fst')
                    )[1]
                )
        line = f.readline().lower().strip()

In [16]:
# generate a new dictionary with all the words
with open('vosk_lm/en.dict', 'w') as f:
    for word in lexicon:
        for pronunciation in lexicon[word]:
            f.write("{word} {pronunciation}\n".format(
                word=word,
                pronunciation=" ".join(pronunciation)
            ))


In [18]:
# Use the JSGF .gram files to generate an ARPA language model
!sphinx_jsgf2fsg -jsgf vosk_lm/default.gram -fsg vosk_lm/default.fsg

Current configuration:
[NAME]		[DEFLT]	[VALUE]
-compile	no	no
-fsg			vosk_lm/default.fsg
-fsm			
-help		no	no
-jsgf			vosk_lm/default.gram
-symtab			
-toprule		

INFO: jsgf.c(799): Importing <number.number> from vosk_lm/number.gram to default
INFO: jsgf.c(705): Defined rule: <number.g00000>
INFO: jsgf.c(705): Defined rule: <number.ones>
INFO: jsgf.c(705): Defined rule: <number.g00002>
INFO: jsgf.c(705): Defined rule: <number.one_to_ten>
INFO: jsgf.c(705): Defined rule: <number.g00004>
INFO: jsgf.c(705): Defined rule: <number.g00005>
INFO: jsgf.c(705): Defined rule: <number.oneteens>
INFO: jsgf.c(705): Defined rule: <number.g00007>
INFO: jsgf.c(705): Defined rule: <number.tens>
INFO: jsgf.c(705): Defined rule: <number.hundred>
INFO: jsgf.c(705): Defined rule: <number.g00010>
INFO: jsgf.c(705): Defined rule: <number.magnitude>
INFO: jsgf.c(705): Defined rule: <number.g00012>
INFO: jsgf.c(705): Defined rule: <number.trailing>
INFO: jsgf.c(705): Defined rule: <numb

In [11]:
for pronunciation in phonetisaurus.predict(
    words=['magicvoice'],
    model_path=os.path.expanduser('~/.config/naomi/vosk/vosk-model-en-us-0.22-compile/db/en-g2p/en.fst')
):
    print(" ".join(pronunciation[1]))


m { dZ I k v OI s
