In [None]:
!pip install deep-phonemizer

Collecting deep-phonemizer
  Downloading https://files.pythonhosted.org/packages/71/87/1aff530cdb3a2c5d7fa7166b3364b539a76cfa29a041371232ff6a4881b6/deep-phonemizer-0.0.8.tar.gz
Collecting PyYAML>=5.1
[?25l  Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)
[K     |████████████████████████████████| 645kB 5.9MB/s 
[?25hCollecting transformers>=2.2.2
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 19.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 28.3MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l 

In [None]:
# Dowload and prepare an exampla dataset
!wget https://raw.githubusercontent.com/CUNY-CL/wikipron/master/data/scrape/tsv/eng_latn_us_broad.tsv

with open('eng_latn_us_broad.tsv', 'r', encoding='utf-8') as f:
  lines = f.readlines()

# Prepare data as tuples (lang, word, phoneme)
lines = [l.replace(' ', '').replace('\n', '') for l in lines]
splits = [l.split('\t') for l in lines]
train_data = [('en_us', s[0], s[1]) for s in splits if len(s)==2]

for d in train_data[:10000:1000]:
  print(d)


In [None]:
# Read standard config and adjust some params for speedup
from dp.utils.io import read_config, save_config
import dp
import os

config_file = os.path.dirname(dp.__file__) + '/configs/forward_config.yaml'
config = read_config(config_file)
config['training']['epochs'] = 10
config['training']['warmup_steps'] = 100
config['training']['generate_steps'] = 500
config['training']['validate_steps'] = 500
save_config(config, 'config.yaml')

for k, v in config.items():
  print(f'{k} {v}')



In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/checkpoints

In [None]:
from dp.preprocess import preprocess
from dp.train import train

preprocess(config_file='config.yaml', train_data=train_data)
train(config_file='config.yaml')

In [None]:
# Load phonemizer (including the training data dictionary) and use it
from dp.phonemizer import Phonemizer

phonemizer = Phonemizer.from_checkpoint('/content/checkpoints/best_model.pt')
result = phonemizer('Phonemizing an English text is imposimpable!', lang='en_us')

print(result)

In [None]:
# Phonemize a list of texts and pull out model predictions with confidence scores
result = phonemizer.phonemise_list(['Phonemizing an US-English text is imposimpable!'], lang='en_us')

for word, pred in result.predictions.items():
  print(f'{word} {pred.phonemes} {pred.confidence}')