In [1]:
# Clone the repo
!git clone https://github.com/as-ideas/DeepForcedAligner

Cloning into 'DeepForcedAligner'...
remote: Enumerating objects: 77, done.[K
remote: Counting objects: 100% (77/77), done.[K
remote: Compressing objects: 100% (59/59), done.[K
remote: Total 367 (delta 40), reused 44 (delta 16), pack-reused 290[K
Receiving objects: 100% (367/367), 3.03 MiB | 19.06 MiB/s, done.
Resolving deltas: 100% (232/232), done.


In [2]:
# Install requirements
%cd DeepForcedAligner/  
!pip install -r requirements.txt

/content/DeepForcedAligner
Collecting librosa>=0.7.2
[?25l  Downloading https://files.pythonhosted.org/packages/26/4d/c22d8ca74ca2c13cd4ac430fa353954886104321877b65fa871939e78591/librosa-0.8.0.tar.gz (183kB)
[K     |████████████████████████████████| 184kB 6.4MB/s 
Collecting PyYAML>=5.1
[?25l  Downloading https://files.pythonhosted.org/packages/64/c2/b80047c7ac2478f9501676c988a5411ed5572f35d1beff9cae07d321512c/PyYAML-5.3.1.tar.gz (269kB)
[K     |████████████████████████████████| 276kB 12.7MB/s 
Collecting soundfile>=0.9.0
  Downloading https://files.pythonhosted.org/packages/eb/f2/3cbbbf3b96fb9fa91582c438b574cff3f45b29c772f94c400e2c99ef5db9/SoundFile-0.10.3.post1-py2.py3-none-any.whl
Collecting pooch>=1.0
[?25l  Downloading https://files.pythonhosted.org/packages/ce/11/d7a1dc8173a4085759710e69aae6e070d0d432db84013c7c343e4e522b76/pooch-1.2.0-py3-none-any.whl (47kB)
[K     |████████████████████████████████| 51kB 5.6MB/s 
Collecting appdirs
  Downloading https://files.pythonhosted.o

In [None]:
# Download and extract data (this may take a while)
!wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
!tar -xf LJSpeech-1.1.tar.bz2

--2020-10-30 12:03:20--  http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
Resolving data.keithito.com (data.keithito.com)... 174.138.79.61
Connecting to data.keithito.com (data.keithito.com)|174.138.79.61|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2748572632 (2.6G) [application/octet-stream]
Saving to: ‘LJSpeech-1.1.tar.bz2’


In [None]:
# Update config with paths and settings for speedup
from dfa.utils import read_config, save_config

config = read_config('config.yaml')
config['paths']['dataset_dir'] = 'LJSpeech-1.1'
config['paths']['metadata_path'] = 'LJSpeech-1.1/metadata.csv'
config['training']['epochs'] = 4         # for speedup
config['durations']['method'] = 'beam'   # for speedup
save_config(config, 'config.yaml')

# Preprocess data (tokenize text and convert wavs to mels)
!python preprocess.py --num_workers 2

In [None]:
# Start tensorboard
%load_ext tensorboard
%tensorboard --logdir dfa_checkpoints

In [None]:
# Train speech-to-text model
!python train.py

In [None]:
# Load latest model and extract char durations
!python extract_durations.py --num_workers 2

In [None]:
# Load and print example durations in mel steps and milliseconds
import numpy as np

durations = np.load('output/durations/LJ001-0002.npy')
mel_step_ms = 1000. * config['audio']['hop_length'] / config['audio']['sample_rate']
text = 'in being comparatively modern.' 

print('ind    char     dur    dur in ms')
for i, (t, dur) in enumerate(zip(text, durations)):
  print(f'{i:#2}      {t}       {dur:#2}       {dur * mel_step_ms:#.4}')


In [None]:
# For comparison listen to the wav
import librosa
import IPython.display as ipd

sample_rate = config['audio']['sample_rate']
hop_len = config['audio']['hop_length']
wav, _ = librosa.load('LJSpeech-1.1/wavs/LJ001-0002.wav', sr=sample_rate)
ipd.Audio(wav, rate=sample_rate)


In [None]:
# Cut word out of wav
print(text[9:22])
char_time = np.cumsum(np.pad(durations, (1, 0))) * hop_len
wav_start, wav_end = char_time[9], char_time[21]
wav_cut = wav[wav_start: wav_end]
ipd.Audio(wav_cut, rate=sample_rate)