# One-Shot Learning Example

The Jupyter Notebook should be launched in the folder **notebooks**.

In [1]:
import os
os.chdir('../src')
from osms.common.multispeaker import MultispeakerManager
import torch
import yaml
import warnings
warnings.filterwarnings("ignore")


Create a 5-second .wav file with someone speaking English and put it into the folder **audio_samples**.
Set the path to your .wav file in the attribute `SPEAKER_SPEECH_PATH` in `src/tts_modules/common/configs/main_config.yaml`.
We suggest to use the app [Audio Recorder](https://apps.apple.com/us/app/audio-recorder-wav-m4a/id1454488895) to record the voice. Set the sample rate to 16HGz there.

Create a .txt file with some sentences written in English and put it into the **texts** folder. Set the path to your .txt file in the attribute `INPUT_TEXTS_PATH` in `src/tts_modules/common/configs/main_config.yaml`.


The examples are already present in these folders.

In [2]:
with open(os.path.join(os.getcwd(), 'osms/tts_modules/common/configs/main_config.yaml'), "r") as ymlfile:
    main_config = yaml.load(ymlfile)
    
SPEAKER_SPEECH_PATH = "../audio_samples"
if not os.path.exists(SPEAKER_SPEECH_PATH):
    os.makedirs(SPEAKER_SPEECH_PATH)
    
INPUT_TEXTS_PATH = "../texts"
if not os.path.exists(INPUT_TEXTS_PATH):
    os.makedirs(INPUT_TEXTS_PATH)
    
OUTPUT_AUDIO_DIR = "../result_speech"
if not os.path.exists(OUTPUT_AUDIO_DIR):
    os.makedirs(OUTPUT_AUDIO_DIR)

In [2]:
main_config = None

In [3]:
multispeaker_manager = MultispeakerManager(main_configs=main_config)
multispeaker_manager.inference()

Trainable Parameters in dVecModel: 1.424M
Loading DVecModel checkpoint from checkpoints/encoder.pt
Trainable Parameters in Tacotron: 30.870M
Loading Tacotron checkpoint from checkpoints/synthesizer/synthesizer.pt
Trainable Parameters in WaveRNN: 4.481M
Loading WaveRNN checkpoint from checkpoints/vocoder/vocoder.pt


array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
       5.21322577e-07, 7.94726051e-08, 0.00000000e+00])

The results will be available in the folder `result_speech`. The name of the file will be **result.wav**.

The usability will be further improved.

In [1]:
import os
from torch.optim import Adam
import yaml
os.chdir('../src')


In [3]:

from osms.tts_modules.encoder.configs import get_default_encoder_config
from osms.tts_modules.encoder.data.dataset import PreprocessLibriSpeechDataset, SpeakerEncoderDataLoader, SpeakerEncoderDataset
from osms.tts_modules.encoder.data.wav_preprocessing import StandardAudioPreprocessor
from osms.tts_modules.encoder.data.wav2mel import StandardWav2MelTransform
from osms.tts_modules.encoder.models.dVecModel import DVecModel
from osms.tts_modules.encoder.utils.Trainer import SpeakerEncoderTrainer

In [4]:
from osms.tts_modules.encoder.speaker_encoder_manager import SpeakerEncoderManager


In [5]:
from osms.common.configs import get_default_main_configs
from osms.tts_modules.encoder.configs import get_default_encoder_config

In [6]:
main_config = get_default_main_configs()
encoder_config = get_default_encoder_config()

In [7]:
prerocessor = StandardAudioPreprocessor(encoder_config)
wav2mel = StandardWav2MelTransform(encoder_config)

In [8]:
dataset = SpeakerEncoderDataset(encoder_config)
dataloader = SpeakerEncoderDataLoader(encoder_config, dataset, 'train')

In [9]:
model = DVecModel(encoder_config )

Trainable Parameters in dVecModel: 1.424M


In [10]:
optimizer = Adam(model.parameters(), lr=0.0001)

In [11]:


speaker_encoder_manager = SpeakerEncoderManager(main_config, model, prerocessor, 
                                                wav2mel, dataloader, dataloader, optimizer)



In [None]:
speaker_encoder_manager.train_session()

Starting the training from scratch.
Saving the model (step 2)


In [5]:
config = get_default_encoder_config()

with open("./osms/tts_modules/encoder/configs/AudioConfig.yaml", "r") as ymlfile:
    audio_config = yaml.load(ymlfile)
    
prerocessor = StandardAudioPreprocessor(audio_config)
wav2mel = StandardWav2MelTransform(audio_config)

  after removing the cwd from sys.path.


In [6]:
prerocessor.audio_config

{'MEL_N_CHANNELS': 40,
 'MEL_WINDOW_STEP': 10,
 'MEL_WINDOW_LENGTH': 40,
 'SAMPLING_RATE': 16000,
 'PARTIAL_UTTERANCE_N_FRAMES': 160,
 'inference_n_frames': 80,
 'VAD_WINDOW_LENGTH': 30,
 'VAD_MOVING_AVERAGE_WIDTH': 8,
 'VAD_MAX_SILENCE_LENGTH': 6,
 'AUDIO_NORM_TARGET_dBFS': -30}

In [7]:
DataPreprocessor = PreprocessLibriSpeechDataset(config,prerocessor,wav2mel)

In [8]:
DataPreprocessor.preprocess_dataset()

[PosixPath('/Users/gleb/Documents/LibriSpeech/test-clean/61'), PosixPath('/Users/gleb/Documents/LibriSpeech/test-clean/5639'), PosixPath('/Users/gleb/Documents/LibriSpeech/test-clean/6829'), PosixPath('/Users/gleb/Documents/LibriSpeech/test-clean/908'), PosixPath('/Users/gleb/Documents/LibriSpeech/test-clean/672'), PosixPath('/Users/gleb/Documents/LibriSpeech/test-clean/.DS_Store'), PosixPath('/Users/gleb/Documents/LibriSpeech/test-clean/8455'), PosixPath('/Users/gleb/Documents/LibriSpeech/test-clean/8463'), PosixPath('/Users/gleb/Documents/LibriSpeech/test-clean/1320'), PosixPath('/Users/gleb/Documents/LibriSpeech/test-clean/2300'), PosixPath('/Users/gleb/Documents/LibriSpeech/test-clean/6930'), PosixPath('/Users/gleb/Documents/LibriSpeech/test-clean/260'), PosixPath('/Users/gleb/Documents/LibriSpeech/test-clean/1995'), PosixPath('/Users/gleb/Documents/LibriSpeech/test-clean/3575'), PosixPath('/Users/gleb/Documents/LibriSpeech/test-clean/4507'), PosixPath('/Users/gleb/Documents/LibriS

In [10]:
with open("./osms/tts_modules/encoder/configs/dVecModelConfig.yaml", "r") as ymlfile:
    model_config = yaml.load(ymlfile)

  


Trainable Parameters in dVecModel: 1.424M


In [13]:
a = 5