In [None]:
# !pip install datasets
# !pip install accelerate -U
# !pip install soundfile
# !pip install --upgrade speechbrain
# !pip install transformers -U
# !pip install librosa

In [None]:
from datasets import load_dataset, Audio
import librosa

In [None]:
dataset = load_dataset('mozilla-foundation/common_voice_13_0', 'ta',
                       split='train', streaming=True)
dataset

IterableDataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
    n_shards: 2
})

In [None]:
import os
import torch
import speechbrain
import torchaudio
from speechbrain.inference.speaker import EncoderClassifier
from transformers import SpeechT5Processor
import IPython as ipy

In [None]:
def resample_audio(audio_tensor, original_sr, target_sr=16000):
   resampled_audio = librosa.resample(audio_tensor, orig_sr=original_sr, target_sr=target_sr)
   return resampled_audio

In [None]:
checkpoint = "carlfeynman/speecht5-finetuned-tamil"
processor = SpeechT5Processor.from_pretrained(checkpoint)
tokenizer = processor.tokenizer
len(tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


149

In [None]:
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name),
)
def create_speaker_embedding(waveform):
    with torch.no_grad():
        spkr_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        spkr_embeddings = torch.nn.functional.normalize(spkr_embeddings, dim=2)
        spkr_embeddings = spkr_embeddings.squeeze().cpu().numpy()
    return spkr_embeddings

In [None]:
def prepare_dataset(example):
    audio = example['audio']
    example = processor(
          text = example['sentence'],
          audio_target=audio['array'],
          sampling_rate=audio['sampling_rate'],
          return_attention_mask=False
      )

    example['labels'] = example['labels'][0]
    example['speaker_embeddings'] = create_speaker_embedding(audio['array'])

    return example

In [None]:
example = next(iter(dataset))
example_speaker_embed = create_speaker_embedding(example['audio']['array'])
example_speaker_embed.shape

Reading metadata...: 43350it [00:01, 35242.55it/s]


(512,)

In [None]:
example['audio']['array'].shape

(393984,)

In [None]:
audio = resample_audio(example['audio']['array'], example['audio']['sampling_rate'], 16000)
audio.shape

(131328,)

In [None]:
example['audio']['array'] = torch.tensor(audio)
example['audio']['sampling_rate'] = 16000

In [None]:
from transformers import pipeline,SpeechT5ForTextToSpeech
from transformers import SpeechT5HifiGan

vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

In [None]:
text = 'hello'
text = example['sentence']
inputs = processor(text=text, return_tensors="pt")
print(text)
inputs['input_ids'].shape

அவரைப் பொதுமக்கள் விடாமல் பின்னாலேயே துரத்திக் கொண்டே ஓடினார்கள்.


torch.Size([1, 67])

In [None]:
model = SpeechT5ForTextToSpeech.from_pretrained(
   checkpoint
)

In [None]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
example_speaker_embed[None].shape

(1, 512)

In [None]:
inputs["input_ids"].shape

torch.Size([1, 67])

In [None]:
speech = model.generate_speech(inputs["input_ids"],
                               torch.tensor(example_speaker_embed)[None], vocoder=vocoder)

In [None]:
inputs["input_ids"]

tensor([[ 93, 133,  83,  87, 134,  92, 139, 134, 126, 132, 112,  84, 144,  92,
         144, 103,  92, 139, 133,  98, 121,  96,  84, 135,  92, 139, 134,  98,
         142,  92, 142,  96, 135, 107, 105, 107, 139, 132, 112,  83, 132,  92,
         132,  98, 144,  92, 139, 144, 126,  89,  92, 121, 107, 139, 115, 121,
          98, 142,  96,  83,  92, 144, 103,  92,   4,  26,   2]])

In [None]:
print(text)
ipy.display.Audio(speech, rate=16000)

அவரைப் பொதுமக்கள் விடாமல் பின்னாலேயே துரத்திக் கொண்டே ஓடினார்கள்.
