In [1]:
import os

import torch
import torchaudio
import whisper

import numpy as np
import librosa as lr

from tqdm.notebook import tqdm

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 71 files to the new cache system


  0%|          | 0/71 [00:00<?, ?it/s]

In [2]:
device = 'mps' if torch.backends.mps.is_available() else 'cuda' if torch.backends.cuda.is_available() else 'cpu'
print(f'Torch device {device}')

Torch device mps


In [3]:
pvideo = 'lex_short.mp4'
# with open('lex_short.mp4', 'rb') as f:
#     lex_short = f.read()

# print(f'Read file of length {len(lex_short) / 1024 / 1024:.2f} MB')

import subprocess

paudio = 'lex_short.wav'
command = f'ffmpeg -i {pvideo} -ab 160k -ac 2 -ar 44100 -vn {paudio}'

subprocess.call(command, shell=True)

ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers
  built with clang version 14.0.4
  configuration: --prefix=/Users/runner/miniforge3/conda-bld/ffmpeg_1666357556406/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_pl --cc=arm64-apple-darwin20.0.0-clang --cxx=arm64-apple-darwin20.0.0-clang++ --nm=arm64-apple-darwin20.0.0-nm --ar=arm64-apple-darwin20.0.0-ar --disable-doc --disable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libfontconfig --enable-libopenh264 --enable-cross-compile --arch=arm64 --target-os=darwin --cross-prefix=arm64-apple-darwin20.0.0- --host-cc=/Users/runner/miniforge3/conda-bld/ffmpeg_1666357556406/_build_env/bin/x86_64-apple-darwin13.4.0-clang --enable-neon --enable-gnutls --enable-libmp3lame --enable-libvpx --enable-pthreads --enable-gpl --enable-libx264 --enable

1

In [4]:
audio, sample_rate = lr.load(paudio, sr=16000)
audio.shape, sample_rate

((2042242,), 16000)

In [5]:
model = whisper.load_model("small.en")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

Model is English-only and has 240,582,144 parameters.


In [6]:
options = whisper.DecodingOptions(language='en', without_timestamps=False, fp16 = False)

In [7]:
# chunks = np.array_split(audio.flatten(), audio.shape[0] // 80000)
# chunks = [audio[i : i + 90000] for i in range(0, audio.shape[0], 89000)]
split = lr.effects.split(audio, top_db=60)
chunks = []
for b, e in split:
    chunks.append(audio[b:e])
len(chunks)

8

In [8]:
mels = []
for chunk in chunks:
    if sum(chunk) == 0.0:
        continue

    taudio = whisper.pad_or_trim(torch.tensor(chunk)).to('cpu')
    mel = whisper.log_mel_spectrogram(taudio)
    mels.append(mel.numpy())
    # tmel = torch.tensor(np.array([mel.numpy()]))

In [9]:
%%time
texts = []
batch_size = 16
# for mel in np.array_split(mels, batch_size):
tmel = torch.tensor(np.array(mels))
results = model.decode(tmel, options)
texts.extend([result.text for result in results])

CPU times: user 1min 54s, sys: 32.2 s, total: 2min 26s
Wall time: 43.1 s


In [10]:
texts

['What makes Magnus so good? What are the various aspects of his game that make him so good?',
 'I think for Magnus, he just.',
 "You know that in the end game, in the end games, if you get there, he's just he's not going to blunder. That's the first thing. So you know, if you reach an end game, he's not going to make a mistake. He obviously plays great openings.",
 "And there's just really no defined weakness that he has. There's no weakness that I can think of very specifically. There are many times where players actually out-prepare him in the opening phase, but as soon as they're on their own and they have to think...",
 "very often times they'll make mistakes. So there's just no weakness for Magnus, really no weakness. Unlike say Kasparov, like Kasparov on the other hand, there are very clear weaknesses in his game like Kramnik exploited them. First of all, very, I don't wanna say like.",
 "Ego is the right word, but like very stubborn, believing that his openings were infallible,

In [14]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [22]:
%%time

tokenizer.src_lang = "en_IN"
translated = []
for text in texts:
    encoded_en = tokenizer(text, return_tensors="pt")

    generated_tokens = model.generate(
        **encoded_en,
        forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"]
    )
    result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    translated.append(result)

translated

CPU times: user 1min 39s, sys: 9.5 s, total: 1min 48s
Wall time: 1min 41s


['Qu’est-ce qui fait de Magnus si bon? Quels sont les divers aspects de son jeu qui le rendent si bon?',
 "Je pense à Magnus, c'est lui.",
 "Vous savez qu'à l'issue du jeu, à l'issue du jeu, si vous y arrivez, il ne va pas blâmer. C'est la première chose. Donc, vous savez, si vous arrivez à l'issue du jeu, il ne fera pas d'erreur. Il joue évidemment de grands ouvertures.",
 "Et il n'y a vraiment pas de faiblesse définie qu'il a. Il n'y a pas de faiblesse que je puisse envisager très spécifiquement. Il y a de nombreuses fois où les joueurs l'ont vraiment out-preparé dans la phase d'ouverture, mais dès qu'ils sont seuls et qu'ils doivent penser...",
 "c'est pourquoi il n'y a pas de faiblesse pour Magnus, vraiment pas de faiblesse. À l'instar de Kasparov dit Kasparov, comme Kasparov d'autre part, il y a des faiblesses très claires dans son jeu comme Kramnik les exploite comme Kramnik les exploite. Tout d'abord, très, je ne veux pas dire comme.",
 "Ego est la bonne parole, mais comme très 

In [23]:
!pip install speechbrain

Collecting speechbrain
  Downloading speechbrain-0.5.13-py3-none-any.whl (498 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.0/499.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
Collecting hyperpyyaml
  Downloading HyperPyYAML-1.0.1.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting ruamel.yaml>=0.17.8
  Downloading ruamel.yaml-0.17.21-py3-none-any.whl (109 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.5/109.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ruamel.yaml.clib>=0.2.6
  Downloading ruamel.yaml.clib-0.2.7-cp39-cp39-macosx_12_0_arm64.whl (130 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.8/130.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: hyperpyyaml
  Building wheel for hyperpyyaml (setup.py) ... [?25ldone
[?25h  Created wheel for hyperpyyaml: filename=

In [24]:
import torchaudio
from speechbrain.pretrained import Tacotron2
from speechbrain.pretrained import HIFIGAN


In [25]:
tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir_tts")
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")


Downloading:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/113M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/55.8M [00:00<?, ?B/s]

In [31]:
mel_output, mel_length, alignment = tacotron2.encode_text(translated[0])

In [32]:
waveforms = hifi_gan.decode_batch(mel_output)

In [33]:
torchaudio.save('example_TTS.wav',waveforms.squeeze(1), 22050)