In [None]:
!pip install wget
!pip install git+https://github.com/m-bain/whisperx.git
!pip install git+https://github.com/facebookresearch/demucs#egg=demucs
!pip install deepmultilingualpunctuation
!pip install nemo_toolkit[asr]==1.17.0
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9657 sha256=167249f6dbc1fe684f85817d3d533564e7406f725f0bf1c1c6bbc250d19baaea
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/m-bain/whisperx.git
  Cloning https://github.com/m-bain/whisperx.git to /tmp/pip-req-build-k3uxnoje
  Running command git clone --filter=blob:none --quiet https://github.com/m-bain/whisperx.git /tmp/pip-req-build-k3uxno

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting demucs
  Cloning https://github.com/facebookresearch/demucs to /tmp/pip-install-1j4kt2z0/demucs_7c6985a2180844c28606ed9f43342ab3
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/demucs /tmp/pip-install-1j4kt2z0/demucs_7c6985a2180844c28606ed9f43342ab3
  Resolved https://github.com/facebookresearch/demucs to commit e25cfeb76546c2bf436661adf18cea8fbecec9ea
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dora-search (from demucs)
  Downloading dora_search-0.1.11.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.0/87.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diffq>=0.2.1 (from demucs)
  Downloading diffq

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#!pip install faster-whisper==0.5.1

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
def utf():
  locale.getpreferredencoding = getpreferredencoding

In [None]:
#@title helpers
import os
import wget
from omegaconf import OmegaConf
import json
import shutil

punct_model_langs = [
    "en",
    "fr",
    "de",
    "es",
    "it",
    "nl",
    "pt",
    "bg",
    "pl",
    "cs",
    "sk",
    "sl",
]
wav2vec2_langs = [
    "en",
    "fr",
    "de",
    "es",
    "it",
    "nl",
    "pt",
    "ja",
    "zh",
    "uk",
    "pt",
    "ar",
    "ru",
    "pl",
    "hu",
    "fi",
    "fa",
    "el",
    "tr",
]


def create_config():
    data_dir = "./"
    DOMAIN_TYPE = "telephonic"  # Can be meeting or telephonic based on domain type of the audio file
    CONFIG_FILE_NAME = f"diar_infer_{DOMAIN_TYPE}.yaml"
    CONFIG_URL = f"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/{CONFIG_FILE_NAME}"
    MODEL_CONFIG = os.path.join(data_dir, CONFIG_FILE_NAME)
    if not os.path.exists(MODEL_CONFIG):
        MODEL_CONFIG = wget.download(CONFIG_URL, data_dir)

    config = OmegaConf.load(MODEL_CONFIG)

    ROOT = os.getcwd()
    data_dir = os.path.join(ROOT, "data")
    os.makedirs(data_dir, exist_ok=True)

    meta = {
        "audio_filepath": "mono_file.wav",
        "offset": 0,
        "duration": None,
        "label": "infer",
        "text": "-",
        "rttm_filepath": None,
        "uem_filepath": None,
    }
    with open("data/input_manifest.json", "w") as fp:
        json.dump(meta, fp)
        fp.write("\n")

    pretrained_vad = "vad_multilingual_marblenet"
    pretrained_speaker_model = "titanet_large"

    config.num_workers = 1  # Workaround for multiprocessing hanging with ipython issue

    output_dir = "nemo_outputs"  # os.path.join(ROOT, 'outputs')
    os.makedirs(output_dir, exist_ok=True)
    config.diarizer.manifest_filepath = "data/input_manifest.json"
    config.diarizer.out_dir = (
        output_dir  # Directory to store intermediate files and prediction outputs
    )

    config.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
    config.diarizer.oracle_vad = (
        False  # compute VAD provided with model_path to vad config
    )
    config.diarizer.clustering.parameters.oracle_num_speakers = False

    # Here, we use our in-house pretrained NeMo VAD model
    config.diarizer.vad.model_path = pretrained_vad
    config.diarizer.vad.parameters.onset = 0.8
    config.diarizer.vad.parameters.offset = 0.6
    config.diarizer.vad.parameters.pad_offset = -0.05
    config.diarizer.msdd_model.model_path = (
        "diar_msdd_telephonic"  # Telephonic speaker diarization model
    )

    return config


def get_word_ts_anchor(s, e, option="start"):
    if option == "end":
        return e
    elif option == "mid":
        return (s + e) / 2
    return s


def get_words_speaker_mapping(wrd_ts, spk_ts, word_anchor_option="start"):
    s, e, sp = spk_ts[0]
    wrd_pos, turn_idx = 0, 0
    wrd_spk_mapping = []
    for wrd_dict in wrd_ts:
        ws, we, wrd = (
            int(wrd_dict["start"] * 1000),
            int(wrd_dict["end"] * 1000),
            wrd_dict["word"],
        )
        wrd_pos = get_word_ts_anchor(ws, we, word_anchor_option)
        while wrd_pos > float(e):
            turn_idx += 1
            turn_idx = min(turn_idx, len(spk_ts) - 1)
            s, e, sp = spk_ts[turn_idx]
            if turn_idx == len(spk_ts) - 1:
                e = get_word_ts_anchor(ws, we, option="end")
        wrd_spk_mapping.append(
            {"word": wrd, "start_time": ws, "end_time": we, "speaker": sp}
        )
    return wrd_spk_mapping


sentence_ending_punctuations = ".?!"


def get_first_word_idx_of_sentence(word_idx, word_list, speaker_list, max_words):
    is_word_sentence_end = (
        lambda x: x >= 0 and word_list[x][-1] in sentence_ending_punctuations
    )
    left_idx = word_idx
    while (
        left_idx > 0
        and word_idx - left_idx < max_words
        and speaker_list[left_idx - 1] == speaker_list[left_idx]
        and not is_word_sentence_end(left_idx - 1)
    ):
        left_idx -= 1

    return left_idx if left_idx == 0 or is_word_sentence_end(left_idx - 1) else -1


def get_last_word_idx_of_sentence(word_idx, word_list, max_words):
    is_word_sentence_end = (
        lambda x: x >= 0 and word_list[x][-1] in sentence_ending_punctuations
    )
    right_idx = word_idx
    while (
        right_idx < len(word_list)
        and right_idx - word_idx < max_words
        and not is_word_sentence_end(right_idx)
    ):
        right_idx += 1

    return (
        right_idx
        if right_idx == len(word_list) - 1 or is_word_sentence_end(right_idx)
        else -1
    )


def get_realigned_ws_mapping_with_punctuation(
    word_speaker_mapping, max_words_in_sentence=50
):
    is_word_sentence_end = (
        lambda x: x >= 0
        and word_speaker_mapping[x]["word"][-1] in sentence_ending_punctuations
    )
    wsp_len = len(word_speaker_mapping)

    words_list, speaker_list = [], []
    for k, line_dict in enumerate(word_speaker_mapping):
        word, speaker = line_dict["word"], line_dict["speaker"]
        words_list.append(word)
        speaker_list.append(speaker)

    k = 0
    while k < len(word_speaker_mapping):
        line_dict = word_speaker_mapping[k]
        if (
            k < wsp_len - 1
            and speaker_list[k] != speaker_list[k + 1]
            and not is_word_sentence_end(k)
        ):
            left_idx = get_first_word_idx_of_sentence(
                k, words_list, speaker_list, max_words_in_sentence
            )
            right_idx = (
                get_last_word_idx_of_sentence(
                    k, words_list, max_words_in_sentence - k + left_idx - 1
                )
                if left_idx > -1
                else -1
            )
            if min(left_idx, right_idx) == -1:
                k += 1
                continue

            spk_labels = speaker_list[left_idx : right_idx + 1]
            mod_speaker = max(set(spk_labels), key=spk_labels.count)
            if spk_labels.count(mod_speaker) < len(spk_labels) // 2:
                k += 1
                continue

            speaker_list[left_idx : right_idx + 1] = [mod_speaker] * (
                right_idx - left_idx + 1
            )
            k = right_idx

        k += 1

    k, realigned_list = 0, []
    while k < len(word_speaker_mapping):
        line_dict = word_speaker_mapping[k].copy()
        line_dict["speaker"] = speaker_list[k]
        realigned_list.append(line_dict)
        k += 1

    return realigned_list


def get_sentences_speaker_mapping(word_speaker_mapping, spk_ts):
    s, e, spk = spk_ts[0]
    prev_spk = spk

    snts = []
    snt = {"speaker": f"Speaker {spk}", "start_time": s, "end_time": e, "text": ""}

    for wrd_dict in word_speaker_mapping:
        wrd, spk = wrd_dict["word"], wrd_dict["speaker"]
        s, e = wrd_dict["start_time"], wrd_dict["end_time"]
        if spk != prev_spk:
            snts.append(snt)
            snt = {
                "speaker": f"Speaker {spk}",
                "start_time": s,
                "end_time": e,
                "text": "",
            }
        else:
            snt["end_time"] = e
        snt["text"] += wrd + " "
        prev_spk = spk

    snts.append(snt)
    return snts


def get_speaker_aware_transcript(sentences_speaker_mapping, f):
    for sentence_dict in sentences_speaker_mapping:
        sp = sentence_dict["speaker"]
        text = sentence_dict["text"]
        f.write(f"\n\n{sp}: {text}")


def format_timestamp(
    milliseconds: float, always_include_hours: bool = False, decimal_marker: str = "."
):
    assert milliseconds >= 0, "non-negative timestamp expected"

    hours = milliseconds // 3_600_000
    milliseconds -= hours * 3_600_000

    minutes = milliseconds // 60_000
    milliseconds -= minutes * 60_000

    seconds = milliseconds // 1_000
    milliseconds -= seconds * 1_000

    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
    return (
        f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
    )


def write_srt(transcript, file):
    """
    Write a transcript to a file in SRT format.

    """
    for i, segment in enumerate(transcript, start=1):
        # write srt lines
        print(
            f"{i}\n"
            f"{format_timestamp(segment['start_time'], always_include_hours=True, decimal_marker=',')} --> "
            f"{format_timestamp(segment['end_time'], always_include_hours=True, decimal_marker=',')}\n"
            f"{segment['speaker']}: {segment['text'].strip().replace('-->', '->')}\n",
            file=file,
            flush=True,
        )


def cleanup(path: str):
    """path could either be relative or absolute."""
    # check if file or directory exists
    if os.path.isfile(path) or os.path.islink(path):
        # remove file
        os.remove(path)
    elif os.path.isdir(path):
        # remove directory and all its content
        shutil.rmtree(path)
    else:
        raise ValueError("Path {} is not a file or dir.".format(path))


In [None]:
!wget https://dft3h5i221ap1.cloudfront.net/OpenAI/chatgpt-prompt-eng/video/prompt_eng_01_intro_v2.mp4 -O video.mp4

--2023-05-11 16:54:35--  https://dft3h5i221ap1.cloudfront.net/OpenAI/chatgpt-prompt-eng/video/prompt_eng_01_intro_v2.mp4
Resolving dft3h5i221ap1.cloudfront.net (dft3h5i221ap1.cloudfront.net)... 18.64.182.54, 18.64.182.190, 18.64.182.228, ...
Connecting to dft3h5i221ap1.cloudfront.net (dft3h5i221ap1.cloudfront.net)|18.64.182.54|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 97372023 (93M) [video/mp4]
Saving to: ‘video.mp4’


2023-05-11 16:54:36 (101 MB/s) - ‘video.mp4’ saved [97372023/97372023]



In [None]:
!ffmpeg -i video.mp4 -vn -acodec pcm_s16le audio.wav

ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e

In [None]:
import whisperx
import gc 

device = "cuda" 
audio_file = "audio.wav"
batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)

# 1. Transcribe with original whisper (batched)
model = whisperx.load_model("large-v2", device, compute_type=compute_type)

audio = whisperx.load_audio(audio_file)
result1 = model.transcribe(audio, batch_size=batch_size)
print(result1["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result1["language"], device=device)
result = whisperx.align(result1["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result["segments"]) # after alignment


[NeMo W 2023-05-11 16:55:07 optimizers:54] Apex was not found. Using the lamb or fused_adam optimizer will error out.
[NeMo W 2023-05-11 16:55:08 experimental:27] Module <class 'nemo.collections.asr.modules.audio_modules.SpectrogramToMultichannelFeatures'> is experimental, not ready for production and is not fully supported. Use at your own risk.


Downloading (…)7a179508/config.json:   0%|          | 0.00/2.80k [00:00<?, ?B/s]

Downloading model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Downloading (…)79508/vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

Downloading (…)79508/tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

No language specified, language will be first be detected for each audio file (increases inference time).


100%|█████████████████████████████████████| 16.9M/16.9M [00:01<00:00, 14.4MiB/s]
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v1.9.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file ../root/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 2.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.0+cu118. Bad things might happen unless you revert torch to 1.x.
Detected language: en (1.00) in first 30s of audio...
[{'text': " Welcome to this course on ChatGPT Prompt Engineering for Developers. I'm thrilled to have with me Iza Fulford to teach this along with me. She's a member of the technical staff of OpenAI and had built the popular ChatGPT Retrieval plugin. And a large part of her work has been teaching people how to use OLM or Large Language Model technology in products. She's also contributed to the OpenAI Cookbook that teaches people prompting. So thrilled to have you with me.", 'start': 5.138, 'end': 32.645}, {'text': " And I'm thrilled to be here and share some prompting best practices with you all. So there's been a lot of material on the internet for prompting with articles like 30 prompts 

Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth" to /root/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960_asr_ls960.pth
100%|██████████| 360M/360M [00:02<00:00, 185MB/s]


[{'start': 5.238, 'end': 9.499, 'text': ' Welcome to this course on ChatGPT Prompt Engineering for Developers.', 'words': [{'word': 'Welcome', 'start': 5.238, 'end': 5.538, 'score': 0.951}, {'word': 'to', 'start': 5.538, 'end': 5.638, 'score': 0.679}, {'word': 'this', 'start': 5.638, 'end': 5.778, 'score': 0.941}, {'word': 'course', 'start': 5.778, 'end': 6.118, 'score': 0.839}, {'word': 'on', 'start': 6.118, 'end': 6.318, 'score': 0.904}, {'word': 'ChatGPT', 'start': 6.318, 'end': 7.098, 'score': 0.804}, {'word': 'Prompt', 'start': 7.098, 'end': 7.419, 'score': 0.738}, {'word': 'Engineering', 'start': 7.419, 'end': 7.959, 'score': 0.94}, {'word': 'for', 'start': 7.959, 'end': 8.259, 'score': 0.915}, {'word': 'Developers.', 'start': 8.259, 'end': 8.919, 'score': 0.898}]}, {'start': 9.499, 'end': 14.62, 'text': "I'm thrilled to have with me Iza Fulford to teach this along with me.", 'words': [{'word': "I'm", 'start': 9.499, 'end': 9.639, 'score': 0.608}, {'word': 'thrilled', 'start': 9.

In [None]:
word_timestamps = result["word_segments"]

In [None]:
result["word_segments"][-1]

{'word': 'video.', 'start': 386.86, 'end': 387.14, 'score': 0.694}

In [None]:
import argparse
import os
from faster_whisper import WhisperModel
import whisperx
import torch
import librosa
import soundfile
from nemo.collections.asr.models.msdd_models import NeuralDiarizer
from deepmultilingualpunctuation import PunctuationModel
import re

In [None]:
# convert audio to mono for NeMo combatibility
signal, sample_rate = librosa.load(audio_file, sr=None)
ROOT = os.getcwd()
temp_path = os.path.join(ROOT, "temp_outputs")
if not os.path.exists(temp_path):
    os.mkdir(temp_path)
os.chdir(temp_path)
soundfile.write("mono_file.wav", signal, sample_rate, "PCM_24")

# Initialize NeMo MSDD diarization model
msdd_model = NeuralDiarizer(cfg=create_config()).to("cuda")
msdd_model.diarize()

del msdd_model
torch.cuda.empty_cache()

# Reading timestamps <> Speaker Labels mapping

output_dir = "nemo_outputs"

speaker_ts = []
with open(f"{output_dir}/pred_rttms/mono_file.rttm", "r") as f:
    lines = f.readlines()
    for line in lines:
        line_list = line.split(" ")
        s = int(float(line_list[5]) * 1000)
        e = s + int(float(line_list[8]) * 1000)
        speaker_ts.append([s, e, int(line_list[11].split("_")[-1])])

wsm = get_words_speaker_mapping(word_timestamps, speaker_ts, "start")



[NeMo I 2023-05-11 16:56:55 msdd_models:1092] Loading pretrained diar_msdd_telephonic model from NGC
[NeMo I 2023-05-11 16:56:55 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/diar_msdd_telephonic/versions/1.0.1/files/diar_msdd_telephonic.nemo to /root/.cache/torch/NeMo/NeMo_1.17.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo
[NeMo I 2023-05-11 16:56:59 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2023-05-11 16:57:01 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: true
    
[NeMo W 2023-05-11 16:57:01 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: false
    
[NeMo W 2023-05-11 16:57:01 modelPT:174] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple

[NeMo I 2023-05-11 16:57:01 features:287] PADDING: 16
[NeMo I 2023-05-11 16:57:01 features:287] PADDING: 16
[NeMo I 2023-05-11 16:57:02 save_restore_connector:247] Model EncDecDiarLabelModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.17.0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2023-05-11 16:57:02 features:287] PADDING: 16
[NeMo I 2023-05-11 16:57:03 clustering_diarizer:127] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2023-05-11 16:57:03 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/vad_multilingual_marblenet/versions/1.10.0/files/vad_multilingual_marblenet.nemo to /root/.cache/torch/NeMo/NeMo_1.17.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
[NeMo I 2023-05-11 16:57:04 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2023-05-11 16:57:04 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: sca

[NeMo I 2023-05-11 16:57:04 features:287] PADDING: 16
[NeMo I 2023-05-11 16:57:05 save_restore_connector:247] Model EncDecClassificationModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.17.0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2023-05-11 16:57:05 msdd_models:864] Multiscale Weights: [1, 1, 1, 1, 1]
[NeMo I 2023-05-11 16:57:05 msdd_models:865] Clustering Parameters: {
        "oracle_num_speakers": false,
        "max_num_speakers": 8,
        "enhanced_count_thres": 80,
        "max_rp_threshold": 0.25,
        "sparse_search_volume": 30,
        "maj_vote_spk_count": false
    }
[NeMo I 2023-05-11 16:57:05 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2023-05-11 16:57:05 clustering_diarizer:309] Split long audio file to avoid CUDA memory issue


splitting manifest: 100%|██████████| 1/1 [00:00<00:00,  1.75it/s]

[NeMo I 2023-05-11 16:57:05 classification_models:263] Perform streaming frame-level VAD
[NeMo I 2023-05-11 16:57:05 collections:298] Filtered duration for loading collection is 0.000000.
[NeMo I 2023-05-11 16:57:05 collections:301] Dataset loaded with 8 items, total duration of  0.11 hours.
[NeMo I 2023-05-11 16:57:05 collections:303] # 8 files loaded accounting to # 1 labels



vad: 100%|██████████| 8/8 [00:02<00:00,  3.42it/s]

[NeMo I 2023-05-11 16:57:08 clustering_diarizer:250] Generating predictions with overlapping input segments



                                                               

[NeMo I 2023-05-11 16:57:11 clustering_diarizer:262] Converting frame level prediction to speech/no-speech segment in start and end times format.


creating speech segments: 100%|██████████| 1/1 [00:00<00:00,  3.04it/s]

[NeMo I 2023-05-11 16:57:11 clustering_diarizer:287] Subsegmentation for embedding extraction: scale0, nemo_outputs/speaker_outputs/subsegments_scale0.json
[NeMo I 2023-05-11 16:57:11 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-05-11 16:57:11 collections:298] Filtered duration for loading collection is 0.000000.
[NeMo I 2023-05-11 16:57:11 collections:301] Dataset loaded with 365 items, total duration of  0.13 hours.
[NeMo I 2023-05-11 16:57:11 collections:303] # 365 files loaded accounting to # 1 labels



[1/5] extract embeddings: 100%|██████████| 6/6 [00:01<00:00,  5.69it/s]

[NeMo I 2023-05-11 16:57:13 clustering_diarizer:389] Saved embedding files to nemo_outputs/speaker_outputs/embeddings
[NeMo I 2023-05-11 16:57:13 clustering_diarizer:287] Subsegmentation for embedding extraction: scale1, nemo_outputs/speaker_outputs/subsegments_scale1.json





[NeMo I 2023-05-11 16:57:13 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-05-11 16:57:13 collections:298] Filtered duration for loading collection is 0.000000.
[NeMo I 2023-05-11 16:57:13 collections:301] Dataset loaded with 444 items, total duration of  0.13 hours.
[NeMo I 2023-05-11 16:57:13 collections:303] # 444 files loaded accounting to # 1 labels


[2/5] extract embeddings: 100%|██████████| 7/7 [00:01<00:00,  6.58it/s]

[NeMo I 2023-05-11 16:57:14 clustering_diarizer:389] Saved embedding files to nemo_outputs/speaker_outputs/embeddings





[NeMo I 2023-05-11 16:57:14 clustering_diarizer:287] Subsegmentation for embedding extraction: scale2, nemo_outputs/speaker_outputs/subsegments_scale2.json
[NeMo I 2023-05-11 16:57:14 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-05-11 16:57:14 collections:298] Filtered duration for loading collection is 0.000000.
[NeMo I 2023-05-11 16:57:14 collections:301] Dataset loaded with 554 items, total duration of  0.14 hours.
[NeMo I 2023-05-11 16:57:14 collections:303] # 554 files loaded accounting to # 1 labels


[3/5] extract embeddings: 100%|██████████| 9/9 [00:01<00:00,  7.45it/s]

[NeMo I 2023-05-11 16:57:15 clustering_diarizer:389] Saved embedding files to nemo_outputs/speaker_outputs/embeddings
[NeMo I 2023-05-11 16:57:15 clustering_diarizer:287] Subsegmentation for embedding extraction: scale3, nemo_outputs/speaker_outputs/subsegments_scale3.json
[NeMo I 2023-05-11 16:57:15 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-05-11 16:57:15 collections:298] Filtered duration for loading collection is 0.000000.
[NeMo I 2023-05-11 16:57:15 collections:301] Dataset loaded with 745 items, total duration of  0.15 hours.
[NeMo I 2023-05-11 16:57:15 collections:303] # 745 files loaded accounting to # 1 labels



[4/5] extract embeddings: 100%|██████████| 12/12 [00:01<00:00,  7.62it/s]

[NeMo I 2023-05-11 16:57:17 clustering_diarizer:389] Saved embedding files to nemo_outputs/speaker_outputs/embeddings
[NeMo I 2023-05-11 16:57:17 clustering_diarizer:287] Subsegmentation for embedding extraction: scale4, nemo_outputs/speaker_outputs/subsegments_scale4.json





[NeMo I 2023-05-11 16:57:17 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2023-05-11 16:57:17 collections:298] Filtered duration for loading collection is 0.000000.
[NeMo I 2023-05-11 16:57:17 collections:301] Dataset loaded with 1152 items, total duration of  0.15 hours.
[NeMo I 2023-05-11 16:57:17 collections:303] # 1152 files loaded accounting to # 1 labels


[5/5] extract embeddings: 100%|██████████| 18/18 [00:02<00:00,  6.66it/s]


[NeMo I 2023-05-11 16:57:20 clustering_diarizer:389] Saved embedding files to nemo_outputs/speaker_outputs/embeddings


clustering: 100%|██████████| 1/1 [00:06<00:00,  6.10s/it]

[NeMo I 2023-05-11 16:57:26 clustering_diarizer:464] Outputs are saved in /content/temp_outputs/nemo_outputs directory



[NeMo W 2023-05-11 16:57:26 der:106] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-05-11 16:57:26 msdd_models:960] Loading embedding pickle file of scale:0 at nemo_outputs/speaker_outputs/embeddings/subsegments_scale0_embeddings.pkl
[NeMo I 2023-05-11 16:57:26 msdd_models:960] Loading embedding pickle file of scale:1 at nemo_outputs/speaker_outputs/embeddings/subsegments_scale1_embeddings.pkl
[NeMo I 2023-05-11 16:57:26 msdd_models:960] Loading embedding pickle file of scale:2 at nemo_outputs/speaker_outputs/embeddings/subsegments_scale2_embeddings.pkl
[NeMo I 2023-05-11 16:57:26 msdd_models:960] Loading embedding pickle file of scale:3 at nemo_outputs/speaker_outputs/embeddings/subsegments_scale3_embeddings.pkl
[NeMo I 2023-05-11 16:57:26 msdd_models:960] Loading embedding pickle file of scale:4 at nemo_outputs/speaker_outputs/embeddings/subsegments_scale4_embeddings.pkl
[NeMo I 2023-05-11 16:57:26 msdd_models:938] Loading cluster label file from nemo_outputs/speaker_outputs/subsegments_scale4_cluster.label
[NeMo I 2023-05-11 16:57:26 collections:612] F

100%|██████████| 1/1 [00:00<00:00, 25.69it/s]

[NeMo I 2023-05-11 16:57:26 msdd_models:1403]      [Threshold: 0.7000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2023-05-11 16:57:26 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2023-05-11 16:57:26 speaker_utils:93] Number of files to diarize: 1



[NeMo W 2023-05-11 16:57:26 der:106] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-05-11 16:57:26 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2023-05-11 16:57:26 der:106] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-05-11 16:57:26 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2023-05-11 16:57:26 der:106] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2023-05-11 16:57:26 msdd_models:1431]   
    


In [None]:
wsm

In [None]:
result["segments"]

In [None]:
if result1["language"] in punct_model_langs:
    # restoring punctuation in the transcript to help realign the sentences
    punct_model = PunctuationModel(model="kredor/punctuate-all")

    words_list = list(map(lambda x: x["word"], wsm))

    labled_words = punct_model.predict(words_list)

    ending_puncts = ".?!"
    model_puncts = ".,;:!?"

    # We don't want to punctuate U.S.A. with a period. Right?
    is_acronym = lambda x: re.fullmatch(r"\b(?:[a-zA-Z]\.){2,}", x)

    for word_dict, labeled_tuple in zip(wsm, labled_words):
        word = word_dict["word"]
        if (
                word
                and labeled_tuple[1] in ending_puncts
                and (word[-1] not in model_puncts or is_acronym(word))
        ):
            word += labeled_tuple[1]
            if word.endswith(".."):
                word = word.rstrip(".")
            word_dict["word"] = word

    wsm = get_realigned_ws_mapping_with_punctuation(wsm)
else:
    print(
        f'Punctuation restoration is not available for {result1["language"]} language.'
    )

ssm = get_sentences_speaker_mapping(wsm, speaker_ts)

In [None]:
ssm

In [None]:
result["segments"]

In [None]:
pos=0
out_f=[]
for segment in result["segments"]:
  start = segment["start"]
  end = segment["end"]
  text = segment["text"]
  words = segment["words"]
  sentence = []
  for i in range(len(words)):
    segment_speaker=wsm[pos]
    if i==0:
      start=segment_speaker["start_time"]
    if (i+1)==len(words):
      end=segment_speaker["end_time"]
    sentence.append(segment_speaker["word"])
    pos +=1
  out_f.append({
      "start_time":start,
      "end_time":end,
      "text":" ".join(sentence),
      "speaker":segment_speaker["speaker"]
  })

In [None]:
out_f

In [None]:
#se unen los audios de un speaker 
from pydub import AudioSegment

In [None]:
%cd ..

/content


In [None]:
base_clone_voice={}
silencio = AudioSegment.silent(duration=500)
for speaker_data in out_f:
  # Cargar el archivo de audio con pydub
  audio = AudioSegment.from_wav(audio_file)

  # Cortar el audio desde start_ms hasta end_ms
  cut_audio = audio[speaker_data["start_time"]:speaker_data["end_time"]]
  id_speaker=speaker_data["speaker"]
  if id_speaker not in base_clone_voice:
    base_clone_voice[id_speaker]=cut_audio
  else:
    base_clone_voice[id_speaker] +=silencio+cut_audio
for id_speaker,audio in base_clone_voice.items():
  audio.export("speaker"+str(id_speaker)+".wav", format="wav")

In [None]:
!pip install TTS
!pip install argostranslate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting TTS
  Downloading TTS-0.13.3-cp310-cp310-manylinux1_x86_64.whl (655 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m655.3/655.3 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cython==0.29.28 (from TTS)
  Downloading Cython-0.29.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
Collecting inflect==5.6.0 (from TTS)
  Downloading inflect-5.6.0-py3-none-any.whl (33 kB)
Collecting anyascii (from TTS)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
Collecting pysbd (from TTS)
  Downloading pysbd-0.3.4-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting argostranslate
  Downloading argostranslate-1.8.0-py3-none-any.whl (27 kB)
Collecting ctranslate2==2.24.0 (from argostranslate)
  Downloading ctranslate2-2.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece==0.1.96 (from argostranslate)
  Downloading sentencepiece-0.1.96-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting stanza==1.1.1 (from argostranslate)
  Downloading stanza-1.1.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages

In [None]:
import gc; gc.collect(); torch.cuda.empty_cache(); del model

In [None]:
import argostranslate.package
import argostranslate.translate

from_code = result1["language"]
to_code = "es"

# Download and install Argos Translate package
argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()
package_to_install = next(
    filter(
        lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
    )
)
argostranslate.package.install_from_path(package_to_install.download())

In [None]:
from TTS.api import TTS
tts = TTS("tts_models/es/css10/vits")

 > Downloading model to /root/.local/share/tts/tts_models--es--css10--vits


100%|██████████| 101M/101M [00:02<00:00, 38.8MiB/s] 


 > Model's license - bsd-3-clause
 > Check https://opensource.org/licenses for more info.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > initialization of speaker-embedding layers.
 > initialization of language-embedding layers.


In [None]:
from pydub import effects

In [None]:
def speed_change(sound, speed=1.0):
    # Manually override the frame_rate. This tells the computer how many
    # samples to play per second
    sound_with_altered_frame_rate = sound._spawn(sound.raw_data, overrides={
        "frame_rate": int(sound.frame_rate * speed)
    })

    # convert the sound with altered frame rate to a standard frame rate
    # so that regular playback programs will work right. They often only
    # know how to play audio at standard frame rate (like 44.1k)
    return sound_with_altered_frame_rate.set_frame_rate(sound.frame_rate)

In [None]:
#for speach in ssm:
voices =[]
for speach in out_f:
  # Ruta del archivo de audio
  #audio_file = "audio.wav"

  # Tiempo de inicio y fin en milisegundos
  start_ms = speach['start_time']
  end_ms = speach['end_time']
  speaker = speach['speaker']

  # Cargar el archivo de audio con pydub
  audio = AudioSegment.from_wav(audio_file)

  # Cortar el audio desde start_ms hasta end_ms
  cut_audio = audio[start_ms:end_ms]
  # Translate
  translatedText = argostranslate.translate.translate(speach["text"], from_code, to_code)
  print(translatedText)
  tiempo_audio = (speach['end_time']-speach['start_time'])/1000
  voice = tts.tts_with_vc_to_file(
      translatedText,
      speaker_wav="speaker"+str(speaker)+".wav",
      file_path="ouptut.wav"
  )
  voicedub = AudioSegment.from_wav("ouptut.wav")
  duracion_actual =voicedub.duration_seconds # No puede estar por debajo de 1.0
  velocidad = round(duracion_actual/tiempo_audio,2)
  if velocidad!=1:
    print("cambiando velocidad: "+str(velocidad))
    voicedub=speed_change(voicedub, velocidad)
  #if velocidad>=1:
  #  voicedub = voicedub.speedup(velocidad )
  #else:
  #  voicedub = voicedub.speed_down(velocidad )
  voices.append({
      "start_time":start_ms,
      "end_time":end_ms,
      "segment":voicedub
  })
  gc.collect(); torch.cuda.empty_cache();

Bienvenidos a este curso sobre ChatGPT Prompt Engineering para Desarrolladores.
 > Text splitted to sentences.
['Bienvenidos a este curso sobre ChatGPT Prompt Engineering para Desarrolladores.']
 > Processing time: 2.222442865371704
 > Real-time factor: 0.37237739499579087
cambiando velocidad: 1.62
Estoy emocionado de tener conmigo Iza Fulford para enseñar esto junto conmigo.
 > Text splitted to sentences.
['Estoy emocionado de tener conmigo Iza Fulford para enseñar esto junto conmigo.']
 > Processing time: 0.8036644458770752
 > Real-time factor: 0.13926192185016276
cambiando velocidad: 1.25
Es miembro del personal técnico de OpenAI y había construido el popular plugin de ChatGPT Retrieval.
 > Text splitted to sentences.
['Es miembro del personal técnico de OpenAI y había construido el popular plugin de ChatGPT Retrieval.']
 > Processing time: 1.0344252586364746
 > Real-time factor: 0.13477992904967304
cambiando velocidad: 1.28
Y una gran parte de su trabajo ha estado enseñando a la ge

In [None]:
duration=voices[0]["start_time"]
silencio = AudioSegment.silent(duration=duration)
audio_final=silencio+voices[0]["segment"]
for i in range(len(voices)-1):
  duration=voices[i+1]["start_time"]-voices[i]["end_time"]
  silencio = AudioSegment.silent(duration=duration)
  audio_final +=silencio+voices[i+1]["segment"]

In [None]:
audio_final.export("audio_final.wav", format="wav")

<_io.BufferedRandom name='audio_final.wav'>

In [None]:
audio_final.duration_seconds

386.97833333333335

In [None]:
from IPython.display import Audio
wn = Audio('audio_final.wav', autoplay=False)
display(wn)

In [None]:
!ffmpeg -i video.mp4 -i audio_final.wav -map 0:v -map 1:a -c:v copy -c:a aac -shortest output_video.mp4

ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --e