In [None]:
import numpy as np
from scipy.io import wavfile
import soundfile as sf

# Load the stereo WAV file
sample_rate, audio_data = wavfile.read('english_long.wav')

# Check if the audio is stereo (has 2 channels)
if len(audio_data.shape) == 2 and audio_data.shape[1] == 2:
    # Extract left channel (channel 1)
    left_channel = audio_data[:, 0]
    
    # Extract right channel (channel 2)
    right_channel = audio_data[:, 1]
    
    # Save left channel as english_long_ch1.wav
    wavfile.write('english_long_ch1.wav', sample_rate, left_channel)
    
    # Save right channel as english_long_ch2.wav
    wavfile.write('english_long_ch2.wav', sample_rate, right_channel)
    
    print("Successfully saved both channels as separate files.")
else:
    print("The input file is not stereo. Please check the audio file.")


# Debugging

In [None]:
import argparse
import torch
from whisperx.utils import LANGUAGES, TO_LANGUAGE_CODE, optional_float, optional_int, str2bool

# Create an argument parser with default values
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
parser.add_argument("--model", default="openai/whisper-large-v3", help="name of the Whisper model to use")
parser.add_argument("--model_cache_only", type=str2bool, default=False, help="If True, will not attempt to download models, instead using cached models from --model_dir")
parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
parser.add_argument("--device_index", default=0, type=int, help="device index to use for FasterWhisper inference")
parser.add_argument("--batch_size", default=4, type=int, help="the preferred batch size for inference")
parser.add_argument("--compute_type", default="float16", type=str, choices=["float16", "float32", "int8"], help="compute type for computation")
parser.add_argument("--use_openai_whisper", type=str2bool, default=False, help="use OpenAI Whisper from Hugging Face instead of Faster Whisper")

parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
parser.add_argument("--output_format", "-f", type=str, default="all", choices=["all", "srt", "vtt", "txt", "tsv", "json", "aud"], help="format of the output file; if not specified, all available formats will be produced")
parser.add_argument("--verbose", type=str2bool, default=None, help="whether to print out the progress and debug messages")

parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, specify None to perform language detection")

# alignment params
parser.add_argument("--align_model", default=None, help="Name of phoneme-level ASR model to do alignment")
parser.add_argument("--interpolate_method", default="nearest", choices=["nearest", "linear", "ignore"], help="For word .srt, method to assign timestamps to non-aligned words, or merge them into neighbouring.")
parser.add_argument("--no_align", action='store_true', help="Do not perform phoneme alignment")
parser.add_argument("--return_char_alignments", action='store_true', help="Return character-level alignments in the output json file")

# vad params
parser.add_argument("--vad_method", type=str, default="silero_custom", choices=["pyannote", "silero", "silero_custom"], help="VAD method to be used")
parser.add_argument("--vad_onset", type=float, default=0.500, help="Onset threshold for VAD (see pyannote.audio), reduce this if speech is not being detected")
parser.add_argument("--vad_offset", type=float, default=0.363, help="Offset threshold for VAD (see pyannote.audio), reduce this if speech is not being detected.")
parser.add_argument("--chunk_size", type=int, default=30, help="Chunk size for merging VAD segments. Default is 30, reduce this if the chunk is too long.")
parser.add_argument("--vad_onnx", type=str2bool, default=True, help="If `True`, use the ONNX version of the Silero VAD model.")
parser.add_argument("--silero_merge_cutoff", type=float, default=0.1, help="The merge cutoff for the Silero VAD model.")

# diarization params
parser.add_argument("--diarize", action="store_true", help="Apply diarization to assign speaker labels to each segment/word")
parser.add_argument("--min_speakers", default=None, type=int, help="Minimum number of speakers to in audio file")
parser.add_argument("--max_speakers", default=None, type=int, help="Maximum number of speakers to in audio file")

parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
parser.add_argument("--patience", type=float, default=1.0, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
parser.add_argument("--length_penalty", type=float, default=1.0, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default")

parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
parser.add_argument("--suppress_numerals", action="store_true", help="whether to suppress numeric symbols and currency symbols during sampling, since wav2vec2 cannot align them correctly")

parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
parser.add_argument("--condition_on_previous_text", type=str2bool, default=False, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")

parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")

parser.add_argument("--max_line_width", type=optional_int, default=None, help="(not possible with --no_align) the maximum number of characters in a line before breaking the line")
parser.add_argument("--max_line_count", type=optional_int, default=None, help="(not possible with --no_align) the maximum number of lines in a segment")
parser.add_argument("--highlight_words", type=str2bool, default=False, help="(not possible with --no_align) underline each word as it is spoken in srt and vtt")
parser.add_argument("--segment_resolution", type=str, default="sentence", choices=["sentence", "chunk"], help="(not possible with --no_align) the maximum number of characters in a line before breaking the line")

parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")

parser.add_argument("--hf_token", type=str, default=None, help="Hugging Face Access Token to access PyAnnote gated models")

parser.add_argument("--print_progress", type=str2bool, default=False, help="if True, progress will be printed in transcribe() and align() methods.")

# Create an args object with default values
args = parser.parse_args(['english_long_ch1.wav'])  # Default audio file

# Override with specific values if needed
args.model = "openai/whisper-large-v3"
args.device = "cuda" if torch.cuda.is_available() else "cpu"
args.use_openai_whisper = True
args.compute_type = "float16"
args.language = "en"
args.verbose = True

model_name: str = args.model
batch_size: int = args.batch_size
model_dir: str = args.model_dir
model_cache_only: bool = args.model_cache_only
output_dir: str = args.output_dir
output_format: str = args.output_format
device: str = args.device
device_index: int = args.device_index
compute_type: str = args.compute_type
verbose: bool = args.verbose
use_openai_whisper: bool = args.use_openai_whisper

In [2]:
device = "cuda"

asr_options = {
    "beam_size": args.beam_size,
    "patience": args.patience,
    "length_penalty": args.length_penalty,
    "suppress_tokens": [int(x) for x in args.suppress_tokens.split(",")],
    "suppress_numerals": args.suppress_numerals,
}


hf_token: str = args.hf_token
vad_method: str = args.vad_method
vad_onset: float = args.vad_onset
vad_offset: float = args.vad_offset
chunk_size: int = args.chunk_size

diarize: bool = args.diarize
min_speakers: int = args.min_speakers
max_speakers: int = args.max_speakers
print_progress: bool = args.print_progress
task = args.task


In [None]:
from whisperx.transcribe import *

model = load_openai_model(
    model_name, 
    device=device, 
    device_index=device_index, 
    download_root=model_dir, 
    compute_type=compute_type, 
    language=None,
    asr_options=asr_options, 
    vad_method=vad_method, 
    vad_options={"chunk_size":chunk_size, "vad_onset": vad_onset, "vad_offset": vad_offset}, 
    task=task,
    local_files_only=model_cache_only,
    return_token_probabilities=use_openai_whisper
)

In [None]:
model.device

In [5]:
from whisperx.asr import *

In [6]:
# audio ="examples/english_long_ch1.wav"
audio ="examples/hindi.wav"
if isinstance(audio, str):
    audio = load_audio(audio)

In [7]:
# Pre-process audio and merge chunks as defined by the respective VAD child class 
# In case vad_model is manually assigned (see 'load_model') follow the functionality of pyannote toolkit
if issubclass(type(model.vad_model), Vad):
    waveform = model.vad_model.preprocess_audio(audio)
    merge_chunks =  model.vad_model.merge_chunks
else:
    waveform = Pyannote.preprocess_audio(audio)
    merge_chunks = Pyannote.merge_chunks

In [8]:
vad_segments = model.vad_model({"waveform": waveform, "sample_rate": SAMPLE_RATE})
vad_segments = merge_chunks(
    vad_segments,
    chunk_size,
    onset=model._vad_params["vad_onset"],
    offset=model._vad_params["vad_offset"],
)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Extract segment durations
durations = [round(i["end"]-i["start"], 2) for i in vad_segments]

# Create histogram
plt.figure(figsize=(10, 6))
plt.hist(durations, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
plt.xlabel('Segment Duration (seconds)')
plt.ylabel('Frequency')
plt.title('Histogram of VAD Segment Durations')
plt.grid(alpha=0.3)
plt.show()


# Calculate the time gaps between consecutive segments
gaps = []
for i in range(1, len(vad_segments)):
    current_start = vad_segments[i]["start"]
    previous_end = vad_segments[i-1]["end"]
    gap = round(current_start - previous_end, 2)
    gaps.append(gap)
    
gaps = [i for i in gaps if i < 1]

# Create histogram of gaps between segments
plt.figure(figsize=(10, 6))
plt.hist(gaps, bins=20, alpha=0.7, color='lightgreen', edgecolor='black')
plt.xlabel('Gap Duration (seconds)')
plt.ylabel('Frequency')
plt.title('Histogram of Gaps Between VAD Segments')
plt.grid(alpha=0.3)
plt.show()

In [None]:
from tqdm import tqdm
languages = []
for segment in tqdm(vad_segments):
    start_frame = int(segment['start'] * SAMPLE_RATE)
    end_frame = int(segment['end'] * SAMPLE_RATE)
    audio_segment = audio[start_frame:end_frame]
    languages.append(model.detect_language(audio_segment))

In [11]:
def data(audio, segments):
    final_segments = []
    for seg in segments:
        f1 = int(seg['start'] * SAMPLE_RATE)
        f2 = int(seg['end'] * SAMPLE_RATE)
        final_segments.append({'inputs': audio[f1:f2]})
    return final_segments

final_segments = data(audio, vad_segments)

In [12]:
features_all = [model.preprocess_audio(audio_segment["inputs"]) for audio_segment in final_segments]

In [13]:
complete_data = torch.stack(features_all)
batches = torch.split(complete_data, 32)

In [None]:
len(batches)

In [15]:
decoder_ids = {}
for i in set([i[0] for i in languages]):
    forced_decoder_ids = model.processor.get_decoder_prompt_ids(
        task="transcribe", 
        language=i, 
        no_timestamps=model.options.without_timestamps
    )
    decoder_ids[i] = forced_decoder_ids    

decoder_ids["hi"] = model.processor.get_decoder_prompt_ids(
        task="transcribe", 
        language="hi", 
        no_timestamps=model.options.without_timestamps
    )

In [16]:
# Suppress tokens if needed
if model.suppress_numerals:
    suppress_tokens = list(set(model.options.suppress_tokens + model.numeral_symbol_tokens))
else:
    suppress_tokens = model.options.suppress_tokens

In [17]:
gen_kwargs = {
    "max_new_tokens": model.options.max_new_tokens,
    "num_beams": model.options.num_beams,
    "num_return_sequences": 1,
    "temperature": model.options.temperature,
    "repetition_penalty": model.options.repetition_penalty,
    "no_repeat_ngram_size": model.options.no_repeat_ngram_size,
    "length_penalty": model.options.length_penalty,
    "return_dict_in_generate": model.options.return_token_probabilities,
    "output_scores": model.options.return_token_probabilities,
}

if suppress_tokens:
    gen_kwargs["suppress_tokens"] = suppress_tokens

In [None]:
out = []
from tqdm import tqdm
for batch in tqdm(batches):
    with torch.no_grad():
        outputs = model.model.generate(
            batch,
            forced_decoder_ids=[decoder_ids[i[0]] for i in languages],
            **gen_kwargs
        )
        out.append(outputs)

In [None]:
for batch in batches:
    with torch.no_grad():
        outputs = model.model.generate(
            batch,
            # forced_decoder_ids=decoder_ids["hi"],
            return_timestamps=True,
            **gen_kwargs
        )
    break

In [None]:
outputs.keys()

In [None]:
outputs["segments"][0][0].keys()

In [None]:
%%time
model.processor.decode(outputs.sequences[0])

In [52]:
sequences = outputs.sequences
scores = outputs.scores

In [62]:
sequence = sequences[0].cpu().tolist()
transcription = model.processor.decode(sequence, skip_special_tokens=True)

In [None]:
transcription

In [None]:
scores[0].shape

In [None]:
len(scores)

In [None]:
[len(i) for i in sequences]

In [None]:
token_probs = []
for i, token in enumerate(sequences[0].cpu().tolist()): # Ignore special tokens (4 in beginning and 1 in end)
    decoded_token = model.processor.decode([token])
    probs = torch.nn.functional.softmax(scores[i][0], dim=-1)
    token_probs.append({
        "token": decoded_token,
        "probability": round(probs[token].item(), 3)
    })

In [None]:
scores[i][0]

In [None]:
i

In [None]:
len(scores)

# ASR

In [1]:
import argparse
import torch
from whisperx.utils import LANGUAGES, TO_LANGUAGE_CODE, optional_float, optional_int, str2bool

# Create an argument parser with default values
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
parser.add_argument("--model", default="openai/whisper-large-v3", help="name of the Whisper model to use")
parser.add_argument("--model_cache_only", type=str2bool, default=False, help="If True, will not attempt to download models, instead using cached models from --model_dir")
parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
parser.add_argument("--device_index", default=0, type=int, help="device index to use for FasterWhisper inference")
parser.add_argument("--batch_size", default=4, type=int, help="the preferred batch size for inference")
parser.add_argument("--compute_type", default="float16", type=str, choices=["float16", "float32", "int8"], help="compute type for computation")
parser.add_argument("--use_openai_whisper", type=str2bool, default=False, help="use OpenAI Whisper from Hugging Face instead of Faster Whisper")

parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
parser.add_argument("--output_format", "-f", type=str, default="all", choices=["all", "srt", "vtt", "txt", "tsv", "json", "aud"], help="format of the output file; if not specified, all available formats will be produced")
parser.add_argument("--verbose", type=str2bool, default=None, help="whether to print out the progress and debug messages")

parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, specify None to perform language detection")

# alignment params
parser.add_argument("--align_model", default=None, help="Name of phoneme-level ASR model to do alignment")
parser.add_argument("--interpolate_method", default="nearest", choices=["nearest", "linear", "ignore"], help="For word .srt, method to assign timestamps to non-aligned words, or merge them into neighbouring.")
parser.add_argument("--no_align", action='store_true', help="Do not perform phoneme alignment")
parser.add_argument("--return_char_alignments", action='store_true', help="Return character-level alignments in the output json file")

# vad params
parser.add_argument("--vad_method", type=str, default="silero_custom", choices=["pyannote", "silero", "silero_custom"], help="VAD method to be used")
parser.add_argument("--vad_onset", type=float, default=0.500, help="Onset threshold for VAD (see pyannote.audio), reduce this if speech is not being detected")
parser.add_argument("--vad_offset", type=float, default=0.363, help="Offset threshold for VAD (see pyannote.audio), reduce this if speech is not being detected.")
parser.add_argument("--chunk_size", type=int, default=30, help="Chunk size for merging VAD segments. Default is 30, reduce this if the chunk is too long.")
parser.add_argument("--vad_onnx", type=str2bool, default=True, help="If `True`, use the ONNX version of the Silero VAD model.")
parser.add_argument("--silero_merge_cutoff", type=float, default=0.1, help="The merge cutoff for the Silero VAD model.")

# diarization params
parser.add_argument("--diarize", action="store_true", help="Apply diarization to assign speaker labels to each segment/word")
parser.add_argument("--min_speakers", default=None, type=int, help="Minimum number of speakers to in audio file")
parser.add_argument("--max_speakers", default=None, type=int, help="Maximum number of speakers to in audio file")

parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
parser.add_argument("--patience", type=float, default=1.0, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
parser.add_argument("--length_penalty", type=float, default=1.0, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default")

parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
parser.add_argument("--suppress_numerals", action="store_true", help="whether to suppress numeric symbols and currency symbols during sampling, since wav2vec2 cannot align them correctly")

parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
parser.add_argument("--condition_on_previous_text", type=str2bool, default=False, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")

parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")

parser.add_argument("--max_line_width", type=optional_int, default=None, help="(not possible with --no_align) the maximum number of characters in a line before breaking the line")
parser.add_argument("--max_line_count", type=optional_int, default=None, help="(not possible with --no_align) the maximum number of lines in a segment")
parser.add_argument("--highlight_words", type=str2bool, default=False, help="(not possible with --no_align) underline each word as it is spoken in srt and vtt")
parser.add_argument("--segment_resolution", type=str, default="sentence", choices=["sentence", "chunk"], help="(not possible with --no_align) the maximum number of characters in a line before breaking the line")

parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")

parser.add_argument("--hf_token", type=str, default=None, help="Hugging Face Access Token to access PyAnnote gated models")

parser.add_argument("--print_progress", type=str2bool, default=False, help="if True, progress will be printed in transcribe() and align() methods.")

# Create an args object with default values
args = parser.parse_args(['english_long_ch1.wav'])  # Default audio file

# Override with specific values if needed
args.model = "openai/whisper-large-v3"
args.device = "cuda" if torch.cuda.is_available() else "cpu"
args.use_openai_whisper = True
args.compute_type = "float16"
args.language = "en"
args.verbose = True

model_name: str = args.model
batch_size: int = args.batch_size
model_dir: str = args.model_dir
model_cache_only: bool = args.model_cache_only
output_dir: str = args.output_dir
output_format: str = args.output_format
device: str = args.device
device_index: int = args.device_index
compute_type: str = args.compute_type
verbose: bool = args.verbose
use_openai_whisper: bool = args.use_openai_whisper

device = "cuda"

asr_options = {
    "beam_size": args.beam_size,
    "patience": args.patience,
    "length_penalty": args.length_penalty,
    "suppress_tokens": [int(x) for x in args.suppress_tokens.split(",")],
    "suppress_numerals": args.suppress_numerals,
}


hf_token: str = args.hf_token
vad_method: str = args.vad_method
vad_onset: float = args.vad_onset
vad_offset: float = args.vad_offset
chunk_size: int = args.chunk_size

diarize: bool = args.diarize
min_speakers: int = args.min_speakers
max_speakers: int = args.max_speakers
print_progress: bool = args.print_progress
task = args.task

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from whisperx.transcribe import *
from whisperx.asr import *
faster_whisper_threads = 4
if (threads := args.threads) > 0:
    torch.set_num_threads(threads)
    faster_whisper_threads = threads


In [3]:
model = load_model(
        "large-v3-turbo", 
        device=device, 
        device_index=device_index, 
        download_root=model_dir, 
        compute_type=compute_type, 
        language=None, 
        asr_options=asr_options, 
        vad_method=vad_method, 
        vad_options={"chunk_size":chunk_size, "vad_onset": vad_onset, "vad_offset": vad_offset}, 
        task=task,
        local_files_only=model_cache_only,
        threads=faster_whisper_threads
    )

No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Silero...


Using cache found in /home/ubuntu/.cache/torch/hub/snakers4_silero-vad_master


In [4]:
out = model.transcribe("examples/english_long_ch1.wav", batch_size=64, verbose=False)

Detecting language for segments: 100%|██████████| 56/56 [00:04<00:00, 12.84it/s]
Processing segments of language: en: 100%|██████████| 218/218 [00:14<00:00, 15.01it/s]


In [5]:
out

{'segments': [{'text': ' Hello, my name is Tony Chiquilla.',
   'start': 0.768,
   'end': 3.136,
   'language': 'en'},
  {'text': ' and', 'start': 3.808, 'end': 4.288, 'language': 'en'},
  {'text': ' I generally prefer.',
   'start': 4.416,
   'end': 5.568,
   'language': 'en'},
  {'text': ' eating at home.', 'start': 5.856, 'end': 6.496, 'language': 'en'},
  {'text': ' Hello Andy.', 'start': 10.144, 'end': 10.88, 'language': 'en'},
  {'text': ' How are you?', 'start': 11.904, 'end': 12.288, 'language': 'en'},
  {'text': ' Good.', 'start': 13.664, 'end': 13.952, 'language': 'en'},
  {'text': " You have any idea what's going on?",
   'start': 14.528,
   'end': 15.712,
   'language': 'en'},
  {'text': ' Yeah.', 'start': 19.136, 'end': 19.488, 'language': 'en'},
  {'text': " What's your preference?",
   'start': 20.256,
   'end': 21.216,
   'language': 'en'},
  {'text': ' Thank you.', 'start': 24.672, 'end': 24.896, 'language': 'en'},
  {'text': " I'm eating at home",
   'start': 31.744,


In [5]:
out

[{'text': ' Hello, my name is Tony Chiquilla.', 'start': 0.768, 'end': 3.136},
 {'text': ' and', 'start': 3.808, 'end': 4.288},
 {'text': ' I generally prefer.', 'start': 4.416, 'end': 5.568},
 {'text': ' eating at home.', 'start': 5.856, 'end': 6.496},
 {'text': ' Hello Andy.', 'start': 10.144, 'end': 10.88},
 {'text': ' How are you?', 'start': 11.904, 'end': 12.288},
 {'text': ' Good.', 'start': 13.664, 'end': 13.952},
 {'text': " You have any idea what's going on?",
  'start': 14.528,
  'end': 15.712},
 {'text': ' Yeah.', 'start': 19.136, 'end': 19.488},
 {'text': " What's your preference?", 'start': 20.256, 'end': 21.216},
 {'text': ' Thank you.', 'start': 24.672, 'end': 24.896},
 {'text': " I'm eating at home", 'start': 31.744, 'end': 32.512},
 {'text': ' Uh,', 'start': 34.976, 'end': 35.072},
 {'text': ' um,', 'start': 35.392, 'end': 35.584},
 {'text': ' No.', 'start': 37.376, 'end': 37.728},
 {'text': ' uh...', 'start': 37.888, 'end': 38.624},
 {'text': ' enjoy eating at home be

In [2]:
# Get the key with the maximum length list
data = {"a": [1, 2, 3], "b": [4, 5, 6]}
max_key = max(data.keys(), key=lambda k: len(data[k]))
max_key

'a'

In [4]:
audio ="examples/english_long_ch1.wav"
# audio ="examples/hindi.wav"

batch_size=4
chunk_size=chunk_size
print_progress=print_progress
verbose=verbose
num_workers=0
language=None
task = "transcribe"
print_progress=True
combined_progress=True
verbose=True

In [5]:
def data(audio, segments):
    for seg in segments:
        f1 = int(seg['start'] * SAMPLE_RATE)
        f2 = int(seg['end'] * SAMPLE_RATE)
        # print(f2-f1)
        yield {'inputs': audio[f1:f2]}

In [6]:
if isinstance(audio, str):
    audio = load_audio(audio)
            
if issubclass(type(model.vad_model), Vad):
    waveform = model.vad_model.preprocess_audio(audio)
    merge_chunks =  model.vad_model.merge_chunks
else:
    waveform = Pyannote.preprocess_audio(audio)
    merge_chunks = Pyannote.merge_chunks

In [7]:
vad_segments = model.vad_model({"waveform": waveform, "sample_rate": SAMPLE_RATE})
vad_segments = merge_chunks(
    vad_segments,
    chunk_size,
    onset=model._vad_params["vad_onset"],
    offset=model._vad_params["vad_offset"],
)

In [10]:
from tqdm.auto import tqdm
# First merge segments separated by just 1 second
merged_segments = []
current_segment = None
X = 2  # Merge threshold in seconds

for idx, seg in enumerate(vad_segments):
    if current_segment is None:
        current_segment = {
            'start': seg['start'],
            'end': seg['end'],
            'original_segments': [seg]
        }
    else:
        # If gap is less than X seconds
        if seg['start'] - current_segment['end'] <= X:
            # Merge by extending end time
            current_segment['end'] = seg['end']
            current_segment['original_segments'].append(seg)
        else:
            # Add current segment to merged list and start a new one
            merged_segments.append(current_segment)
            current_segment = {
                'start': seg['start'],
                'end': seg['end'],
                'original_segments': [seg]
            }

# Add the last segment if it exists
if current_segment is not None:
    merged_segments.append(current_segment)

# Detect language for each merged segment and assign to original segments
language_segments = {}
for merged_seg in tqdm(merged_segments):
    # Extract audio for merged segment
    f1 = int(merged_seg['start'] * SAMPLE_RATE)
    f2 = int(merged_seg['end'] * SAMPLE_RATE)
    segment_audio = audio[f1:f2]
    
    # Detect language for this merged segment
    segment_language = model.detect_language(segment_audio)
    
    # Assign language to all original segments in this merged segment
    for original_seg in merged_seg['original_segments']:
        if segment_language not in language_segments:
            language_segments[segment_language] = []
        
        language_segments[segment_language].append(original_seg)

100%|██████████| 56/56 [00:04<00:00, 13.27it/s]


In [None]:
if model.tokenizer is None:
    

In [25]:
if model.tokenizer is None:
    language = language or model.detect_language(audio)
    task = task or "transcribe"
    model.tokenizer = Tokenizer(
        model.model.hf_tokenizer,
        model.model.model.is_multilingual,
        task=task,
        language=language,
    )
else:
    language = language or model.tokenizer.language_code
    task = task or model.tokenizer.task
    if task != model.tokenizer.task or language != model.tokenizer.language_code:
        model.tokenizer = Tokenizer(
            model.model.hf_tokenizer,
            model.model.model.is_multilingual,
            task=task,
            language=language,
        )

In [None]:
segm

In [27]:
if model.suppress_numerals:
    previous_suppress_tokens = model.options.suppress_tokens
    numeral_symbol_tokens = find_numeral_symbol_tokens(model.tokenizer)
    print(f"Suppressing numeral and symbol tokens")
    new_suppressed_tokens = numeral_symbol_tokens + model.options.suppress_tokens
    new_suppressed_tokens = list(set(new_suppressed_tokens))
    model.options = replace(model.options, suppress_tokens=new_suppressed_tokens)

In [41]:
segments: List[SingleSegment] = []
batch_size = batch_size or model._batch_size
total_segments = len(vad_segments)
batch_size = 64

In [44]:
from tqdm.auto import tqdm
for idx, out in tqdm(enumerate(model.__call__(data(audio, vad_segments), batch_size=batch_size, num_workers=num_workers))):
    if print_progress:
        base_progress = ((idx + 1) / total_segments) * 100
        percent_complete = base_progress / 2 if combined_progress else base_progress
        # print(f"Progress: {percent_complete:.2f}%...")
    text = out['text']
    if batch_size in [0, 1, None]:
        text = text[0]
    if False:
        print(f"Transcript: [{round(vad_segments[idx]['start'], 3)} --> {round(vad_segments[idx]['end'], 3)}] {text}")
    segments.append(
        {
            "text": text,
            "start": round(vad_segments[idx]['start'], 3),
            "end": round(vad_segments[idx]['end'], 3)
        }
    )

218it [00:29,  7.43it/s]


In [1]:
segments

NameError: name 'segments' is not defined

In [46]:
model.__call__??

[31mSignature:[39m model.__call__(inputs, *args, num_workers=[38;5;28;01mNone[39;00m, batch_size=[38;5;28;01mNone[39;00m, **kwargs)
[31mDocstring:[39m Call self as a function.
[31mSource:[39m   
    [38;5;28;01mdef[39;00m __call__(self, inputs, *args, num_workers=[38;5;28;01mNone[39;00m, batch_size=[38;5;28;01mNone[39;00m, **kwargs):
        [38;5;28;01mif[39;00m args:

        [38;5;28;01mif[39;00m num_workers [38;5;28;01mis[39;00m [38;5;28;01mNone[39;00m:
            [38;5;28;01mif[39;00m self._num_workers [38;5;28;01mis[39;00m [38;5;28;01mNone[39;00m:
                num_workers = [32m0[39m
            [38;5;28;01melse[39;00m:
                num_workers = self._num_workers
        [38;5;28;01mif[39;00m batch_size [38;5;28;01mis[39;00m [38;5;28;01mNone[39;00m:
            [38;5;28;01mif[39;00m self._batch_size [38;5;28;01mis[39;00m [38;5;28;01mNone[39;00m:
                batch_size = [32m1[39m
            [38;5;28;01melse[39;00m:
  

In [7]:





# revert the tokenizer if multilingual inference is enabled
if self.preset_language is None:
    self.tokenizer = None

# revert suppressed tokens if suppress_numerals is enabled
if self.suppress_numerals:
    self.options = replace(self.options, suppress_tokens=previous_suppress_tokens)

return {"segments": segments, "language": language}

def detect_language(self, audio: np.ndarray) -> str:
if audio.shape[0] < N_SAMPLES:
    print("Warning: audio is shorter than 30s, language detection may be inaccurate.")
model_n_mels = self.model.feat_kwargs.get("feature_size")
segment = log_mel_spectrogram(audio[: N_SAMPLES],
                                n_mels=model_n_mels if model_n_mels is not None else 80,
                                padding=0 if audio.shape[0] >= N_SAMPLES else N_SAMPLES - audio.shape[0])
encoder_output = self.model.encode(segment)
results = self.model.model.detect_language(encoder_output)
language_token, language_probability = results[0][0]
language = language_token[2:-2]
print(f"Detected language: {language} ({language_probability:.2f}) in first 30s of audio...")
return language

4

In [None]:
verbose

In [None]:
import transformers

In [None]:
import transformers

In [None]:
import transformers

In [None]:
import transformers

In [None]:
import transformers