hugging_face_token = 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

In [None]:
# ==============================================================================
# CELL 1: FINAL CLEAN INSTALL
# ==============================================================================
# This cell creates a clean environment by completely removing TensorFlow
# and then installing the necessary libraries.
# ==============================================================================
import os

# --- CRITICAL STEP: Uninstall TensorFlow to prevent CUDA conflicts ---
print("[INFO] Uninstalling TensorFlow to prevent library conflicts...")
!pip uninstall -y -q tensorflow

# --- Install system utilities ---
print("[INFO] Installing ffmpeg, git, and libsndfile1...")
!apt-get update -qq && apt-get install -y -qq ffmpeg git libsndfile1 > /dev/null

# --- Install Python packages ---
print("[INFO] Installing Python packages...")
!pip install -q git+https://github.com/m-bain/whisperX.git
!pip install -q "pyannote.audio>=3.2"

# --- Install a compatible PyTorch version ---
print("[INFO] Installing PyTorch for CUDA 12.1...")
!pip install --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# --- Install other utilities ---
!pip install -q yt-dlp

print("\n[INFO] All dependencies installed successfully.")
print("[IMPORTANT] Restarting runtime to apply changes...")

# Restart the runtime to apply changes
os.kill(os.getpid(), 9)

[INFO] Uninstalling TensorFlow to prevent library conflicts...
[0m[INFO] Installing ffmpeg, git, and libsndfile1...
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
[INFO] Installing Python packages...
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[INFO] Installing PyTorch for CUDA 12.1...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.4/780.4 MB[0m [31m794.0 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m105.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m 

In [None]:
!pip uninstall -y whisperx
!pip install --upgrade --force-reinstall git+https://github.com/m-bain/whisperX.git

Found existing installation: whisperx 3.7.2
Uninstalling whisperx-3.7.2:
  Successfully uninstalled whisperx-3.7.2
Collecting git+https://github.com/m-bain/whisperX.git
  Cloning https://github.com/m-bain/whisperX.git to /tmp/pip-req-build-964ioaba
  Running command git clone --filter=blob:none --quiet https://github.com/m-bain/whisperX.git /tmp/pip-req-build-964ioaba
  Resolved https://github.com/m-bain/whisperX.git to commit 0fa81b31f136b3574d6e9b2837de003acc9f9321
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting ctranslate2>=4.5.0 (from whisperx==3.7.2)
  Using cached ctranslate2-4.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting faster-whisper>=1.1.1 (from whisperx==3.7.2)
  Using cached faster_whisper-1.2.0-py3-none-any.whl.metadata (16 kB)
Collecting nltk>=3.9.1 (from whisperx==3.7.2)
  Using cached nltk-3.9

In [None]:
# ==============================================================================
# CELL 2: Transcribe (faster-whisper, no VAD) -> Align (WhisperX on CPU) -> Diarize (WhisperX on CPU)
# Avoids cuDNN crashes by keeping all PyTorch work on CPU.
# ==============================================================================

import os
import gc
from pathlib import Path

import torch
import yt_dlp
import whisperx
from faster_whisper import WhisperModel
from whisperx.diarize import DiarizationPipeline

# --- Configuration ---
YOUTUBE_URL   = "https://www.youtube.com/watch?v=LJiUDxj-2ZE"
AUDIO_DIR     = Path("/content/audios")
RESULTS_DIR   = Path("/content/resultados")

# IMPORTANT: PASTE YOUR HUGGING FACE TOKEN HERE
HF_TOKEN      = "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"  # e.g., "hf_xxx..."

# Whisper model size for faster-whisper
WHISPER_MODEL = "tiny"  # tiny, base, small, medium, large-v3

# --- 1. Download Audio ---
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

ytdl_opts = {
    "format": "bestaudio/best",
    "outtmpl": str(AUDIO_DIR / "%(title)s.%(ext)s"),
    "postprocessors": [{
        "key": "FFmpegExtractAudio",
        "preferredcodec": "mp3",
        "preferredquality": "192",
    }],
    "quiet": True,
}

print(f"[INFO] Downloading audio from YouTube URL: {YOUTUBE_URL}...")
with yt_dlp.YoutubeDL(ytdl_opts) as ydl:
    info = ydl.extract_info(YOUTUBE_URL, download=True)
    downloaded_file = ydl.prepare_filename(info)
    audio_path = Path(downloaded_file).with_suffix(".mp3")

if not audio_path.exists():
    raise FileNotFoundError(f"Failed to download or find the audio file at {audio_path}")
print(f"[OK] Audio downloaded successfully: {audio_path.name}")

# --- 2. Devices ---
# Use GPU for faster-whisper (CTranslate2, does not need cuDNN).
device_fw = "cuda" if torch.cuda.is_available() else "cpu"
compute_type_fw = "float16" if device_fw == "cuda" else "int8"

# Force WhisperX to use cpu
device_whisperx = "cpu"

gc.collect()
try:
    torch.cuda.empty_cache()
except Exception:
    pass

# --- 3. Transcribe (faster-whisper, VAD disabled) ---
print(f"[INFO] 1/4: Transcribing audio with faster-whisper on {device_fw} (no VAD)...")
fw_model = WhisperModel(WHISPER_MODEL, device=device_fw, compute_type=compute_type_fw)

segments_iter, info = fw_model.transcribe(
    str(audio_path),
    vad_filter=False,         # ensure no VAD/filter is used
    beam_size=5,
    temperature=0.0,
    word_timestamps=False     # WhisperX will handle alignment/word timings
)

fw_segments = []
for seg in segments_iter:
    fw_segments.append({
        "start": float(seg.start),
        "end": float(seg.end),
        "text": (seg.text or "").strip(),
    })

result = {
    "segments": fw_segments,
    "language": info.language or "en"
}
print("[OK] Transcription complete.")

del fw_model
gc.collect()
try:
    torch.cuda.empty_cache()
except Exception:
    pass

# --- 4. Align with WhisperX (CPU) ---
print("[INFO] 2/4: Aligning transcription with WhisperX (CPU)...")
audio = whisperx.load_audio(str(audio_path))
language_code = result.get("language", None)
#model_a, metadata = whisperx.load_align_model(language_code=language_code, device=device_whisperx)
model_a, metadata = whisperx.load_align_model(language_code=language_code, device="cuda")


result = whisperx.align(
    result["segments"],
    model_a,
    metadata,
    audio,
    device_whisperx,
    return_char_alignments=False
)
print("[OK] Alignment complete.")

del model_a
gc.collect()

# --- 5. Diarize with WhisperX (CPU) ---
if not HF_TOKEN:
    raise ValueError("❌ Hugging Face token missing — please set HF_TOKEN.")

#diarize_model = DiarizatonPipeline(use_auth_token=HF_TOKEN,device=device_whisperx)
diarize_model = DiarizatonPipeline(use_auth_token=HF_TOKEN, device="cuda")
diarize_segments = diarize_model(audio)
result = whisperx.assign_word_speakers(diarize_segments, result)
print("[OK] Diarization complete.")

del diarize_model
gc.collect()

# --- 6. Write outputs ---
print("[INFO] 4/4: Writing output files...")
output_txt = RESULTS_DIR / f"{audio_path.stem}.txt"
output_srt = RESULTS_DIR / f"{audio_path.stem}.srt"

def format_time(seconds: float) -> str:
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    ms = int((seconds - int(seconds)) * 1000)
    return f"{h:02}:{m:02}:{s:02},{ms:03}"

with open(output_txt, "w", encoding="utf-8") as f_txt, open(output_srt, "w", encoding="utf-8") as f_srt:
    for i, segment in enumerate(result.get("segments", [])):
        start_time = float(segment.get("start", 0.0) or 0.0)
        end_time = float(segment.get("end", 0.0) or 0.0)
        speaker = segment.get("speaker", "UNKNOWN")
        text = (segment.get("text") or "").strip()

        # Write to TXT
        f_txt.write(f"[{format_time(start_time)} --> {format_time(end_time)}] [{speaker}] {text}\n")

        # Write to SRT
        f_srt.write(f"{i+1}\n")
        f_srt.write(f"{format_time(start_time)} --> {format_time(end_time)}\n")
        f_srt.write(f"[{speaker}] {text}\n\n")

print(f"[OK] Output files saved to {RESULTS_DIR}")

[INFO] Downloading audio from YouTube URL: https://www.youtube.com/watch?v=LJiUDxj-2ZE...
[OK] Audio downloaded successfully: Así es VIVIR y ESCAPAR de una CÁRCEL en PERÚ.mp3
[INFO] 1/4: Transcribing audio with faster-whisper on cuda (no VAD)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

vocabulary.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.bin:   0%|          | 0.00/75.5M [00:00<?, ?B/s]

[OK] Transcription complete.
[INFO] 2/4: Aligning transcription with WhisperX (CPU)...
Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_voxpopuli_base_10k_asr_es.pt" to /root/.cache/torch/hub/checkpoints/wav2vec2_voxpopuli_base_10k_asr_es.pt


100%|██████████| 360M/360M [00:01<00:00, 350MB/s]


[OK] Alignment complete.


NameError: name 'DiarizatonPipeline' is not defined

In [None]:
# ==============================================================================
# GPU VERSION: Transcribe (faster-whisper, GPU) -> Align (WhisperX on GPU) -> Diarize (WhisperX on GPU)
# Requiere entorno con GPU y cuDNN funcional (Colab con T4/A100 recomendado).
# ==============================================================================

import os
import gc
from pathlib import Path

import torch
import yt_dlp
import whisperx
from faster_whisper import WhisperModel
from whisperx.diarize import DiarizationPipeline  # Asegúrate de este import

# --- Configuration ---
YOUTUBE_URL   = "https://www.youtube.com/watch?v=LJiUDxj-2ZE"
AUDIO_DIR     = Path("/content/audios")
RESULTS_DIR   = Path("/content/resultados")

# IMPORTANTE: PASTE YOUR HUGGING FACE TOKEN
HF_TOKEN      = "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"  # <- Usa el tuyo

# Whisper model size for faster-whisper
WHISPER_MODEL = "tiny"  # tiny, base, small, medium, large-v3

# --- 1. Download Audio ---
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

ytdl_opts = {
    "format": "bestaudio/best",
    "outtmpl": str(AUDIO_DIR / "%(title)s.%(ext)s"),
    "postprocessors": [{
        "key": "FFmpegExtractAudio",
        "preferredcodec": "mp3",
        "preferredquality": "192",
    }],
    "quiet": True,
}

print(f"[INFO] Downloading audio from YouTube URL: {YOUTUBE_URL}...")
with yt_dlp.YoutubeDL(ytdl_opts) as ydl:
    info = ydl.extract_info(YOUTUBE_URL, download=True)
    downloaded_file = ydl.prepare_filename(info)
    audio_path = Path(downloaded_file).with_suffix(".mp3")

if not audio_path.exists():
    raise FileNotFoundError(f"Failed to download or find the audio file at {audio_path}")
print(f"[OK] Audio downloaded successfully: {audio_path.name}")

# --- 2. Devices ---
device_fw = "cuda" if torch.cuda.is_available() else "cpu"
compute_type_fw = "float16" if device_fw == "cuda" else "int8"
device_whisperx = "cuda"  # ✅ Forzar WhisperX a GPU

gc.collect()
try:
    torch.cuda.empty_cache()
except Exception:
    pass

# --- 3. Transcribe (faster-whisper, GPU) ---
print(f"[INFO] 1/4: Transcribing audio with faster-whisper on {device_fw}...")
fw_model = WhisperModel(WHISPER_MODEL, device=device_fw, compute_type=compute_type_fw)

segments_iter, info = fw_model.transcribe(
    str(audio_path),
    vad_filter=False,
    beam_size=5,
    temperature=0.0,
    word_timestamps=False
)

fw_segments = []
for seg in segments_iter:
    fw_segments.append({
        "start": float(seg.start),
        "end": float(seg.end),
        "text": (seg.text or "").strip(),
    })

result = {
    "segments": fw_segments,
    "language": info.language or "en"
}
print("[OK] Transcription complete.")

del fw_model
gc.collect()

# --- 4. Align with WhisperX (GPU) ---
print("[INFO] 2/4: Aligning transcription with WhisperX on GPU...")
audio = whisperx.load_audio(str(audio_path))
language_code = result.get("language", None)
model_a, metadata = whisperx.load_align_model(language_code=language_code, device=device_whisperx)

result = whisperx.align(
    result["segments"],
    model_a,
    metadata,
    audio,
    device_whisperx,
    return_char_alignments=False
)
print("[OK] Alignment complete.")

del model_a
gc.collect()

# --- 5. Diarize with WhisperX (GPU) ---
if not HF_TOKEN:
    raise ValueError("❌ Hugging Face token missing — please set HF_TOKEN.")

print("[INFO] 3/4: Running diarization on GPU...")
diarize_model = DiarizationPipeline(use_auth_token=HF_TOKEN, device=device_whisperx)
diarize_segments = diarize_model(audio)
result = whisperx.assign_word_speakers(diarize_segments, result)
print("[OK] Diarization complete.")

del diarize_model
gc.collect()

# --- 6. Write outputs ---
print("[INFO] 4/4: Writing output files...")
output_txt = RESULTS_DIR / f"{audio_path.stem}.txt"
output_srt = RESULTS_DIR / f"{audio_path.stem}.srt"

def format_time(seconds: float) -> str:
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    ms = int((seconds - int(seconds)) * 1000)
    return f"{h:02}:{m:02}:{s:02},{ms:03}"

with open(output_txt, "w", encoding="utf-8") as f_txt, open(output_srt, "w", encoding="utf-8") as f_srt:
    for i, segment in enumerate(result.get("segments", [])):
        start_time = float(segment.get("start", 0.0) or 0.0)
        end_time = float(segment.get("end", 0.0) or 0.0)
        speaker = segment.get("speaker", "UNKNOWN")
        text = (segment.get("text") or "").strip()

        f_txt.write(f"[{format_time(start_time)} --> {format_time(end_time)}] [{speaker}] {text}\n")

        f_srt.write(f"{i+1}\n")
        f_srt.write(f"{format_time(start_time)} --> {format_time(end_time)}\n")
        f_srt.write(f"[{speaker}] {text}\n\n")

print(f"[OK] Output files saved to {RESULTS_DIR}")


RuntimeError: operator torchvision::nms does not exist