Model Selector
> turbo, base, small, medium, large, large-v2, large-v3-turbo, large-v3

In [31]:
MODEL_NAME = "base"

Device

In [32]:
# Habilita fallback CPU para ops no soportadas
import os
os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
import re, unicodedata, shutil, datetime, traceback, time
import platform, torch, whisper
import subprocess, json, hashlib
from pathlib import Path

In [33]:
print("MPS available:", torch.backends.mps.is_available())
print("MPS built:", getattr(torch.backends.mps, "is_built", lambda: None)())

MPS available: True
MPS built: True


CPU

In [34]:
# DEVICE = "cpu"
# FP16 = False  # en CPU no uses fp16

GPU

In [35]:
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# FP16 = (DEVICE == "cuda")  # en GPU con CUDA sí conviene usar fp16

Apple Silicon

In [36]:
# if torch.backends.mps.is_available():
#     DEVICE = "mps"
#     FP16 = False
# else:
#     DEVICE = "cpu"
#     FP16 = False

Detectar Hardware: CUDA > MPS > CPU

In [37]:
def select_candidates():
    cands = []
    if torch.cuda.is_available():
        cands.append(("cuda", True))     # CUDA con fp16
    if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
        cands.append(("mps", False))     # MPS con fp16=False
    cands.append(("cpu", False))         # CPU
    return cands

def load_whisper_with_backoff(model_name: str):
    last_err = None
    for dev, fp16 in select_candidates():
        try:
            log_line(f"Cargando Whisper '{model_name}' en '{dev}' fp16={fp16}…")
            model = whisper.load_model(model_name, device=dev)
            return model, dev, fp16
        except (NotImplementedError, RuntimeError) as e:
            msg = str(e)
            # Si es un fallo típico de MPS (SparseMPS / operador no soportado), probamos el siguiente device
            if "MPS" in msg or "SparseMPS" in msg or "aten::_sparse_coo_tensor_with_dims_and_tensors" in msg:
                log_line(f"[WARN] Backend {dev} falló ({e.__class__.__name__}: {e}). Probando siguiente opción…")
                last_err = e
                continue
            # Otros errores: propaga
            raise
    # Si ninguno funcionó, re-lanza el último para que quede trazado
    raise last_err if last_err else RuntimeError("No fue posible cargar el modelo en ningún backend")

# Usa esta llamada en vez de whisper.load_model(...)
model, DEVICE, FP16 = load_whisper_with_backoff(MODEL_NAME)

# (Opcional) imprime el contexto detectado
try:
    extra = ""
    if DEVICE == "cuda":
        extra = f" | GPU: {torch.cuda.get_device_name(0)} cap={torch.cuda.get_device_capability(0)}"
    log_line(f"[HW] device={DEVICE} fp16={FP16} torch={torch.__version__} os={platform.system()} {platform.release()}{extra}")
except Exception:
    pass

Cargando Whisper 'base' en 'mps' fp16=False…
[WARN] Backend mps falló (NotImplementedError: Could not run 'aten::_sparse_coo_tensor_with_dims_and_tensors' with arguments from the 'SparseMPS' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::_sparse_coo_tensor_with_dims_and_tensors' is only available for these backends: [MPS, Meta, SparseCPU, SparseMeta, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradHIP, AutogradXLA, AutogradMPS, AutogradIPU, AutogradXPU, AutogradHPU, AutogradVE, AutogradLazy, AutogradMTIA, AutogradMAIA, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, AutogradMeta, AutogradNestedTensor, Tracer, 

Config

In [38]:

WORKDIR = Path.cwd()
PENDING = WORKDIR / "pending"
PROCESSING = WORKDIR / "processing"
DONE = WORKDIR / "done"
FAILED = WORKDIR / "failed"
LOGFILE = WORKDIR / "pipeline.log"
# Normalización de audio (WAV 16 kHz mono PCM16)
NORMALIZE_AUDIO = True  # False si quieres desactivarlo temporalmente
AUDIO_EXTS = {".wav", ".m4a", ".mp3", ".flac", ".ogg", ".oga", ".ogx", ".opus", ".aac", ".wma", ".caf", ".aiff", ".aif", ".aifc", ".amr", ".alaw", ".ulaw", ".ac3", ".eac3", ".dts", ".mp4", ".m4v", ".mov", ".mkv", ".mka", ".webm", ".weba", ".avi", ".3gp", ".3g2", ".flv", ".ts", ".mp2", ".mp1"}
# "long" = YYYYMMDD-HHMMSS-name, "short" = MMDDHHMM-name
NAME_STYLE = "long"
LANG = "es"
BEAM_SIZE = 8
TEMPERATURE = 0.0

In [39]:
# INITIAL_PROMPT = (
#     "Transcripción fiel en español de un archivo de audio, cualquiera sea su contexto: conferencia, reunión, clase, entrevista, discurso, enseñanza, narración, conversación o grabación personal. "
#     "Usar ortografía y gramática correctas, con buena puntuación, manteniendo la coherencia y fidelidad al contenido original. "
#     "El estilo debe ser claro, estructurado y sin inventar información. "
#     "Palabras clave frecuentes: reunión, conferencia, clase, enseñanza, discurso, entrevista, conversación, narración, audio, transcripción, documento, claridad, precisión, fidelidad, coherencia, comprensión, lenguaje, contexto."
# )

Prompt IDMJI

In [40]:
INITIAL_PROMPT = (
    "Transcripción fiel en español"
    "Usar ortografía y gramática correctas, con buena puntuación, manteniendo la coherencia y fidelidad al mensaje original. "
    "Corresponde a una enseñanza doctrinal de la Iglesia de Dios Ministerial de Jesucristo Internacional (IDMJI)."
    "El estilo debe ser claro, estructurado y sin inventar información. "
    # "Cuando se mencione un versículo bíblico, adicionalmente debe transcribirse con el siguiente formato entre corchetes, indicando libro, capítulo y versículos según la Biblia Reina-Valera 1960. Y se agrega justo despues de donde se menciona el versiculo. Ejemplo: [San Lucas 5:27-28]. "
    "Palabras clave frecuentes: Iglesia, Doctrina, Enseñanza, Profecía, Biblia, Dios, Espíritu Santo."
)

Utils

In [41]:
def nfc(s: str) -> str: return unicodedata.normalize("NFC", s)
def slugify(filename: str) -> str:
    base = Path(filename).stem
    base = unicodedata.normalize("NFC", base).strip().casefold()
    base = re.sub(r"[.\s]+", "-", base)
    base = re.sub(r"[^\w\-.]+", "-", base)
    base = re.sub(r"-{2,}", "-", base).strip("-")
    return base or "audio"
def ts_long() -> str: return datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
def ts_short() -> str: return datetime.datetime.now().strftime("%m%d%H%M")
def make_job_name(audio_path: Path) -> str:
    ts = ts_long() if NAME_STYLE == "long" else ts_short()
    model_tag = slugify(MODEL_NAME)
    return f"{ts}-{model_tag}-{slugify(audio_path.name)}"
def list_audios(pending_dir: Path):
    def has_allowed_ext(p: Path) -> bool:
        return any(ext.lower() in AUDIO_EXTS for ext in p.suffixes)
    files = []
    for p in pending_dir.iterdir():
        if p.is_file():
            if has_allowed_ext(p):
                files.append(p)
            else:
                log_line(f"[SKIP] {p.name} (Extensión no permitida)")
    return sorted(files, key=lambda p: p.stat().st_ctime)
def log_line(msg: str):
    LOGFILE.parent.mkdir(parents=True, exist_ok=True)
    with LOGFILE.open("a", encoding="utf-8") as f:
        f.write(f"{datetime.datetime.now().isoformat()}  {msg}\n")
    print(msg)

RTF (Real-Time Factor)

In [42]:
def ffprobe_duration(path: str | Path) -> float:
    """Duración en segundos usando ffprobe (preciso y rápido)."""
    out = subprocess.check_output([
        "ffprobe", "-v", "error", "-show_entries", "format=duration",
        "-of", "default=noprint_wrappers=1:nokey=1", str(path)
    ], text=True).strip()
    try:
        return float(out)
    except:
        return 0.0

def sha1_file(path: Path) -> str:
    h = hashlib.sha1()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(1<<20), b""):
            h.update(chunk)
    return h.hexdigest()

Normalización de audio (WAV 16 kHz mono PCM16)

In [43]:
def to_wav_16k(in_path: Path, out_path: Path) -> Path:
    """
    Convierte cualquier entrada (audio o video) a WAV PCM 16 kHz mono.
    -vn fuerza a ignorar video; aresample=soxr mejora la calidad del remuestreo.
    """
    cmd = [
        "ffmpeg", "-y", "-i", str(in_path),
        "-vn",
        "-ac", "1",
        "-af", "aresample=resampler=soxr:precision=33",
        "-ar", "16000",
        "-c:a", "pcm_s16le",
        str(out_path)
    ]
    # Capturamos stdout/stderr para que el notebook no se llene de logs de ffmpeg
    subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return out_path
def prepareAudioforWhisper(src_path: Path, job_dir: Path, enabled: bool) -> Path:
    """
    Si enabled=True → genera job_dir/'input.16k.wav' y lo devuelve.
    Si enabled=False → devuelve el path original sin tocar.
    """
    if not enabled:
        return src_path
    target = job_dir / "input.16k.wav"
    return to_wav_16k(src_path, target)

Preparar carpetas

In [44]:
for d in (PENDING, PROCESSING, DONE, FAILED): d.mkdir(parents=True, exist_ok=True)


Model

In [45]:
log_line(f"Cargando Whisper '{MODEL_NAME}' en '{DEVICE}' fp16={FP16}…")
model = whisper.load_model(MODEL_NAME, device=DEVICE)

Cargando Whisper 'base' en 'cpu' fp16=False…


Bucle Procesador de Audios

In [46]:
audios = list_audios(PENDING)
if not audios:
    print("No hay audios en ./pending.")
else:
    total_jobs = 0
    ok_jobs = 0
    failed_jobs = 0
    total_elapsed = datetime.timedelta()
    total_audio_dur = 0.0
    normalized_count = 0
    rtfs = []

    for audio_in in audios:
        # --- crear carpeta de trabajo del job ---
        job_name = make_job_name(audio_in)
        job_dir = PROCESSING / job_name
        job_dir.mkdir(parents=True, exist_ok=True)

        # --- copiar el original a processing/<job> ---
        audio_tmp = job_dir / audio_in.name
        shutil.copy2(str(audio_in), str(audio_tmp))

        # --- cronómetro: mide de principio a fin ---
        start_wall = datetime.datetime.now()
        start_perf = time.perf_counter()

        # --- preparar fuente para Whisper (normalizar a 16k) con fallback ---
        try:
            src_for_whisper = prepareAudioforWhisper(audio_tmp, job_dir, enabled=NORMALIZE_AUDIO)
            if src_for_whisper != audio_tmp:
                log_line(f"[AUDIO] Normalizado a 16k WAV: {src_for_whisper.name}")
            else:
                log_line(f"[AUDIO] Sin normalizar (usando original): {audio_tmp.name}")
            # Conteo de normalizados
            if src_for_whisper != audio_tmp:
                normalized_count += 1
        except Exception as prep_err:
            src_for_whisper = audio_tmp
            log_line(f"[AUDIO] Preparación falló, uso original: {audio_tmp.name} :: {prep_err}")
        log_line(f"[START] {audio_in.name} -> {job_dir.name} [model={MODEL_NAME}, device={DEVICE}, fp16={FP16}]")


        try:
            # --- transcribir usando la fuente elegida ---
            result = model.transcribe(
                str(src_for_whisper),
                language=LANG,
                task="transcribe",
                temperature=TEMPERATURE,
                beam_size=BEAM_SIZE,
                patience=1.0,
                condition_on_previous_text=True,
                initial_prompt=INITIAL_PROMPT if INITIAL_PROMPT.strip() else None,
                fp16=FP16
            )

            # --- limpiar texto ---
            text = nfc(result.get("text", "")).strip()
            text = re.sub(r"[ \t]+", " ", text)
            text = re.sub(r"\s+\n", "\n", text).strip() + "\n"

            # --- escribir salida de texto ---
            out_txt = job_dir / f"{job_name}.txt"
            out_txt.write_text(text, encoding="utf-8")

            # --- métricas y meta ---
            end_wall = datetime.datetime.now()
            elapsed = datetime.timedelta(seconds=(time.perf_counter() - start_perf))

            audio_used_path = Path(src_for_whisper)
            audio_duration = ffprobe_duration(audio_used_path)
            rtf = (elapsed.total_seconds() / audio_duration) if audio_duration > 0 else None

            last_end = None
            try:
                segs = result.get("segments")
                if isinstance(segs, list) and segs:
                    last_end = float(segs[-1].get("end", 0.0))
            except Exception:
                last_end = None
            coverage_ratio = (last_end / audio_duration) if (last_end and audio_duration) else None

            meta = {
                "job_name": job_name,
                "start_time": start_wall.isoformat(),
                "end_time": end_wall.isoformat(),
                "elapsed_sec": round(elapsed.total_seconds(), 3),
                "audio_duration_sec": round(audio_duration, 3),
                "rtf": round(rtf, 3) if rtf is not None else None,
                "coverage_last_segment_end_sec": round(last_end, 3) if last_end is not None else None,
                "coverage_ratio": round(coverage_ratio, 4) if coverage_ratio is not None else None,

                "model": MODEL_NAME,
                "device": DEVICE,
                "fp16": FP16,
                "language": LANG,
                "beam_size": BEAM_SIZE,
                "temperature": (list(TEMPERATURE) if isinstance(TEMPERATURE, tuple) else TEMPERATURE),
                "initial_prompt_len": len(INITIAL_PROMPT.strip()),

                "normalized_16k": bool(NORMALIZE_AUDIO and audio_used_path.name.endswith("input.16k.wav")),
                "input_original_name": audio_in.name,
                "input_original_sha1": sha1_file(audio_in),
                "input_used_name": audio_used_path.name,
                "input_used_sha1": sha1_file(audio_used_path),
                "output_txt": out_txt.name,

                "chars": len(text),
                "words": len(text.split()),
                "segments": (len(result.get("segments", [])) if isinstance(result.get("segments"), list) else None),

                "whisper_version": getattr(whisper, "__version__", "git"),
                "torch_version": torch.__version__,
                "os": f"{platform.system()} {platform.release()}",
            }

            # --- escritura atómica de meta.json ---
            tmp_meta = job_dir / "meta.json.tmp"
            tmp_meta.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
            tmp_meta.replace(job_dir / "meta.json")

            # --- mover a done y limpiar pending ---
            final_dir = DONE / job_name
            shutil.move(str(job_dir), str(final_dir))
            audio_in.unlink(missing_ok=False)

            log_line(f"[DONE]  {audio_in.name} -> {final_dir} (Duración: {elapsed}, RTF={meta['rtf']})")

            total_jobs += 1
            ok_jobs += 1
            total_elapsed += elapsed
            # Solo acumula duración/RTF si la duración es válida (>0)
            if audio_duration and audio_duration > 0:
                total_audio_dur += audio_duration
                if rtf is not None:
                    rtfs.append(rtf)

        except Exception as e:
            failed_jobs += 1
            total_jobs += 1
            tb = traceback.format_exc()
            log_line(f"[FAIL]  {audio_in.name} :: {e}")
            (job_dir / "error.log").write_text(tb, encoding="utf-8")
            failed_dir = (WORKDIR / "failed" / job_name)
            failed_dir.parent.mkdir(parents=True, exist_ok=True)
            shutil.move(str(job_dir), str(failed_dir))
            continue

    print("\n===== 📊 Informe del pipeline =====")
    print(f"Total procesados: {total_jobs}")
    print(f"  ✅ Exitosos: {ok_jobs}")
    print(f"  ❌ Fallidos: {failed_jobs}")

    if total_jobs > 0:
        print(f"Normalizados: {normalized_count}/{total_jobs} "
            f"({(100.0*normalized_count/total_jobs):.1f}%)")

    if ok_jobs > 0:
        # Promedios solo con éxitos y con duraciones válidas
        if total_audio_dur > 0:
            avg_audio = total_audio_dur / ok_jobs
            print(f"Duración promedio de audio: {avg_audio:.1f} s")
        else:
            print("Duración promedio de audio: N/A")

        avg_elapsed = total_elapsed / ok_jobs
        print(f"Tiempo promedio de ejecución: {avg_elapsed}")

        if rtfs:
            avg_rtf = sum(rtfs) / len(rtfs)
            print(f"RTF promedio: {avg_rtf:.3f}")
        else:
            print("RTF promedio: N/A")
    else:
        print("No hubo jobs exitosos.")

    print("===================================")
    print("\nProceso terminado. Revisa ./done para los finalizados y ./pending para los no procesados.")

No hay audios en ./pending.
