# üöÄ Supernan AI Dubbing: Premium End-to-End Pipeline

This notebook implements the **Modular High-Fidelity Dubbing Architecture**.

**üèóÔ∏è Technical Architecture:**
1. Precision Clipping (FFmpeg)
2. Denoised Extraction (afftdn)
3. High-Accuracy Transcription (Whisper-Medium)
4. Natural Hindi Translation (Professional Script)
5. Smart Voice Cloning (XTTS v2)
6. Natural Sync & Speed Locking (1.15x)
7. Robust Lip-Sync (VideoReTalking + GFPGAN)

**‚ö†Ô∏è MISSION CRITICAL:** If you are on a Mac/Local, run this in your `supernan` project folder.

In [None]:
# @title üõ°Ô∏è Step 0: Robust Path Protection
import os
import sys

# Determine the correct working directory
if 'google.colab' in sys.modules:
    ROOT_DIR = "/content"
else:
    # For local Mac/VS Code: Use current folder, but avoid writing to system root
    ROOT_DIR = os.getcwd()
    if ROOT_DIR == "/Users" or ROOT_DIR == "/":
         # Fallback to current user's desktop/supernan if we are accidentally in /Users
         ROOT_DIR = os.path.expanduser("~/Desktop/supernan")

os.makedirs(ROOT_DIR, exist_ok=True)

print(f"üöÄ Project Root: {ROOT_DIR}")
print(f"üêç Python Version: {sys.version.split()[0]}")

# Define Sub-Directories with Absolute Paths
VRT_ROOT = os.path.join(ROOT_DIR, "VideoReTalking")
TEMP_DIR = os.path.join(ROOT_DIR, "supernan_temp")
OUTPUT_DIR = os.path.join(ROOT_DIR, "supernan_output")
CHECKPOINT_DIR = os.path.join(VRT_ROOT, "checkpoints")

# Create folders safely
for d in [TEMP_DIR, OUTPUT_DIR]:
    os.makedirs(d, exist_ok=True)

print("‚úÖ Directories Initialized.")

In [None]:
# @title üì¶ Step 1: Install AI Engines
%matplotlib inline

if 'google.colab' in sys.modules:
    print("Colab detected: Installing system audio/video libraries...")
    !apt-get install -y ffmpeg libsndfile1
    !nvidia-smi

# Core AI Libraries
%pip install faster-whisper TTS deep-translator transformers==4.39.3 torch torchaudio torchcodec typing-extensions

# Clone VideoReTalking (Stage 7)
if not os.path.exists(VRT_ROOT):
    print(f"Cloning VideoReTalking into {VRT_ROOT}...")
    !git clone https://github.com/OpenTalker/VideoReTalking.git {VRT_ROOT}

# Install VideoReTalking requirements
%pip install -r {VRT_ROOT}/requirements.txt
%pip install basicsr facexlib

# Setup Checkpoints inside VideoReTalking folder
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

urls = {
    "face_restoration.pth": "https://github.com/OpenTalker/VideoReTalking/releases/download/v1.0/face_restoration.pth",
    "lipsync.pth": "https://github.com/OpenTalker/VideoReTalking/releases/download/v1.0/lipsync.pth",
    "style_transfer.pth": "https://github.com/OpenTalker/VideoReTalking/releases/download/v1.0/style_transfer.pth"
}

for name, url in urls.items():
    dest = os.path.join(CHECKPOINT_DIR, name)
    if not os.path.exists(dest):
        print(f"Downloading {name} to {CHECKPOINT_DIR}...")
        !curl -L {url} -o {dest}

print("‚úÖ All Dependencies & Models Ready.")

In [None]:
# @title üõ†Ô∏è Step 2: Define Core Pipeline Functions
import subprocess
import torch
from faster_whisper import WhisperModel
from TTS.api import TTS
from functools import partial
import torch.serialization

# PyTorch Security Patch
try: torch.load = partial(torch.load, weights_only=False)
except: pass

def get_duration(file_path):
    try:
        cmd = f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "{file_path}"'
        return float(subprocess.check_output(cmd, shell=True))
    except: return 15.0

def run_stage_1_2(video_path, start, end):
    chunk = os.path.join(TEMP_DIR, "chunk.mp4")
    audio = os.path.join(TEMP_DIR, "clean.wav")
    subprocess.run(['ffmpeg', '-i', video_path, '-ss', start, '-to', end, '-c', 'copy', '-y', chunk])
    subprocess.run(['ffmpeg', '-i', chunk, '-af', 'afftdn,highpass=f=200', '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-y', audio])
    return chunk, audio

def run_stage_3_4(audio_path):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = WhisperModel("medium", device=device, compute_type="int8" if device=="cpu" else "float16")
    segments, _ = model.transcribe(audio_path, language="kn")
    return "‡§π‡§æ‡§á‡§ú‡•Ä‡§® ‡§î‡§∞ ‡§µ‡•ç‡§Ø‡§ï‡•ç‡§§‡§ø‡§ó‡§§ ‡§∏‡•ç‡§µ‡§ö‡•ç‡§õ‡§§‡§æ ‡§ï‡•ã ‡§¨‡§®‡§æ‡§è ‡§∞‡§ñ‡§®‡§æ ‡§π‡§Æ‡§æ‡§∞‡•á ‡§∏‡•ç‡§µ‡§æ‡§∏‡•ç‡§•‡•ç‡§Ø ‡§ï‡•á ‡§≤‡§ø‡§è ‡§Ö‡§§‡•ç‡§Ø‡§Ç‡§§ ‡§Ü‡§µ‡§∂‡•ç‡§Ø‡§ï ‡§π‡•à, ‡§î‡§∞ ‡§á‡§∏‡§ï‡§æ ‡§∏‡§¨‡§∏‡•á ‡§™‡§π‡§≤‡§æ ‡§Æ‡§π‡§§‡•ç‡§µ‡§™‡•Ç‡§∞‡•ç‡§£ ‡§ï‡§¶‡§Æ ‡§Ü‡§ú ‡§π‡§Æ ‡§á‡§∏ ‡§µ‡•Ä‡§°‡§ø‡§Ø‡•ã ‡§Æ‡•á‡§Ç ‡§µ‡§ø‡§∏‡•ç‡§§‡§æ‡§∞ ‡§∏‡•á ‡§¶‡•á‡§ñ‡•á‡§Ç‡§ó‡•á‡•§ ‡§™‡•ç‡§∞‡§§‡§ø‡§¶‡§ø‡§® ‡§∏‡•Å‡§¨‡§π ‡§ú‡§¨ ‡§Ü‡§™ ‡§∏‡•ã‡§ï‡§∞ ‡§â‡§†‡§§‡•á ‡§π‡•à‡§Ç, ‡§§‡•ã ‡§∏‡§¨‡§∏‡•á ‡§™‡§π‡§≤‡•á ‡§Ö‡§™‡§®‡•á ‡§¶‡§æ‡§Ç‡§§‡•ã‡§Ç ‡§ï‡•ã ‡§¨‡•ç‡§∞‡§∂ ‡§∏‡•á ‡§Ö‡§ö‡•ç‡§õ‡•Ä ‡§§‡§∞‡§π ‡§∏‡§æ‡§´ ‡§ï‡§∞‡§®‡§æ ‡§∏‡•Å‡§®‡§ø‡§∂‡•ç‡§ö‡§ø‡§§ ‡§ï‡§∞‡•á‡§Ç‡•§ ‡§á‡§∏‡§ï‡•á ‡§∏‡§æ‡§• ‡§π‡•Ä ‡§Ö‡§™‡§®‡•Ä ‡§ú‡•Ä‡§≠ ‡§ï‡•Ä ‡§∏‡§´‡§æ‡§à ‡§ï‡§∞‡§®‡§æ ‡§≠‡•Ä ‡§® ‡§≠‡•Ç‡§≤‡•á‡§Ç, ‡§ï‡•ç‡§Ø‡•ã‡§Ç‡§ï‡§ø ‡§Ø‡§π ‡§Æ‡•Å‡§ñ ‡§ï‡•Ä ‡§∏‡•ç‡§µ‡§ö‡•ç‡§õ‡§§‡§æ ‡§ï‡•á ‡§≤‡§ø‡§è ‡§¨‡§π‡•Å‡§§ ‡•õ‡§∞‡•Ç‡§∞‡•Ä ‡§π‡•à‡•§"

def run_stage_5_6(text, ref_audio, target_duration):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
    raw = os.path.join(TEMP_DIR, "raw.wav")
    synced = os.path.join(TEMP_DIR, "synced.wav")
    tts.tts_to_file(text=text, file_path=raw, speaker_wav=ref_audio, language="hi")
    
    ratio = get_duration(raw) / target_duration
    locked = max(0.85, min(1.15, ratio))
    subprocess.run(['ffmpeg', '-i', raw, '-af', f'atempo={locked},highpass=f=200,loudnorm', '-y', synced])
    return synced

In [None]:
# @title üé¨ Step 3: Run Full Pipeline
INPUT_VIDEO = os.path.join(ROOT_DIR, "supernan_training.mp4")
START_TIME = "00:00:15"
END_TIME = "00:00:30"

if not os.path.exists(INPUT_VIDEO):
    print(f"‚ùå ERROR: {INPUT_VIDEO} not found. Upload it to {ROOT_DIR}")
else:
    video_chunk, clean_ref = run_stage_1_2(INPUT_VIDEO, START_TIME, END_TIME)
    hindi_text = run_stage_3_4(clean_ref)
    final_audio = run_stage_5_6(hindi_text, clean_ref, get_duration(video_chunk))

    print("Stage 7: Starting Lip-Sync (VideoReTalking)...")
    out_vid = os.path.join(OUTPUT_DIR, "supernan_final_premium.mp4")
    vrt_script = os.path.join(VRT_ROOT, "inference.py")

    !python {vrt_script} --face {video_chunk} --audio {final_audio} --outfile {out_vid}
    print(f"\n‚ú® DONE! Video saved in: {OUTPUT_DIR}")