<a href="https://colab.research.google.com/github/agungfirdaus717-ux/torentotgd/blob/main/SubTranslatorNLLB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================
# SRT Translator (Hugging Face Transformers)
# Model: facebook/nllb-200-distilled-600M
# ============================================================
# ✅ Hasil lebih natural, mirip subtitle resmi
# ✅ Tanpa API key
# ⚠️ Butuh GPU Colab untuk performa lebih cepat
# ============================================================

!pip install transformers sentencepiece srt ftfy tqdm

import io, os
import srt
from tqdm import tqdm
from ftfy import fix_text
from google.colab import files
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# =========================
# KONFIGURASI
# =========================
MODEL_NAME   = "facebook/nllb-200-distilled-600M"
SOURCE_LANG  = "eng_Latn"   # kode bahasa sumber (contoh: English Latin)
TARGET_LANG  = "ind_Latn"   # kode bahasa target (contoh: Indonesian Latin)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# =========================
# LOAD MODEL & TOKENIZER
# =========================
print("🔄 Download model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)

# =========================
# UPLOAD FILE
# =========================
print("📂 Upload file .srt...")
uploaded = files.upload()
in_name = list(uploaded.keys())[0]
raw = uploaded[in_name]

text_data = fix_text(io.StringIO(raw.decode("utf-8", errors="ignore")).read())
subs = list(srt.parse(text_data))
print(f"✅ Loaded {len(subs)} subtitle entries.")

# =========================
# TRANSLATION FUNCTION
# =========================
def translate_batch(texts, src_lang, tgt_lang, batch_size=8, max_length=512):
    results = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Menerjemahkan"):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
        # setting bahasa sumber & target
        inputs["forced_bos_token_id"] = tokenizer.convert_tokens_to_ids(tgt_lang)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=max_length)
        out_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        results.extend(out_texts)
    return results

# =========================
# AMBIL TEKS SUBTITLE
# =========================
original_texts = [s.content.strip().replace("\n", " ") for s in subs]

# =========================
# TRANSLATE
# =========================
translated_texts = translate_batch(original_texts, SOURCE_LANG, TARGET_LANG)

# =========================
# SUSUN ULANG SUBTITLE
# =========================
new_subs = []
for orig, trans in zip(subs, translated_texts):
    new_item = srt.Subtitle(
        index=orig.index,
        start=orig.start,
        end=orig.end,
        content=trans.strip()
    )
    new_subs.append(new_item)

out_text = srt.compose(new_subs)

# =========================
# SIMPAN HASIL
# =========================
base, ext = os.path.splitext(in_name)
out_name = f"{base}.id.srt"
with open(out_name, "w", encoding="utf-8") as f:
    f.write(out_text)

print(f"✅ Selesai → {out_name}")
files.download(out_name)
