# Translate Resume CSV using HuggingFace MarianMT
This notebook loads a CSV file, translates long English texts to Indonesian using sentence-level translation with HuggingFace's MarianMT model, and handles long sequences by splitting into sentences.

In [None]:
import pandas as pd
from tqdm import tqdm
from transformers import MarianMTModel, MarianTokenizer
import torch
import re

## Load MarianMT Model and Tokenizer

In [None]:
model_name = "Helsinki-NLP/opus-mt-en-id"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
device



device(type='cuda')

## Configuration and Load Data

In [None]:
# Adjust paths if needed
input_path = "/content/resume 621-1241.csv"
output_path = "/content/Translated resume 621-1241.csv"

source_col = "Resume_str"
target_col = "Resume_str_translated"
start_row = 0
checkpoint_every = 50

df = pd.read_csv(input_path)

if target_col not in df.columns:
    df[target_col] = ""

df[[source_col, target_col]].head()

Unnamed: 0,Resume_str,Resume_str_translated
0,BUSINESS DEVELOPMENT CONSULTANT ...,
1,BUSINESS DEVELOPMENT MANAGER ...,
2,BUSINESS DEVELOPMENT DIRECTOR S...,
3,DIRECTOR BUSINESS DEVELOPMENT ...,
4,SENIOR ASSOCIATE BUSINESS DEVELOPMENT...,


## Define Sentence Splitting and Translation Function

In [None]:
def split_into_sentences(text):
    return re.split(r'(?<=[.!?])\s+', text.strip())

def translate_text(text):
    if not isinstance(text, str) or not text.strip():
        return ""

    sentences = split_into_sentences(text)
    translated_sentences = []

    for sentence in sentences:
        try:
            inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
            with torch.no_grad():
                translated = model.generate(**inputs, max_length=512)
            translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
            translated_sentences.append(translated_text)
        except Exception as e:
            translated_sentences.append(f"[TRANSLATION ERROR]: {sentence}")

    return " ".join(translated_sentences)

## Run Translation with Checkpoints

In [None]:
for i in tqdm(range(start_row, len(df))):
    try:
        if not isinstance(df.at[i, target_col], str) or df.at[i, target_col].strip() == "":
            source_text = df.at[i, source_col]
            translated_text = translate_text(source_text)
            df.at[i, target_col] = translated_text
    except Exception as e:
        print(f"❌ Error di baris {i}: {e}")

    if (i + 1) % checkpoint_every == 0 or i == len(df) - 1:
        df.to_csv(output_path, index=False)
        print(f"💾 Checkpoint disimpan di baris {i + 1}")

print(f"\n✅ Translasi selesai. File disimpan di: {output_path}")

  8%|▊         | 50/621 [09:59<1:48:14, 11.37s/it]

💾 Checkpoint disimpan di baris 50


 16%|█▌        | 100/621 [21:04<1:54:12, 13.15s/it]

💾 Checkpoint disimpan di baris 100


 24%|██▍       | 150/621 [32:28<1:39:27, 12.67s/it]

💾 Checkpoint disimpan di baris 150


 32%|███▏      | 200/621 [42:30<1:18:43, 11.22s/it]

💾 Checkpoint disimpan di baris 200


 40%|████      | 250/621 [51:16<1:09:29, 11.24s/it]

💾 Checkpoint disimpan di baris 250


 48%|████▊     | 300/621 [1:00:28<1:00:38, 11.33s/it]

💾 Checkpoint disimpan di baris 300


 56%|█████▋    | 350/621 [1:11:08<1:32:52, 20.56s/it]

💾 Checkpoint disimpan di baris 350


 64%|██████▍   | 400/621 [1:22:20<42:57, 11.66s/it]

💾 Checkpoint disimpan di baris 400


 72%|███████▏  | 450/621 [1:29:10<28:55, 10.15s/it]

💾 Checkpoint disimpan di baris 450


 81%|████████  | 500/621 [1:39:34<26:51, 13.31s/it]

💾 Checkpoint disimpan di baris 500


 89%|████████▊ | 550/621 [1:52:02<21:07, 17.86s/it]

💾 Checkpoint disimpan di baris 550


 97%|█████████▋| 600/621 [2:02:38<04:19, 12.35s/it]

💾 Checkpoint disimpan di baris 600


100%|██████████| 621/621 [2:07:27<00:00, 12.32s/it]

💾 Checkpoint disimpan di baris 621

✅ Translasi selesai. File disimpan di: /content/Translated resume 621-1241.csv



