In [1]:
!pip install transformers pandas tqdm




In [3]:
import pandas as pd
from transformers import MarianTokenizer, MarianMTModel
from tqdm import tqdm

# Load model and tokenizer for Chinese → English
model_name = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Load the CSV file (change filename if needed)
# df = pd.read_csv("/kaggle/input/chinese-text/C_text.csv").head(10)  # assuming column is 'chinese_text'
df = pd.read_csv("/kaggle/input/chinese-text/C_text.csv", encoding='gb18030').head(10)

# Translation function
def translate(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""
    batch = tokenizer.prepare_seq2seq_batch([text], return_tensors="pt")
    gen = model.generate(**batch, max_new_tokens=100)
    return tokenizer.decode(gen[0], skip_special_tokens=True)

# Apply translation
tqdm.pandas()
df["translated_english"] = df["text"].progress_apply(translate)

# Save result
df.to_csv("translated_chinese_10rows.csv", index=False)
print("✅ Chinese to English translation done. Saved to translated_chinese_10rows.csv")


`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

100%|██████████| 10/10 [00:04<00:00,  2.09it/s]

✅ Chinese to English translation done. Saved to translated_chinese_10rows.csv



