#Step 1: HuggingFace Setup and Model Loading

In [6]:
%pip install huggingface_hub
%pip install -U bitsandbytes
%pip install protobuf==3.20.3



In [7]:
from transformers import pipeline
pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, StoppingCriteria, StoppingCriteriaList
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en", dtype="auto", device_map="auto")

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


#Step 2: Load the Dataset

Example output genrated by the model before fine tuning.

In [8]:
prefix = "画蛇添足"
inputs = tokenizer(prefix, return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}

outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    num_return_sequences=1,
    do_sample=True,
    top_p=0.9,
    temperature=0.8,
    pad_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

Draw snakes and fill them up.


In [12]:
!pip install transformers datasets
import pandas as pd

# Change to your paths
train_df = pd.read_csv("/content/chineseproverbs/idiomkb_train.csv")
val_df = pd.read_csv("/content/chineseproverbs/idiomkb_val.csv")
test_df = pd.read_csv("/content/chineseproverbs/idiomkb_test.csv")

#print(train_df.head())

  chinese                                            english source  \
0    无所不晓   know everything, be knowledgeable in all aspects   JSON   
1    琼枝玉叶      beautiful and valuable decoration or ornament   JSON   
2    先斩后奏  taking impulsive or hasty actions without seek...   JSON   
3    南来北往  travelling back and forth between the north an...   JSON   
4    言听计从             obey someone's advice without question   JSON   

                                           refs_list  
0  ['know everything, be knowledgeable in all asp...  
1  ['beautiful and valuable decoration or ornament']  
2  ['taking impulsive or hasty actions without se...  
3  ['travelling back and forth between the north ...  
4         ["obey someone's advice without question"]  


In [13]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df[['chinese', 'refs_list']])
val_dataset = Dataset.from_pandas(val_df[['chinese', 'refs_list']])
test_dataset = Dataset.from_pandas(test_df[['chinese', 'refs_list']])

{'chinese': '无所不晓', 'refs_list': "['know everything, be knowledgeable in all aspects']"}
{'chinese': '琼枝玉叶', 'refs_list': "['beautiful and valuable decoration or ornament']"}
{'chinese': '先斩后奏', 'refs_list': "['taking impulsive or hasty actions without seeking approval or permission first']"}


#Step 3: Supervised Fine Tuning

In [31]:
%pip install transformers datasets sacrebleu
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model_name = "Helsinki-NLP/opus-mt-zh-en"
#tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")



In [32]:
MAX_LEN = 256

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
if tokenizer.bos_token is None:
    tokenizer.bos_token = tokenizer.eos_token
if tokenizer.eos_token is None:
    tokenizer.eos_token = tokenizer.eos_token

def tokenize_translation_batch(batch):
    inputs = batch["chinese"]
    targets = batch["refs_list"]

    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LEN,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        model_targets = tokenizer(
            targets,
            max_length=MAX_LEN,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = model_targets["input_ids"]
    return model_inputs

train_tok = train_dataset.map(
    tokenize_translation_batch,
    batched=True,
    batch_size=1024
    )
val_tok = val_dataset.map(
    tokenize_translation_batch,
    batched=True,
    batch_size=1024
    )
test_tok = test_dataset.map(
    tokenize_translation_batch,
    batched=True,
    batch_size=1024
    )

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding=True,
    return_tensors="pt"
)

Map:   0%|          | 0/6904 [00:00<?, ? examples/s]



Map:   0%|          | 0/864 [00:00<?, ? examples/s]

Map:   0%|          | 0/864 [00:00<?, ? examples/s]

In [36]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./opus_mt_zh_en_finetuned",
    num_train_epochs=1,   #try larger
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-4,   #try
    predict_with_generate=True,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    eval_strategy="steps",
    eval_steps=25,
    save_steps=250,
    save_total_limit=2,
    logging_steps=50,
    optim="adamw_torch",
    fp16=True,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Seq2SeqTrainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


In [37]:
trainer.train()
trainer.save_model()
#Save the finetuned model
model.push_to_hub("username/opus_mt_zh_en_finetuned")
tokenizer.push_to_hub("username/opus_mt_zh_en_finetuned")

Step,Training Loss,Validation Loss
25,No log,0.181716
50,0.126200,0.192055
75,0.126200,0.193129
100,0.092900,0.193414
125,0.092900,0.18667
150,0.196400,0.187896
175,0.196400,0.18312
200,0.187300,0.180905
225,0.187300,0.182021
250,0.179700,0.176454




#Step 4: Evaluation

In [38]:
import sacrebleu

#For faster testing
#test_tok = test_tok.select(range(200))
preds = trainer.predict(test_tok)
decoded_preds = tokenizer.batch_decode(preds.predictions, skip_special_tokens=True)
references = [[r] for r in test_df["refs_list"].tolist()]

bleu = sacrebleu.corpus_bleu(decoded_preds, references)
print("BLEU score:", bleu.score)

BLEU score: 16.51582159006904
