In [None]:
!pip install transformers datasets evaluate sacrebleu
!pip install accelerate -U

In [None]:
from huggingface_hub import login
from google.colab import userdata
from datasets import load_dataset

DATA_CUTOFF = 100000

login(userdata.get('huggingface'))

# https://huggingface.co/datasets/Helsinki-NLP/opus-100
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
train_es = load_dataset("Helsinki-NLP/opus-100", "en-es")["train"].select(range(DATA_CUTOFF))
test_es = load_dataset("Helsinki-NLP/opus-100", "en-es")["test"]

train_ja = load_dataset("Helsinki-NLP/opus-100", "en-ja")["train"].select(range(DATA_CUTOFF))
test_ja = load_dataset("Helsinki-NLP/opus-100", "en-ja")["test"]

train_zh = load_dataset("Helsinki-NLP/opus-100", "en-zh")["train"].select(range(DATA_CUTOFF))
test_zh = load_dataset("Helsinki-NLP/opus-100", "en-zh")["test"]


In [None]:
# followed this tutorial: https://huggingface.co/docs/transformers/tasks/translation
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoTokenizer

check = "google/mt5-small"
# check = "google-t5/t5-small"

tokenizer = AutoTokenizer.from_pretrained(check)
model = AutoModelForSeq2SeqLM.from_pretrained(check)



tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# import fine tuning stuff
import numpy as np
import torch
import torch.optim as optim
import random
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [None]:
from transformers import DataCollatorForSeq2Seq

# prepare data
lang_dict = {"es": "Spanish", "zh": "Chinese", "ja": "Japanese"}

lang = 'es'
def preprocess(examples):
  source = lang_dict[lang]
  prefix = f"translate {source} to English: "

  inputs = [prefix + t[lang] for t in examples["translation"]]
  targets = [t['en'] for t in examples["translation"]]

  model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True)
  return model_inputs

# Spanish
es_train_token = train_es.map(preprocess, batched=True)
es_test_token = test_es.map(preprocess, batched=True)
collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=check)

# Japanese
lang='ja'
ja_train_token = train_ja.map(preprocess, batched=True)
ja_test_token = test_ja.map(preprocess, batched=True)

# Chinese
lang='zh'
zh_train_token = train_zh.map(preprocess, batched=True)
zh_test_token = test_zh.map(preprocess, batched=True)


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

def finetune(epochs, name, lang='es', batch=4):
  tokenized_train = es_train_token if lang=='es' else ja_train_token if lang=='ja' else zh_train_token
  tokenized_test = es_test_token if lang=='es' else ja_test_token if lang=='ja' else zh_test_token
  data_collator = collator

  training_args = Seq2SeqTrainingArguments(
    output_dir=name,
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch,
    per_device_eval_batch_size=batch,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    push_to_hub=True,
  )

  trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
  )

  trainer.train()
  trainer.push_to_hub()

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
# finetuning section
EPOCHS = 3

check="lseely916/CMSC_473_mt5"
tokenizer = AutoTokenizer.from_pretrained(check)
model = AutoModelForSeq2SeqLM.from_pretrained(check)

collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=check)

print("tuning for spanish...")
# finetune(EPOCHS, f"gdrive/My Drive/CMSC 473/spanish", "es")
finetune(1, "CMSC_473_mt5", "es")

config.json:   0%|          | 0.00/802 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

tuning for spanish...


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,2.1116,1.927763,16.1315,13.4965


'(MaxRetryError("HTTPSConnectionPool(host='hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com', port=443): Max retries exceeded with url: /repos/d0/6a/d06aea3b67ce9bf30e0778c5ae078bee1265c14247e55531e8358cbe48edba2a/970177d7f292d1cdb47029b089629e35cae1434deb847c8f0a4f41e9d5abeb83?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQFN2FTF47%2F20240510%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240510T005739Z&X-Amz-Expires=86400&X-Amz-Signature=f973340182ec398b31ab415cb3db21978103a9eb3d791aa059e636e25d286954&X-Amz-SignedHeaders=host&partNumber=1&uploadId=ZCQ2k90SBXFFGyh4kGgmhiLPva8XmQWHK10JyZiwHOr4pHKBiTk9zWQw8xKyuSSgb8EUJOwMAIBisLFNsHymovjqB_Ij4IE6ACktTsypxAEYdhiORU8NiFCe1zcfWQzf&x-id=UploadPart (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2426)')))"), '(Request ID: 7a371a9e-5d04-41cf-9ffd-5de486a13294)')' thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/d0/6a/d

events.out.tfevents.1715296908.5e969d5ca539.1905.0:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

In [None]:
EPOCHS = 2

check="lseely916/CMSC_473_mt5"
tokenizer = AutoTokenizer.from_pretrained(check)
model = AutoModelForSeq2SeqLM.from_pretrained(check)

collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=check)

print("tuning for japanese...")
finetune(EPOCHS, "CMSC_473_mt5", "ja")



model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

tuning for japanese...


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,2.8569,2.688671,6.3014,11.609
2,2.6682,2.644463,6.9704,11.638




events.out.tfevents.1715368926.78e9b3cd84ce.720.1:   0%|          | 0.00/27.5k [00:00<?, ?B/s]

In [None]:
EPOCHS = 2
check="lseely916/CMSC_473_mt5"

tokenizer = AutoTokenizer.from_pretrained(check)
model = AutoModelForSeq2SeqLM.from_pretrained(check)

print("tuning for chinese...")
finetune(EPOCHS, "CMSC_473_mt5", 'zh')

tokenizer_config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/416 [00:00<?, ?B/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tuning for chinese...


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,2.4962,2.007097,7.3228,16.2395
2,2.3434,1.91368,7.8609,16.173


'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 1b7a14f3-799a-4943-bf08-d48011ba26d3)')' thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/3a/b1/3ab1aea760a3cec60e063fca35f50f46d840b39a1bd342a100afbd36817d1ada/32b80f856f99dbc95ff715937b377eaafe02a82c33172b4561e70ccd917a0be3?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQFN2FTF47%2F20240511%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240511T062254Z&X-Amz-Expires=86400&X-Amz-Signature=ff1228526fb9a844ca9c9dc6ec2e50821f9cbd06be25e90f70da83e4898e3355&X-Amz-SignedHeaders=host&partNumber=1&uploadId=8._P6_87YuJPtyRQxAT5wLBRoBvGgxTw.HmiqwPME0hcmiqvBtltRm0WmPzsa7_igdg.8fRBOVgMTcgS7xm189gZx.wJxpXmTKrgg_vph7mRs8H1vXdYkm10B6zzmR7T&x-id=UploadPart
Retrying in 1s [Retry 1/5].
'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')),

events.out.tfevents.1715406134.56c5842cf9f4.173.0:   0%|          | 0.00/27.5k [00:00<?, ?B/s]

In [None]:
# from transformers import T5Tokenizer, T5ForConditionalGeneration
# followed this tutorial: https://huggingface.co/docs/transformers/tasks/translation
from transformers import pipeline

# Change `xx` to the language of the input and `yy` to the language of the desired output.
# Examples: "en" for English, "fr" for French, "de" for German, "es" for Spanish, "zh" for Chinese, etc; translation_en_to_fr translates English to French
# You can view all the lists of languages here - https://huggingface.co/languages
check = "lseely916/CMSC_473_mt5"

def test(n, lang):
  translator = pipeline(f"translation_{lang_dict[lang]}_to_en", model=check, max_length=256)
  for i in range(n):
    test_set = test_es if lang=='es' else test_zh if lang=='zh' else test_ja
    pair = test_set[random.randrange(0, len(test_set))]['translation']
    print('source:', pair[lang])
    print('target:', pair["en"])
    print('prediction:', translator(pair[lang])[0]['translation_text'])
    print()

test(10, 'zh')



source: C. 决议草案A/C.3/57/L.25和Rev.1以及文件A/C.3/57/L.72所载修正案
target: C. Draft resolution A/C.3/57/L.25 and Rev.1 and amendment contained in document A/C.3/57/L.72
prediction: C. Decision proposalA/C.3/57/L.25 and rev.1 and a documentA/C.3/57/L. 72 released a proposal for C.3/57/L. 72 released a proposal for C.3/57/L. 72 released a proposal for C.3/57/L. 72 released a proposal for C.3/57/L. 72 released a proposal for C.3/57/L. 72 released a proposal for C.3/57/L. 72 released a proposal for C.3/57/L. 72 released a proposal for C.3/57/L. 72 released a proposal for C.3/57/L. 72 released a proposal for C.3/57/L. 72 released a proposal for C.3/57/L. 72 released a proposal for C.3/57/L. 72 released a proposal for C.3/57/L. 72 released a proposal for C.3/57/L. 72 released a proposal for C.3/57/L. 72 released a proposal for C.3/57/L. 72

source: 甲替硝基苯胺(MNA;PNMA)
target: (a) Complete rotor assemblies:
prediction: MNA;PNMA)

source: 8.1 人权事务委员会根据当事各方依《任择议定书》第五条第1款规定提交的一切现有资料审议了来文。
target: 8.1 The Com