In [None]:
#!pip install datasets transformers sentencepiece datasets --quiet

In [None]:
from google.colab import data_table; data_table.enable_dataframe_formatter()
import numpy as np; np.random.seed(123)
from accelerate import Accelerator
import pandas as pd
import datasets
import json
import gc
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoTokenizer
import numpy as np
import torch

In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))

get_ipython().events.register('pre_run_cell', set_css)

def clear_cache():
  if torch.cuda.is_available():
    model = None
    torch.cuda.empty_cache()

In [None]:
with open("/content/drive/MyDrive/data/train_data_with_correct_answer.json", "r") as json_file:
    dataset = json.load(json_file)
dataset = pd.DataFrame(dataset)

In [None]:
dataset.head(2)

In [None]:
def read_data(data: pd.DataFrame):
    d=[]
    for i in range(len(data)):
        if data['options'] != "":
            d.append(
                    {
                        "math": {
                            "problem": data['Problem'][i] + "\n" + data['options'],
                            "answer": data['correct'][i]
                        }
                    }
                )
    print(f'total size of data is {len(d)}')
    tdata = pd.DataFrame(d)
    tdata = tdata.reset_index()
    tdata = tdata.rename(columns={'index': 'id'})
    dataset = datasets.Dataset.from_pandas(tdata)
    return dataset

In [None]:
train_data = read_data(data=dataset)

In [None]:
checkpoint="google/flan-t5-base"
# checkpoint="NghiemAbe/flan-t5-base-mathqa_v1"
# checkpoint="NghiemAbe/flan-t5-base-mathqa_v2"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
def preprocess_function(examples):
    max_input_length = 512
    max_target_length = 512
    source = "problem"
    target = "answer"

    inputs = [ex[source] for ex in examples["math"]]
    targets = [ex[target] for ex in examples["math"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = train_data.map(preprocess_function, batched=True)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
torch.cuda.empty_cache()
gc.collect()

*NOTE:*

Các thông số TrainingArguments cho mỗi lần Finetune xem ở link:
- https://huggingface.co/NghiemAbe/flan-t5-base-mathqa_v1
- https://huggingface.co/NghiemAbe/flan-t5-base-mathqa_v2
- https://huggingface.co/NghiemAbe/flan-t5-base-mathqa_v3

In [None]:
args = Seq2SeqTrainingArguments(output_dir="/content/drive/MyDrive/flan-t5-base-mathqa_v1/",
                        do_train=True,
                        warmup_steps=100,
                        optim="adamw_torch",
                        per_device_train_batch_size=8,
                        gradient_accumulation_steps=1,
                        learning_rate=5e-5,
                        num_train_epochs=3,
                        predict_with_generate=True,
                        logging_steps=100,
                        save_steps=2000,
                        )


trainer = Seq2SeqTrainer(model=model,
                args=args,
                data_collator=data_collator,
                train_dataset=tokenized_datasets,
                tokenizer=tokenizer,
                )

accelerator = Accelerator()

tokenized_datasets, trainer = accelerator.prepare(
     tokenized_datasets, trainer
      )

trainer.train()

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
trainer.push_to_hub()

In [None]:
del trainer

In [None]:
torch.cuda.empty_cache()
gc.collect()