In [None]:
import re
import pandas as pd
import numpy as np

import jionlp as jio
from torch.optim.lr_scheduler import LambdaLR, StepLR, MultiStepLR, ExponentialLR, ReduceLROnPlateau 
from rouge_score import rouge_scorer
from datasets import load_metric
from torch.cuda import  amp
from tqdm import tqdm

#read data from csv base on month and save in to  one datadrame


In [None]:
month_list = ['01','02','03','04','05','06','07','08','09','10','11','12']
for month in month_list:
    df = pd.read_csv(f'./dataset/dataset_training/2023{month}.csv')
    if month == '01':
        df_all = df
    else:
        df_all = pd.concat([df_all, df], axis=0)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("original_model/small_helmet_418M")
model = AutoModelForSeq2SeqLM.from_pretrained("original_model/small_helmet_418M")
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter('runs/small_helmet_418M')

# 超參數設置

In [None]:
max_length = 256
num_epochs = 10
learning_rate = 1e-4
batch_size = 16

In [None]:
data = df_all[['p_claim', 'p_fact']]
data = data.rename(columns={'p_claim': 'answer', 'p_fact': 'question'})
data = data.dropna()
data = data.reset_index(drop=True)
data['question'] = data['question'].apply(lambda x: jio.clean_text(x))
data['answer'] = data['answer'].apply(lambda x: jio.clean_text(x))
data['question'] = data['question'].apply(lambda x: re.sub(r'\s+', ' ', x))
data['answer'] = data['answer'].apply(lambda x: re.sub(r'\s+', ' ', x))
#turn the data into a dataset
from datasets import Dataset
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
#tokenize the data
def process_data_to_model_inputs(batch):

    # tokenize the inputs and labels
    inputs = tokenizer(batch["question"], padding="max_length", truncation=True, max_length=max_length)
    outputs = tokenizer(batch["answer"], padding="max_length", truncation=True, max_length=max_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["labels"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask

    return batch


dataset = dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["question", "answer"]
)

In [None]:
#load the model
from sklearn import metrics
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import EarlyStoppingCallback



training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    warmup_steps=500,
    eval_steps=500,
    max_steps=10000,
    fp16=True
)

#use amp to train the model
scaler = amp.GradScaler()
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    
    )

trainer.train()


In [None]:

model.save_pretrained('trained_model/small_helmet_418M')
tokenizer.save_pretrained('trained_model/small_helmet_418M')
