In [1]:
import os
import json
import re
import pandas as pd
import numpy as np
import csv
import torch
import jionlp as jio
from torch.optim.lr_scheduler import LambdaLR, StepLR, MultiStepLR, ExponentialLR, ReduceLROnPlateau 
from rouge_score import rouge_scorer
from datasets import load_metric
from torch.cuda import  amp
from tqdm import tqdm

#read data from csv base on month and save in to  one datadrame


# jionlp - 微信公众号: JioNLP  Github: `https://github.com/dongrixinyu/JioNLP`.
# jiojio - `http://www.jionlp.com/jionlp_online/cws_pos` is available for online trial.


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
month_list = ['01','02','03','04','05','06','07','08','09','10','11','12']
for month in month_list:
    df = pd.read_csv(f'./dataset/dataset_training/2023{month}.csv')
    if month == '01':
        df_all = df
    else:
        df_all = pd.concat([df_all, df], axis=0)

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("original_model/m2m100_418M")
model = AutoModelForSeq2SeqLM.from_pretrained("original_model/m2m100_418M")
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter('runs/model5')

In [4]:
max_length = 256
num_epochs = 10
learning_rate = 1e-4
batch_size = 12

In [5]:
data = df_all[['p_claim', 'p_fact']]
data = data.rename(columns={'p_claim': 'answer', 'p_fact': 'question'})
data = data.dropna()
data = data.reset_index(drop=True)
data['question'] = data['question'].apply(lambda x: jio.clean_text(x))
data['answer'] = data['answer'].apply(lambda x: jio.clean_text(x))
data['question'] = data['question'].apply(lambda x: re.sub(r'\s+', ' ', x))
data['answer'] = data['answer'].apply(lambda x: re.sub(r'\s+', ' ', x))
#turn the data into a dataset
from datasets import Dataset
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.2)

In [6]:
#tokenize the data
def process_data_to_model_inputs(batch):

    # tokenize the inputs and labels
    inputs = tokenizer(batch["question"], padding="max_length", truncation=True, max_length=max_length)
    outputs = tokenizer(batch["answer"], padding="max_length", truncation=True, max_length=max_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["labels"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask

    return batch


dataset = dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=["question", "answer"]
)

Map: 100%|██████████| 42647/42647 [00:36<00:00, 1160.37 examples/s]
Map: 100%|██████████| 10662/10662 [00:08<00:00, 1232.15 examples/s]


In [7]:
#load the model
from sklearn import metrics
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import EarlyStoppingCallback



training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    warmup_steps=500,
    eval_steps=500,
    max_steps=8000,

    fp16=True

)

#use amp to train the model
scaler = amp.GradScaler()
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    
    )

trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 50%|█████     | 500/1000 [35:36<36:06,  4.33s/it]  

{'loss': 5.3047, 'grad_norm': 13.844526290893555, 'learning_rate': 1.24e-05, 'epoch': 0.14}


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5}


In [None]:

model.save_pretrained('trained_model/model5')
tokenizer.save_pretrained('trained_model/model5')
