In [None]:
import os
import json
import re
import pandas as pd
import numpy as np
import csv
import torch
import jionlp as jio
from torch.optim.lr_scheduler import LambdaLR, StepLR, MultiStepLR, ExponentialLR, ReduceLROnPlateau 
from rouge_score import rouge_scorer
from datasets import load_metric
from torch.cuda import  amp
from tqdm import tqdm
#create a new dataframe empty 
month_list = ['01','02','03','04','05','06','07','08','09','10','11','12']

with open('dataset/dataset_training/202312.csv',encoding="utf-8-sig") as f:
    data = pd.read_csv(f)
    data.head()
# with open('hyperparameters.json', 'r') as f:
#     hyperparameters = json.load(f)


from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("original_model/m2m100_418M")
model = AutoModelForSeq2SeqLM.from_pretrained("original_model/m2m100_418M")


from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter('runs/model5')

In [None]:
max_length = 512
num_epochs = 60
learning_rate = 1e-5
batch_size = 4

In [None]:

#column p_claim is the answer  and column p_fact is the question
data = data[['p_claim', 'p_fact']]
data = data.rename(columns={'p_claim': 'answer', 'p_fact': 'question'})
data = data.dropna()
data = data.reset_index(drop=True)
data.head()

data['question'] = data['question'].apply(lambda x: jio.clean_text(x))
data['answer'] = data['answer'].apply(lambda x: jio.clean_text(x))
data['question'] = data['question'].apply(lambda x: re.sub(r'\s+', ' ', x))
data['answer'] = data['answer'].apply(lambda x: re.sub(r'\s+', ' ', x))


In [None]:
from torch.utils.data import Dataset, DataLoader
class CustomDataset(Dataset):

    def __init__(self, data, tokenizer, max_length):
        self.inputs = data['question'].tolist()
        self.targets = data['answer'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):
        source = self.inputs[index]
        target = self.targets[index]

        encoding = self.tokenizer(source, truncation=True, max_length=self.max_length, padding='max_length', return_tensors='pt')
        input_ids = encoding.input_ids.squeeze()
        attention_mask = encoding.attention_mask.squeeze()

        labels = self.tokenizer(target, truncation=True, max_length=self.max_length, padding='max_length', return_tensors='pt').input_ids.squeeze()
        decoder_input_ids = labels.clone()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'decoder_input_ids': decoder_input_ids,
            'labels': labels
        }
    
#split the data into training and validation
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

train_dataset = CustomDataset(train_data, tokenizer, max_length)
test_dataset = CustomDataset(test_data, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)




In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

def train_model(model, train_loader, num_epochs, learning_rate):

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    scaler = amp.GradScaler()

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    for epoch in range(num_epochs):
        for i, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            decoder_input_ids = batch['decoder_input_ids'].to(device)

            optimizer.zero_grad()

            with amp.autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, decoder_input_ids=decoder_input_ids)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            if i % 100 == 0:
                print(f'Epoch: {epoch}, Loss: {loss.item()}')
                writer.add_scalar('training loss', loss.item(), epoch*len(train_loader) + i)


train_model(model, train_loader, num_epochs, learning_rate)




In [None]:
def evaluate_model(model, test_loader):
    
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = model.to(device)
        model.eval()
    
        metric = load_metric('rouge')
    
        with torch.no_grad():
            for i, batch in enumerate(test_loader):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
    
                outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length, num_beams=4, early_stopping=True)
                predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
                for j in range(len(predictions)):
                    prediction = predictions[j]
                    target = tokenizer.decode(labels[j], skip_special_tokens=True)
                    metric.add(prediction=prediction, reference=target)
    
        scores = metric.compute(use_stemmer=True)
        return scores

scores = evaluate_model(model, test_loader)
print(scores)

In [None]:

model.save_pretrained('trained_model/model5')
tokenizer.save_pretrained('trained_model/model5')


In [None]:

# # 
# metric = load_metric("sacrebleu")
# model.eval()
# predictions = []
# labels = []
# for batch in test_loader:
#     input_ids = batch['input_ids'].to(device)
#     attention_mask = batch['attention_mask'].to(device)
#     labels_ids = batch['labels'].to(device)
#     output = model.generate(input_ids, attention_mask=attention_mask)
#     predictions.extend(tokenizer.batch_decode(output, skip_special_tokens=True))
#     labels.extend(tokenizer.batch_decode(labels_ids, skip_special_tokens=True))

In [None]:
# scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
# scores = []
# for i in range(len(predictions)):
#     scores.append(scorer.score(predictions[i], labels[i]))

# rouge1 = [score['rouge1'].fmeasure for score in scores]
# rouge2 = [score['rouge2'].fmeasure for score in scores]
# rougeL = [score['rougeL'].fmeasure for score in scores]

# np.mean(rouge1), np.mean(rouge2), np.mean(rougeL)

In [None]:
# len(train_loader)