In [None]:
import json
import pandas as pd

INPUT_NAME='abstract'
OUTPUT_NAME='comments'

checkpoint = "t5-small"

df=pd.read_csv('./data/data.csv')

df=df[[INPUT_NAME,OUTPUT_NAME]].dropna()
features=df[INPUT_NAME].values
labels=df[OUTPUT_NAME].values

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
model_inputs = tokenizer(list(features), text_target=list(labels), max_length=1024, truncation=True)

In [None]:
from datasets import Dataset
model_dataset = Dataset.from_dict(model_inputs)
model_dataset = model_dataset.train_test_split(test_size=0.2)

In [None]:
from sklearn.metrics import r2_score
def phrase_num_metric(decoded_predictions,decoded_labels,phrase,result):
    preds=[]
    labels=[]
    for dec_pred,dec_lab in zip(decoded_predictions,decoded_labels):
        dec_pred=list(map(lambda x: x.lower(),dec_pred))
        dec_lab=list(map(lambda x: x.lower(),dec_lab))
        dec_pred=''.join(dec_pred)
        dec_lab=''.join(dec_lab)
        if(phrase in dec_lab and phrase in dec_pred):
            dec_lab=dec_lab.split()
            dec_pred=dec_pred.split()
            for i,w in enumerate(dec_lab):
                if(phrase in w):
                    il=i
                    break
            for i,w in enumerate(dec_pred):
                if(phrase in w):
                    ip=i
                    break
            vl=None
            vp=None
            for i in range(il,il-2,-1):
                if(dec_lab[i].isdigit()):
                    vl=int(dec_lab[i])
                    break
            for i in range(ip,ip-2,-1):
                if(dec_pred[i].isdigit()):
                    vp=int(dec_pred[i])
                    break
            if(vp != None and vl!=None):
                preds.append(vp)
                labels.append(vl)
    result[f'{phrase}_r2']=r2_score(labels,preds)
    result[f'{phrase}_lab_avg']=sum(labels)/len(labels)
    result[f'{phrase}_pred_avg']=sum(preds)/len(preds)

In [None]:
def phrase_metric(decoded_predictions,decoded_labels,phrases,result):
    tp=0
    fp=0
    tn=0
    fn=0
    for dec_pred,dec_lab in zip(decoded_predictions,decoded_labels):
        dec_pred=list(map(lambda x: x.lower(),dec_pred))
        dec_lab=list(map(lambda x: x.lower(),dec_lab))
        dec_pred=''.join(dec_pred)
        dec_lab=''.join(dec_lab)
        if(any(phrase in dec_lab for phrase in phrases)):
            if(any(phrase in dec_pred for phrase in phrases)):
                tp+=1
            else:
                fn+=1
        else:
            if(any(phrase in dec_pred for phrase in phrases)):
                fp+=1
            else:
                tn+=1
    try:
        result[f'{phrases[0]}_Precision']=tp/(tp+fp)
    except:
        result[f'{phrases[0]}_Precision']=0
    try:
        result[f'{phrases[0]}_Recall']=tp/(tp+fn)
    except:
        result[f'{phrases[0]}_Recall']=0
    try:
        result[f'{phrases[0]}_F1']=2*result[f'{phrases[0]}_Precision']*result[f'{phrases[0]}_Recall']/(result[f'{phrases[0]}_Precision']+result[f'{phrases[0]}_Recall'])
    except:
        result[f'{phrases[0]}_F1']=0

In [None]:
def space_metric(decoded_predictions,decoded_labels,result):
    tp=0
    fp=0
    tn=0
    fn=0
    phrases=(' ',)
    for dec_pred,dec_lab in zip(decoded_predictions,decoded_labels):
        dec_pred=list(map(lambda x: x.lower(),dec_pred))
        dec_lab=list(map(lambda x: x.lower(),dec_lab))
        dec_pred=''.join(dec_pred)
        dec_lab=''.join(dec_lab)
        if(not any(phrase in dec_lab for phrase in phrases)):
            if(not any(phrase in dec_pred for phrase in phrases)):
                tp+=1
            else:
                fn+=1
        else:
            if(not any(phrase in dec_pred for phrase in phrases)):
                fp+=1
            else:
                tn+=1
    try:
        result[f'{phrases[0]}_Precision']=tp/(tp+fp)
    except:
        result[f'{phrases[0]}_Precision']=0
    try:
        result[f'{phrases[0]}_Recall']=tp/(tp+fn)
    except:
        result[f'{phrases[0]}_Recall']=0
    try:
        result[f'{phrases[0]}_F1']=2*result[f'{phrases[0]}_Precision']*result[f'{phrases[0]}_Recall']/(result[f'{phrases[0]}_Precision']+result[f'{phrases[0]}_Recall'])
    except:
        result[f'{phrases[0]}_F1']=0

In [None]:
import numpy as np
import evaluate

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    phrase_metric(decoded_preds,decoded_labels,('figure',),result)
    phrase_metric(decoded_preds,decoded_labels,('page',),result)
    phrase_metric(decoded_preds,decoded_labels,('version',),result)
    phrase_metric(decoded_preds,decoded_labels,('publish','submit'),result)
    space_metric(decoded_preds,decoded_labels,result)
    phrase_num_metric(decoded_preds,decoded_labels,'figure',result)
    phrase_num_metric(decoded_preds,decoded_labels,'page',result)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
    output_dir="./model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=6,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    report_to='none'
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=model_dataset["train"],
    eval_dataset=model_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
from transformers import AutoTokenizer
text='A fully differential calculation in perturbative quantum chromodynamics is presented for the production of massive photon pairs at hadron colliders. All next-to-leading order perturbative contributions from quark-antiquark, gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as all-orders resummation of initial-state gluon radiation valid at next-to-next-to-leading logarithmic accuracy. The region of phase space is specified in which the calculation is most reliable. Good agreement is demonstrated with data from the Fermilab Tevatron, and predictions are made for more detailed tests with CDF and DO data. Predictions are shown for distributions of diphoton pairs produced at the energy of the Large Hadron Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs boson are contrasted with those produced from QCD processes at the LHC, showing that enhanced sensitivity to the signal can be obtained with judicious selection of events.'
inputs = tokenizer(text, return_tensors="pt").input_ids

In [None]:
model.to('cpu')
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [None]:
tokenizer.decode(outputs[0], skip_special_tokens=True)