## Configuration

In [None]:
!pip install git_root

PROJECT_ROOT = None
in_colab = 'google.colab' in str(get_ipython())

if in_colab:
  print('Running on CoLab')
  PROJECT_ROOT = "/content/drive/MyDrive/DL4NLP/abstract-to-title-generation/"
  from google.colab import drive
  drive.mount('/content/drive')

else:
  print('Running on local machine')
  from git_root import git_root
  PROJECT_ROOT = git_root()

%cd {PROJECT_ROOT}
# install requirements
!pip install -r requirements.txt

# pull data only pulls changed data
!dvc pull

In [None]:
model_checkpoint = "gpt2"

In [None]:
from datasets import Dataset, load_dataset, load_metric
from transformers import GPT2LMHeadModel, DataCollatorForLanguageModeling, TrainingArguments, Trainer, GPT2Tokenizer
import pandas as pd
import nltk
import numpy as np

## Training

In [None]:
model = GPT2LMHeadModel.from_pretrained(model_checkpoint)

In [None]:
    
def load_tokenizer():
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    special_tokens = {'bos_token':'<|startoftext|>','eos_token':'<|endoftext|>','pad_token':'<pad>','additional_special_tokens':['<TITLE>']} 
    tokenizer.add_special_tokens(special_tokens)
    return tokenizer

tokenizer = load_tokenizer()
model.resize_token_embeddings(len(tokenizer))

In [None]:
df = pd.read_csv("../data/filtered/train_pairs.csv", index_col=0)
df

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.to_csv("../data/filtered/1024_characters_pairs.csv")

In [None]:
df = df.drop(columns=["title_length", "abstract_length"])

In [None]:
df_train = df[:11864]
df_valid = df[11864:11964]

In [None]:
def add_input(df):
  inputs = []
  for a,t in zip(df.abstract.to_list(), df.title.to_list()):
    input = a + "<TITLE>" + t + "<|endoftext|>"
    inputs.append(input)
  df["input"] = inputs
  return df

In [None]:
df_train = add_input(df_train)

In [None]:
df_valid = add_input(df_valid)

In [None]:
df_train = df_train.drop(columns=["title", "abstract"])
df_valid = df_valid.drop(columns=["title", "abstract"])

In [None]:
train_dataset = Dataset.from_pandas(df_train)
valid_dataset = Dataset.from_pandas(df_valid)
metric = load_metric("rouge")


In [None]:
max_input_length = 620
def preprocess_function(examples):
    model_inputs = tokenizer(examples["input"], padding="max_length", max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = model_inputs

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
valid_dataset = valid_dataset.map(preprocess_function, batched=True)


In [None]:
from operator import indexOf
def get_max_len(train_dataset, valid_dataset, max_global_input_len):
  for i in train_dataset:
    t = indexOf(i["input_ids"], tokenizer.eos_token_id) + 1
    if t > max_global_input_len:
      max_global_input_len = t

  for i in valid_dataset:
    t = indexOf(i["input_ids"], tokenizer.eos_token_id) + 1
    if t > max_global_input_len:
      max_global_input_len = t
  return max_global_input_len

max_global_input_len = get_max_len(train_dataset, valid_dataset, 0)

In [None]:
batch_size = 2
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    "output/gpt2/" + f"{model_name}-finetuned-lm_al_paper",
    evaluation_strategy = "epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    fp16=True,
    push_to_hub=False,
    gradient_accumulation_steps=8,
    save_steps = 200,
    logging_steps = 185,
)

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)





```
Examples:
    >>> predictions = ["hello there", "general kenobi"]
    >>> references = ["hello there", "general kenobi"]
    >>> bertscore = datasets.load_metric("bertscore")
    >>> results = bertscore.compute(predictions=predictions, references=references, lang="en")
    >>> print([round(v, 2) for v in results["f1"]])
    [1.0, 1.0]
  
```



In [None]:
nltk.download('punkt')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("./output/gpt2")

## Testing

In [None]:
test_samples = pd.read_csv("../data/filtered/test_pairs.csv", index_col=0)
test_samples

In [None]:
abstracts = test_samples.abstract.to_list()
titles = test_samples.title.to_list()

In [None]:
def creat_eval_pairs(model, tokenizer, abstracts, titles):
  preds = []
  for abstract, title in zip(abstracts, titles):
    encoding = tokenizer(abstract + "<TITLE>", return_tensors = "pt", max_length=620)
    inputs = encoding["input_ids"].to("cuda")
    attention_masks = encoding["attention_mask"].to("cuda")
    title_ids = model.generate(
            input_ids = inputs,
            attention_mask = attention_masks,
            max_length = 1024,
            num_beams = 5,
            num_return_sequences = 5,
            repetition_penalty=2.0, 
            length_penalty=10.0,
            early_stopping = True,
            )
    result = []
    for g in title_ids:
      result.append(tokenizer.decode(g).split("<TITLE>")[1].split("<|endoftext|>")[0])
    s=""
    for t in result:
      s = s + "<TITLE>" + t
    preds.append(s)
    if len(preds) % 500 == 0:
      print("original title: ", title)
      print("generated title: ", preds[-1:])
  return preds, titles

In [None]:
model.to("cuda")

In [None]:
preds, titles = creat_eval_pairs(model, tokenizer, abstracts, titles)

In [None]:
pred_target_pairs = pd.DataFrame(list(zip(preds, titles)), columns=['predictions', 'targets'])

In [None]:
pred_target_pairs.to_csv("gpt2.csv")