## Configuration

In [None]:
!pip install git_root

PROJECT_ROOT = None
in_colab = 'google.colab' in str(get_ipython())

if in_colab:
    print('Running on CoLab')
    PROJECT_ROOT = "/content/drive/MyDrive/DL4NLP/abstract-to-title-generation/"
    from google.colab import drive

    drive.mount('/content/drive')

else:
    print('Running on local machine')
    from git_root import git_root

    PROJECT_ROOT = git_root()

% cd {PROJECT_ROOT}
# install requirements
!pip install -r requirements.txt
# pull data only pulls changed data
!dvc pull


In [None]:
model_checkpoint = 'facebook/bart-large-cnn'

In [None]:
!pip install tqdm -U

In [None]:
from datasets import Dataset, load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import pandas as pd
import nltk
import numpy as np

## Testing

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
df = pd.read_csv("../data/filtered/train_pairs.csv", index_col=0)
df

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.to_csv("./1024_characters_pairs.csv")

In [None]:
df = df.drop(columns=["title_length", "abstract_length", "token_len"])

In [None]:
df_train = df[:11864]
df_valid = df[11864:]

In [None]:
train_dataset = Dataset.from_pandas(df_train)
valid_dataset = Dataset.from_pandas(df_valid)
metric = load_metric("rouge")


In [None]:
max_input_length = 1024
max_target_length = 512

def preprocess_function(examples):
    model_inputs = tokenizer(examples["abstract"], max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["title"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
preprocess_function(valid_dataset[:2])

In [None]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
valid_dataset = valid_dataset.map(preprocess_function, batched=True)

In [None]:
batch_size = 4
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-lm_al_paper",
    evaluation_strategy = "epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    gradient_accumulation_steps=8,
    save_steps = 500,
    logging_steps = 185,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)




```
Examples:
    >>> predictions = ["hello there", "general kenobi"]
    >>> references = ["hello there", "general kenobi"]
    >>> bertscore = datasets.load_metric("bertscore")
    >>> results = bertscore.compute(predictions=predictions, references=references, lang="en")
    >>> print([round(v, 2) for v in results["f1"]])
    [1.0, 1.0]
  
```



In [None]:
nltk.download('punkt')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("../model/BART-CNN/")


## Testing

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("../model/BART-CNN/pytorch_model.bin")

In [None]:

test_samples = pd.read_csv("../data/filtered/test_pairs.csv", index_col=0)
test_samples

In [None]:
abstracts = test_samples.abstract.to_list()
titles = test_samples.title.to_list()

In [None]:
def creat_eval_pairs(model, tokenizer, abstracts, titles):
  preds = []
  for abstract, title in zip(abstracts, titles):
    encoding = tokenizer.encode_plus(abstract, return_tensors = "pt")
    inputs = encoding["input_ids"].to("cuda")
    attention_masks = encoding["attention_mask"].to("cuda")
    title_ids = model.generate(
            input_ids = inputs,
            attention_mask = attention_masks,
            max_length = 30,
            num_beams = 5,
            num_return_sequences = 5,
            repetition_penalty=2.0, 
            length_penalty=15.0,
            early_stopping = True,
            )
    result = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in title_ids]
    s=""
    for t in result:
      s = s + "<TITLE>" + t
    preds.append(s)
    if len(preds) % 500 == 0:
      print("original title: ", title)
      print("generated title: ", preds[-1:])
  return preds, titles

In [None]:
model.to("cuda")

In [None]:
preds, titles = creat_eval_pairs(model, tokenizer, abstracts, titles)

In [None]:
pred_target_pairs = pd.DataFrame(list(zip(preds, titles)), columns=['predictions', 'targets'])

In [None]:
pred_target_pairs.to_csv("../output/preds_targets_pairs/bart-large.csv")