## Configuration

In [None]:
!pip install git_root

PROJECT_ROOT = None
in_colab = 'google.colab' in str(get_ipython())

if in_colab:
  print('Running on CoLab')
  PROJECT_ROOT = "/content/drive/MyDrive/DL4NLP/abstract-to-title-generation/"
  from google.colab import drive
  drive.mount('/content/drive')

else:
  print('Running on local machine')
  from git_root import git_root
  PROJECT_ROOT = git_root()

%cd {PROJECT_ROOT}

# install requirements
!pip install -r requirements.txt

# pull data only pulls changed data
!dvc pull

Running on local machine
/Users/linusschwarz/source/uni/dl4nlp/project/abstract-to-title
Collecting sentencepiece
  Using cached sentencepiece-0.1.96.tar.gz (508 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting scipy
  Using cached scipy-1.8.1-cp310-cp310-macosx_12_0_arm64.whl (28.7 MB)
Collecting dvc[gdrive]
  Using cached dvc-2.10.2-py3-none-any.whl (401 kB)
Collecting colorama>=0.3.9
  Using cached colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting shortuuid>=0.5.0
  Using cached shortuuid-1.0.9-py3-none-any.whl (9.4 kB)
Collecting nanotime>=0.5.2
  Using cached nanotime-0.5.2.tar.gz (3.2 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting flufl.lock>=5
  Using cached flufl.lock-7.0-py3-none-any.whl (11 kB)
Collecting flatten-dict<1,>=0.4.1
  Using cached flatten_dict-0.4.2-py2.py3-none-any.whl (9.7 kB)
Collecting zc.lockfile>=1.2.1
  Using cached zc.lockfile-2.0-py2.py3-none-any.whl (9.7 kB)
Collecting dvc-render==0.0.5
  Using cached dvc_render-0.0

## Code section

In [None]:
# imports
from datasets import Dataset
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import pandas as pd

In [None]:
model_checkpoint = 'facebook/bart-base'

In [None]:
# create tokenizer from checkopoint
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# create pretrained model from checkpoint
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
# load test pairs
!echo $PROJECT_ROOT
df = pd.read_csv(f"{PROJECT_ROOT}/data/filtered/Kopie von test_pairs.csv", index_col=0)
df

/content/drive/MyDrive/DL4NLP/abstract-to-title-generation/


Unnamed: 0,title,abstract,title_length,abstract_length
0,Learning Latent Semantic Annotations for Groun...,Previous work on grounded language learning di...,11,121
1,Partially Supervised Sense Disambiguation by L...,Supervised and semi-supervised sense disambigu...,13,140
2,Hawkes Processes for Continuous Time Sequence ...,Classification of temporal textual data sequen...,15,68
3,A Unified Single Scan Algorithm for Japanese B...,We describe an algorithm for Japanese analysis...,13,62
4,Generating Coherent Event Schemas at Scale,Chambers and Jurafsky (2009) demonstrated that...,6,127
...,...,...,...,...
5356,Bridging Information-Seeking Human Gaze and Ma...,"In this work, we analyze how human gaze during...",8,118
5357,Quantum-inspired Neural Network for Conversati...,We provide a novel perspective on conversation...,7,116
5358,The BQ Corpus: A Large-scale Domain-specific C...,This paper introduces the Bank Question (BQ) c...,13,174
5359,Doc2hash: Learning Discrete Latent variables f...,Learning to hash via generative model has beco...,8,131


In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df = df.drop(columns=["title_length", "abstract_length", "token_len"])

In [None]:
df_train = df[:11864]
df_valid = df[11864:]

In [None]:

train_dataset = Dataset.from_pandas(df_train)
valid_dataset = Dataset.from_pandas(df_valid)
metric = load_metric("rouge")


In [None]:
df_train

Unnamed: 0,title,abstract
0,Natural Image Bases to Represent Neuroimaging ...,Visual inspection of neuroimagery is susceptib...
1,Sluice Resolution without Hand-Crafted Feature...,Sluice resolution in English is the problem of...
2,Sentiment Adaptive End-to-End Dialog Systems,End-to-end learning framework is useful for bu...
3,User-Friendly Text Prediction For Translators,Text prediction is a form of interactive machi...
4,Aligning Sentences from Standard Wikipedia to ...,This work improves monolingual sentence alignm...
...,...,...
11859,Vine Parsing and Minimum Risk Reranking for Sp...,We describe our entry in the CoNLL-X shared ta...
11860,From Characters to Time Intervals: New Paradig...,This paper presents the first model for time n...
11861,Unsupervised Consonant-Vowel Prediction over H...,"In this paper, we present a solution to one as..."
11862,Variational Autoencoder with Arbitrary Conditi...,We propose a single neural probabilistic model...


In [None]:
max_input_length = 1024
max_target_length = 512

def preprocess_function(examples):
    model_inputs = tokenizer(examples["abstract"], max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["title"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
preprocess_function(valid_dataset[:2])


{'input_ids': [[0, 1121, 2136, 1472, 2982, 3146, 1023, 9762, 6, 10, 467, 3881, 7, 3094, 5, 1472, 9, 10, 2136, 31, 37617, 1575, 4, 5454, 7926, 7, 745, 10, 239, 12, 12955, 2136, 1472, 2982, 3146, 1023, 9762, 467, 680, 5, 9600, 9, 27963, 414, 13, 42, 3685, 8, 9, 15924, 2051, 12, 6504, 7153, 1472, 41517, 4, 1216, 743, 10114, 5647, 31, 5, 754, 14, 5, 3685, 16, 145, 3032, 11, 13084, 31, 678, 2939, 9, 6885, 2982, 3146, 1023, 14133, 414, 4, 96, 42, 2225, 6, 52, 1701, 5, 1330, 3685, 9, 2136, 19850, 6, 147, 52, 2813, 7, 3094, 5, 4577, 19850, 9, 10, 2136, 31, 5377, 4, 166, 64, 304, 12980, 2777, 22997, 102, 25, 10, 739, 1787, 9, 8531, 16274, 414, 13, 42, 3685, 4, 166, 1455, 16964, 13, 15582, 5, 2136, 19850, 936, 8, 8085, 10, 1233, 3855, 81, 10, 18043, 467, 4, 166, 172, 311, 14, 5, 2136, 12, 48235, 467, 64, 28, 341, 7, 1477, 819, 15, 10, 30082, 20399, 179, 17985, 1253, 35019, 3685, 8, 64, 4296, 8, 12775, 3349, 4438, 5, 278, 9, 1984, 41762, 13, 10, 2136, 4, 2], [0, 170, 892, 5, 13879, 9, 16854, 11, 

In [None]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
valid_dataset = valid_dataset.map(preprocess_function, batched=True)


  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
batch_size = 8
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-lm_al_paper",
    evaluation_strategy = "epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    gradient_accumulation_steps=8,
    save_steps = 500,
    logging_steps = 185,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)





```
Examples:
    >>> predictions = ["hello there", "general kenobi"]
    >>> references = ["hello there", "general kenobi"]
    >>> bertscore = datasets.load_metric("bertscore")
    >>> results = bertscore.compute(predictions=predictions, references=references, lang="en")
    >>> print([round(v, 2) for v in results["f1"]])
    [1.0, 1.0]
  
```



In [None]:
import nltk
import numpy as np
nltk.download('punkt')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp fp16 backend


In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: abstract, title.
***** Running training *****
  Num examples = 11864
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 555


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
0,2.806,2.092597,44.0366,24.6408,39.5718,39.593,13.657
1,1.9659,1.956018,45.4715,25.9633,40.5348,40.5143,13.978
2,1.4999,1.939256,46.9783,26.8635,42.0319,42.0369,14.689


The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: abstract, title.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 4
The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: abstract, title.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 4
Saving model checkpoint to bart-base-finetuned-lm_al_paper/checkpoint-500
Configuration saved in bart-base-finetuned-lm_al_paper/checkpoint-500/config.json
Model weights saved in bart-base-finetuned-lm_al_paper/checkpoint-500/pytorch_model.bin
tokenizer config file saved in bart-base-finetuned-lm_al_paper/checkpoint-500/tokenizer_config.json
Special tokens file saved in bart-base-finetuned-lm_al_paper/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartF

TrainOutput(global_step=555, training_loss=2.090629948796453, metrics={'train_runtime': 775.9142, 'train_samples_per_second': 45.871, 'train_steps_per_second': 0.715, 'total_flos': 4491766526607360.0, 'train_loss': 2.090629948796453, 'epoch': 3.0})

In [None]:
model.save_pretrained(f"{PROJECT_ROOT}/model/BART-base/")


Configuration saved in ./output/bart/config.json
Model weights saved in ./output/bart/pytorch_model.bin


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(f"{PROJECT_ROOT}/model/BART-base/pytorch_model.bin")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

OSError: It looks like the config file at '/Users/linusschwarz/source/uni/dl4nlp/project/abstract-to-title/model/BART-base/pytorch_model.bin' is not a valid JSON file.

In [None]:
import pandas as pd


In [None]:
test_samples = pd.read_csv(f"{PROJECT_ROOT}/data/filtered/test_pairs.csv", index_col=0)
test_samples

Unnamed: 0,title,abstract,title_length,abstract_length
0,Learning Latent Semantic Annotations for Groun...,Previous work on grounded language learning di...,11,121
1,Partially Supervised Sense Disambiguation by L...,Supervised and semi-supervised sense disambigu...,13,140
2,Hawkes Processes for Continuous Time Sequence ...,Classification of temporal textual data sequen...,15,68
3,A Unified Single Scan Algorithm for Japanese B...,We describe an algorithm for Japanese analysis...,13,62
4,Generating Coherent Event Schemas at Scale,Chambers and Jurafsky (2009) demonstrated that...,6,127
...,...,...,...,...
5356,Bridging Information-Seeking Human Gaze and Ma...,"In this work, we analyze how human gaze during...",8,118
5357,Quantum-inspired Neural Network for Conversati...,We provide a novel perspective on conversation...,7,116
5358,The BQ Corpus: A Large-scale Domain-specific C...,This paper introduces the Bank Question (BQ) c...,13,174
5359,Doc2hash: Learning Discrete Latent variables f...,Learning to hash via generative model has beco...,8,131


In [None]:
abstracts = test_samples.abstract.to_list()
titles = test_samples.title.to_list()

In [None]:
model.to("cuda")

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        

In [None]:
def creat_eval_pairs(model, tokenizer, abstracts, titles):
  preds = []
  for abstract, title in zip(abstracts, titles):
    encoding = tokenizer.encode_plus(abstract, return_tensors = "pt")
    inputs = encoding["input_ids"].to("cuda")
    attention_masks = encoding["attention_mask"].to("cuda")
    title_ids = model.generate(
            input_ids = inputs,
            attention_mask = attention_masks,
            max_length = 30,
            num_beams = 5,
            num_return_sequences = 5,
            repetition_penalty=2.0, 
            length_penalty=10.0,
            early_stopping = True,
            )
    result = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in title_ids]
    s=""
    for t in result:
      s = s + "<TITLE>" + t
    preds.append(s)
    if len(preds) % 500 == 0:
      print("original title: ", title)
      print("generated title: ", preds[-1:])
  return preds, titles

In [None]:
preds, titles = creat_eval_pairs(model, tokenizer, abstracts, titles)

original title:  Paraphrase-Driven Learning for Open Question Answering
generated title:  ['<TITLE>Learning a Semantic Lexicon and Linear Ranking Function for Question Answering<TITLE>Learning Semantic Lexicons and Linear Ranking Functions for Question Answering<TITLE>Learning a Semantic Lexicon and Linear Ranking Function for Open-Domain Questions<TITLE>Learning Semantic Lexicon and Linear Ranking Functions for Question Answering<TITLE>Learning Semantic Lexicons for Question Answering']
original title:  Robustness and Generalization of Role Sets: PropBank vs. VerbNet
generated title:  ['<TITLE>Robustness and Generalization of PropBank and VerbNet Roles for Semantic Role Labeling<TITLE>Robustness and Generalization of Two Alternative Role Sets for Semantic Role Labeling<TITLE>Robustness and Generalization of Alternative Role Sets for Semantic Role Labeling<TITLE>Robustness and Generalization of PropBank Roles for Semantic Role Labeling<TITLE>Robustness and Generalization of Alternative

In [None]:
pred_target_pairs = pd.DataFrame(list(zip(preds, titles)), columns=['predictions', 'targets'])

In [None]:
pred_target_pairs.to_csv(f"{PROJECT_ROOT}/output/preds_targets_pairs/bart-base.csv")