## Configuration

In [4]:
!pip install git_root

PROJECT_ROOT = None
in_colab = 'google.colab' in str(get_ipython())

if in_colab:
  print('Running on CoLab')
  PROJECT_ROOT = "/content/drive/MyDrive/DL4NLP/abstract-to-title-generation/"
  from google.colab import drive
  drive.mount('/content/drive')

else:
  print('Running on local machine')
  from git_root import git_root
  PROJECT_ROOT = git_root()

%cd {PROJECT_ROOT}

Collecting git_root
  Downloading git_root-0.1-py3-none-any.whl (2.5 kB)
Installing collected packages: git-root
Successfully installed git-root-0.1
Running on CoLab
Mounted at /content/drive
/content/drive/MyDrive/DL4NLP/abstract-to-title-generation


In [5]:
# install requirements
!pip install -r requirements.txt

Collecting datasets
  Downloading datasets-2.2.2-py3-none-any.whl (346 kB)
[K     |████████████████████████████████| 346 kB 5.6 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 50.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 50.3 MB/s 
Collecting dvc[gdrive]
  Downloading dvc-2.10.2-py3-none-any.whl (401 kB)
[K     |████████████████████████████████| 401 kB 49.3 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 50.4 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 44.0 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinu

In [8]:
# pull data only pulls changed data
%cd {PROJECT_ROOT}
!dvc pull

/content/drive/MyDrive/DL4NLP/abstract-to-title-generation
Transferring:   0% 0/1 [00:00<?, ?file/s{'info': ''}]
![A
  0%|          |40d83206bbedf13326333bb63cbbc9.dir 0.00/? [00:00<?,        ?B/s][A
40d83206bbedf13326333bb63cbbc9.dir:   0% 0.00/148 [00:00<?, ?B/s{'info': ''}]   [A
100% 148/148 [00:01<00:00, 81.8B/s{'info': ''}]                              [A
Transferring:   0% 0/1 [00:00<?, ?file/s{'info': ''}]
![A
  0%|          |ca4e70571a46a8cd5b14cebf93cc30.dir 0.00/? [00:00<?,        ?B/s][A
ca4e70571a46a8cd5b14cebf93cc30.dir:   0% 0.00/148 [00:00<?, ?B/s{'info': ''}]   [A
100% 148/148 [00:01<00:00, 87.0B/s{'info': ''}]                              [A
Transferring:   0% 0/1 [00:00<?, ?file/s{'info': ''}]
![A
  0%|          |79fb1e3826d48e27806105d0f9683d.dir 0.00/? [00:00<?,        ?B/s][A
79fb1e3826d48e27806105d0f9683d.dir:   0% 0.00/217 [00:00<?, ?B/s{'info': ''}]   [A
100% 217/217 [00:01<00:00, 128B/s{'info': ''}]                               [A
Transferring:   0

## Code section

In [7]:
# imports
from datasets import Dataset
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import pandas as pd

In [8]:
model_checkpoint = 'facebook/bart-base'

In [9]:
# create tokenizer from checkopoint
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# create pretrained model from checkpoint
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/532M [00:00<?, ?B/s]

In [14]:
# load train pairs
!echo $PROJECT_ROOT
df = pd.read_csv(f"{PROJECT_ROOT}/data/filtered/Kopie von train_pairs.csv", index_col=0)
df

/content/drive/MyDrive/DL4NLP/abstract-to-title-generation/


Unnamed: 0,title,abstract,title_length,abstract_length
0,Natural Image Bases to Represent Neuroimaging ...,Visual inspection of neuroimagery is susceptib...,7,132
1,Sluice Resolution without Hand-Crafted Feature...,Sluice resolution in English is the problem of...,9,110
2,Learning Translation Models from Monolingual C...,Translation models often fail to generate good...,7,152
3,Sentiment Adaptive End-to-End Dialog Systems,End-to-end learning framework is useful for bu...,5,119
4,User-Friendly Text Prediction For Translators,Text prediction is a form of interactive machi...,5,134
...,...,...,...,...
21437,"Arabic Tokenization, Part-of-Speech Tagging an...",We present an approach to using a morphologica...,11,58
21438,Using Semantically Motivated Estimates to Help...,Research into the automatic acquisition of sub...,8,530
21439,A Mathematical Exploration of Why Language Mod...,"Autoregressive language models, pretrained usi...",11,194
21440,Do You Know That Florence Is Packed with Visit...,"When a speaker, Mary, asks ""Do you know that F...",15,174


In [15]:
df.reset_index(drop=True, inplace=True)

In [17]:
df = df.drop(columns=["title_length", "abstract_length"])
print(len(df))

21442


In [49]:
df_train = df[:11864]
df_valid = df[11864:]

3752


In [50]:
df_valid

Unnamed: 0,title,abstract
3752,Algorithms for Average Regret Minimization,"In this paper, we study a problem from the rea..."
3753,Scalable First-Order Methods for Robust MDPs,Robust Markov Decision Processes (MDPs) are a ...
3754,Rule-Based Anomaly Pattern Detection for Detec...,This paper presents an algorithm for performin...
3755,Leveraging Pre-trained Checkpoints for Sequenc...,Unsupervised pre-training of large neural mode...
3756,Continual Learning Through Synaptic Intelligence,While deep learning has led to remarkable adva...
...,...,...
5356,Bridging Information-Seeking Human Gaze and Ma...,"In this work, we analyze how human gaze during..."
5357,Quantum-inspired Neural Network for Conversati...,We provide a novel perspective on conversation...
5358,The BQ Corpus: A Large-scale Domain-specific C...,This paper introduces the Bank Question (BQ) c...
5359,Doc2hash: Learning Discrete Latent variables f...,Learning to hash via generative model has beco...


In [51]:
df_train

Unnamed: 0,title,abstract
0,Learning Latent Semantic Annotations for Groun...,Previous work on grounded language learning di...
1,Partially Supervised Sense Disambiguation by L...,Supervised and semi-supervised sense disambigu...
2,Hawkes Processes for Continuous Time Sequence ...,Classification of temporal textual data sequen...
3,A Unified Single Scan Algorithm for Japanese B...,We describe an algorithm for Japanese analysis...
4,Generating Coherent Event Schemas at Scale,Chambers and Jurafsky (2009) demonstrated that...
...,...,...
3747,Implicit Surfaces with Globally Regularised an...,We consider the problem of constructing a func...
3748,Watermarking the Outputs of Structured Predict...,We propose a general method to watermark and p...
3749,Specializing Word Embeddings for Similarity or...,We demonstrate the advantage of specializing s...
3750,Efficient Online Inference for Bayesian Nonpar...,We propose a pool-based non-parametric active ...


In [52]:
train_dataset = Dataset.from_pandas(df_train)
valid_dataset = Dataset.from_pandas(df_valid)
metric = load_metric("rouge")

In [53]:
max_input_length = 1024
max_target_length = 512

def preprocess_function(examples):
    model_inputs = tokenizer(examples["abstract"], max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["title"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [54]:
print(len(valid_dataset))
preprocess_function(valid_dataset[:2])


1609


{'input_ids': [[0, 1121, 42, 2225, 6, 52, 892, 10, 936, 31, 5, 18747, 9, 43630, 3961, 6971, 568, 442, 11, 61, 5, 724, 16, 7, 5163, 31, 10, 576, 278, 208, 9, 385, 12, 23944, 8720, 10, 3527, 23855, 37105, 208, 33533, 19, 43490, 9917, 4, 345, 1409, 6, 9917, 1797, 5, 24376, 37055, 9, 1434, 61, 74, 101, 7, 5163, 49, 2674, 7626, 31, 278, 208, 53, 122, 64, 129, 5163, 49, 2674, 7626, 31, 5, 37105, 208, 33533, 4, 14422, 173, 2061, 15, 8191, 154, 5, 4532, 9917, 61, 16, 3030, 30, 5, 144, 13865, 3018, 4, 166, 15393, 7, 1701, 5, 674, 9917, 1386, 61, 16, 3030, 30, 5, 6797, 9, 36, 879, 43, 298, 37055, 9, 70, 678, 1434, 4, 166, 311, 14, 42, 9917, 2450, 606, 19, 21453, 3611, 25, 2422, 14377, 42664, 61, 2386, 7, 12558, 46194, 16964, 4, 9870, 6, 52, 6581, 5, 9917, 34655, 25639, 31320, 936, 8, 2268, 21141, 9, 84, 16964, 7, 5, 682, 1850, 449, 12, 4950, 4903, 2450, 4, 1541, 26534, 775, 32, 7513, 19, 15491, 15, 10, 3143, 9, 16584, 19, 385, 62, 7, 262, 4, 2], [0, 18776, 4193, 1190, 1417, 30300, 19149, 293, 36

In [55]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
valid_dataset = valid_dataset.map(preprocess_function, batched=True)


  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [56]:
batch_size = 8
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-lm_al_paper",
    evaluation_strategy = "epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    gradient_accumulation_steps=8,
    save_steps = 500,
    logging_steps = 185,
)

In [57]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)





```
Examples:
    >>> predictions = ["hello there", "general kenobi"]
    >>> references = ["hello there", "general kenobi"]
    >>> bertscore = datasets.load_metric("bertscore")
    >>> results = bertscore.compute(predictions=predictions, references=references, lang="en")
    >>> print([round(v, 2) for v in results["f1"]])
    [1.0, 1.0]
  
```



In [58]:
import nltk
import numpy as np
nltk.download('punkt')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [59]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp half precision backend


In [60]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: title, abstract. If title, abstract are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3752
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 174


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
0,No log,2.394839,43.0293,23.9707,38.9023,38.9081,13.9497
1,No log,2.286196,43.1251,24.114,38.8531,38.8597,14.0796
2,No log,2.255791,44.1794,24.9595,39.6825,39.6837,14.6314


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: title, abstract. If title, abstract are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1609
  Batch size = 4
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: title, abstract. If title, abstract are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1609
  Batch size = 4
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: title, abstract. If title, abstract are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *

TrainOutput(global_step=174, training_loss=2.281775112809806, metrics={'train_runtime': 2970.8868, 'train_samples_per_second': 3.789, 'train_steps_per_second': 0.059, 'total_flos': 2392939311759360.0, 'train_loss': 2.281775112809806, 'epoch': 2.99})

In [None]:
model.save_pretrained(f"{PROJECT_ROOT}/model/BART-base/")


Configuration saved in ./output/bart/config.json
Model weights saved in ./output/bart/pytorch_model.bin


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(f"{PROJECT_ROOT}/model/BART-base/pytorch_model.bin")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

OSError: It looks like the config file at '/Users/linusschwarz/source/uni/dl4nlp/project/abstract-to-title/model/BART-base/pytorch_model.bin' is not a valid JSON file.

In [None]:
import pandas as pd


In [None]:
test_samples = pd.read_csv(f"{PROJECT_ROOT}/data/filtered/test_pairs.csv", index_col=0)
test_samples

Unnamed: 0,title,abstract,title_length,abstract_length
0,Learning Latent Semantic Annotations for Groun...,Previous work on grounded language learning di...,11,121
1,Partially Supervised Sense Disambiguation by L...,Supervised and semi-supervised sense disambigu...,13,140
2,Hawkes Processes for Continuous Time Sequence ...,Classification of temporal textual data sequen...,15,68
3,A Unified Single Scan Algorithm for Japanese B...,We describe an algorithm for Japanese analysis...,13,62
4,Generating Coherent Event Schemas at Scale,Chambers and Jurafsky (2009) demonstrated that...,6,127
...,...,...,...,...
5356,Bridging Information-Seeking Human Gaze and Ma...,"In this work, we analyze how human gaze during...",8,118
5357,Quantum-inspired Neural Network for Conversati...,We provide a novel perspective on conversation...,7,116
5358,The BQ Corpus: A Large-scale Domain-specific C...,This paper introduces the Bank Question (BQ) c...,13,174
5359,Doc2hash: Learning Discrete Latent variables f...,Learning to hash via generative model has beco...,8,131


In [None]:
abstracts = test_samples.abstract.to_list()
titles = test_samples.title.to_list()

In [None]:
model.to("cuda")

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        

In [None]:
def creat_eval_pairs(model, tokenizer, abstracts, titles):
  preds = []
  for abstract, title in zip(abstracts, titles):
    encoding = tokenizer.encode_plus(abstract, return_tensors = "pt")
    inputs = encoding["input_ids"].to("cuda")
    attention_masks = encoding["attention_mask"].to("cuda")
    title_ids = model.generate(
            input_ids = inputs,
            attention_mask = attention_masks,
            max_length = 30,
            num_beams = 5,
            num_return_sequences = 5,
            repetition_penalty=2.0, 
            length_penalty=10.0,
            early_stopping = True,
            )
    result = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in title_ids]
    s=""
    for t in result:
      s = s + "<TITLE>" + t
    preds.append(s)
    if len(preds) % 500 == 0:
      print("original title: ", title)
      print("generated title: ", preds[-1:])
  return preds, titles

In [None]:
preds, titles = creat_eval_pairs(model, tokenizer, abstracts, titles)

original title:  Paraphrase-Driven Learning for Open Question Answering
generated title:  ['<TITLE>Learning a Semantic Lexicon and Linear Ranking Function for Question Answering<TITLE>Learning Semantic Lexicons and Linear Ranking Functions for Question Answering<TITLE>Learning a Semantic Lexicon and Linear Ranking Function for Open-Domain Questions<TITLE>Learning Semantic Lexicon and Linear Ranking Functions for Question Answering<TITLE>Learning Semantic Lexicons for Question Answering']
original title:  Robustness and Generalization of Role Sets: PropBank vs. VerbNet
generated title:  ['<TITLE>Robustness and Generalization of PropBank and VerbNet Roles for Semantic Role Labeling<TITLE>Robustness and Generalization of Two Alternative Role Sets for Semantic Role Labeling<TITLE>Robustness and Generalization of Alternative Role Sets for Semantic Role Labeling<TITLE>Robustness and Generalization of PropBank Roles for Semantic Role Labeling<TITLE>Robustness and Generalization of Alternative

In [None]:
pred_target_pairs = pd.DataFrame(list(zip(preds, titles)), columns=['predictions', 'targets'])

In [None]:
pred_target_pairs.to_csv(f"{PROJECT_ROOT}/output/preds_targets_pairs/bart-base.csv")