## Configuration

In [45]:
!pip install git_root

PROJECT_ROOT = None
in_colab = 'google.colab' in str(get_ipython())

if in_colab:
  print('Running on CoLab')
  PROJECT_ROOT = "/content/drive/MyDrive/DL4NLP/abstract-to-title-generation"
  from google.colab import drive
  drive.mount('/content/drive')

else:
  print('Running on local machine')
  from git_root import git_root
  PROJECT_ROOT = git_root()

%cd {PROJECT_ROOT}

Running on CoLab
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/DL4NLP/abstract-to-title-generation


In [None]:
# install requirements
!pip install -r requirements.txt

Collecting datasets
  Downloading datasets-2.2.2-py3-none-any.whl (346 kB)
[K     |████████████████████████████████| 346 kB 5.6 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 50.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 50.3 MB/s 
Collecting dvc[gdrive]
  Downloading dvc-2.10.2-py3-none-any.whl (401 kB)
[K     |████████████████████████████████| 401 kB 49.3 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 50.4 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 44.0 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinu

In [None]:
# pull data only pulls changed data
%cd {PROJECT_ROOT}
!dvc pull

/content/drive/MyDrive/DL4NLP/abstract-to-title-generation
Transferring:   0% 0/1 [00:00<?, ?file/s{'info': ''}]
![A
  0%|          |40d83206bbedf13326333bb63cbbc9.dir 0.00/? [00:00<?,        ?B/s][A
40d83206bbedf13326333bb63cbbc9.dir:   0% 0.00/148 [00:00<?, ?B/s{'info': ''}]   [A
100% 148/148 [00:01<00:00, 81.8B/s{'info': ''}]                              [A
Transferring:   0% 0/1 [00:00<?, ?file/s{'info': ''}]
![A
  0%|          |ca4e70571a46a8cd5b14cebf93cc30.dir 0.00/? [00:00<?,        ?B/s][A
ca4e70571a46a8cd5b14cebf93cc30.dir:   0% 0.00/148 [00:00<?, ?B/s{'info': ''}]   [A
100% 148/148 [00:01<00:00, 87.0B/s{'info': ''}]                              [A
Transferring:   0% 0/1 [00:00<?, ?file/s{'info': ''}]
![A
  0%|          |79fb1e3826d48e27806105d0f9683d.dir 0.00/? [00:00<?,        ?B/s][A
79fb1e3826d48e27806105d0f9683d.dir:   0% 0.00/217 [00:00<?, ?B/s{'info': ''}]   [A
100% 217/217 [00:01<00:00, 128B/s{'info': ''}]                               [A
Transferring:   0

## Code section

In [None]:
# imports
from datasets import Dataset
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import pandas as pd

In [None]:
model_checkpoint = 'facebook/bart-base'

In [20]:
# create tokenizer from checkopoint
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# create pretrained model from checkpoint
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [21]:
# load train pairs
!echo $PROJECT_ROOT
df = pd.read_csv(f"{PROJECT_ROOT}/data/filtered/Kopie von train_pairs.csv", index_col=0)
df

/content/drive/MyDrive/DL4NLP/abstract-to-title-generation/


Unnamed: 0,title,abstract,title_length,abstract_length
0,Natural Image Bases to Represent Neuroimaging ...,Visual inspection of neuroimagery is susceptib...,7,132
1,Sluice Resolution without Hand-Crafted Feature...,Sluice resolution in English is the problem of...,9,110
2,Learning Translation Models from Monolingual C...,Translation models often fail to generate good...,7,152
3,Sentiment Adaptive End-to-End Dialog Systems,End-to-end learning framework is useful for bu...,5,119
4,User-Friendly Text Prediction For Translators,Text prediction is a form of interactive machi...,5,134
...,...,...,...,...
21437,"Arabic Tokenization, Part-of-Speech Tagging an...",We present an approach to using a morphologica...,11,58
21438,Using Semantically Motivated Estimates to Help...,Research into the automatic acquisition of sub...,8,530
21439,A Mathematical Exploration of Why Language Mod...,"Autoregressive language models, pretrained usi...",11,194
21440,Do You Know That Florence Is Packed with Visit...,"When a speaker, Mary, asks ""Do you know that F...",15,174


In [None]:
df.reset_index(drop=True, inplace=True)

In [22]:
df = df.drop(columns=["title_length", "abstract_length"])

In [23]:
df_train = df[:11864]
df_valid = df[11864:]

In [24]:
df_valid

Unnamed: 0,title,abstract
11864,Spherical CNNs on Unstructured Grids,We present an efficient convolution kernel for...
11865,Soft Representation Learning for Sparse Transfer,Transfer learning is effective for improving t...
11866,Explicit and Implicit Syntactic Features for T...,Syntactic features are useful for many text cl...
11867,Adaptive Gradient Methods with Dynamic Bound o...,"Adaptive optimization methods such as ADAGRAD,..."
11868,Using Fast Weights to Improve Persistent Contr...,The most commonly used learning algorithm for ...
...,...,...
21437,"Arabic Tokenization, Part-of-Speech Tagging an...",We present an approach to using a morphologica...
21438,Using Semantically Motivated Estimates to Help...,Research into the automatic acquisition of sub...
21439,A Mathematical Exploration of Why Language Mod...,"Autoregressive language models, pretrained usi..."
21440,Do You Know That Florence Is Packed with Visit...,"When a speaker, Mary, asks ""Do you know that F..."


In [25]:
df_train

Unnamed: 0,title,abstract
0,Natural Image Bases to Represent Neuroimaging ...,Visual inspection of neuroimagery is susceptib...
1,Sluice Resolution without Hand-Crafted Feature...,Sluice resolution in English is the problem of...
2,Learning Translation Models from Monolingual C...,Translation models often fail to generate good...
3,Sentiment Adaptive End-to-End Dialog Systems,End-to-end learning framework is useful for bu...
4,User-Friendly Text Prediction For Translators,Text prediction is a form of interactive machi...
...,...,...
11859,On Fast Adversarial Robustness Adaptation in M...,Model-agnostic meta-learning (MAML) has emerge...
11860,Syntactical Analysis of the Weaknesses of Sent...,We carry out a syntactic analysis of two state...
11861,CEM-RL: Combining evolutionary and gradient-ba...,Deep neuroevolution and deep reinforcement lea...
11862,Semi-Markov Conditional Random Fields for Info...,We describe semi-Markov conditional random fie...


In [28]:
train_dataset = Dataset.from_pandas(df_train)
valid_dataset = Dataset.from_pandas(df_valid)
metric = load_metric("rouge")


Collecting rouge_score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


In [29]:
max_input_length = 1024
max_target_length = 512

def preprocess_function(examples):
    model_inputs = tokenizer(examples["abstract"], max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["title"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [30]:
preprocess_function(valid_dataset[:2])


{'input_ids': [[0, 170, 1455, 41, 5693, 15380, 23794, 34751, 13, 30505, 23794, 337, 44304, 14641, 36, 16256, 29, 43, 15, 542, 25384, 4075, 38446, 634, 43797, 1538, 25406, 5990, 150, 5650, 15, 44787, 8724, 215, 25, 5730, 29762, 3156, 50, 33417, 8724, 4, 598, 42, 253, 6, 52, 3190, 9164, 15380, 23794, 45256, 19, 26956, 21092, 9, 25406, 5990, 14, 32, 19099, 30, 1532, 868, 17294, 4, 22248, 2617, 5990, 64, 28, 14146, 2319, 15, 542, 25384, 4075, 38446, 634, 65, 12, 4506, 6611, 6, 8, 1532, 868, 17294, 64, 28, 29854, 149, 2526, 124, 12, 27128, 1073, 1258, 4, 287, 10, 898, 6, 52, 6925, 2778, 5693, 26739, 4836, 14, 914, 50, 9980, 3899, 194, 12, 1116, 12, 627, 12, 2013, 1546, 41885, 11, 1110, 9, 819, 53, 19, 10, 3625, 2735, 346, 9, 1546, 17294, 4, 166, 10516, 84, 17194, 11, 41, 4935, 651, 9, 15491, 15, 10, 3143, 9, 3034, 3360, 8, 2147, 2866, 8558, 6, 217, 3989, 20257, 6, 2147, 6184, 2835, 1258, 6, 8, 32442, 808, 43606, 337, 2274, 46195, 2835, 1258, 4, 7806, 6, 52, 36, 134, 43, 1455, 10, 5808, 3480

In [31]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
valid_dataset = valid_dataset.map(preprocess_function, batched=True)


  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

In [32]:
batch_size = 8
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-lm_al_paper",
    evaluation_strategy = "epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    gradient_accumulation_steps=8,
    save_steps = 500,
    logging_steps = 185,
)

In [33]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)





```
Examples:
    >>> predictions = ["hello there", "general kenobi"]
    >>> references = ["hello there", "general kenobi"]
    >>> bertscore = datasets.load_metric("bertscore")
    >>> results = bertscore.compute(predictions=predictions, references=references, lang="en")
    >>> print([round(v, 2) for v in results["f1"]])
    [1.0, 1.0]
  
```



In [34]:
import nltk
import numpy as np
nltk.download('punkt')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [35]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp half precision backend


In [36]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: abstract, __index_level_0__, title. If abstract, __index_level_0__, title are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 11864
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 555


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored

In [None]:
model.save_pretrained(f"{PROJECT_ROOT}/model/BART-base/")


Configuration saved in ./output/bart/config.json
Model weights saved in ./output/bart/pytorch_model.bin


In [53]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained(f"{PROJECT_ROOT}/model/BART-base/config.json")
#tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(f"{PROJECT_ROOT}/model/BART-base/")

loading configuration file /content/drive/MyDrive/DL4NLP/abstract-to-title-generation/model/BART-base/config.json
Model config BartConfig {
  "_name_or_path": "/content/drive/MyDrive/DL4NLP/abstract-to-title-generation/model/BART-base/config.json",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2":

OSError: ignored

In [None]:
import pandas as pd


In [None]:
test_samples = pd.read_csv(f"{PROJECT_ROOT}/data/filtered/test_pairs.csv", index_col=0)
test_samples

Unnamed: 0,title,abstract,title_length,abstract_length
0,Learning Latent Semantic Annotations for Groun...,Previous work on grounded language learning di...,11,121
1,Partially Supervised Sense Disambiguation by L...,Supervised and semi-supervised sense disambigu...,13,140
2,Hawkes Processes for Continuous Time Sequence ...,Classification of temporal textual data sequen...,15,68
3,A Unified Single Scan Algorithm for Japanese B...,We describe an algorithm for Japanese analysis...,13,62
4,Generating Coherent Event Schemas at Scale,Chambers and Jurafsky (2009) demonstrated that...,6,127
...,...,...,...,...
5356,Bridging Information-Seeking Human Gaze and Ma...,"In this work, we analyze how human gaze during...",8,118
5357,Quantum-inspired Neural Network for Conversati...,We provide a novel perspective on conversation...,7,116
5358,The BQ Corpus: A Large-scale Domain-specific C...,This paper introduces the Bank Question (BQ) c...,13,174
5359,Doc2hash: Learning Discrete Latent variables f...,Learning to hash via generative model has beco...,8,131


In [None]:
abstracts = test_samples.abstract.to_list()
titles = test_samples.title.to_list()

In [None]:
model.to("cuda")

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        

In [None]:
def creat_eval_pairs(model, tokenizer, abstracts, titles):
  preds = []
  for abstract, title in zip(abstracts, titles):
    encoding = tokenizer.encode_plus(abstract, return_tensors = "pt")
    inputs = encoding["input_ids"].to("cuda")
    attention_masks = encoding["attention_mask"].to("cuda")
    title_ids = model.generate(
            input_ids = inputs,
            attention_mask = attention_masks,
            max_length = 30,
            num_beams = 5,
            num_return_sequences = 5,
            repetition_penalty=2.0, 
            length_penalty=10.0,
            early_stopping = True,
            )
    result = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in title_ids]
    s=""
    for t in result:
      s = s + "<TITLE>" + t
    preds.append(s)
    if len(preds) % 500 == 0:
      print("original title: ", title)
      print("generated title: ", preds[-1:])
  return preds, titles

In [None]:
preds, titles = creat_eval_pairs(model, tokenizer, abstracts, titles)

original title:  Paraphrase-Driven Learning for Open Question Answering
generated title:  ['<TITLE>Learning a Semantic Lexicon and Linear Ranking Function for Question Answering<TITLE>Learning Semantic Lexicons and Linear Ranking Functions for Question Answering<TITLE>Learning a Semantic Lexicon and Linear Ranking Function for Open-Domain Questions<TITLE>Learning Semantic Lexicon and Linear Ranking Functions for Question Answering<TITLE>Learning Semantic Lexicons for Question Answering']
original title:  Robustness and Generalization of Role Sets: PropBank vs. VerbNet
generated title:  ['<TITLE>Robustness and Generalization of PropBank and VerbNet Roles for Semantic Role Labeling<TITLE>Robustness and Generalization of Two Alternative Role Sets for Semantic Role Labeling<TITLE>Robustness and Generalization of Alternative Role Sets for Semantic Role Labeling<TITLE>Robustness and Generalization of PropBank Roles for Semantic Role Labeling<TITLE>Robustness and Generalization of Alternative

In [None]:
pred_target_pairs = pd.DataFrame(list(zip(preds, titles)), columns=['predictions', 'targets'])

In [None]:
pred_target_pairs.to_csv(f"{PROJECT_ROOT}/output/preds_targets_pairs/bart-base.csv")