## Load dataset

In [18]:
from tqdm import tqdm

In [10]:
with open("../_common/datasets/dialogpt/train_v2.txt") as file:
    train = file.read().splitlines()
    
with open("../_common/datasets/dialogpt/val_v2.txt") as file:
    valid = file.read().splitlines()

In [11]:
len(train)

28092

In [None]:
import re

In [None]:
for (i, text) in tqdm(enumerate(train), total=len(train)):
    if len(re.findall(" <SEP> ", text)) != 2:
        print(i)

In [None]:
len(train)

In [None]:
for (i, text) in tqdm(enumerate(valid), total=len(valid)):
    if len(re.findall(" <SEP> ", text)) != 2:
        print(i)

In [None]:
len(valid)

#### calculate optimal input lengths

In [1]:
import numpy as np
import torch

from tqdm.notebook import tqdm

In [2]:
from transformers import BartTokenizer, BartModel

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
model_name = 'theojolliffe/bart-cnn-science'

tokenizer = BartTokenizer.from_pretrained(model_name)
# model = BartModel.from_pretrained(model_name).to(device)

In [7]:
special_tokens_dict = {
    'sep_token': '<SEP>',
    'additional_special_tokens': ['<UTTERSEP>']
}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
# model.resize_token_embeddings(len(tokenizer))

In [12]:
num_tokens_input = []
num_tokens_output = []

toomuchtokens = []


for i, text in tqdm(enumerate(train), total=len(train)):
    input_text, output_text = text.split(" <INPUTEND> ")
    input_text = input_text.replace("<BOS> ", "").replace(" <EOS>", "")
    if len(tokenizer.encode(text)) > 1024:
        toomuchtokens.append(i)
    output_text = output_text.replace("<BOS> ", "").replace(" <EOS>", "")
    num_tokens_input.append(len(tokenizer.encode(input_text)))
    num_tokens_output.append(len(tokenizer.encode(output_text)))

  0%|          | 0/28092 [00:00<?, ?it/s]

In [15]:
np.mean(num_tokens_input), np.median(num_tokens_input), np.quantile(num_tokens_input, 0.95)

(148.88996867435569, 136.0, 276.0)

In [16]:
np.mean(num_tokens_output), np.median(num_tokens_output), np.quantile(num_tokens_output, 0.95)

(53.6047273245052, 43.0, 123.0)

#### change data format

In [None]:
import pandas as pd

In [None]:
questions = list()
answers = list()

for text in tqdm(train):
    input_text, output_text = text.split(" <INPUTEND> ")
    input_text = input_text[6:]
    output_text = output_text[:-6]
    
    if len(tokenizer.encode(input_text)) > 1800:
        continue
    
    questions.append(input_text)
    answers.append(output_text)

train_df = pd.DataFrame({"text": questions, "summary": answers})
train_df.to_csv("datasets/bart_response_data/train_ft_deberta.csv", index=False)
        
        
questions = list()
answers = list()

for text in tqdm(valid):
    input_text, output_text = text.split(" <INPUTEND> ")
    questions.append(input_text[6:])
    answers.append(output_text[:-6])

valid_df = pd.DataFrame({"text": questions, "summary": answers})
valid_df.to_csv("datasets/bart_response_data/valid_ft_deberta.csv", index=False)

In [None]:
test = pd.read_csv("datasets/bart_response_data/train_ft_deberta.csv")

In [None]:
test.text[90]

#### train

In [None]:
import os
import wandb

In [27]:
!CUDA_VISIBLE_DEVICES=0 python custom_bart_scripts/run_response_generation.py \
    --model_name_or_path="theojolliffe/bart-cnn-science" \
    --do_train \
    --do_eval \
    --report_to="wandb" \
    --evaluation_strategy="steps" \
    --weight_decay=0.01 \
    --logging_steps=1000 \
    --save_steps=1000 \
    --run_name="bart_cnn_science_4ep_2e05" \
    --train_file="../_common/datasets/bart_response_data/train_ft_deberta.csv" \
    --validation_file="../_common/datasets/bart_response_data/valid_ft_deberta.csv" \
    --output_dir="../_common/bart_response_generation/bart_cnn_science_4ep_2e05" \
    --per_device_train_batch_size=20 \
    --per_device_eval_batch_size=20 \
    --max_target_length=512 \
    --learning_rate=2e-05 \
    --num_train_epochs=5 \
    --overwrite_output_dir \
    --predict_with_generate

2023-04-14 12:07:11.995887: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-14 12:07:13.837836: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-04-14 12:07:13.837918: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
04/14/2023 12:07:17 - INFO - __main__ 

[INFO|tokenization_utils_base.py:907] 2023-04-14 12:07:18,346 >> Assigning <SEP> to the sep_token key of the tokenizer
[INFO|tokenization_utils_base.py:907] 2023-04-14 12:07:18,346 >> Assigning ['<INPUTEND>', '<UTTERSEP>'] to the additional_special_tokens key of the tokenizer
[INFO|modeling_utils.py:2403] 2023-04-14 12:07:18,393 >> loading weights file pytorch_model.bin from cache at /home/jovyan/.cache/huggingface/hub/models--theojolliffe--bart-cnn-science/snapshots/2b5c0e689642ef19663935c01d19a6881777c0d2/pytorch_model.bin
[INFO|configuration_utils.py:575] 2023-04-14 12:07:18,889 >> Generate config GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "length_penalty": 2.0,
  "max_length": 142,
  "min_length": 56,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.27.4"
}

[INFO|modeling_u

 14%|█████▊                                     | 10/74 [00:38<04:13,  3.96s/it][A
 15%|██████▍                                    | 11/74 [00:44<04:59,  4.75s/it][A
 16%|██████▉                                    | 12/74 [00:48<04:26,  4.30s/it][A
 18%|███████▌                                   | 13/74 [00:52<04:22,  4.30s/it][A
 19%|████████▏                                  | 14/74 [00:57<04:24,  4.41s/it][A
 20%|████████▋                                  | 15/74 [01:00<04:08,  4.22s/it][A
 22%|█████████▎                                 | 16/74 [01:05<04:07,  4.27s/it][A
 23%|█████████▉                                 | 17/74 [01:09<04:06,  4.33s/it][A
 24%|██████████▍                                | 18/74 [01:12<03:40,  3.94s/it][A
 26%|███████████                                | 19/74 [01:17<03:53,  4.25s/it][A
 27%|███████████▌                               | 20/74 [01:21<03:36,  4.00s/it][A
 28%|████████████▏                              | 21/74 [01:25<03:40,  4.16s

 14%|█████▊                                     | 10/74 [00:35<04:12,  3.95s/it][A
 15%|██████▍                                    | 11/74 [00:44<05:53,  5.61s/it][A
 16%|██████▉                                    | 12/74 [00:49<05:39,  5.47s/it][A
 18%|███████▌                                   | 13/74 [00:53<04:57,  4.88s/it][A
 19%|████████▏                                  | 14/74 [00:57<04:43,  4.72s/it][A
 20%|████████▋                                  | 15/74 [01:01<04:30,  4.58s/it][A
 22%|█████████▎                                 | 16/74 [01:06<04:25,  4.58s/it][A
 23%|█████████▉                                 | 17/74 [01:12<04:47,  5.04s/it][A
 24%|██████████▍                                | 18/74 [01:15<04:02,  4.33s/it][A
 26%|███████████                                | 19/74 [01:19<03:54,  4.27s/it][A
 27%|███████████▌                               | 20/74 [01:23<03:43,  4.13s/it][A
 28%|████████████▏                              | 21/74 [01:27<03:36,  4.09s

 11%|████▊                                       | 8/74 [00:28<04:46,  4.35s/it][A
 12%|█████▎                                      | 9/74 [00:33<04:40,  4.31s/it][A
 14%|█████▊                                     | 10/74 [00:37<04:43,  4.43s/it][A
 15%|██████▍                                    | 11/74 [00:43<04:58,  4.74s/it][A
 16%|██████▉                                    | 12/74 [00:48<04:57,  4.80s/it][A
 18%|███████▌                                   | 13/74 [00:53<04:53,  4.81s/it][A
 19%|████████▏                                  | 14/74 [00:57<04:48,  4.82s/it][A
 20%|████████▋                                  | 15/74 [01:01<04:27,  4.54s/it][A
 22%|█████████▎                                 | 16/74 [01:07<04:39,  4.82s/it][A
 23%|█████████▉                                 | 17/74 [01:12<04:38,  4.89s/it][A
 24%|██████████▍                                | 18/74 [01:15<03:57,  4.24s/it][A
 26%|███████████                                | 19/74 [01:18<03:44,  4.09s

[INFO|modeling_utils.py:1762] 2023-04-14 13:16:42,960 >> Model weights saved in ../_common/bart_response_generation/bart_cnn_science_4ep_2e05/pytorch_model.bin
[INFO|tokenization_utils_base.py:2163] 2023-04-14 13:16:42,971 >> tokenizer config file saved in ../_common/bart_response_generation/bart_cnn_science_4ep_2e05/tokenizer_config.json
[INFO|tokenization_utils_base.py:2170] 2023-04-14 13:16:42,978 >> Special tokens file saved in ../_common/bart_response_generation/bart_cnn_science_4ep_2e05/special_tokens_map.json
***** train metrics *****
  epoch                    =        5.0
  train_loss               =     1.5809
  train_runtime            = 1:09:11.84
  train_samples            =      28089
  train_samples_per_second =     33.827
  train_steps_per_second   =      1.692
04/14/2023 13:16:43 - INFO - __main__ - *** Evaluate ***
[INFO|trainer.py:3068] 2023-04-14 13:16:43,083 >> ***** Running Evaluation *****
[INFO|trainer.py:3070] 2023-04-14 13:16:43,083 >>   Num examples = 1473
[I

### Inference

In [28]:
import pickle
import re
import torch

from tqdm import tqdm
from transformers import BartTokenizer, BartForConditionalGeneration

In [29]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [30]:
model_name = "bart_cnn_science_4ep_2e05"

In [31]:
checkpoint = f"/home/jovyan/chatbot/_common/bart_response_generation/{model_name}/checkpoint-4000"

model = BartForConditionalGeneration.from_pretrained(checkpoint)
tokenizer = BartTokenizer.from_pretrained(checkpoint)

In [32]:
model.to(device)
model.eval();

In [37]:
def generate_top(text, num_beams=4,  max_source_len=512, max_target_length=700, top_k=50, top_p=1):
    inputs = tokenizer([text], return_tensors="pt").to(device)
    input_tensor = inputs["input_ids"]

    summary_ids = model.generate(
        input_tensor,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_length=512
    )
    pred = tokenizer.batch_decode(
        summary_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]
    return pred

In [40]:
def remove_special_tokens(s):
    for t in special_tokens_list:
        s = s.replace(t, '')
    return s.strip()

In [41]:
special_tokens_list = []
for el in tokenizer.special_tokens_map.values():
    if isinstance(el, list):
        special_tokens_list.extend(el)
    else:
        special_tokens_list.append(el)

In [38]:
with open('../_common/datasets/dialogpt/val_v2.txt', 'r') as f:
    test_data = f.read().split('\n')[:-1]

In [39]:
test_data[0]

'<BOS> What is V2V-PoseNet? <SEP> First, we convert 2D depth images to 3D volumetric forms by reprojecting the points in the 3D space and discretizing the continuous space. After voxelizing the 2D depth image, the V2V-PoseNet takes the 3D voxelized data as an input and estimates the per-voxel likelihood for each keypoint. The position of the highest likelihood response for each keypoint is identified and warped to the real world coordinate, which becomes the final result of our model. <SEP> <INPUTEND> V2V-PoseNet is a model designed for 3D pose estimation, which takes voxelized 3D data as input and estimates the per-voxel likelihood for each keypoint. It includes four kinds of building blocks: volumetric basic block, volumetric residual block, volumetric downsampling block, and volumetric upsampling block. <EOS>'

In [46]:
predictions = []
for text in tqdm(test_data):
    inp, target = text.split(' <INPUTEND> ')
    inp = inp.replace("<BOS> ", "").replace(" <EOS>", "")
    target = target.replace("<BOS> ", "").replace(" <EOS>", "")
    pred = generate_top(inp)
    predictions.append({'input': inp,
                        'target': remove_special_tokens(target),
                        'prediction': remove_special_tokens(pred)})

100%|██████████| 1473/1473 [1:01:38<00:00,  2.51s/it]


In [47]:
len(predictions)

1473

In [48]:
with open(f"../_common/bart_response_generation/predictions_{model_name}.pkl", 'wb') as f:
    pickle.dump(predictions, f)

### Calculate LM scores

In [49]:
import os
import numpy as np
import pandas as pd
import pickle
import string

from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

In [50]:
results_paths = sorted([f for f in os.listdir('../_common/bart_response_generation') if ".pkl" in f and "ft" in f])

In [51]:
results_paths

['predictions_large_ft_deberta_3ep_5e5.pkl',
 'predictions_large_ft_deberta_3ep_5e5_16bs.pkl',
 'predictions_large_ft_deberta_5ep_1e5_20bs_wr01.pkl',
 'predictions_large_ft_deberta_5ep_8e5_20bs_ch4000.pkl']

In [52]:
for res_path in results_paths:
    print(res_path)
    with open('../_common/bart_response_generation/' + res_path, 'rb') as f:
        preds = pickle.load(f)
        
    rouge = Rouge(metrics=['rouge-n', 'rouge-l'],
                       max_n=2,
                       limit_length=False,
                       length_limit=3,
                       length_limit_type='words',
                       apply_avg=True,
                       apply_best=False,
                       alpha=0.5, # Default F1_score
                       weight_factor=1.2,
                       stemming=False)
    
    hyps, refs = [], []
    for i in range(len(preds)):
        hyps.append(preds[i]['prediction'])
        refs.append(preds[i]['target'])
        
    gen_ref = zip(hyps, refs)
    gen_ref = [_ for _ in gen_ref if not all(j in string.punctuation for j in _[1]) and not all(j in string.punctuation for j in _[0])]
    gens, refs  = zip(*gen_ref)
    
    #rouge_res = rouge.get_scores(gens, refs, avg=True, ignore_empty=False) #python-rouge
    rouge_res = rouge.get_scores(gens, refs) # py-rouge
    print()
    print('ROUGE-1:', round(100 * rouge_res['rouge-1']['f'], 2))
    print('ROUGE-2:', round(100 * rouge_res['rouge-2']['f'], 2))
    print('ROUGE-L:', round(100 * rouge_res['rouge-l']['f'], 2))
    
    print()
    for j in range(1, 5):
        weights=[0,0,0,0]
        for k in range(j):
            weights[k] = 1
        mean_bleu = 0
        for gen, ref in zip(gens, refs):
            mean_bleu += sentence_bleu([word_tokenize(ref)], word_tokenize(gen), weights=weights)
        mean_bleu /= len(gens)
        print(f'BLEU-{j}:', round(100 * mean_bleu, 2))  
    
    print('\n' + '-'*50 + '\n')

predictions_large_ft_deberta_3ep_5e5.pkl

ROUGE-1: 51.72
ROUGE-2: 36.25
ROUGE-L: 49.8



The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 42.33
BLEU-2: 17.83
BLEU-3: 9.21
BLEU-4: 5.89

--------------------------------------------------

predictions_large_ft_deberta_3ep_5e5_16bs.pkl

ROUGE-1: 52.78
ROUGE-2: 37.49
ROUGE-L: 51.21

BLEU-1: 43.73
BLEU-2: 19.83
BLEU-3: 10.82
BLEU-4: 7.16

--------------------------------------------------

predictions_large_ft_deberta_5ep_1e5_20bs_wr01.pkl

ROUGE-1: 51.79
ROUGE-2: 36.51
ROUGE-L: 50.11

BLEU-1: 42.36
BLEU-2: 18.41
BLEU-3: 9.64
BLEU-4: 6.14

--------------------------------------------------

predictions_large_ft_deberta_5ep_8e5_20bs_ch4000.pkl

ROUGE-1: 51.71
ROUGE-2: 36.37
ROUGE-L: 49.93

BLEU-1: 42.52
BLEU-2: 17.92
BLEU-3: 9.15
BLEU-4: 5.69

--------------------------------------------------

