### Get splits for BART training

In [1]:
import pandas as pd
import pickle
import re
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
sep_token = '</s>'
ds_suff = 'short' #'filt'

In [3]:
def get_base_bart_df(multi_df_path, columns, target):
    multi_df = pd.read_csv(multi_df_path, sep='\t')
    result_df = pd.DataFrame()
    result_df['@'.join(columns)] = multi_df[columns].apply(lambda x: f' {sep_token} '.join(x), axis=1)
    result_df[target] = multi_df[target]
    return result_df

In [4]:
target_col = "response"
source_cols = ["history", "title", "grounding"]

for part in ['train', 'val']:
    base_df = get_base_bart_df(f'bart_input/{part}_reddit_dial_df_{ds_suff}.csv', source_cols, target_col)
    base_df.to_csv(f'bart_input/{part}_reddit_dial_df_base_{ds_suff}__{"-".join(source_cols)}__{target_col}.csv', sep='\t', index=False)

In [5]:
df = pd.read_csv(f'bart_input/train_reddit_dial_df_{ds_suff}.csv', sep='\t')

In [6]:
df['history'].values[3]

"<s1> <u1> <to:u1> I don’t think Henry is going to play a different DC character. His instagram post reads like he’s done with DC films, not that he’s going to be in something else just not as Superman. </s> <s2> <u2> <to:u1> At this point hasn't WB burnt the bridge? Toying with the character for years he finally gets welcomed back and now he gets kicked out again?? </s> <s1> <u3> <to:u2> Yup. They burnt that bridge into ashes at this point. It’s interesting to see the difference in tone between James Gunn’s and Henry’s statements. James makes it sound like it was positive. “We had a great meeting” he says. But Henry’s sounds very sad and disappointed. He most definitely did not describe it as a positive meeting. </s> <s3> <u4> <to:u1> What's james gonna say, that he completely fucked Henry over? </s> <s4> <u5> <to:u4> The Rock did that. It was up to Gunn to make the tough call, that’s all. </s> <s5> <u6> <to:u1> I don’t think Henry is going to play a different DC character. His instag

### Train base BART

In [1]:
# set by hands using the results above
source_lens = {
    'history': 600,
    'history_aug': 600,
    'history_amr': 1024,
    'history_discourse': 40,
    'addr_amr': 300,
    'response': 160,
    'response_aug':  160,
    'grounding': 850,
    'title': 64
}

In [2]:
n_epochs = 4
learning_rate = 3e-5
batch_size = 4
gradient_accumulation_steps = 2
#target_col = "response_aug"
target_col = "response"
source_cols = ["history", "title", "grounding"] #["history_aug", "title", "grounding"] # in that order, last will be truncateds
text_column = '@'.join(source_cols)

max_source_length = min(1024, sum([source_lens[c] for c in source_cols]))
max_target_length = source_lens[target_col]

In [3]:
train_fn = f'bart_input/train_reddit_dial_df_base_filt__{"-".join(source_cols)}__{target_col}.csv'
val_fn = f'bart_input/val_reddit_dial_df_base_filt__{"-".join(source_cols)}__{target_col}.csv'

In [4]:
checkpoint_path = f"checkpoint/base_bart_bs{batch_size*gradient_accumulation_steps}_{n_epochs}ep_lr{learning_rate}__from:{'-'.join(source_cols)}___to:{target_col}"
checkpoint_path

'checkpoint/base_bart_bs8_4ep_lr3e-05__from:history-title-grounding___to:response'

In [None]:
# change special tokens map path in run_summarization.py
!CUDA_VISIBLE_DEVICES=0 python custom_bart_scripts/run_summarization.py \
    --model_name_or_path="facebook/bart-base" \
    --train_file=$train_fn \
    --validation_file=$val_fn \
    --text_column=$text_column \
    --summary_column=$target_col \
    --max_source_length=$max_source_length \
    --max_target_length=$max_target_length \
    --do_train \
    --do_eval \
    --per_device_train_batch_size=$batch_size \
    --per_device_eval_batch_size=$batch_size \
    --gradient_accumulation_steps=$gradient_accumulation_steps \
    --learning_rate=$learning_rate \
    --save_steps=80000 \
    --num_train_epochs=$n_epochs \
    --output_dir=$checkpoint_path \
    --overwrite_output_dir

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
06/14/2023 16:50:39 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
generation_max_length=None,
generation_num_beams=None,
gradient_accumulation_steps=2,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_mo

[INFO|modeling_utils.py:1427] 2023-06-14 16:52:03,068 >> loading weights file https://huggingface.co/facebook/bart-base/resolve/main/pytorch_model.bin from cache at /home/aschernyavskiy/.cache/huggingface/transformers/486355ec722ef05fd480e999d4c763be56549ae930f6a3742ee721a5d2a05647.f2f355ad2775769afc60592b43a46d72ca548375e3a1d65f381a751e711cbadd
CUSTOM BART
[INFO|modeling_utils.py:1694] 2023-06-14 16:52:06,569 >> All model checkpoint weights were used when initializing BartForConditionalGeneration.

[INFO|modeling_utils.py:1703] 2023-06-14 16:52:06,569 >> All the weights of BartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BartForConditionalGeneration for predictions without further training.
[INFO|tokenization_utils_base.py:888] 2023-06-14 16:52:06,574 >> Assigning ['<u10>', '<u11>', '<u12>', '<u13>', '<u14>', '<u15>', '<u16>', '<u17>', '<u18

Running tokenizer on validation dataset:   0%|            | 0/4 [00:00<?, ?ba/s]06/14/2023 16:52:14 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/aschernyavskiy/.cache/huggingface/datasets/csv/default-15b3ef04a8eeedeb/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-ea78b9989ee1fba3.arrow
Running tokenizer on validation dataset: 100%|████| 4/4 [00:18<00:00,  4.63s/ba]


### Test model

In [1]:
import pandas as pd
import pickle
import re
import string
from tqdm import tqdm

In [2]:
import warnings
warnings.filterwarnings("ignore")

import torch
import numpy as np

In [3]:
import sys
sys.path.insert(0, 'custom_bart_scripts/')

In [4]:
from modeling_custom_bart import BartForConditionalGeneration
from transformers import BartTokenizer

In [5]:
def generate_top(text, num_beams=4,  max_source_len=1024, max_target_length=64, temperature=1.,
                 do_sample=False, top_k=50, top_p=1, num_return_sequences=1, force_words_ids=None):
    inputs = tokenizer([text], max_length=max_source_len, return_tensors="pt", truncation=True, padding=True).to(device)
    summary_ids = model.generate(inputs["input_ids"], do_sample=do_sample, num_beams=num_beams,
                             max_length=max_target_length, top_k=top_k, top_p=top_p, temperature=temperature,
                             num_return_sequences=num_return_sequences)
    
    pred = tokenizer.batch_decode(summary_ids, clean_up_tokenization_spaces=False)
    pred = [re.sub(r'\s+', ' ', p).replace('</s>', '').replace('<s>', '').replace('<pad>', '').strip() for p in pred]
    if len(pred) == 1:
        return pred[0]
    return pred

In [6]:
#model_name_or_path = 'checkpoint/base_bart_copy1_bs8_7ep_lr3e-05__from:history_aug-title-grounding___to:response_aug'
#model_name_or_path = 'checkpoint/base_bart_bs8_4ep_lr3e-05__from:history-title___to:response'
model_name_or_path = 'checkpoint/base_bart_bs8_4ep_lr3e-05__from:history-title-grounding___to:response'

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [7]:
tokenizer = BartTokenizer.from_pretrained(model_name_or_path)
model = BartForConditionalGeneration.from_pretrained(model_name_or_path).train(False)

CUSTOM BART


In [8]:
model.to(device)

# with open('bart_input/special_tokens_map_reddit_dial.pkl', 'rb') as f:
#     special_tokens_dict = pickle.load(f)
# num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

# model.resize_token_embeddings(len(tokenizer))

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50373, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50373, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        

In [9]:
# set by hands using the results above
source_lens = {
    'history': 600,
    'history_aug': 600,
    'history_amr': 1024,
    'history_discourse': 40,
    'addr_amr': 300,
    'response': 160,
    'response_aug':  160,
    'grounding': 850,
    'title': 64
}

In [10]:
source_cols = model_name_or_path.split('/')[1].split('__')[1][5:].split('-')
text_column = '@'.join(source_cols)
target_col = model_name_or_path.split('/')[1].split('__')[-1][4:]
max_encoder_length = 512

max_source_length = min(1024, sum([source_lens[c] for c in source_cols]))
max_target_length = source_lens[target_col]

In [11]:
max_source_length

1024

### Print Examples

In [12]:
val_fn = f'bart_input/val_reddit_dial_df_base__{"-".join(source_cols)}__{target_col}.csv'
test_data = pd.read_csv(val_fn, sep='\t')

In [13]:
for idx in range(100, 200, 10):
    input_text = test_data[text_column].values[idx]
    input_texts = input_text.split(' </s> ')
    pred = generate_top(input_text,
                        num_beams=1,#2,
                        max_source_len=max_source_length,
                        max_target_length=max_target_length)
    
    
    print(idx)
    print(input_text[:3000])
    print()
    #for i, col in enumerate(source_cols):
    #    print(f'{col}:\n', input_texts[i][:3000])
    #     print()
    print('Prediction:', pred)
    print('\n\nGT:', test_data[target_col].values[idx])
    print('\n' + '-'*70 + '\n')

100
<s1> <u1> <to:u1> <init> <Negative> What about the people who have genuine concerns? Surely not everyone who voiced complaints were disrespectful and rude.Why piss away a good Superman to do a younger one? Why? I genuinely don't get it.I'm not going to harass or bully James Gunn or anyone else, but I am calling a spade a spade in that this Henry Cavill situation is a bunch of bullshit. </s> <s2> <u2> <to:u1> <question> <Neutral> What is there to not get? Henry is 40. The new Superman movie won’t be ready for a few years. Gunn is in charge, and his plan is to tell a story with a younger Superman. That’s all there is to it. </s> <s1> <u3> <to:u2> <question> <Neutral> Why a younger Superman? That's what I don't get. Why?Does Henry Cavill look like his body is going to wither away in 10 years? He is pretty healthy for a 40 year old. You could digitally deage him to play a younger Superman. </s> <s3> <u4> <to:u3> <answer> <Neutral> My guess It’s about longevity. Yes, they can digitally 

130
<s1> <u1> <to:u1> <init> <Neutral> This is the part of Superman that Snyder never understood. He was so obsessed with Superman’s power and the implications of it on the world, and neglected Superman’s HUMANITY. He may come from another planet but the entire point of Superman is that he is human, and a kind one at that.Snyder’s approach where he treated these characters as gods instead of people played a huge role in why audiences struggled to connect with them. There was far more attention paid to “what do people think of Superman” and not enough attention paid to “what does Superman do for people”I want James Gunn’s new Superman to focus far more on Clark Kent than the deity that is Superman, at least for his first appearance. Take a look at the first Captain America film. We spend most of it with Steve Rogers before the super-serum, and even then we can see his kindness, his bravery, and his loyalty to others. His “I don’t like bullies” moment was just the perfect way for audienc

170
<s1> <u1> <to:u1> <init> <Positive> The Suicide Squad and Peacemaker have beautiful scenes and moments. What ever Gunn does i trust him. </s> <s2> <u2> <to:u1> <answer> <Positive> Well, yeah. Don't get me wrong, I love the idea of Henry Cavill as Superman but James Gunn wrote better lines for a talking tree that only uses the words I, we, am, are, and Groot than Snyder did for one of the most important comics characters of all time. </s> <s3> <u3> <to:u1> <answer> <Positive> i love dialogues from the snyder trilogy : simple, classy and deep. a beautiful lie.no one stays good in this world. if there's one pourcent chance that he is our enemy, we have to take it as an absolute necessity. do you bleed ? you're will !!!!! you know what is the greatest lie ? that power can be innocent. lex luthor when he explain his motivation. superman was never real, it was just a dream from a farmer. must there be a Superman ? Men are still good, we fight, we kill, we betray each other but we can reb

In [20]:
num_same = 0
for idx in tqdm(range(100, 300)):
    input_text = test_data[text_column].values[idx]
    
    #try:
    pred = generate_top(input_text,
                        num_beams=1,
                        max_source_len=max_source_length,
                        max_target_length=max_target_length)
    
    gt = test_data[target_col].values[idx]
    
    if pred.split()[0] == gt.split()[0]:
        num_same += 1

100%|██████████| 200/200 [01:06<00:00,  3.00it/s]


In [21]:
num_same

58

### Calculate predictions for full test

In [12]:
import pickle
from tqdm import tqdm

In [13]:
val_fn = f'bart_input/val_reddit_dial_df_base_short__{"-".join(source_cols)}__{target_col}.csv'
test_data = pd.read_csv(val_fn, sep='\t')

In [None]:
preds = []

for idx in tqdm(range(len(test_data))):
    input_text = test_data[text_column].values[idx]
    try:
        pred = generate_top(input_text,
                            num_beams=1,
                            max_source_len=max_source_length,
                            max_target_length=max_target_length,
                            do_sample=True)
    except:
        pred = ""
        
    preds.append(pred)

 93%|█████████▎| 14615/15716 [1:25:38<07:47,  2.35it/s]  

In [None]:
with open(model_name_or_path.replace('checkpoint/', 'predictions/').replace('/checkpoint', '_') + '_short.pkl', 'wb') as f:
    pickle.dump(preds, f)

In [None]:
model_name_or_path.replace('checkpoint/', 'predictions_v2/').replace('/checkpoint', '_') + '.pkl'