Data is constructed in "BART-Convokit"

In [14]:
import numpy as np
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name_or_path = "t5-base"

tokenizer = T5Tokenizer.from_pretrained(model_name_or_path)
model =  T5ForConditionalGeneration.from_pretrained(model_name_or_path).to(device) # to check load

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [15]:
with open('data/special_tokens_map_convokit.pkl', 'rb') as f:
    special_tokens_dict = pickle.load(f)
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

In [16]:
num_added_toks + tokenizer.vocab_size

32193

In [18]:
tokenizer.encode(['<negativereaction>',
     '<other>',
     '<appreciation>',
     '<unk>',
     '<elaboration>',
     '<answer>',
     '<question>',
     '<humor>',
     '<announcement>',
     '<agreement>',
     '<disagreement>'])

[32100, 32101, 32102, 2, 32103, 32104, 32105, 32106, 32107, 32108, 32109, 1]

## Train Structure Generator

In [None]:
# change special tokens map path in run_summarization.py
!CUDA_VISIBLE_DEVICES=1 python custom_t5_scripts_weights/run_summarization.py \
    --model_name_or_path="t5-base" \
    --train_file="data/train_structure_convokit.csv" \
    --validation_file="data/val_structure_convokit.csv" \
    --text_column="context" \
    --summary_column="structure" \
    --max_source_length=1024 \
    --max_target_length=64 \
    --do_train \
    --do_eval \
    --per_device_train_batch_size=1 \
    --per_device_eval_batch_size=1 \
    --gradient_accumulation_steps=2 \
    --learning_rate=2e-5 \
    --class_weights=100 \
    --save_steps=80000 \
    --num_train_epochs=5 \
    --output_dir="checkpoints/structure_custom_t5_convokit_bs_1_2_lr_2e5_ep_5_w_100" \
    --overwrite_output_dir

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
06/19/2022 20:15:18 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
generation_max_length=None,
generation_num_beams=None,
gradient_accumulation_steps=2,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_mo

[INFO|modeling_utils.py:1427] 2022-06-19 20:15:24,416 >> loading weights file https://huggingface.co/t5-base/resolve/main/pytorch_model.bin from cache at /home/aschernyavskiy/.cache/huggingface/transformers/ab4e948915b067f5cb6e5105f6f85044fd717b133f43240db67899a8fc7b29a2.26934c75adf19ceac3c268b721ba353356b7609c45f5627550326f275a2163b4
CUSTOM T5 with class_weight=100.0
[INFO|modeling_utils.py:1694] 2022-06-19 20:15:30,638 >> All model checkpoint weights were used when initializing T5ForConditionalGeneration.

[INFO|modeling_utils.py:1703] 2022-06-19 20:15:30,639 >> All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.
[INFO|tokenization_utils_base.py:888] 2022-06-19 20:15:30,644 >> Assigning ['<negativereaction>', '<other>', '<appreciation>', '<unk>', '<elaboration>', '<answ

Special tokens: <negativereaction> <other> <appreciation> <elaboration> <answer> <question> <humor> <announcement> <agreement> <disagreement>
[INFO|trainer.py:1244] 2022-06-19 20:15:32,074 >> ***** Running training *****
[INFO|trainer.py:1245] 2022-06-19 20:15:32,074 >>   Num examples = 81984
[INFO|trainer.py:1246] 2022-06-19 20:15:32,074 >>   Num Epochs = 5
[INFO|trainer.py:1247] 2022-06-19 20:15:32,074 >>   Instantaneous batch size per device = 1
[INFO|trainer.py:1248] 2022-06-19 20:15:32,074 >>   Total train batch size (w. parallel, distributed & accumulation) = 2
[INFO|trainer.py:1249] 2022-06-19 20:15:32,074 >>   Gradient Accumulation steps = 2
[INFO|trainer.py:1250] 2022-06-19 20:15:32,074 >>   Total optimization steps = 204960
{'loss': 8.7606, 'learning_rate': 1.9951209992193602e-05, 'epoch': 0.01}        
{'loss': 3.9152, 'learning_rate': 1.99024199843872e-05, 'epoch': 0.02}          
{'loss': 3.6102, 'learning_rate': 1.9853629976580797e-05, 'epoch': 0.04}        
{'loss': 3.38

## Test model

In [None]:
import pandas as pd
import pickle
import re
import string
from tqdm import tqdm

In [None]:
import warnings
warnings.filterwarnings("ignore")

import torch
import numpy as np
from transformers import T5ForConditionalGeneration, T5Tokenizer
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [None]:
model_name_or_path = 'checkpoints/structure_custom_t5_convokit_bs_1_2_lr_2e5_ep_5_w_100'

In [None]:
tokenizer = T5Tokenizer.from_pretrained(model_name_or_path)
model = T5ForConditionalGeneration.from_pretrained(model_name_or_path).train(False).to(device)

In [None]:
def generate_top(text, num_beams=4,  max_source_len=1024, max_target_length=64, top_k=50, top_p=1):
    inputs = tokenizer([text], max_length=max_source_len, return_tensors="pt", truncation=True, padding = False).to(device)
    summary_ids = model.generate(inputs["input_ids"], do_sample=True,num_beams=num_beams,
                                 max_length=max_target_length, top_k=top_k, top_p=top_p)
    pred = tokenizer.batch_decode(summary_ids, clean_up_tokenization_spaces=False)[0]
    pred = re.sub(r'\s+', ' ', pred).replace('</s>', '').replace('<s>', '').strip()
    return pred

In [None]:
test_data = pd.read_csv("data/val_structure_convokit.csv", sep='\t')

In [None]:
X_test = test_data['context'].values
y_test = test_data['structure'].values

In [None]:
preds = []
for i, text in tqdm(enumerate(X_test), total=len(X_test)):
    try:
        preds.append([text, generate_top(text, top_k=50, num_beams=1)])
    except:
        print(i)
        preds.append([text, 'err'])
        continue

In [None]:
with open('predictions/{}.pkl'.format(model_name_or_path.replace('checkpoints/', '')), 'wb') as f:
    pickle.dump([X_test, preds], f)