In [11]:
# the notebook's main objective is to filter and prepare the dataset to train a summarizer on it.
import os, sys
from pathlib import Path
HOME = os.getcwd()
DATA_FOLDER = os.path.join(Path(HOME).parent, 'data')
data_path = os.path.join(DATA_FOLDER, 'filtered.tsv')

current = HOME 
while 'src' not in os.listdir(current):
    current = Path(current).parent

PARENT_DIR = str(current)
sys.path.append(str(current))
sys.path.append(os.path.join(str(current), 'data_analysis'))
sys.path.append(os.path.join(str(current), 'evaluation'))
sys.path.append(os.path.join(str(current), 'text_processing')) 

In [12]:
model_checkpoint = os.path.join(PARENT_DIR, 'src', 'models', 's2s', 'test-trainer', 'checkpoint-1000')

In [18]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
from datasets import load_dataset

sample = load_dataset('csv', data_files=os.path.join(PARENT_DIR, 'src','data', 'all_data_processed.csv'), split='train', nrows=5000)
# sample = all_data.select(range(5000))
checkpoint = 'facebook/bart-base'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
toxic_classifier = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


In [19]:
def prepare_labeled_data(batch):
    # tokenize 'x'
    model_inputs = tokenizer(batch['source'], truncation=True)
    # tokenize 'y'  
    labels = tokenizer(text_target=batch["target"], truncation=True)
    # add it to the model's input
    model_inputs["labels"] = labels["input_ids"]
    # model_inputs["labels_attention_masks"] = labels['attention_mask']    
    return model_inputs

In [20]:
from datasets import concatenate_datasets
train_data, val_data = sample.select(range(4500)), sample.select(range(4500, 5000))
train_data = train_data.map(prepare_labeled_data, batched=True).remove_columns(['source', 'target'])
val_data = val_data.map(prepare_labeled_data, batched=True).remove_columns(['source', 'target'])
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)
# we are now ready to create the dataloader
from torch.utils.data import DataLoader
train_dl = DataLoader(dataset=train_data, batch_size=4, shuffle=True, collate_fn=data_collator)
val_dl = DataLoader(dataset=val_data, batch_size=4, shuffle=False, collate_fn=data_collator)

In [21]:
from torch import nn
from transformers import Trainer
from torch.nn.functional import softmax

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        model_output = model(**inputs)
        # extract the sequence to sequence loss
        s2s_loss = model_output.loss

        prediction_ids = model_output.logits.argmax(dim=-1)
        attention_mask = torch.where(prediction_ids == tokenizer.pad_token_id,
                                     torch.zeros(*prediction_ids.shape), torch.ones(*prediction_ids.shape))
        
        toxic_output = toxic_classifier(input_ids=prediction_ids, attention_mask=attention_mask)
        toxic_loss = torch.mean(softmax(toxic_output.logits, dim=1)[:, 1])
        loss = s2s_loss + 0.05 * toxic_loss 
        return (loss, model_output) if return_outputs else loss 
    

In [24]:
# Train model

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

batch_size = 30
num_epochs = 10
learning_rate = 5e-5
warmup_steps = 500
weight_decay = 0.01

sc_training_args = Seq2SeqTrainingArguments(
    output_dir='seq_2_seq',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    do_train=True,
    do_eval=True,
    logging_steps=100,
    save_steps=100000,
    eval_steps=10,
    overwrite_output_dir=True,
    warmup_steps=warmup_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    num_train_epochs=num_epochs,
    fp16=True,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=sc_training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator
)

In [25]:
trainer.train()



You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                       
  0%|          | 0/150 [01:02<?, ?it/s]          

{'loss': 3.0458, 'learning_rate': 9.7e-06, 'epoch': 0.67}


                                       
100%|██████████| 150/150 [00:19<00:00,  7.86it/s]

{'train_runtime': 19.0937, 'train_samples_per_second': 235.679, 'train_steps_per_second': 7.856, 'train_loss': 2.8132533772786457, 'epoch': 1.0}





TrainOutput(global_step=150, training_loss=2.8132533772786457, metrics={'train_runtime': 19.0937, 'train_samples_per_second': 235.679, 'train_steps_per_second': 7.856, 'train_loss': 2.8132533772786457, 'epoch': 1.0})

In [31]:
# Test model
train_sample = sample[4700]
train_sample['source'], train_sample['target']

("it 's not a fucking chimney !", "it 's not a fireplace .")

In [40]:
for i in range(0, len(val_data), 50):
    input_ids = val_data[i]['input_ids']
    attention_mask = val_data[i]['attention_mask']
    labels = val_data[i]['labels']

    print(f"source: {tokenizer.decode(input_ids, skip_special_tokens=True)}")
    print(f"target: {tokenizer.decode(labels, skip_special_tokens=True)}")

    outputs = model.generate(
        input_ids=torch.tensor(input_ids).unsqueeze(0).to('cuda'),
        attention_mask=torch.tensor(attention_mask).unsqueeze(0).to('cuda'),
        max_length=512,
        num_beams=5,
        early_stopping=True
    )

    print(f"generated :{tokenizer.decode(outputs[0], skip_special_tokens=True)}")
    print("#" * 100)

source: what's this shit your little crew buddy person's been talking about
target: i wonder what your friend person is talking about.
generated :what is this thing your little crew person person's been talking about
####################################################################################################
source: you think that shit is just bad luck
target: do you think it was bad luck
generated :you think that is just bad luck
####################################################################################################
source: if it's ellsworth apprising you of the widow, let him fucking continue and do likewise for me.
target: if person tells you about the widow...... then leave him to testify and do the same for me.
generated :if it's not the widow, let him continue and do likewise for me.
####################################################################################################
source: ay, don't be an idiot.
target: person, don't be crazy.
generated :don

In [35]:
from train import train_custom_seq2seq
from transformers import AutoTokenizer, AutoModelForSequenceClassification

_TOXIC_CLASSIFIER_CHECKPNT = 'SkolkovoInstitute/roberta_toxicity_classifier'
toxic_tokenizer = AutoTokenizer.from_pretrained(_TOXIC_CLASSIFIER_CHECKPNT)
toxic_classifier = AutoModelForSequenceClassification.from_pretrained(_TOXIC_CLASSIFIER_CHECKPNT)

# let's define some of the training parameters
from torch.optim import Adam
from torch.optim.lr_scheduler import LinearLR

# lr: the same as the one used in the/home/ayhem18/DEV/My_Kaggle_Repo/pytorch_modular/pytorch_utilities.py
optimizer = Adam(model.parameters(), lr=2 * 10 ** -5)
scheduler = LinearLR(optimizer=optimizer, start_factor=1, end_factor=0.5,total_iters=100)

Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [36]:
b = next(iter(train_dl))
print(b['input_ids'].shape, b['attention_mask'].shape, b['labels'].shape)

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


torch.Size([4, 18]) torch.Size([4, 18]) torch.Size([4, 13])


In [37]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
b = {k: v.to(device) for k , v in b.items()}
model(**b)

ValueError: Expected input batch_size (68) to match target batch_size (48).

In [19]:
# let's write a function to compute the summarization + toxicity loss
from src.evaluation.toxicity_classication import EvalutionSingletonInitializer
from torch.nn.functional import softmax
from typing import Union

def toxic_summary_model_loss(output_decoded: torch.Tensor, 
                             device,
                             return_tensor: bool=False) -> Union[float, torch.Tensor]:
    
    singleton_obj = EvalutionSingletonInitializer()
    tc_tokenizer, tc_classifier = singleton_obj.get_toxic_tokenizer(), singleton_obj.get_toxic_classifier()

    # make sure to freeze their parameters
    for p in tc_classifier.parameters():
        p.requires_grad = False

    tc_classifier.to(device)
    # tokenize
    model_input = tc_tokenizer(output_decoded, return_tensors='pt', padding=True, truncation=True)
    # set the input to the device
    model_input = {k: v.to(device) for k, v in model_input.items()}
    # pass through the model
    output = tc_classifier(**model_input)
    
    loss = torch.mean(softmax(output.logits, dim=1)[:, 1])
    
    if return_tensor: 
        loss.requires_grad=True
        return loss
    
    return loss.item()


In [None]:
train_custom_seq2seq(train_dataloader=train_dl, 
                     val_dataloader=val_dl, 
                     model=model, 
                     tokenizer=tokenizer, 
                     toxic_tokenizer=toxic_tokenizer,
                     toxic_classifier=toxic_classifier,
                     optimizer=optimizer, 
                     scheduler=scheduler, 
                     toxicity_loss_function=toxic_summary_model_loss,
                     toxicity_coeff=0.5,
                    num_epochs=2,   
                    report_per_epoch=1,
                    log_dir=os.path.join(HOME, 'runs')
                    )

        