# Imports and Data Loading

In [1]:
# the notebook's main objective is to filter and prepare the dataset to train a summarizer on it.

import os, sys
from pathlib import Path
HOME = os.getcwd()
DATA_FOLDER = os.path.join(Path(HOME).parent, 'data')
data_path = os.path.join(DATA_FOLDER, 'filtered.tsv')


In [2]:
current = HOME 
while 'src' not in os.listdir(current):
    current = Path(current).parent

sys.path.append(str(current))
sys.path.append(os.path.join(str(current), 'data_analysis'))
sys.path.append(os.path.join(str(current), 'evaluation'))
sys.path.append(os.path.join(str(current), 'text_processing'))

In [3]:
# let's load the data as a HF Dataset
from datasets import load_dataset
original_data = load_dataset('csv', data_files=os.path.join(DATA_FOLDER, 'fixed.csv'), split='train')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# let's start with removing all fields but 'source' and 'target'ArithmeticError
original_data = original_data.remove_columns(['source_tox', 'target_tox', 'similarity', 'lenght_diff'])
original_data

Dataset({
    features: ['source', 'target'],
    num_rows: 577777
})

# Original Dataset

In [5]:
sample = original_data.shuffle(seed=69).select(range(1000))
sample

Dataset({
    features: ['source', 'target'],
    num_rows: 1000
})

In [6]:
from src.text_processing import preprocess as pr
def process_text(text: str) -> str:
    return pr.no_extra_spaces(pr.no_extra_chars(pr.to_lower(text)))

In [7]:
# the first step is to process the data 
def process_sample_text(batch):
    """This function recieves  batch of samples from the original data. It returns a new batch where each
    'source' and 'target' text data will be processed using the function above
    """
    new_batch = dict([(k, [process_text(t) for t in v]) for k, v in batch.items()])
    return new_batch

sample = sample.map(process_sample_text, batched=True)

In [8]:
# the next step is to filter the dataset
def filter_data(sample):
    """This function receives  a batch of samples from the original data and filters those whose 'source' text is shorter than the 'target' text."""
    # first tokenize each 'source' and 'target' fields
    source = pr.tokenize(sample['source'], tokenizer_type='word')
    target = pr.tokenize(sample['target'], tokenizer_type='word')
    return len(source) > len(target)

sample = sample.filter(filter_data)

In [9]:
sample

Dataset({
    features: ['source', 'target'],
    num_rows: 458
})

# Auxiliary Dataset

In [12]:
from transformers import set_seed
# set the seed for reproducibility, set_seed(69)
aux_data = load_dataset('s-nlp/paradetox')['train'] # the dataset has only one split: 'train'
# shuffle the data
aux_data = aux_data.rename_column('en_toxic_comment', 'source').rename_column('en_neutral_comment', 'target').shuffle(seed=69)
# split the data
train_split, val_split, test_split = aux_data.select(range(2000, len(aux_data))), aux_data.select(range(1000)), aux_data.select(range(1000, 2000))

In [13]:
train_split, val_split, test_split

(Dataset({
     features: ['source', 'target'],
     num_rows: 17744
 }),
 Dataset({
     features: ['source', 'target'],
     num_rows: 1000
 }),
 Dataset({
     features: ['source', 'target'],
     num_rows: 1000
 }))

In [14]:
from datasets import concatenate_datasets
train_data, val_data, test_data = concatenate_datasets([train_split, sample]), val_split, test_split

# DataLoaders

In [15]:
# the next step is to tokenize the data  
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
CHECKPOINT = 't5-small'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
TOKENIZER = AutoTokenizer.from_pretrained(CHECKPOINT)
MODEL = AutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT).to(DEVICE)

In [16]:
TASK_PREFIX = 'summarize: '

def prepare_labeled_data(batch):
    # add the task predix to each sentence
    inputs = [TASK_PREFIX + doc for doc in batch["source"]]
    # tokenize 'x'
    model_inputs = TOKENIZER(inputs, truncation=True, max_length=1028)
    # tokenize 'y'  
    labels = TOKENIZER(text_target=batch["target"], truncation=True, max_length=124)
    # add it to the model's input
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [17]:
train_data = train_data.map(prepare_labeled_data, batched=True).remove_columns(['source', 'target'])
val_data = val_data.map(prepare_labeled_data, batched=True).remove_columns(['source', 'target'])

In [18]:
train_data, val_data
# let's choose a small portion of the data to experiment with
train_data = train_data.shuffle(seed=69).select(range(2000))

In [19]:
# create a DataCollator for padding for the sequence to sequence models
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=TOKENIZER, model=CHECKPOINT)
# we are now ready to create the dataloader
from torch.utils.data import DataLoader
train_dl = DataLoader(dataset=train_data, batch_size=4, shuffle=True, collate_fn=data_collator)
val_dl = DataLoader(dataset=val_data, batch_size=4, shuffle=False, collate_fn=data_collator)

In [20]:
# make sure the data is loaded correctly
b1, b2 = next(iter(train_dl)), next(iter(val_dl))

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


# Model Training

## Training Code

In [21]:
from transformers import AutoModelForSequenceClassification

# let's build the training loop: first the train_per_epoch function
def train_per_epoch(train_dataloader, 
                    summary_model: AutoModelForSeq2SeqLM, # the summarization model to customize 
                    summary_tokenizer: AutoTokenizer, 
                    toxicity_loss_function: callable, 
                    toxicity_coeff: float,
                    optimizer: torch.optim, 
                    scheduler: torch.optim.lr_scheduler,
                    device: str,
                    ):
    
    # make sure to save the reesults
    summary_loss = 0
    toxic_loss = 0
    train_loss = 0

    for batch in train_dataloader:
        # first put all the data into the device
        batch = {k: v.to(device) for k, v in batch.items()}
        # the next step is to pass the data to the model 
        model_output = summary_model(**batch)
        # extract the sequence to sequence loss
        sq2sq_loss = model_output.loss 
        
        # save the sequence to sequence loss
        summary_loss += sq2sq_loss.item()

        # time to build the toxicity loss
        # 1. generate the output in textual form: this can be done through the generate model, so it expects different input
        batch_generate = {k: v for k, v in batch.items() if k != 'labels'}    
        output_decoded = summary_tokenizer.batch_decode(summary_model.generate(**batch_generate), skip_special_tokens=True)
        tc_loss = toxicity_loss_function(output_decoded, device=device)
        
        # save the toxicity loss
        toxic_loss += tc_loss.item()

        # the final loss is a linear a combination of the seq2seq loss and the classification loss
        final_loss = sq2sq_loss + tc_loss * toxicity_coeff
        train_loss += final_loss

        # the usual part of the training loop
        final_loss.backward() 

        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    
    # make sure to average the loss across the different batches
    summary_loss /= len(train_dataloader)
    toxic_loss /= len(train_dataloader)
    train_loss /= len(train_dataloader)
    
    
    # make sure to return the losses
    return {"summary_loss": summary_loss, "toxic_loss": toxic_loss, "train_loss": train_loss}


In [22]:
# let's define the validation epoch
def val_per_epoch(val_dataloader, 
                    summary_model: AutoModelForSeq2SeqLM, # the summarization model to customize 
                    summary_tokenizer: AutoTokenizer, 
                    toxicity_loss_function: callable, 
                    toxicity_coeff: float,
                    device: str
                    ):    
    summary_loss = 0
    toxic_loss = 0
    val_loss = 0

    for batch in val_dataloader:
        # first put all the data into the device
        batch = {k: v.to(device) for k, v in batch.items()}
        # the next step is to pass the data to the model 
        model_output = summary_model(**batch)
        # extract the sequence to sequence loss
        sq2sq_loss = model_output.loss 
        
        # save the sequence to sequence loss
        summary_loss += sq2sq_loss.item()

        # time to build the toxicity loss
        # 1. generate the output in textual form: this can be done through the generate model, so it expects different input
        batch_generate = {k: v for k, v in batch.items() if k != 'labels'}    
        output_decoded = summary_tokenizer.batch_decode(summary_model.generate(**batch_generate), skip_special_tokens=True)
        tc_loss = toxicity_loss_function(output_decoded, device=device)
        
        # save the toxicity loss
        toxic_loss += tc_loss.item()

        # the final loss is a linear a combination of the seq2seq loss and the classification loss
        final_loss = sq2sq_loss + tc_loss * toxicity_coeff
        val_loss += final_loss

    # make sure to average the loss across the different batches
    summary_loss /= len(val_dataloader)
    toxic_loss /= len(val_dataloader)
    train_loss /= len(val_dataloader)
    
    
    # make sure to return the losses
    return {"summary_loss": summary_loss, "toxic_loss": toxic_loss, "val_loss": train_loss}        


In [23]:
# time to write the entire training loop
from torch.utils.tensorboard import SummaryWriter
from src.training_utilities.exp_tracking import create_summary_writer
from typing import Dict, List, Tuple

def _set_summary_writer(writer: SummaryWriter,
                        train_losses_dict: Dict, 
                        val_losses_dict: Dict,
                        epoch:int) -> None:
    
    if writer is not None:
        # track the losses in the training stage
        writer.add_scalar(main_tag='Train_Losses', 
                          tag_scalar_dict=train_losses_dict, 
                          global_step=epoch)

        # track the losses in the evaluation stage
        writer.add_scalar(main_tag='Validation_Losses', 
                          tag_scalar_dict=val_losses_dict,
                          global_step=epoch)

        writer.close()


In [24]:
def report_results(train_losses_dict: Dict, 
                      val_losses_dict: Dict):
    
    print("Training losses")
    
    for k, v in train_losses_dict:
        print(f"{k}: {v}")

    for k, v in val_losses_dict:
        print(f"{k}: {v}")

    print("#" * 50)
    print()

In [25]:
from src.training_utilities.pytorch_utilities import save_model, cleanup

def train_custom_summarizer(train_dataloader: DataLoader,
                            val_dataloader: DataLoader, 
                            
                            summary_model: AutoModelForSeq2SeqLM, 
                            summary_tokenizer: AutoTokenizer,
                            
                            toxicity_loss_function: callable, 
                            toxicity_coeff: float,
                            
                            optimizer: torch.optim, 
                            scheduler: torch.optim.lr_scheduler,

                            num_epochs: int = 5,
                            device: str = None,
                            log_dir: str = None, 
                            report_per_epoch: int = 5, 
                            ) -> Tuple[Dict[str, List[float]]]:
    
    # make sure to cleanup the GPU memory before starting to traing
    cleanup()
    
    # set the default device
    device = ('cuda' if torch.cuda.is_available() else 'cpu') if device is None else device 
    
    # set the SummaryWriter for visualization
    writer, save_path  = (None, None) if log_dir is None else (create_summary_writer(log_dir, return_path=True))

    # save the results somewhere
    train_losses = {"summary_loss": [], "toxic_loss": [], "train_loss": []}
    val_losses = {"summary_loss": [], "toxic_loss": [], "val_loss": []}

    best_train_loss = float('inf')
    best_model = None

    for epoch in range(1, num_epochs + 1):
        # first the training part
        train_results = train_per_epoch(train_dataloader=train_dataloader, 
                                summary_model=summary_model,
                                summary_tokenizer=summary_tokenizer, 
                                toxicity_loss_function=toxicity_loss_function, 
                                toxicity_coeff=toxicity_coeff,
                                optimizer=optimizer, 
                                scheduler=scheduler, 
                                device=device
                                )
        
        val_results = val_per_epoch(val_dataloader=val_dataloader,
                                    summary_model=summary_model,
                                    summary_tokenizer=summary_tokenizer, 
                                    toxicity_loss_function=toxicity_loss_function, 
                                    toxicity_coeff=toxicity_coeff,
                                    device=device
                                    )

        # make sure to save the best model
        if best_model is None:
            best_model = summary_model

        best_model = summary_model if (train_results['train_loss'] < best_train_loss) else best_model
        # update 'best_train_loss' 
        best_train_loss = min([best_train_loss, train_results['train_loss']])

        # make sure to track the resutls
        for k, v in train_results:
            train_losses[k].append(v)

        for k, v in val_results:
            val_losses[k].append(v)            

        # add the results for visualization
        _set_summary_writer(writer, 
                            train_losses_dict=train_results, 
                            val_losses_dict=val_results)

        # report the results if needed  
        if epoch % report_per_epoch == 0: 
            report_results(train_losses_dict=train_results, val_losses_dict=val_results)

    # at the end save the model
    save_model(model=best_model, path=save_path)
    # return the results
    return train_losses, val_losses, best_model            


## Train utilities

In [26]:
from src.evaluation import toxicity_classication as tc
singleton_obj = tc.EvalutionSingletonInitializer()
tx_classifier, tx_tokenizer, tx_device = singleton_obj.get_toxic_classifier(), singleton_obj.get_toxic_tokenizer(), singleton_obj.get_device()
# let's define some of the training parameters
from torch.optim import Adam
from torch.optim.lr_scheduler import LinearLR

# lr: the same as the one used in the/home/ayhem18/DEV/My_Kaggle_Repo/pytorch_modular/pytorch_utilities.py
optimizer = Adam(MODEL.parameters(), lr=2 * 10 ** -5)
scheduler = LinearLR(optimizer=optimizer, start_factor=1, end_factor=0.5,total_iters=100)

Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
# let's write a function to compute the summarization + toxicity loss
from src.evaluation.toxicity_classication import EvalutionSingletonInitializer
from torch.nn.functional import softmax

def toxic_summary_model_loss(output_decoded: torch.Tensor, 
                             device):
    
    singleton_obj = EvalutionSingletonInitializer()
    tc_tokenizer, tc_classifier = singleton_obj.get_toxic_tokenizer(), singleton_obj.get_toxic_classifier()

    # make sure to freeze their parameters
    for p in tx_classifier.parameters():
        p.requires_grad = False

    tc_classifier.to(device)
    # tokenize
    model_input = tc_tokenizer(output_decoded, return_tensors='pt', padding=True, truncation=True)
    # set the input to the device
    model_input = {k: v.to(device) for k, v in model_input.items()}
    # pass through the model
    output = tc_classifier(**model_input)
    
    loss = torch.mean(softmax(output.logits, dim=1)[:, 1])

    loss.requires_grad = True

    return loss

In [28]:
# now the model is ready to train
_, _, best_model = train_custom_summarizer(train_dataloader=train_dl, 
                                            val_dataloader=val_dl,
                                            summary_model=MODEL,
                                            summary_tokenizer=TOKENIZER, 
                                            toxicity_loss_function=toxic_summary_model_loss,
                                            toxicity_coeff=0.5, 
                                            optimizer=optimizer, 
                                            scheduler=scheduler, 
                                            num_epochs=3,
                                            report_per_epoch=1,
                                            log_dir=os.path.join(HOME, 'runs')
                                            )

[INFO] Created SummaryWriter, saving to: /home/ayhem18/DEV/TextDetoxification/src/data_analysis/runs/experience_3...




OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacty of 7.79 GiB of which 15.75 MiB is free. Including non-PyTorch memory, this process has 7.76 GiB memory in use. Of the allocated memory 7.49 GiB is allocated by PyTorch, and 99.85 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF