# Imports and Data Loading

In [23]:
# the notebook's main objective is to filter and prepare the dataset to train a summarizer on it.
import os, sys
from pathlib import Path
HOME = os.getcwd()
DATA_FOLDER = os.path.join(Path(HOME).parent, 'data')
data_path = os.path.join(DATA_FOLDER, 'filtered.tsv')

In [24]:
current = HOME 
while 'src' not in os.listdir(current):
    current = Path(current).parent

sys.path.append(str(current))
sys.path.append(os.path.join(str(current), 'data_analysis'))
sys.path.append(os.path.join(str(current), 'evaluation'))
sys.path.append(os.path.join(str(current), 'text_processing'))

In [25]:
# let's load the data as a HF Dataset
from datasets import load_dataset
original_data = load_dataset('csv', data_files=os.path.join(DATA_FOLDER, 'fixed.csv'), split='train')

In [26]:
# let's start with removing all fields but 'source' and 'target'
original_data = original_data.remove_columns(['source_tox', 'target_tox', 'similarity', 'lenght_diff'])
original_data

Dataset({
    features: ['source', 'target'],
    num_rows: 577777
})

# Original Dataset

In [27]:
sample = original_data.shuffle(seed=69).select(range(1000))
sample

Dataset({
    features: ['source', 'target'],
    num_rows: 1000
})

In [28]:
from src.text_processing import preprocess as pr
def process_text(text: str) -> str:
    return pr.no_extra_spaces(pr.no_extra_chars(pr.to_lower(text)))

In [29]:
# the first step is to process the data 
def process_sample_text(batch):
    """This function recieves  batch of samples from the original data. It returns a new batch where each
    'source' and 'target' text data will be processed using the function above
    """
    new_batch = dict([(k, [process_text(t) for t in v]) for k, v in batch.items()])
    return new_batch


sample = sample.map(process_sample_text, batched=True)

In [30]:
# the next step is to filter the dataset
def filter_data(sample):
    """This function receives  a batch of samples from the original data and filters those whose 'source' text is shorter than the 'target' text."""
    # first tokenize each 'source' and 'target' fields
    source = pr.tokenize(sample['source'], tokenizer_type='word')
    target = pr.tokenize(sample['target'], tokenizer_type='word')
    return len(source) > len(target)

sample = sample.filter(filter_data)

In [31]:
sample

Dataset({
    features: ['source', 'target'],
    num_rows: 458
})

# Auxiliary Dataset

In [32]:
from transformers import set_seed
# set the seed for reproducibility, set_seed(69)
aux_data = load_dataset('s-nlp/paradetox')['train'] # the dataset has only one split: 'train'
# shuffle the data
aux_data = aux_data.rename_column('en_toxic_comment', 'source').rename_column('en_neutral_comment', 'target').shuffle(seed=69)
# split the data
train_split, val_split, test_split = aux_data.select(range(2000, len(aux_data))), aux_data.select(range(1000)), aux_data.select(range(1000, 2000))

In [33]:
train_split, val_split, test_split

(Dataset({
     features: ['source', 'target'],
     num_rows: 17744
 }),
 Dataset({
     features: ['source', 'target'],
     num_rows: 1000
 }),
 Dataset({
     features: ['source', 'target'],
     num_rows: 1000
 }))

In [34]:
from datasets import concatenate_datasets
# make sure to change the first val_split to 'train_split' after making sure the code works
train_data, val_data, test_data = concatenate_datasets([train_split, sample]), val_split, test_split

# DataLoaders

In [35]:
# the next step is to tokenize the data  
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
CHECKPOINT = 't5-small'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
TOKENIZER = AutoTokenizer.from_pretrained(CHECKPOINT)
MODEL = AutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT).to(DEVICE)

In [36]:
TASK_PREFIX = 'summarize: '

def prepare_labeled_data(batch):
    # add the task predix to each sentence
    inputs = [TASK_PREFIX + doc for doc in batch["source"]]
    # tokenize 'x'
    model_inputs = TOKENIZER(inputs, truncation=True, max_length=1028)
    # tokenize 'y'  
    labels = TOKENIZER(text_target=batch["target"], truncation=True)
    # add it to the model's input
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [37]:
train_data = train_data.map(prepare_labeled_data, batched=True).remove_columns(['source', 'target'])
val_data = val_data.map(prepare_labeled_data, batched=True).remove_columns(['source', 'target'])

Map: 100%|██████████| 1458/1458 [00:00<00:00, 22508.19 examples/s]


In [39]:
train_data, val_data
# let's choose a small portion of the data to experiment with
train_data = train_data.shuffle(seed=69).select(range(500))

In [40]:
# create a DataCollator for padding for the sequence to sequence models
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=TOKENIZER, model=CHECKPOINT)
# we are now ready to create the dataloader
from torch.utils.data import DataLoader
train_dl = DataLoader(dataset=train_data, batch_size=4, shuffle=True, collate_fn=data_collator)
val_dl = DataLoader(dataset=val_data, batch_size=4, shuffle=False, collate_fn=data_collator)

In [41]:
# make sure the data is loaded correctly
b1, b2 = next(iter(train_dl)), next(iter(val_dl))

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


# Model Training

## Training Code

## Train utilities

In [43]:
from src.evaluation import toxicity_classication as tc
singleton_obj = tc.EvalutionSingletonInitializer()
tx_classifier, tx_tokenizer, tx_device = singleton_obj.get_toxic_classifier(), singleton_obj.get_toxic_tokenizer(), singleton_obj.get_device()
# let's define some of the training parameters
from torch.optim import Adam
from torch.optim.lr_scheduler import LinearLR

# lr: the same as the one used in the/home/ayhem18/DEV/My_Kaggle_Repo/pytorch_modular/pytorch_utilities.py
optimizer = Adam(MODEL.parameters(), lr=2 * 10 ** -5)
scheduler = LinearLR(optimizer=optimizer, start_factor=1, end_factor=0.5,total_iters=100)

In [49]:
# let's write a function to compute the summarization + toxicity loss
from src.evaluation.toxicity_classication import EvalutionSingletonInitializer
from torch.nn.functional import softmax
from typing import Union

def toxic_summary_model_loss(output_decoded: torch.Tensor, 
                             device,
                             return_tensor: bool=False) -> Union[float, torch.Tensor]:
    
    singleton_obj = EvalutionSingletonInitializer()
    tc_tokenizer, tc_classifier = singleton_obj.get_toxic_tokenizer(), singleton_obj.get_toxic_classifier()

    # make sure to freeze their parameters
    for p in tx_classifier.parameters():
        p.requires_grad = False

    tc_classifier.to(device)
    # tokenize
    model_input = tc_tokenizer(output_decoded, return_tensors='pt', padding=True, truncation=True)
    # set the input to the device
    model_input = {k: v.to(device) for k, v in model_input.items()}
    # pass through the model
    output = tc_classifier(**model_input)
    
    loss = torch.mean(softmax(output.logits, dim=1)[:, 1])
    
    if return_tensor: 
        loss.requires_grad=True
        return loss
    
    return loss.item()


In [56]:
from src.data_analysis.models import summarizer as ss
import src.training_utilities.exp_tracking as et
import importlib 
importlib.reload(ss)
importlib.reload(et)

_, _, best_model = ss.train_custom_summarizer(train_dataloader=train_dl, 
                                            val_dataloader=val_dl,
                                            summary_model=MODEL,
                                            summary_tokenizer=TOKENIZER, 
                                            toxicity_loss_function=toxic_summary_model_loss,
                                            toxicity_coeff=0.5, 
                                            optimizer=optimizer, 
                                            scheduler=scheduler, 
                                            num_epochs=2,   
                                            report_per_epoch=1,
                                            log_dir=os.path.join(HOME, 'runs')
                                            )


[INFO] Created SummaryWriter, saving to: /home/ayhem18/DEV/TextDetoxification/src/data_analysis/runs/experience_6...


 50%|█████     | 1/2 [00:39<00:39, 39.56s/it]

Training losses
train_loss: 1.3837907037734984
loss: 1.1527388542890549
toxic_loss: 0.5399586112574761
##################################################



100%|██████████| 2/2 [01:19<00:00, 39.59s/it]

Training losses
train_loss: 1.375048008441925
loss: 1.1357569699287415
toxic_loss: 0.5365462037323451
##################################################




