# Data preprocessing. MUST BE MOVED TO src/data/...

## Reading

In [1]:
import pandas as pd
import torchmetrics as torchmetrics

extracted_dir = '../data/interim/'
tsv_path = extracted_dir + 'filtered.tsv'
tsv_file = pd.read_csv(tsv_path, sep='\t', index_col=0)

tsv_file.head()

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


In [2]:
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

## Dataset

In [19]:
import torch
import nltk
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset
from torchtext.vocab import build_vocab_from_iterator
from collections import Counter

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)


class DeToxicityDataset(Dataset):
    def __init__(self, dataframe, to_remove_word_cnt=5, vocab = None, tox_diff=0.3):
        self.df = dataframe
        self._preprocess_sentences(to_remove_word_cnt, tox_diff)
        assert len(self.references) == len(self.translations)
        self.vocab = vocab or self._create_vocab()

    def _preprocess_sentences(self, to_remove_word_cnt, tox_diff):
        # Swap all ref with trn where toxicity level is greater in ref
        to_swap = self.df['ref_tox'] < self.df['trn_tox']
        self.df.loc[to_swap, ['reference', 'translation']] = self.df.loc[to_swap, ['translation', 'reference']].values
        self.df.loc[to_swap, ['ref_tox', 'trn_tox']] = self.df.loc[to_swap, ['trn_tox', 'ref_tox']].values

        # Delete all rows where difference between ref_tox and trn_tox is less than tox_diff
        self.df = self.df[self.df['ref_tox'] - self.df['trn_tox'] >= tox_diff]

        # Tokenize sentences
        self.df['tokenized_reference'] = self.df['reference'].apply(lambda text: word_tokenize(text))
        self.df['tokenized_translation'] = self.df['translation'].apply(lambda text: word_tokenize(text))

        # Collect all words and count their occurrence in sentences
        all_sent = self.df['tokenized_translation'].tolist() + self.df['tokenized_reference'].tolist()
        all_words = [word for sent in all_sent for word in sent]
        token_counts = Counter(all_words)

        # Remove all words which occur less or equal than 'to_remove_word_cnt'
        unique_words = set(all_words)
        for word in token_counts:
            if token_counts[word] <= to_remove_word_cnt:
                unique_words.remove(word)

        # Leave only approved words in tokenized sentences
        self.df['tokenized_reference'] = self.df['tokenized_reference'].apply(lambda tokens: [word for word in tokens if word in unique_words])
        self.df['tokenized_translation'] = self.df['tokenized_translation'].apply(lambda tokens: [word for word in tokens if word in unique_words])

        # self.df = self.df[self.df['tokenized_reference'].apply(lambda x: len(x) <= max_sent_len)]
        # self.df = self.df[self.df['tokenized_translation'].apply(lambda x: len(x) <= max_sent_len)]
        # self.df['tokenized_reference'] = self.df['tokenized_reference'].apply(lambda tokens: [special_symbols[2]] + tokens + [special_symbols[3]])
        # self.df['tokenized_translation'] = self.df['tokenized_translation'].apply(lambda tokens: [special_symbols[2]] + tokens + [special_symbols[3]])
        self.references = self.df['tokenized_reference'].tolist()
        self.translations = self.df['tokenized_translation'].tolist()

    def _create_vocab(self):
        # creates vocabulary that is used for encoding
        # the sequence of tokens (splitted sentence)
        vocab = build_vocab_from_iterator(self.references + self.translations, specials=special_symbols)
        vocab.set_default_index(UNK_IDX)
        return vocab

    def _get_reference(self, index: int) -> list:
        # retrieves sentence from dataset by index
        sent = self.references[index]
        return self.vocab(sent)

    def _get_translation(self, index: int) -> list:
        # retrieves translation from dataset by index
        sent = self.translations[index]
        return self.vocab(sent)

    def __len__(self) -> int:
        return len(self.references)

    def __getitem__(self, index) -> tuple[list, list]:
        return self._get_reference(index), self._get_translation(index)

## Dataset Creation

In [20]:
from sklearn.model_selection import train_test_split

VALIDATION_RATIO = 0.2
train_dataframe, val_dataframe = train_test_split(tsv_file, test_size=VALIDATION_RATIO, random_state=123)

In [21]:
train_dataset = DeToxicityDataset(train_dataframe)
val_dataset = DeToxicityDataset(val_dataframe, vocab=train_dataset.vocab)

In [18]:
train_dataset.vocab.lookup_tokens(train_dataset[0][1])

['<bos>',
 'everyone',
 'wants',
 'to',
 'know',
 'if',
 'you',
 "'re",
 'scared',
 '.',
 '<eos>']

## Dataloader

In [28]:
from torch.utils.data import DataLoader

batch_size = 128
max_size = 50

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def collate_batch(batch: list):
    references_batch, translations_batch = [], []
    for _ref, _trn in batch:
        _ref, _trn = _ref[:max_size], _trn[:max_size]
        if len(_ref) < max_size:
            _ref = [PAD_IDX] * (max_size - len(_ref)) + _ref
        if len(_trn) < max_size:
            _trn = [PAD_IDX] * (max_size - len(_trn)) + _trn
        references_batch.append(torch.tensor(_ref))
        translations_batch.append(torch.tensor(_trn))

    return torch.stack(references_batch), torch.stack(translations_batch)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [29]:
# just to check that all shapes are correct

for batch in train_dataloader:
    inp, out = batch
    print(inp.shape)
    print(out.shape)
    break

torch.Size([128, 50])
torch.Size([128, 50])


## Model

In [58]:
import torch.nn as nn

class DeToxicTranslator(nn.Module):
    def __init__(self,  vocab_size, embedding_dim, lstm_dim, lstm_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, lstm_dim, num_layers=lstm_layers, batch_first=True)
        self.fc = nn.Sequential(
            # nn.Linear(lstm_dim * 2, lstm_dim),
            # nn.Dropout(0.25),
            # nn.ReLU(),
            nn.Linear(lstm_dim, ),
        )

    def forward(self, text):
        x = self.embedding(text)
        print(x.shape)
        x, hidden = self.lstm(x)
        print(x.shape)
        # x = self.fc(x)
        # print(x.shape)

        return x



## Metrics

In [59]:
# from torchmetrics.text.rouge import ROUGEScore
# from torchmetrics.text.bleu import BLEUScore
# # ALSO, WE CAN USE NLTK FOR THAT
# import nltk.translate.bleu_score as bleu
# from nltk.translate.meteor_score import single_meteor_score
#
# # IT'S WORKING METRICS BTW

## Train Loop

In [60]:
from tqdm import tqdm
# from torchmetrics.text.rouge import ROUGEScore
# from torchmetrics.text.bleu import BLEUScore

def train_one_epoch(
    model,
    loader,
    optimizer,
    loss_fn,
    epoch_num=-1
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch}: train",
        leave=True,
    )
    model.train()
    train_loss = 0.0
    total = 0
    for i, batch in loop:
        texts, labels = batch
        texts = texts.to(device)
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward pass
        outputs = model(texts).to(device)
        # loss calculation
        print(outputs.shape, labels.shape)
        loss = loss_fn(outputs.to(device), labels.to(device))
        # backward pass
        loss.backward()
        # optimizer run
        optimizer.step()
        train_loss += loss.item()
        total += len(batch)
        loop.set_postfix({"loss": train_loss/total})

In [61]:
INPUT_DIM = len(train_dataset.vocab)
EMBEDDING_DIM = 128
LSTM_DIM = 50
LSTM_LAYERS = 2

model = DeToxicTranslator(INPUT_DIM, EMBEDDING_DIM, LSTM_DIM, LSTM_LAYERS).to(device)

optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss()

In [62]:
best = -float('inf')
num_epochs = 10
for epoch in range(num_epochs):
    train_one_epoch(model, train_dataloader, optimizer, loss_fn, epoch_num=epoch)
    # best = val_one_epoch(model, val_dataloader, loss_fn, epoch, best_so_far=best)

Epoch 0: train:   0%|          | 0/3612 [00:00<?, ?it/s]


torch.Size([128, 50, 128])
torch.Size([128, 50, 100])
torch.Size([128, 50, 1])
torch.Size([128, 50, 1]) torch.Size([128, 50])


RuntimeError: Expected target size [128, 1], got [128, 50]