In [32]:
from datasets import load_dataset
import os, sys
from pathlib import Path
HOME = os.getcwd()
DATA_FOLDER = os.path.join(Path(HOME).parent, 'data')
data_path = os.path.join(DATA_FOLDER, 'fixed.csv')

# step1: load the dataset 
dataset = load_dataset("csv", data_files=data_path)
dataset = dataset.remove_columns(['similarity', 'lenght_diff', 'source_tox', 'target_tox'])

In [33]:
current = HOME 
while 'src' not in os.listdir(current):
    current = Path(current).parent

sys.path.append(str(current))
sys.path.append(os.path.join(str(current), 'data_analysis'))
sys.path.append(os.path.join(str(current), 'evaluation'))
sys.path.append(os.path.join(str(current), 'text_processing'))

In [34]:
data_sample = dataset['train'].shuffle(seed=69).select(range(100))
data_sample

Dataset({
    features: ['source', 'target'],
    num_rows: 100
})

In [35]:
# step2: create the model and the tokenizer
import torch
from transformers import T5TokenizerFast, AutoModelForSeq2SeqLM
CHECKPOINT = 't5-small'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = T5TokenizerFast.from_pretrained(CHECKPOINT)
model = AutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT).to(DEVICE)

In [36]:
# step3 create a function to tokenize the data
TASK_PREFIX = 'summarize: '
def prepare_data(batch, split: str ='source'):
    tok_batch = [TASK_PREFIX + s for s in batch[split]]
    return tokenizer(tok_batch, truncation=True)


In [37]:
source_tokenized = data_sample.map(prepare_data, batched=True)
target_tokenized = data_sample.map(lambda x: prepare_data(x, split='target'), batched=True)

In [38]:
print(source_tokenized)
# 'soure' and 'target' columns are unnecessary for the 'source_tokenized' dataset
print(target_tokenized)
# 'source', 'target, and 'attention_masks' are unncessary for the 'target_tokenized'
source_tokenized = source_tokenized.remove_columns(['source', 'target'])
target_tokenized = target_tokenized.remove_columns(['source', 'target', 'attention_mask'])

Dataset({
    features: ['source', 'target', 'input_ids', 'attention_mask'],
    num_rows: 100
})
Dataset({
    features: ['source', 'target', 'input_ids', 'attention_mask'],
    num_rows: 100
})


In [39]:
# create 2 dataloaders, one for source (will be used for predictions) and one for target
import torch
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

from datasets import Dataset
from torch.utils.data import DataLoader
# create a dataloader to pass the data to the model
source_dl = DataLoader(dataset=source_tokenized, batch_size=64, shuffle=False, collate_fn=data_collator)
target_dl = DataLoader(dataset=target_tokenized, batch_size=64, shuffle=False, collate_fn=data_collator)

In [40]:
next(iter(source_dl))
next(iter(target_dl))

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[21603,    10,    27,  ...,     0,     0,     0],
        [21603,    10,  2087,  ...,     0,     0,     0],
        [21603,    10,   216,  ...,     0,     0,     0],
        ...,
        [21603,    10,  1563,  ...,     0,     0,     0],
        [21603,    10, 10855,  ...,     0,     0,     0],
        [21603,    10,   264,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [52]:
from src.evaluation import toxicity_classication as tc
import importlib
importlib.reload(tc)
import re

def build_dataset():
    for source_b, target_b in zip(source_dl, target_dl):
        # ignore the source data
        model_batch = {k: v.to(DEVICE) for k, v in source_b.items()}
        # pass the batch to the model
        output = model.generate(**model_batch)
        # print(output)
        output_decoded = tokenizer.batch_decode(output, skip_special_tokens=True)
        source = tokenizer.batch_decode(source_b['input_ids'], skip_special_tokens=True)
        target = tokenizer.batch_decode(target_b['input_ids'], skip_special_tokens=True)

        # the summary's toxicity classification is next:
        summary_tox = tc.toxic_classification(output_decoded)

        for text, source_text, target_text, tox in zip(output_decoded, source, target, summary_tox):
            yield {"source": re.sub(TASK_PREFIX, "", source_text), "target": re.sub(TASK_PREFIX, "", target_text), "summary": text, "summary_tox": tox}

    
summarized_dataset = Dataset.from_generator(build_dataset)

Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  
Generating train split: 100 examples [00:02, 42.95 examples/s]


In [53]:
summarized_dataset

Dataset({
    features: ['source', 'target', 'summary', 'summary_tox'],
    num_rows: 100
})

In [54]:
# save the dataset 
summarized_dataset.to_csv(os.path.join(DATA_FOLDER, 'summarized.csv'))

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 376.27ba/s]


15932

In [55]:
import pandas as pd
df = pd.read_csv(os.path.join(DATA_FOLDER, 'summarized.csv'))

In [56]:
df.head()

Unnamed: 0,source,target,summary,summary_tox
0,"I was away for too damn long, sir.","I've been away too long, sir.","i was away for too damn long, sir.",0.782886
1,Maybe they'il find your Ripper costume buried ...,maybe they'll find the Ripper costume buried n...,buried with your condor outfit.,0.005894
2,"shit, he's right there!",He's right there!,"shit, he's right there!",0.997899
3,moron.,Prick.,moron.,0.999411
4,If you can't get what you want with foreign ai...,"if you don't get what you want, through foreig...",if you can't get what you want with foreign ai...,6.7e-05
