In [5]:
from datasets import load_dataset
import os, sys
from pathlib import Path
HOME = os.getcwd()
DATA_FOLDER = os.path.join(Path(HOME).parent, 'data')
data_path = os.path.join(DATA_FOLDER, 'fixed.csv')

current = HOME 
while 'src' not in os.listdir(current):
    current = Path(current).parent

sys.path.append(str(current))
sys.path.append(os.path.join(str(current), 'data_analysis'))
sys.path.append(os.path.join(str(current), 'evaluation'))
sys.path.append(os.path.join(str(current), 'text_processing'))


In [6]:
# import the all_data_processed.csv file.
from datasets import load_dataset
all_data = load_dataset('csv', data_files=os.path.join(DATA_FOLDER, 'all_data_processed.csv'), split='train')
# make sure to filter any None values
all_data = all_data.filter(lambda s: (isinstance(s['source'], str) and isinstance(s['target'], str)))

Filter: 100%|██████████| 597521/597521 [00:01<00:00, 536047.55 examples/s]


In [7]:
# step2: create the model and the tokenizer
import torch
from transformers import T5TokenizerFast, AutoModelForSeq2SeqLM
CHECKPOINT = 't5-small'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = T5TokenizerFast.from_pretrained(CHECKPOINT)
model = AutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT).to(DEVICE)

In [8]:
from src.text_processing import preprocess as pr
# the next step is to filter the dataset
def filter_data(sample):
    """This function receives  a batch of samples from the original data and filters those whose 'source' text is shorter than the 'target' text."""
    # first tokenize each 'source' and 'target' fields
    source = pr.tokenize(sample['source'], tokenizer_type='word')
    target = pr.tokenize(sample['target'], tokenizer_type='word')
    return len(source) > len(target)

summary_data = all_data.filter(filter_data)
# save the data
summary_data.to_csv(os.path.join(DATA_FOLDER, 'summarized_data.csv'), index=False)

Filter: 100%|██████████| 597519/597519 [01:03<00:00, 9426.15 examples/s] 
Creating CSV from Arrow format: 100%|██████████| 277/277 [00:01<00:00, 245.36ba/s]


31406141

In [9]:
# step3 create a function to tokenize the data
TASK_PREFIX = 'summarize: '
def prepare_data(batch, split: str ='source'):
    tok_batch = [TASK_PREFIX + s for s in batch[split]]
    return tokenizer(tok_batch, truncation=True)

source_tokenized = summary_data.map(prepare_data, batched=True)
target_tokenized = summary_data.map(lambda x: prepare_data(x, split='target'), batched=True)

Map: 100%|██████████| 276516/276516 [00:07<00:00, 39445.29 examples/s]
Map: 100%|██████████| 276516/276516 [00:06<00:00, 43389.16 examples/s]


In [10]:
print(source_tokenized)
# 'soure' and 'target' columns are unnecessary for the 'source_tokenized' dataset
print(target_tokenized)
# 'source', 'target, and 'attention_masks' are unncessary for the 'target_tokenized'
source_tokenized = source_tokenized.remove_columns(['source', 'target'])
target_tokenized = target_tokenized.remove_columns(['source', 'target', 'attention_mask'])

Dataset({
    features: ['source', 'target', 'input_ids', 'attention_mask'],
    num_rows: 276516
})
Dataset({
    features: ['source', 'target', 'input_ids', 'attention_mask'],
    num_rows: 276516
})


In [11]:
# create 2 dataloaders, one for source (will be used for predictions) and one for target
import torch
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

from datasets import Dataset
from torch.utils.data import DataLoader
# create a dataloader to pass the data to the model
source_dl = DataLoader(dataset=source_tokenized, batch_size=64, shuffle=False, collate_fn=data_collator)
target_dl = DataLoader(dataset=target_tokenized, batch_size=64, shuffle=False, collate_fn=data_collator)

In [None]:
from src.evaluation import toxicity_classication as tc
import importlib
importlib.reload(tc)
import re

def build_dataset():
    for source_b, target_b in zip(source_dl, target_dl):
        # ignore the source data
        model_batch = {k: v.to(DEVICE) for k, v in source_b.items()}
        # pass the batch to the model
        output = model.generate(**model_batch)
        # print(output)
        output_decoded = tokenizer.batch_decode(output, skip_special_tokens=True)
        source = tokenizer.batch_decode(source_b['input_ids'], skip_special_tokens=True)
        target = tokenizer.batch_decode(target_b['input_ids'], skip_special_tokens=True)

        # the summary's toxicity classification is next:
        summary_tox = tc.toxic_classification(output_decoded)
        source_tox = tc.toxic_classification(source)
        for text, source_text, target_text, tox, s_tox in zip(output_decoded, source, target, summary_tox, source_tox):
            yield {"source": re.sub(TASK_PREFIX, "", source_text), "target": re.sub(TASK_PREFIX, "", target_text), "summary": text, "summary_tox": tox, "source_tox":s_tox}

    
summarized_dataset = Dataset.from_generator(build_dataset)

In [None]:
summarized_dataset.to_csv(os.path.join(DATA_FOLDER, 'summarized.csv'))