In [9]:
import pandas as pd
from tqdm import tqdm
import sys
sys.path.append('..')
from src.util import load_c4_dataset
from src.datasets import C4Dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
import numpy as np

In [4]:
model_name = "google/t5-v1_1-base"
print("Loading Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading Tokenizer...


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [21]:
def _span_corruption(text, corruption_rate=0.15, average_length_of_spans=3):
    input_tokens = tokenizer.tokenize(text)
    total_tokens = len(input_tokens)

    total_corrupted_tokens = int(total_tokens * corruption_rate)
    total_spans = total_corrupted_tokens // average_length_of_spans

    span_lengths = np.random.poisson(average_length_of_spans, total_spans)
    span_lengths = np.clip(span_lengths, 1, total_tokens // total_spans)

    total_corrupted_tokens = span_lengths.sum()
    if total_corrupted_tokens != int(total_tokens * corruption_rate):
        difference = int(total_tokens * corruption_rate) - total_corrupted_tokens
        if difference > 0:
            for i, current_length in enumerate(span_lengths):
                if current_length < (total_tokens // total_spans):
                    span_lengths[i] += 1
                    total_corrupted_tokens += 1
                    if total_corrupted_tokens == int(total_tokens * corruption_rate):
                        break
        else:
            for i, current_length in enumerate(span_lengths):
                if current_length > 1:
                    span_lengths[i] -= 1
                    total_corrupted_tokens -= 1
                    if total_corrupted_tokens == int(total_tokens * corruption_rate):
                        break

    span_starts = []
    current_position = 0
    for idx, length in enumerate(span_lengths):
        if current_position >= total_tokens:
            break
        length = min(length, total_tokens - current_position)
        start = np.random.randint(current_position, total_tokens - sum(span_lengths[idx:]) * 2 + 1)
        span_starts.append(start)
        current_position = start + length
    span_starts.sort()

    output_tokens = input_tokens.copy()
    corrupted_tokens = []
    sentinel_counter = 0
    sum_lengths = 0
    for start, length in zip(span_starts, span_lengths):
        end = min(start + length, total_tokens)
        sentinel_token = [f'<extra_id_{sentinel_counter}>']
        corrupted_tokens.append(sentinel_token)
        corrupted_tokens.extend(input_tokens[start:end])
        output_tokens[start-sum_lengths:end-sum_lengths] = sentinel_token
        sentinel_counter += 1
        sum_lengths += length-1
        
    input_tokens = output_tokens
    target_tokens = [token for sublist in corrupted_tokens for token in sublist]

    input_sequence = tokenizer.convert_tokens_to_string(input_tokens)
    target_sequence = tokenizer.convert_tokens_to_string(target_tokens)

    return input_sequence, target_sequence

In [22]:
import random
def _prefix_language_modeling(text):
        tokens = tokenizer.tokenize(text)
        split_point = random.randint(1, len(tokens)-1)
        input_tokens = tokens[:split_point]
        target_tokens = tokens[split_point:]
        
        input_sequence = tokenizer.convert_tokens_to_string(input_tokens)
        target_sequence = tokenizer.convert_tokens_to_string(target_tokens)
        return input_sequence, target_sequence

In [23]:
input_sequence, decoder_sequence= _span_corruption("The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.")
print(input_sequence)
print(decoder_sequence)

The<extra_id_0>s over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.
<extra_id_0> quick brown fox jump


In [5]:
base_path = '../c4/en/c4-train.{:05d}-of-01024.json'

In [6]:
list_of_texts = load_c4_dataset(base_path, 1)


Loading Dataframe with c4 Data...: 100%|██████████| 1/1 [00:09<00:00,  9.89s/it]


In [7]:
model_name = "google/t5-v1_1-base"
print("Loading Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading Tokenizer...


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [8]:
c4_dataset = C4Dataset(list_of_texts, tokenizer)

Cleanup Dataset: Remove texts with < 3 corrupted tokens: 100%|██████████| 356317/356317 [00:05<00:00, 63561.34it/s]
Cleaned 3478 Datapoints remaining 352839 Datapoints


In [13]:
c4_dataloader_train = DataLoader(c4_dataset, batch_size = 32, shuffle=False)
c4_iter = iter(c4_dataloader_train)

In [14]:
first_batch = next(c4_iter)

In [15]:
input, target = first_batch[0], first_batch[1]

In [25]:
tokenized_inputs = tokenizer(input, padding=True, truncation=True, return_tensors='pt')
tokenized_labels = tokenizer(target, padding=True, truncation=True, return_tensors='pt')
input_ids = tokenized_inputs['input_ids']
attention_mask = tokenized_inputs['attention_mask']
labels = tokenized_labels['input_ids']

In [17]:
print(list_of_texts[0])
print(input[0])
print(target[0])

Beginners BBQ Class Taking Place in Missoula!
Do you want to get better at making delicious BBQ? You will have the opportunity, put this on your calendar now. Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. He will be teaching a beginner level class for everyone who wants to get better with their culinary skills.
He will teach you everything you need to know to compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and trimming, plus smoker and fire information.
The cost to be in the class is $35 per person, and for spectators it is free. Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared.
Beginners BBQ Class Taking Place in Missoula! Do you want to get better at making delicious BBQ? You will<extra_id_0> put this on your calendar now. Thursday, September 22nd join World Class BBQ Champion, Tony Ba<extra_id_1>onestar Smoke Rangers. He wi