In [None]:
# default_exp data

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
%load_ext autoreload
%autoreload 2

# Data utils

In [None]:
#export
# Imports for data functions
import pandas as pd
import os
from reformer_fastai.tokenizers import ByteTextTokenizer
from fastai.text.all import *
from torch.utils.data import Dataset
import time
from tqdm import tqdm

In [None]:
#exports
def read_lines(path):
    """
    Tokenizes a text file.
    """
    assert os.path.exists(path)
    lines = []
    with open(path, 'r') as f:
        for line in f:
            lines.append(line)  # + ['<eos>'])
    return lines


def convert_data_to_seq_length(df, seq_length=2**16):
    """
    Take a dataframe text data and convert it to a dataframe with the same columns where
    every data sample is of numericalized token length of seq_length, except for the last example which is the remainder.
    (less than but closest to the value given)
    :param df: a pandas dataframe with columns [tokenized, lens] consisting of the numericalized tokens of text and their respective lengths
    :param seq_length: the numericalized token sequence length to split the data into
    :return: the new dataframe with split data samples
    """
    concat_data = to_concat(list(df['tokenized']))
    result = pd.DataFrame(columns=['tokenized', 'lens'])
    n_seqs = len(concat_data)//seq_length
    for i in tqdm(range(n_seqs), desc="Splitting data", total=n_seqs):
        sample = concat_data[i*seq_length:(i+1)*seq_length]
        result = result.append(
            {
                'tokenized': sample,
                'lens': len(sample),
            },
            ignore_index=True)
    # Add last data sample which is the remainder
    sample = concat_data[n_seqs*seq_length:]
    if len(sample) > 0:
        result = result.append(
        {
            'tokenized': sample,
            'lens': len(sample),
        },
        ignore_index=True)
    return result


def read_and_prepare_data(data_path, seq_length=0):
    """
    Read the data from file, and prepare the dataframe.
    This does not include splitting into train and validation sets.
    :param data_path: relative path to the raw data
    :param seq_length: sequence length to split data into, default is don't change data sample length
    :return: the dataframe after preparations
    """
    print("Reading data from path...")
    # Read the data from file
    enwik8 = read_lines(data_path)
    df = pd.DataFrame({'text': enwik8})
    print("Done!")
    
    time.sleep(0.5)  # this is so the printing of the progress bar is not weird
    # Initialize the BTT
    btt = ByteTextTokenizer(is_lm=True, add_bos=True, add_eos=True)

    # Modify dataset for training
    tqdm.pandas(desc="Tokenizing data")
    df['tokenized'] = df['text'].progress_map(lambda x: btt(x))
    
    # By default we won't change the data sample length
    if seq_length != 0:
        print("Sequence length has been added, splitting data to samples with sequence length " + str(seq_length))
        # Convert data samples according to sequence length
        df = convert_data_to_seq_length(df, seq_length)
        print("Done!")
    else:
        df['lens'] = df['text'].map(lambda x: len(x))

    df['lens_cum_sum'] = df.lens.cumsum()

    return df

In [None]:
test_text = 'hello world!'
test_df = pd.DataFrame({'text': [test_text]})
btt = ByteTextTokenizer(is_lm=True, add_bos=True, add_eos=True)
tokenized_test_text = btt(test_text)
assert len(test_df) == 1
assert len(test_df['text'][0]) == len(test_text)

tqdm.pandas(desc="tokenizing data")
test_df['tokenized'] = test_df['text'].progress_map(lambda x: btt(x))

# Split the df into a divisable length (2)
converted_test_df = convert_data_to_seq_length(test_df, 2)
assert len(converted_test_df) == len(tokenized_test_text)//2

# Split the df into a non-divisable length (5)
converted_test_df = convert_data_to_seq_length(test_df, 5)
assert len(converted_test_df) != len(tokenized_test_text)//5

  from pandas import Panel
tokenizing data: 100%|██████████| 1/1 [00:00<00:00, 1217.86it/s]
Splitting data: 100%|██████████| 7/7 [00:00<00:00, 376.76it/s]
Splitting data: 100%|██████████| 2/2 [00:00<00:00, 363.95it/s]


In [None]:
test_df['tokenized'][0]

LMTensorText([  2, 107, 104, 111, 111, 114,  35, 122, 114, 117, 111, 103,  36,   1])

## Synthetic task

Data utils for the synthetic task of the reformer paper. We want to create sequences of the form 0w0w, where w is a sequence of integeres between 1-127 of some lenght: eg. 08470847.
We create items on the fly instead of all items up front.

In [None]:
#export
class TwinSequence(Dataset):
    def __init__(self, sl=1024, len=100):
        assert sl%2 == 0
        self.sl = sl
        self.len = len
    def __getitem__(self, idx):
        seq = torch.randint(1,128,(self.sl//2,))             # w: [1-127] of len sl//2
        seq[0] = 0                                           # seq = 0w
        seq = torch.cat((seq,seq), -1)                       # seq = 0w0w
        target = torch.cat((seq[1:],torch.tensor([0])), -1)  # return offset target x:[0123], y:[1230]
        return (seq, target)                     
    def __len__(self):
        return self.len

#export
def get_twinseq_dls(sl, train_sz=500, valid_sz=100, bs=64, shuffle=False, device='cuda'):
    return DataLoaders.from_dsets(TwinSequence(sl, train_sz), 
                                  TwinSequence(sl, valid_sz), 
                                  bs=bs, shuffle=False, device='cuda')

In [None]:
dls=get_twinseq_dls(sl=10, device='cpu')
xb, yb = dls.one_batch()
xb.shape, yb.shape

(torch.Size([64, 10]), torch.Size([64, 10]))

In [None]:
inp, targ = xb[0].tolist(), yb[0].tolist()
inp, targ

([0, 24, 52, 68, 118, 0, 24, 52, 68, 118],
 [24, 52, 68, 118, 0, 24, 52, 68, 118, 0])

In [None]:
assert all_equal(inp[1:], targ[:-1])

For the synthetic task we also have to mask the **first half** of the targets. The first part is just random integers, so it's impossible to learn anything from it. We set the tokens in the first part to a special index, -100, and later tell our lossfunction to ignore items with this value. This means that the only task the model can learn is to copy the first part of the input sequence. If we didn't mask the first part, it would be penalized for poor performance in the first part, and would try to find a compromise.

In [None]:
#export
class MaskTargCallback(Callback):
    def before_batch(self):
        self.y[:, :self.dls.train_ds.sl//2] = -100

In [None]:
#hide
from nbdev.export import notebook2script; notebook2script()