In [2]:
import datasets
from datasets import load_dataset, load_metric

In [69]:
from tokenizers import Tokenizer
from transformers import PreTrainedTokenizer
from transformers import BatchEncoding
import numpy as np
import torch
import os
import sentencepiece as spm



class SentencePieceTokenizer(PreTrainedTokenizer):
    def __init__(self, prefix='wd5m_with_pad', max_tokenize_length=75, pad_to_max = False):
        super()
        path = os.path.join('data/sentencepiece', prefix + '.model')
        self.sp = spm.SentencePieceProcessor(model_file=path)
        self._pad_token_id = self.sp['<pad>']
        self._eos_token_id = self.sp['</s>']
        self.max_tokenize_length = max_tokenize_length
        self.pad_to_max = pad_to_max
        if self.pad_to_max == True:
            print('Max length padding enabled (needed for TPU)')

    def make_attention_mask(self, encode_output, max_len):
        # padding is with zeros
        attention_mask = np.zeros((len(encode_output), max_len), dtype=int)
        for i, seq in enumerate(encode_output):
            length = min(max_len, len(seq))
            attention_mask[i][:length] = np.ones((length), dtype=int)
        return attention_mask

    # returns BatchEncoding
    def __call__(self, text, padding=True, truncation=True, max_length=128, return_tensors="pt"):
        out = self.sp.encode(text)
        # out is a list of list, need to pad and add eos token
        for x in out:
            x.append(self._eos_token_id)

        if self.pad_to_max:
            max_len = max_length
        else:
            max_len = min(max([len(x) for x in out]), self.max_tokenize_length)

        attention_mask = self.make_attention_mask(out, max_len)
        input_ids = np.ones((len(out), max_len), dtype=int) * self._pad_token_id
        for i, seq in enumerate(out):
            length = min(max_len, len(seq))
            input_ids[i][:length] = seq[:length]

#         input_ids = torch.LongTensor(input_ids)
#         attention_mask = torch.LongTensor(attention_mask)
        data = {'input_ids': input_ids, 'attention_mask': attention_mask}
        return BatchEncoding(data)

    def tokenize_str(self, text):
        out = self.sp.encode(text)
        return out

    def batch_decode(self, input_ids, skip_special_tokens=True):
        input_ids = input_ids.tolist()
        #TODO: why need to do this?
        for i, x in enumerate(input_ids):
            if x[0] == 0:
                input_ids[i] = input_ids[i][1:]
        decoded = self.sp.decode(input_ids)
        out = []
        for s in decoded:
            s = s.replace('<pad>', '')
            out.append(s)
        return out

    @property
    def vocab_size(self) -> int:
        return len(self.sp)

    @property
    def pad_token_id(self) -> int:
        return self._pad_token_id

    @property
    def eos_token_id(self) -> int:
        return self._eos_token_id

In [143]:
# raw_datasets = load_dataset(
#                 'wmt16', 'ro-en', cache_dir='./cache_dir'
#         )

fname = '/scratche/home/apoorv/transformer-kgc/data/wikidata5m_v3/json_format/train.json'
raw_datasets = load_dataset('json', data_files=fname, cache_dir='./cache_dir')

Using custom data configuration default-2ab599deb35af74b


Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to ./cache_dir/json/default-2ab599deb35af74b/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset json downloaded and prepared to ./cache_dir/json/default-2ab599deb35af74b/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02. Subsequent calls will reuse this data.


In [133]:
column_names = raw_datasets["train"].column_names

In [134]:
column_names

['input', 'output']

In [135]:
len(raw_datasets["train"])

42687362

In [136]:
raw_datasets.keys(), raw_datasets['train'].keys()

AttributeError: 'Dataset' object has no attribute 'keys'

In [137]:
len(raw_datasets["train"])

42687362

In [138]:
train_dataset = raw_datasets["train"]

In [139]:
from transformers import T5TokenizerFast
# tokenizer = T5TokenizerFast.from_pretrained('t5-small')
tokenizer = SentencePieceTokenizer('sp_wd5m_v3', max_tokenize_length=30, pad_to_max=True)

Max length padding enabled (needed for TPU)


In [140]:
def preprocess_function(examples):
    inputs = examples['input']
    targets = examples['output']
    model_inputs = tokenizer(inputs, max_length=30, padding="max_length", truncation=True)

    # Setup the tokenizer for targets
    # with tokenizer.as_target_tokenizer():
    labels = tokenizer(targets, max_length=30, padding="max_length", truncation=True)

    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
#     print(model_inputs)
    return model_inputs

In [141]:
inputs = ['|TAIL| Obama||| position held']
tokenizer(inputs, max_length=30, padding="max_length", truncation=True)

{'input_ids': array([[31190,     4, 28944,     6,   914,     2,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]])}

In [142]:
train_dataset = train_dataset.map(
                preprocess_function,
                batched=True,
                num_proc=8,
                remove_columns=column_names,
            )











OSError: [Errno 28] Error writing bytes to file. Detail: [errno 28] No space left on device

In [None]:
train_dataset.map

In [79]:
len(train_dataset)

610320

In [80]:
train_dataset[0]

{'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'input_ids': [3229,
  361,
  266,
  2306,
  31250,
  30988,
  21405,
  2,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3],
 'labels': [1619,
  8854,
  31407,
  31192,
  720,
  2765,
  310,
  17578,
  31250,
  449,
  839,
  572,
  280,
  1692,
  8148,
  1727,
  511,
  31220,
  1122,
  5288,
  2,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100]}

In [88]:
train_dataset.map

<bound method Dataset.map of Dataset({
    features: ['translation'],
    num_rows: 610320
})>

In [90]:
import os
folder = '/scratche/home/apoorv/transformer-kgc/data/wikidata5m_v3/'
splits = ['train_small', 'valid', 'test']
data_files = [os.path.join(folder, x + '.txt') for x in splits]

dataset = load_dataset('tsv', data_files=data_files)

FileNotFoundError: Couldn't find file locally at tsv/tsv.py, or remotely at https://raw.githubusercontent.com/huggingface/datasets/1.6.2/datasets/tsv/tsv.py.
The file is also not present on the master branch on github.