In [3]:
!pip install torch


Collecting torch
  Downloading torch-1.7.1-cp37-cp37m-manylinux1_x86_64.whl (776.8 MB)
[K     |████████████████████████████████| 776.8 MB 16 kB/s 
Collecting typing-extensions
  Downloading typing_extensions-3.7.4.3-py3-none-any.whl (22 kB)
Installing collected packages: typing-extensions, torch
Successfully installed torch-1.7.1 typing-extensions-3.7.4.3


In [7]:
import tensorflow as tf
import numpy as np 
import gzip
import torch
with gzip.open('../reformer-pytorch/examples/enwik8_simple/data/enwik8.gz') as file:
    readed=file.read(int(95e6))
    #print(readed[:10000]) #raw string 
    X = np.fromstring(readed, dtype=np.uint8)
    print(X)
    print(readed[:10000].split()[0])
    print((readed[:10000].split()[0]).decode("utf-8") )
    print(X[:100])
    print(X.shape)
    trX, vaX = np.split(X, [int(90e6)])
    print(len(vaX))
    data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)
    print(data_val)

[ 60 109 101 ...  47  47 119]
b'<mediawiki'
<mediawiki
[ 60 109 101 100 105  97 119 105 107 105  32 120 109 108 110 115  61  34
 104 116 116 112  58  47  47 119 119 119  46 109 101 100 105  97 119 105
 107 105  46 111 114 103  47 120 109 108  47 101 120 112 111 114 116  45
  48  46  51  47  34  32 120 109 108 110 115  58 120 115 105  61  34 104
 116 116 112  58  47  47 119 119 119  46 119  51  46 111 114 103  47  50
  48  48  49  47  88  77  76  83  99 104]
(95000000,)
5000000
tensor([101, 114, 110,  ...,  47,  47, 119], dtype=torch.uint8)


In [42]:
print(np.frombuffer(b'abc cde',dtype=np.uint8)) #utf-8 encoded integer 

[ 97  98  99  32  99 100 101]


In [2]:

class TextSamplerDataset():
    def __init__(self, data, seq_len):
        super().__init__()
        self.data = data
        self.seq_len = seq_len

    def __getitem__(self, index):
        rand_start = np.random.randint(0,len(self.data) - self.seq_len - 1, (1,))[0]
        
        full_seq = self.data[rand_start: rand_start + self.seq_len + 1]
        return full_seq

    def __len__(self):
        return len(self.data) // self.seq_len

## generate tokenizer

In [1]:
import tensorflow as tf
import numpy as np 
import gzip
from collections import Counter
import csv
import sentencepiece as spm
import os 

#_ROOT = os.path.abspath(os.path.dirname(__file__))

BPE_TSV_PATH ="bpe_spm.tsv"
BPE_MODEL_PATH = "bpe_model"

BOS_ID = 3
EOS_ID = 4
trsh = 5
vocab_size = 20000

token_dict = Counter()
with gzip.open('../reformer-pytorch/examples/enwik8_simple/data/enwik8.gz') as file:
    readed=file.read(int(95e6)).decode("utf-8") 
    dataset = readed.lower().split()
    
    token_dict.update(dataset)
    
    
    trsh = 15
    print(len(token_dict))
    token_dict = Counter(dict(filter(lambda x: x[1] >= trsh, token_dict.items())))
    print(len(token_dict))



    print("finish token_dict")
    #write vocab as tsv
    with open(BPE_TSV_PATH, 'w', newline='') as f_output:
        tsv_output = csv.writer(f_output, delimiter='\t')
        for word in token_dict:
            tsv_output.writerow([word, token_dict[word]])
    print("finish write bpe tsv")
    spmcmd = '--input={spm_input} --model_prefix={spm_model} --input_format=tsv --vocab_size={vocab_size} --user_defined_symbols=[SEP],[BOS],[EOS] --hard_vocab_limit=false --model_type=bpe --pad_id=0 --unk_id=1 --bos_id=-1 --eos_id=-1 --pad_piece=[PAD] --unk_piece=[UNK]'.format(
        spm_input=BPE_TSV_PATH, spm_model=BPE_MODEL_PATH, vocab_size=vocab_size)
    spm.SentencePieceTrainer.train(spmcmd)
    print("finish train bpe ")
   


1305260
50197
finish token_dict
finish write bpe tsv
finish train bpe 


In [94]:
s = spm.SentencePieceProcessor()
s.Load(BPE_MODEL_PATH + ".model")
print(s.encode_as_ids("hello my name is domyoung lee"))

[23113, 1213, 630, 57, 1139, 15204, 5814]


## generate tf.dataset

In [8]:
seq_length = 1000
batch_size = 10
with gzip.open('../reformer-pytorch/examples/enwik8_simple/data/enwik8.gz') as file:
    readed=file.read(int(95e6)).decode("utf-8") 
    dataset = readed.lower().split()
    
sampler_dataset = TextSamplerDataset(dataset,seq_length)
s = spm.SentencePieceProcessor()
s.Load(BPE_MODEL_PATH + ".model")


def generator_fn(dataset,tokenizer, bs,seq_len ):
    for i in range(len(dataset)):
        line = ' '.join(dataset[i])
        encoded_id = tokenizer.encode_as_ids(line)
        if len(encoded_id) < seq_len-1:
            encoded_id = encoded_id + [0]*((seq_len)-len(encoded_id))
        if len(encoded_id) > seq_len-1:
            encoded_id = encoded_id[:seq_len]
        inputs = np.array([BOS_ID] + encoded_id[:-1])
        targets = np.array( encoded_id)
        yield inputs,targets
        
    
        

In [9]:
generator = generator_fn(sampler_dataset,s,batch_size,seq_length)


In [10]:
for x in generator:
    print(x)
    print(x[0].shape,x[1].shape)
    break

(array([    3,   167,    62,  5331,    71, 20461, 11727,   691, 29963,
         515,   154,    28,  4410,   515,   165, 21785,  4410,   103,
          57,     6,  4085,    71,  3589,   114,  7192,     6,  1996,
          93,   995,     6,  2781,  1511, 11727,   579,    72,     6,
         165, 21785,    82,  3813,   100,    98,    17,  1082, 18972,
          41, 15794, 21979,   177,   355,   498,    41,   451, 29973,
        1267, 29942,  5687, 29961,    11,  5885,   397,     9,  2243,
         309,  6035,   243,  1704,    62,   355,    93,   541,  5634,
          25, 11534, 29962, 14048,    29,  1350,    41,  1201,   100,
          25,     6,   317, 13104,  1681, 29962,  2634,   870,   309,
        2613, 13803, 29963,   144,  2830,   144,   514,  2533,   687,
       29968,    32,    78,    21,   482,    66, 18609, 29962,  5713,
       29969,   165, 29946,   597,    44,    72,     6,   896,   555,
       29983,    96,  7776,  4107, 29962,   103,  8478,   702,  2840,
        4781, 29969

In [13]:
import functools

d = tf.data.Dataset.from_generator( \
            functools.partial(generator_fn, dataset=sampler_dataset,tokenizer=s,bs=batch_size,seq_len=seq_length), \
            output_types=(tf.int64, tf.int64), output_shapes=([seq_length],[seq_length]))
d=d.batch(batch_size)


In [14]:
for i in d:
    print(i)
    break

(<tf.Tensor: shape=(10, 1000), dtype=int64, numpy=
array([[    3,    75,  2656, ...,   171, 29953, 29944],
       [    3,    33,  6485, ..., 29985,   366,   894],
       [    3, 15325, 29965, ...,   126, 26757, 20740],
       ...,
       [    3,   345,  2047, ...,   398,  3379, 29963],
       [    3,  6201, 29963, ..., 19657, 29963,  3193],
       [    3,   869,   371, ..., 29969,   146,   219]])>, <tf.Tensor: shape=(10, 1000), dtype=int64, numpy=
array([[   75,  2656, 29963, ..., 29953, 29944,  1209],
       [   33,  6485, 29957, ...,   366,   894, 29963],
       [15325, 29965, 29946, ..., 26757, 20740, 29989],
       ...,
       [  345,  2047,   221, ...,  3379, 29963,   278],
       [ 6201, 29963,  5461, ..., 29963,  3193, 29982],
       [  869,   371,   137, ...,   146,   219, 29969]])>)
