In [59]:
import tensorflow as tf
import numpy as np
from ftfy import fix_text
import sentencepiece as spm
from collections import Counter
import datetime
import pandas as pd
import tqdm
import glob
import csv        

In [83]:
text_files = glob.glob("../data/scraped/*.txt") 
PROCESS_DATA_PATH = "../data/processed.txt"
BPE_TSV_PATH = "../data/bpe_spm.tsv'"
BPE_MODEL_PATH = "../data/bpe_model"
VOCAB_SIZE = 32000
TF_RECORDS = "../data/tf_records/"
BOS_ID = 3
EOS_ID = 4

In [88]:
file_writer = open(PROCESS_DATA_PATH, "w")
for file_name in tqdm.tqdm(text_files):
    fr = open(file_name,'r')
    file_writer.writelines([fix_text(line, normalization='NFKC') for line in fr.readlines()])
    fr.close
file_writer.close()

100%|██████████| 746/746 [00:04<00:00, 150.24it/s]


In [21]:
token_dict = Counter()
with open(PROCESS_DATA_PATH,'r') as fr:
    for line in tqdm.tqdm(fr):
        token_dict.update(line.split())


30411it [00:00, 143923.24it/s]


In [24]:
with open(BPE_TSV_PATH, 'w', newline='') as f_output:
    tsv_output = csv.writer(f_output, delimiter='\t')
    for word in token_dict:
        tsv_output.writerow([word, token_dict[word]])

In [None]:
spm_model = "/disk4/snm2.0_2019/data/edge_corpus/edge_spm"

In [29]:
spmcmd = '--input={spm_input} --model_prefix={spm_model} --input_format=tsv --vocab_size={vocab_size} --user_defined_symbols=[SEP],[BOS],[EOS] --hard_vocab_limit=false --model_type=bpe --pad_id=0 --unk_id=1 --bos_id=-1 --eos_id=-1 --pad_piece=[PAD] --unk_piece=[UNK]'.format(spm_input=BPE_TSV_PATH, spm_model=BPE_MODEL_PATH, vocab_size=VOCAB_SIZE)


In [32]:
spm.SentencePieceTrainer.train(spmcmd)

True

In [41]:
s = spm.SentencePieceProcessor()
s.Load(BPE_MODEL_PATH + ".model")

True

In [51]:
s.encode_as_pieces("[EOS]")

['▁', '[EOS]']

In [66]:
MIN_SEQ_LEN = 10
MAX_SEQ_LEN = 512
per_file_limit = 100000
filename = TF_RECORDS + str(datetime.datetime.now().timestamp()) + ".tfrecord"
tf_writer = tf.io.TFRecordWriter(filename)
doc_counts = 0

with open(PROCESS_DATA_PATH,'r') as f:
    for line in f:
        encoded_id = s.encode_as_ids(line)
        if len(encoded_id) < MAX_SEQ_LEN and len(encoded_id) > 10:
            inputs = np.array([BOS_ID] + encoded_id)
            targets = np.array(encoded_id + [EOS_ID])
            
            example = serialize_example(inputs, targets)
            tf_writer.write(example)
            doc_counts +=1
        if doc_counts >= per_file_limit:
            tf_writer.write(example)
            doc_counts = 0
            tf_writer.close()
            filename = output_dir + str(datetime.datetime.now().timestamp()) + ".tfrecord"
            tf_writer = tf.io.TFRecordWriter(filename)    

In [58]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def serialize_example(inputs, targets):
    feature = {
        'inputs': _int64_feature(inputs),
        'targets': _int64_feature(targets)
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [56]:
serialize_example([2,3])

b'\n\x12\n\x10\n\x06inputs\x12\x06\x1a\x04\n\x02\x02\x03'

In [76]:
def parse_example(serialized_example):
    data_fields = {
        "inputs": tf.io.VarLenFeature(tf.int64),
        "targets": tf.io.VarLenFeature(tf.int64)
    }
    parsed = tf.io.parse_single_example(serialized_example, data_fields)
    inputs = tf.sparse.to_dense(parsed["inputs"])
    targets = tf.sparse.to_dense(parsed["targets"])

    inputs = tf.cast(inputs, tf.int32)
    targets = tf.cast(targets, tf.int32)

    return inputs, targets

def input_fn(tf_recods, batch_size=32, padded_shapes= ([-1], [-1]), epoch=10, buffer_size=10000):
    if type(tf_recods) is str:
        tf_recods = [tf_recods]
    dataset = tf.data.TFRecordDataset(tf_recods, buffer_size=10000)
    dataset = dataset.shuffle(buffer_size=buffer_size)

    dataset = dataset.map(parse_example)
    dataset = dataset.padded_batch(batch_size, padded_shapes=padded_shapes)
    dataset = dataset.repeat(epoch)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset

In [86]:
dataset = input_fn(glob.glob((TF_RECORDS + "*.tfrecord")))

['../data/tf_records/1566726380.168402.tfrecord']


In [87]:
for inp, tar in dataset:
    print(inp)
    print(tar)
    break

tf.Tensor(
[[    3  3419 31904 ...     0     0     0]
 [    3   100   483 ...     0     0     0]
 [    3    64 31904 ...     0     0     0]
 ...
 [    3   108  1438 ...     0     0     0]
 [    3   100 14810 ...     0     0     0]
 [    3   101 31904 ...     0     0     0]], shape=(32, 339), dtype=int32)
tf.Tensor(
[[ 3419 31904 31883 ...     0     0     0]
 [  100   483  4482 ...     0     0     0]
 [   64 31904 31890 ...     0     0     0]
 ...
 [  108  1438  1694 ...     0     0     0]
 [  100 14810  6112 ...     0     0     0]
 [  101 31904  3620 ...     0     0     0]], shape=(32, 339), dtype=int32)
