In [26]:
import os
import string
import time

import matplotlib.pyplot as plt
import tensorflow as tf
from decouple import config
from dotenv import load_dotenv
from pprint import pprint

from utils import load_tokenizer

load_dotenv()

True

In [4]:
DATASET_PATH = config('DATASET_PATH')
TRAIN_DATASET_PATH = config('TRAIN_DATASET_PATH')
TOKENIZER_DATA_PATH = config('TOKENIZER_DATA_PATH')

BLOCK_SIZE = config('BLOCK_SIZE', cast=int)  # 100
BATCH_SIZE = config('BATCH_SIZE', cast=int)  # 12
BUFFER_SIZE = config('BUFFER_SIZE', cast=int)  # 1000

## Raw Data

In [6]:
# loading tokenizer from the saved model path
tokenizer = load_tokenizer(TOKENIZER_DATA_PATH)

# TODO: Avoid loading the whole text in memory
single_string = ''
raw_data = ''
for filename in os.listdir(DATASET_PATH):
    with open(f'{DATASET_PATH}/{filename}', "r", encoding='utf-8') as f:
        x = f.read()
    raw_data += x
    single_string += x + tokenizer.eos_token




#### Check presence of bad characters

In [33]:
accepted_chars = list(string.ascii_letters) + list(string.digits)
print(accepted_chars)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


In [29]:
raw_char_map = {}
tokenized_char_map = {}
raw_word_map = {}
tokenized_word_map = {}
bad_tokenized_char_map = {}
# for c in raw_data:
#     if c not in raw_char_map:
#         raw_char_map[c]  = 0
#     raw_char_map[c] += 1

# for c in single_string:
#     if c not in tokenized_char_map:
#         tokenized_char_map[c] = 0
#     tokenized_char_map[c] += 1
for c in single_string:
    if c not in accepted_chars:
        if c not in bad_tokenized_char_map:
            bad_tokenized_char_map[c] = 0
        bad_tokenized_char_map[c] += 1
        
    

In [30]:
pprint(bad_tokenized_char_map)

{'\n': 38776,
 ' ': 46386498,
 '!': 1937,
 '"': 15120,
 '#': 112,
 '$': 7343,
 '%': 11365,
 '&': 881,
 "'": 1590471,
 '(': 302,
 ')': 320,
 '*': 132,
 '+': 183,
 ',': 376754,
 '-': 184756,
 '.': 594363,
 '/': 27593,
 '0': 197714,
 '1': 114146,
 '2': 109295,
 '3': 36388,
 '4': 37358,
 '5': 48092,
 '6': 31464,
 '7': 26157,
 '8': 28787,
 '9': 43312,
 ':': 54333,
 ';': 786,
 '<': 23253,
 '=': 5,
 '>': 226376,
 '?': 56343,
 '@': 60,
 'A': 2846188,
 'B': 627057,
 'C': 1016021,
 'D': 1261593,
 'E': 4094712,
 'F': 640342,
 'G': 812800,
 'H': 1888432,
 'I': 2887485,
 'J': 143062,
 'K': 361707,
 'L': 1301793,
 'M': 822111,
 'N': 2439447,
 'O': 2708600,
 'P': 716213,
 'Q': 24676,
 'R': 1950126,
 'S': 2216479,
 'T': 3503084,
 'U': 1011740,
 'V': 362212,
 'W': 813863,
 'X': 50985,
 'Y': 784455,
 'Z': 25675,
 '[': 10033,
 ']': 10022,
 '^': 4,
 '_': 154,
 '~': 5,
 '\xa0': 228,
 '¬': 1,
 '\xad': 7,
 '®': 3,
 '°': 3,
 '´': 8,
 'º': 1,
 '¾': 2,
 'À': 3,
 'Á': 5,
 'È': 2,
 'É': 40,
 'Ï': 11,
 'Ñ': 18,
 '

#### Check presence of bad words

In [None]:
for w in raw_data.split(' '):
    if w not in raw_word_map:
        raw_word_map[w]  = 0
    raw_word_map[w] += 1

for w in single_string.split(' '):
    if w not in tokenized_word_map:
        tokenized_word_map[w]  = 0
    tokenized_word_map[w] += 1

In [None]:
pprint(tokenized_word_map)

## Tokenization

In [None]:
print('Tokenizing dataset...')
start_time = time.time()
string_tokenized = tokenizer.encode(single_string)
tokenization_time = execution_time = time.time() - start_time
print("Finished in:", execution_time)

examples = []

for i in range(0, len(string_tokenized) - BLOCK_SIZE + 1, BLOCK_SIZE):
    examples.append(string_tokenized[i:i + BLOCK_SIZE])
inputs, labels = [], []

for ex in examples:
    inputs.append(ex[:-1])
    labels.append(ex[1:])

dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset.save(TRAIN_DATASET_PATH)