In [1]:
#Do all necessary imports

import tensorflow as tf
import tensorflow_datasets as tfds
import os

  from ._conv import register_converters as _register_converters


In [128]:
# Import Data

DIR_NAMES =['train-easy/', 'train-medium/', 'train-hard']
FILE_NAMES = ['algebra__linear_1d.txt']

BUFFER_SIZE = 50000

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
dataset_dir = parent_dir + '/Dataset'

# Based on https://www.tensorflow.org/tutorials/load_data/text

# Read in all files which are in FILE_NAMES 

labeled_data_sets = []


for file_name in FILE_NAMES:
    for dir_name in DIR_NAMES:
        concat_dir = os.path.join(dir_name,file_name)
        lines_dataset = tf.data.TextLineDataset(os.path.join(dataset_dir, concat_dir))
        labeled_data_sets.append(lines_dataset)

# Concatenate all File Data to one Big File Data

all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
    
# Group Data as batches of two (input_sentence, answer)

all_labeled_data = all_labeled_data.batch(2)

# Make two independent Tensors as Tuple (Not needed)

# all_labeled_data = all_labeled_data.map(lambda x: (x[0], x[1]))

# Shuffle the Data

all_labeled_data = all_labeled_data.shuffle(BUFFER_SIZE, reshuffle_each_iteration=False)


# One Hot Encode the Characters
# https://stackoverflow.com/questions/49370940/one-hot-encoding-characters

alphabet = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789*+-.=/()?,'>:<!{}"

def convert_to_onehot_tensor(data):
    
    data = data.numpy().decode("utf-8")
    
    #Creates a dict, that maps to every char of alphabet an unique int based on position
    char_to_int = dict((c,i) for i,c in enumerate(alphabet))
    encoded_data = []
    #Replaces every char in data with the mapped int
    
    encoded_data.append([char_to_int[char] for char in data])
    
    encoded_data = encoded_data[0]
    #This part now replaces the int by an one-hot array with size alphabet
    one_hot = []
    for value in encoded_data:
        #At first, the whole array is initialized with 0
        letter = [0 for _ in range(len(alphabet))]
        #Only at the number of the int, 1 is written
        letter[value] = 1
        one_hot.append(letter)
        
    return tf.convert_to_tensor(one_hot)

# Map to Python Function from Tensorflow

def one_hot_encode_map(x):
  return (tf.py_function(convert_to_onehot_tensor, inp=[x[0]], Tout=(tf.int32)), tf.py_function(convert_to_onehot_tensor, inp=[x[1]], Tout=(tf.int32)))
        
#Map all Datapoints to One Hot Labeled Datapoints Character Wise
    
all_labeled_data = all_labeled_data.map(one_hot_encode_map)

        
# Print out one Sample Inputs to see Format

counter = 0
for x,y in all_labeled_data:
    print(x)
    print(y)
    counter = counter + 1
    if(counter == 1):
        break

tf.Tensor(
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]], shape=(30, 80), dtype=int32)
tf.Tensor(
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
  0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0]], shape=(2, 80), dtype=int32)


In [71]:
print(labeled_data_sets)

[<TextLineDatasetV2 shapes: (), types: tf.string>, <TextLineDatasetV2 shapes: (), types: tf.string>, <TextLineDatasetV2 shapes: (), types: tf.string>]
