<a href="https://colab.research.google.com/github/annamaartensson/dd2424project/blob/issue%2F1/process_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [191]:
import tensorflow as tf
import numpy as np
import time
import pathlib
import os

# Load data

In [192]:
cache_dir = './tmp'
dataset_file_name = 'pg31100.txt'
dataset_file_origin = 'https://www.gutenberg.org/cache/epub/31100/pg31100.txt'

dataset_file_path = tf.keras.utils.get_file(
    fname=dataset_file_name,
    origin=dataset_file_origin,
    cache_dir=pathlib.Path(cache_dir).absolute()
)

print(dataset_file_path)

/tmp/.keras/datasets/pg31100.txt


In [193]:
# Reading the database file.
text = open(dataset_file_path, mode='r').read()
Persuasion = text[1437:468297] # starting from Persuasion
Northanger_Abbey = text[468297:901707]
Mansfield_Park = text[901707:1784972]
Emma = text[1784972:2668012]
Lady_Susan = text[2668012:2795312]
Love_and_friendship = text[2795312:2980261]
Pride_and_predjudice = text[2980261:3665048]
Sense_and_sensibility = text[3682008:4355100]
entire_text = text[1437:4355100]
books = [Persuasion, Northanger_Abbey, Mansfield_Park, Emma, Lady_Susan, Love_and_friendship, Pride_and_predjudice, Sense_and_sensibility]

print('Length of text: {} characters'.format(len(entire_text)))

Length of text: 4353663 characters


In [194]:
def find_indexes(start, end, book):
  t = text[start:end]
  for i in range(len(t)):
    if t[i:i+len(book)]==book:
      print(book,": ", i+start)

In [195]:
def find_chapters(book):
  index=0
  chapters=[]
  for i in range(len(book)):
    if book[i:i+len("Chapter")]=="Chapter" or book[i:i+len("CHAPTER")]=="CHAPTER":
      chapters.append(book[index:i])
      index=i
  return chapters

In [196]:
chapters_emma = find_chapters(Emma)

In [197]:
# One-hot encoding look-up
vocab = sorted(set(entire_text))
char_to_ind = tf.keras.layers.StringLookup(vocabulary = list(vocab), mask_token = None)
ind_to_char = tf.keras.layers.StringLookup(vocabulary = char_to_ind.get_vocabulary(), invert = True, mask_token = None)

# Splitting into training, verification, testing

In [198]:
seq_length=25
K = len(vocab)

In [199]:
# one hot encoding matrices
training_text = text[1437:2980261] # Persuasion --> Love and friendship
training_matrix = tf.transpose(tf.one_hot(char_to_ind(training_text), K))
validation_matrix = tf.transpose(tf.one_hot(char_to_ind(Pride_and_predjudice), K))
testing_matrix = tf.transpose(tf.one_hot(char_to_ind(Sense_and_sensibility), K))

# Dicionary for seeing how many words are correctly spelled

In [254]:
import re
Austen_dict = {}
no_spec=re.sub("\&|\[|\]|\_|!|\?|\*|\.|,|\(|\)|;|:|[0-9]+|\"|\'","", entire_text)
no_enter = re.sub("\n|-"," ", no_spec)
Austen_dict = {word.lower() for word in no_enter.split(" ")}

In [256]:
print(len(Austen_dict))

14850


In [247]:
print(vocab)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


# Doing it the way they did in the Shakespeare example

In [200]:
def split_input_target(chunk):
    X = chunk[:-1]
    Y = chunk[1:]
    return X, Y

In [201]:
def reading_data(text_as_int, seq_length):
  char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
  sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)
  dataset = sequences.map(split_input_target)
  return dataset

In [202]:
# Map characters to their indices in vocabulary.
char_to_ind = {char: index for index, char in enumerate(vocab)}
# Map character indices to characters from vacabulary.
ind_to_char = np.array(vocab)

In [203]:
# Convert chars in text to indices.
training_as_int = np.array([char_to_ind[char] for char in training_text])
validation_as_int = np.array([char_to_ind[char] for char in Pride_and_predjudice])
testing_as_int = np.array([char_to_ind[char] for char in Sense_and_sensibility])

In [204]:
training_set = reading_data(training_as_int, seq_length)
validation_set = reading_data(validation_as_int, seq_length)
testing_set = reading_data(testing_as_int, seq_length)

In [205]:
# Batch size.
BATCH_SIZE = 64

# Buffer size to shuffle the dataset (TF data is designed to work
# with possibly infinite sequences, so it doesn't attempt to shuffle
# the entire sequence in memory. Instead, it maintains a buffer in
# which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<_BatchDataset element_spec=(TensorSpec(shape=(64, 64, 64, 25), dtype=tf.int64, name=None), TensorSpec(shape=(64, 64, 64, 25), dtype=tf.int64, name=None))>