<a href="https://colab.research.google.com/github/annamaartensson/dd2424project/blob/issue%2F1/process_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import platform
import time
import pathlib
import os

# Load data

In [47]:
cache_dir = './tmp'
dataset_file_name = 'pg31100.txt'
dataset_file_origin = 'https://www.gutenberg.org/cache/epub/31100/pg31100.txt'

dataset_file_path = tf.keras.utils.get_file(
    fname=dataset_file_name,
    origin=dataset_file_origin,
    cache_dir=pathlib.Path(cache_dir).absolute()
)

print(dataset_file_path)

Downloading data from https://www.gutenberg.org/cache/epub/31100/pg31100.txt
/tmp/.keras/datasets/pg31100.txt


In [155]:
# Reading the database file.
text = open(dataset_file_path, mode='r').read()
Persuasion = text[1437:468297] # starting from Persuasion
Northanger_Abbey = text[468297:901707]
Mansfield_Park = text[901707:1784972]
Emma = text[1784972:2668012]
Lady_Susan = text[2668012:2795312]
Love_and_friendship = text[2795312:2980261]
Pride_and_predjudice = text[2980261:3665048]
Sense_and_sensibility = text[3682008:4355100]
entire_text = text[1437:4355100]
books = [Persuasion, Northanger_Abbey, Mansfield_Park, Emma, Lady_Susan, Love_and_friendship, Pride_and_predjudice, Sense_and_sensibility]

print('Length of text: {} characters'.format(len(text)))

Length of text: 4373619 characters


In [95]:
t = text[87140:87200].replace("\n", " ").split(" ")
print(t)
print(len(t))

['s', 'in', 'this', 'country."', '', 'Anne', 'hoped', 'she', 'had', 'outlived', 'the', 'age', 'of', '']
14


In [111]:
def find_indexes(start, end, book):
  t = text[start:end]
  for i in range(len(t)):
    if t[i:i+len(book)]==book:
      print(book,": ", i+start)

In [142]:
find_indexes(2980261, 4373619, "END OF THE PROJECT GUTENBERG EBOOK THE COMPLETE PROJECT GUTENBERG WORKS OF JANE AUSTEN")

END OF THE PROJECT GUTENBERG EBOOK THE COMPLETE PROJECT GUTENBERG WORKS OF JANE AUSTEN :  4355105


In [150]:
def find_chapters(book):
  index=0
  chapters=[]
  for i in range(len(book)):
    if book[i:i+len("Chapter")]=="Chapter" or book[i:i+len("CHAPTER")]=="CHAPTER":
      chapters.append(book[index:i])
      index=i
  return chapters

In [151]:
chapters_emma = find_chapters(Emma)

In [6]:
# The unique characters in the file
vocab = sorted(set(text))

print('{} unique characters'.format(len(vocab)))
print('vocab:', vocab)

97 unique characters
vocab: ['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '^', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}', 'à', 'ê', '—', '‘', '’', '“', '”', '•', '™', '\ufeff']


In [9]:
# Map characters to their indices in vocabulary.
char_to_ind = {char: index for index, char in enumerate(vocab)}
# Map character indices to characters from vacabulary.
ind_to_char = np.array(vocab)
print(ind_to_char)

['\n' ' ' '!' '"' '#' '$' '%' '&' "'" '(' ')' '*' ',' '-' '.' '/' '0' '1'
 '2' '3' '4' '5' '6' '7' '8' '9' ':' ';' '?' 'A' 'B' 'C' 'D' 'E' 'F' 'G'
 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' 'X' 'Y'
 'Z' '[' ']' '^' '_' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm'
 'n' 'o' 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z' '{' '}' 'à' 'ê' '—'
 '‘' '’' '“' '”' '•' '™' '\ufeff']


In [12]:
# Convert chars in text to indices.
text_as_int = np.array([char_to_ind[char] for char in text])
print('text_as_int length: {}'.format(len(text_as_int)))
print('{} --> {}'.format(repr(text[:15]), repr(text_as_int[:15])))

text_as_int length: 711352
'\ufeffThe Project Gu' --> array([96, 48, 66, 63,  1, 44, 76, 73, 68, 63, 61, 78,  1, 35, 79])


# Splitting into training, verification, testing

In [13]:
seq_length=25

In [16]:
def split_input_target(chunk):
    X = chunk[:-1]
    Y = chunk[1:]
    return X, Y

In [21]:
def reading_data(text_as_int, seq_length):
  char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
  sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)
  dataset = sequences.map(split_input_target)
  return dataset

In [19]:
for input_example, target_example in dataset.take(1):
    print('Input sequence size:', repr(len(input_example.numpy())))
    print('Target sequence size:', repr(len(target_example.numpy())))
    print()
    print('Input:', repr(''.join(ind_to_char[input_example.numpy()])))
    print('Target:', repr(''.join(ind_to_char[target_example.numpy()])))

Input sequence size: 25
Target sequence size: 25

Input: '\ufeffThe Project Gutenberg eB'
Target: 'The Project Gutenberg eBo'


In [20]:
# Batch size.
BATCH_SIZE = 64

# Buffer size to shuffle the dataset (TF data is designed to work
# with possibly infinite sequences, so it doesn't attempt to shuffle
# the entire sequence in memory. Instead, it maintains a buffer in
# which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<_BatchDataset element_spec=(TensorSpec(shape=(64, 25), dtype=tf.int64, name=None), TensorSpec(shape=(64, 25), dtype=tf.int64, name=None))>