In [11]:
import time
from collections import namedtuple

import numpy as np
import tensorflow as tf

In [12]:
with open('anna.txt', 'r') as f:
    text=f.read()
vocab = set(text)

vocab_to_int = {c: i for i, c in enumerate(vocab)}
int_to_vocab = dict(enumerate(vocab))

chars = np.array([vocab_to_int[c] for c in text], dtype=np.int32)

In [13]:
print(len(vocab))

83


In [None]:
def split_data(chars, batch_size, num_steps, split_frac=0.9):
    """ 
    Split character data into training and validation sets, inputs and targets for each set.
    
    Arguments
    ---------
    chars: character array
    batch_size: Size of examples in each of batch
    num_steps: Number of sequence steps to keep in the input and pass to the network
    split_frac: Fraction of batches to keep in the training set
    
    
    Returns train_x, train_y, val_x, val_y
    """
    print("size (chars)",len(chars))
    print("batch_size: ",batch_size, " num_steps: ",num_steps, " split_frac: ",split_frac)
    
    slice_size = batch_size * num_steps
    print("slice_size: ",slice_size)
    
    n_batches = int(len(chars) / slice_size)
    print("num_batches: ",n_batches)
    
    # Drop the last few characters to make only full batches
    x = chars[: n_batches*slice_size]
    print("x: ",x, "len: ",len(x))
    
    y = chars[1: n_batches*slice_size + 1]
    print("y: ",y, "len: ",len(y))
    
    # Split the data into batch_size slices, then stack them into a 2D matrix 
    x = np.stack(np.split(x, batch_size))
    print("x: \n",x, " shape: ",x.shape)
    
    y = np.stack(np.split(y, batch_size))
    print("y: \n",y, " shape: ",y.shape)
    # Now x and y are arrays with dimensions batch_size x n_batches*num_steps
    
    # Split into training and validation sets, keep the virst split_frac batches for training
    split_idx = int(n_batches*split_frac)
    print("split_size: ",split_idx)
    print("reduced_x_y for training: ",split_idx*num_steps)
    
    train_x, train_y= x[:, :split_idx*num_steps], y[:, :split_idx*num_steps]
    print("train_x: \n",train_x," shape: ",train_x.shape)
    print("train_y: \n",train_y," shape: ",train_y.shape)
    
    val_x, val_y = x[:, split_idx*num_steps:], y[:, split_idx*num_steps:]
    print("val_x: \n",train_x," shape: ",val_x.shape)
    print("val_y: \n",train_y," shape: ",val_y.shape)
    
    return train_x, train_y, val_x, val_y

In [29]:
#Now I'll make my data sets and we can check out what's going on here. Here I'm going to use a batch size of 10 and 50 sequence steps.
train_x, train_y, val_x, val_y = split_data(chars, batch_size=10000, num_steps=50)

size (chars) 1985223
batch_size:  10000  num_steps:  50  split_frac:  0.9
slice_size:  500000
num_batches:  3
x:  [72 52 81 ..., 74 60  7] len:  1500000
y:  [52 81 11 ..., 60  7 69] len:  1500000
x: 
 [[72 52 81 ..., 59  3 43]
 [50 78 59 ..., 76 60 39]
 [52 78 30 ..., 14 59 81]
 ..., 
 [30 81 59 ..., 40 43 20]
 [ 4 32 39 ..., 73 59 81]
 [20 14 59 ..., 74 60  7]]  shape:  (10000, 150)
y: 
 [[52 81 11 ...,  3 43 50]
 [78 59 52 ..., 60 39 52]
 [78 30 59 ..., 59 81 20]
 ..., 
 [81 59  4 ..., 43 20  4]
 [32 39 18 ..., 59 81 20]
 [14 59 13 ..., 60  7 69]]  shape:  (10000, 150)
split_size:  2
reduced_x_y for training:  100
train_x: 
 [[72 52 81 ..., 52 43 20]
 [50 78 59 ..., 59 21 60]
 [52 78 30 ..., 52 81 14]
 ..., 
 [30 81 59 ...,  2 43 11]
 [ 4 32 39 ..., 43 14 59]
 [20 14 59 ...,  4 60 59]]  shape:  (10000, 100)
train_y: 
 [[52 81 11 ..., 43 20 21]
 [78 59 52 ..., 21 60 40]
 [78 30 59 ..., 81 14 59]
 ..., 
 [81 59  4 ..., 43 11  4]
 [32 39 18 ..., 14 59 81]
 [14 59 13 ..., 60 59 76]]  sha

In [27]:
np.save("train_x.npy",train_x)
np.save("train_y.npy",train_y)

In [21]:
def get_train_batches(train_x, train_y, batch_size):
    for i in range(0, train_x.shape[0], batch_size):
        print("getting: ",i," to ", i+batch_size)
        yield train_x[i : i+batch_size], train_y[i : i+batch_size]

### Defining the below so that other files can use it for generating texts

In [30]:
import pickle

In [31]:
output = open('vocab_to_int.txt','ab+')

In [32]:
pickle.dump(vocab_to_int,output)

In [33]:
output.close()

In [34]:
output2 = open('int_to_vocab.txt','ab+')

In [35]:
pickle.dump(int_to_vocab, output2)

In [36]:
output2.close()