In [1]:
import time
from collections import namedtuple

import numpy as np
import tensorflow as tf

In [13]:
with open('anna_small.txt', 'r') as f:
    text=f.read()
vocab = set(text)

vocab_to_int = {c: i for i, c in enumerate(vocab)}
int_to_vocab = dict(enumerate(vocab))

chars = np.array([vocab_to_int[c] for c in text], dtype=np.int32)

In [14]:
print(len(vocab))

49


In [7]:
def split_data(chars, batch_size, num_steps, split_frac=0.9):
    """ 
    Split character data into training and validation sets, inputs and targets for each set.
    
    Arguments
    ---------
    chars: character array
    batch_size: Size of examples in each of batch
    num_steps: Number of sequence steps to keep in the input and pass to the network
    split_frac: Fraction of batches to keep in the training set
    
    
    Returns train_x, train_y, val_x, val_y
    """
    print("size (chars)",len(chars))
    print("batch_size: ",batch_size, " num_steps: ",num_steps, " split_frac: ",split_frac)
    
    slice_size = batch_size * num_steps
    print("slice_size: ",slice_size)
    
    n_batches = int(len(chars) / slice_size)
    print("num_batches: ",n_batches)
    
    # Drop the last few characters to make only full batches
    x = chars[: n_batches*slice_size]
    print("x: ",x, "len: ",len(x))
    
    y = chars[1: n_batches*slice_size + 1]
    print("y: ",y, "len: ",len(y))
    
    # Split the data into batch_size slices, then stack them into a 2D matrix 
    x = np.stack(np.split(x, batch_size))
    print("x: \n",x, " shape: ",x.shape)
    
    y = np.stack(np.split(y, batch_size))
    print("y: \n",y, " shape: ",y.shape)
    # Now x and y are arrays with dimensions batch_size x n_batches*num_steps
    
    # Split into training and validation sets, keep the virst split_frac batches for training
    split_idx = int(n_batches*split_frac)
    print("split_size: ",split_idx)
    print("reduced_x_y for training: ",split_idx*num_steps)
    
    train_x, train_y= x[:, :split_idx*num_steps], y[:, :split_idx*num_steps]
    print("train_x: \n",train_x," shape: ",train_x.shape)
    print("train_y: \n",train_y," shape: ",train_y.shape)
    
    val_x, val_y = x[:, split_idx*num_steps:], y[:, split_idx*num_steps:]
    print("val_x: \n",train_x," shape: ",val_x.shape)
    print("val_y: \n",train_y," shape: ",val_y.shape)
    
    return train_x, train_y, val_x, val_y

#Now I'll make my data sets and we can check out what's going on here. Here I'm going to use a batch size of 10 and 50 sequence steps.
train_x, train_y, val_x, val_y = split_data(chars, batch_size=10, num_steps=50)

size (chars) 2906
batch_size:  10  num_steps:  50  split_frac:  0.9
slice_size:  500
num_batches:  5
x:  [36 37 21 ..., 47 32 20] len:  2500
y:  [37 21 48 ..., 32 20  0] len:  2500
x: 
 [[36 37 21 ...,  0 37  0]
 [47 31 32 ..., 32 15 20]
 [21 48 21 ..., 42 32 15]
 ..., 
 [42 32 47 ...,  9  0 15]
 [15  9 21 ...,  0 15 32]
 [ 0 47 15 ..., 47 32 20]]  shape:  (10, 250)
y: 
 [[37 21 48 ..., 37  0 47]
 [31 32  0 ..., 15 20 21]
 [48 21 32 ..., 32 15 20]
 ..., 
 [32 47 22 ...,  0 15 15]
 [ 9 21 32 ..., 15 32  0]
 [47 15 22 ..., 32 20  0]]  shape:  (10, 250)
split_size:  4
reduced_x_y for training:  200
train_x: 
 [[36 37 21 ..., 47 47 22]
 [47 31 32 ..., 47 40  7]
 [21 48 21 ..., 15 20 21]
 ..., 
 [42 32 47 ...,  5 18 15]
 [15  9 21 ..., 11 22 48]
 [ 0 47 15 ..., 28 21  9]]  shape:  (10, 200)
train_y: 
 [[37 21 48 ..., 47 22 18]
 [31 32  0 ..., 40  7  0]
 [48 21 32 ..., 20 21 32]
 ..., 
 [32 47 22 ..., 18 15 32]
 [ 9 21 32 ..., 22 48 21]
 [47 15 22 ..., 21  9 15]]  shape:  (10, 200)
val_x: 
 

In [8]:
print("train x: ",train_x.shape)
print("train y: ",train_y.shape)
print("valid x: ",val_x.shape)
print("valid y: ",val_y.shape)

train x:  (10, 200)
train y:  (10, 200)
valid x:  (10, 50)
valid y:  (10, 50)


In [9]:
np.save("train_x.npy",train_x)
np.save("train_y.npy",train_y)

In [11]:
def get_train_batches(train_x, train_y, batch_size):
    for i in range(0, train_x.shape[0], batch_size):
        print("getting: ",i," to ", i+batch_size)
        yield train_x[i : i+batch_size], train_y[i : i+batch_size]

In [12]:
for x, y in get_train_batches(train_x, train_y, 2):
    print("x: ",x)
    print("y: ",y)

getting:  0  to  2
x:  [[36 37 21 48 35 15 20  0 47 31 32 23 45 40 32  0 47 32  7 22 47 28 18 40
   0 22 47 32  0 47 32 15 20 21 32 24  5  9 22 47 40 25 35 40 34 32 20 22
  18 40 21 27 32 39 20 21 32 23  0 28 21 32 20 45 38 46 38  0 40  7 22 37
  21 48 21 38 32 15 20 45 15 32 15 20 21 32 20 18 40  5 45 47 38 32 23 45
  40 32  7 45 48 48 35  0 47 31 32 22 47 32 45 47 32  0 47 15 48  0 31 18
  21 32 23  0 15 20 32 45 32 41 48 21 47  7 20 46 31  0 48  9 42 32 23 20
  22 32 20 45 38 32  5 21 21 47 32 45 32 31 22 37 21 48 47 21 40 40 32  0
  47 32 15 20 21  0 48 32 28 45 11  0  9 35 42 32 45 47 38 32 40 20 21 32
  20 45 38 32 45 47 47 22]
 [47 31 32  0 47 32 15 20 21 32 40 45 11 21 32 20 22 18 40 21 32 23  0 15
  20 32 20  0 11 27 46 39 20  0 40 32 12 22 40  0 15  0 22 47 32 22 28 32
  45 28 28 45  0 48 40 32 20 45 38 32 47 22 23 32  9 45 40 15 21 38 32 15
  20 48 21 21 32 38 45 35 40 42 32 45 47 38 32 47 22 15 32 22 47  9 35 32
  15 20 21 46 20 18 40  5 45 47 38 32 45 47 38 32 23  0 28 21 