# CharLSTM

- CharLSTM is a simple LSTM network that creates output documents based on probability of character level sequences in the training data.
- CharLSTM dictionary is a char-level dictionary
- CharLSRM language model is created base on tokenizing input file into charachters.

# Objective
- Create a dataset that downloads and prepares teh data
- Create a char level dataset using gluonnlp pre-processing capabilities
- Create an LSTM network using gluon.rnn
- Training the model

# Imports

In [37]:
import gluonnlp as nlp
import numpy as np
from mxnet import gluon, autograd
from mxnet import nd
import mxnet as mx
from gluonnlp.data.utils import slice_sequence
from mxnet.gluon.data import SimpleDataset
from gluonnlp.data.utils import _get_home_dir
from gluonnlp.data.registry import register
import os
import time
import math

# Download and Prepare Data
In thi section we:
- derive a class from SimpleDataset
- Download source of TimeMachine book from an Internet location
- Strip the text to only core text by removing all legal info that do not support training of a language model based on the book.
- Split the data to training and test and save them locally

```python
@register(segment=['train', 'test'])
class TimeMachineDataLoader(SimpleDataset):
    def __init__(self, segment='train', 
                 root=os.path.join(_get_home_dir(), 'data', 'word_generator')):
        self._data_file = {'train': ('train.txt', ''),
                          'test': ('test.txt', '')}
        root = os.path.expanduser(root)
        if not os.path.isdir(root):
            os.makedirs(root)
        self._root = root
        self._segment = segment
        self._get_data()
        self._file_path = self._get_file_path()
        
        super(TimeMachineDataLoader, self).__init__(self._read_data())
        @property
        def file_path(self):
            return self._file_path        
```

When passing keyword arguments to `register`, they are checked to be valid keyword arguments for the registered 
Dataset class constructor and are saved in the registry.'''


```python
    def _get_data(self):
        data_file_name, data_hash = self._data_file[self._segment]
        root = self._root
        path = os.path.join(root, data_file_name)
        if not os.path.exists(path):
            download('http://archive.org/stream/thetimemachine00035gut/35.txt', path=root)
        with open(os.path.join(root, '35.txt')) as f:
            raw_data = f.read()
        raw_data = raw_data[44332: -24182]
        raw_data_val = raw_data[-len(raw_data)//3:]
        raw_data = raw_data[:2*len(raw_data)//3]
        with open(os.path.join(root, 'train.txt'), 'w+') as output_file:
            output_file.write(raw_data)
        

        with open(os.path.join(root, 'test.txt'), 'w+') as output_file:
            output_file.write(raw_data_val)
                

    def _read_data(self):
        with open(os.path.join(self._root, self._segment+'.txt')) as f:
            samples = list(f.read())
        return samples
    
    def _get_file_path(self):
        data_file_name, data_hash = self._data_file[self._segment]
        root = self._root
        path = os.path.join(root, data_file_name)
        if not os.path.exists(path):
            raise FileNotFoundError
        return path
```

In [18]:
@register(segment=['train', 'test'])
class TimeMachineDataLoader(SimpleDataset):
    def __init__(self, segment='train', 
                 root=os.path.join(_get_home_dir(), 'data', 'word_generator')):
        self._data_file = {'train': ('train.txt', ''),
                          'test': ('test.txt', '')}
        root = os.path.expanduser(root)
        if not os.path.isdir(root):
            os.makedirs(root)
        self._root = root
        self._segment = segment
        self._get_data()
        self._file_path = self._get_file_path()
        
        super(TimeMachineDataLoader, self).__init__(self._read_data())
        
    @property
    def file_path(self):
        return self._file_path
    
    def _get_data(self):
        data_file_name, data_hash = self._data_file[self._segment]
        root = self._root
        path = os.path.join(root, data_file_name)
        if not os.path.exists(path):
            download('http://archive.org/stream/thetimemachine00035gut/35.txt', path=root)
        with open(os.path.join(root, '35.txt')) as f:
            raw_data = f.read()
        raw_data = raw_data[44332: -24182]
        raw_data_val = raw_data[-len(raw_data)//3:]
        raw_data = raw_data[:2*len(raw_data)//3]
        with open(os.path.join(root, 'train.txt'), 'w+') as output_file:
            output_file.write(raw_data)
        

        with open(os.path.join(root, 'test.txt'), 'w+') as output_file:
            output_file.write(raw_data_val)
                

    def _read_data(self):
        with open(os.path.join(self._root, self._segment+'.txt')) as f:
            samples = list(f.read())
        return samples
    
    def _get_file_path(self):
        data_file_name, data_hash = self._data_file[self._segment]
        root = self._root
        path = os.path.join(root, data_file_name)
        if not os.path.exists(path):
            raise FileNotFoundError
        return path
        


  return register_(class_)


# DataSet and Batch Generator
In this section we create a dataset for iterating over the data:
- The dataset is derived from SimpleDataset and provides us with dataset capabilities
- The class is capably of batchifying our data and returning a vectorized version of the book in batch sizes
- The class returns a Back Propagation Through Time batchifier that in essense is out data iterator and returns an NDArray as an enumurable object that includes data and target for training.

In [19]:
class CharLevelDataSet(SimpleDataset):
    def __init__(self, dataset, tokenizer=nlp.data.SpacyTokenizer('en')):
        self._tokenizer = tokenizer
        #self._dataset = self._tokenizer(dataset[:])
        self._dataset = dataset
    
    @property
    def dataset(self):
        return self._dataset
    
    def batchify(self, vocab, batch_size):
        data = self._dataset[:]
        sample_len = len(data) // batch_size
        return mx.nd.array(vocab[data[:sample_len * batch_size]]).reshape((batch_size, -1)).T
    
    def bptt_batchify(self, bptt, vocab, batch_size):
        data = self.batchify(vocab, batch_size)
        batches = slice_sequence(data, bptt+1, overlap=1)
        return SimpleDataset(batches).transform(lambda x: (x[:min(len(x)-1, bptt), :], x[1:, :]))
        

    
            

## Alternative method
The above code is created for training purpose. normally you would inherit a single dataset class from `gluonnlp.data.LanguageModelDataset` that provides `batchify` and `bptt-batchify` through inheritence.
For more information please check:
1. `_WikiText`, and `WikiText2` classes in [github](https://github.com/dmlc/gluon-nlp/blob/master/gluonnlp/data/language_model.py)
2. `Sentiment` class in [github](https://github.com/dmlc/gluon-nlp/blob/master/gluonnlp/data/sentiment.py) or [gluonnlp API Docs](https://gluon-nlp.mxnet.io/api/data.html#transforms) 

## Data Transformation
In order to create character level LSTM, you would beed to identify ```python tokenizer=lambda s : s```. For word level tokenizer leave the tokenizer to default. You can also use one of the supported transformers you can import from ```gluonnlp.data```:
- NLTKMosesTokenizer
- SpacyTokenizer 
- JiebaTokenizer
- NLTKStanfordSegmenter
for more information and source code please check [github](https://github.com/dmlc/gluon-nlp/blob/master/gluonnlp/data/transforms.py)

## More on ```gluonnlp.data.utils.slice_sequence()```

In [20]:
a = [0,1,2,3,4,5,6,7,8]
print("length = 3, overlap = 1: {}".format(slice_sequence(a, length=3, overlap=1)))
print("length = 3, overlap = 2: {}".format(slice_sequence(a, length=3, overlap=2)))

length = 3, overlap = 1: [[0, 1, 2], [2, 3, 4], [4, 5, 6], [6, 7, 8]]
length = 3, overlap = 2: [[0, 1, 2], [1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, 8]]


In [21]:
a = nd.random_normal(shape=(5,5))
print("a.shape= {}".format(a.shape))
print("length = 3, overlap = 1: {}".format(slice_sequence(a, length=3, overlap=1)))

a.shape= (5, 5)
length = 3, overlap = 1: [
[[ 0.32510808 -1.30023408  0.3679345   1.45342624  0.24154152]
 [ 0.47898006  0.96885103 -1.02182448 -0.06812762 -0.31868345]
 [-0.17634277  0.35655284  0.74419165  0.77874237  0.60878229]]
<NDArray 3x5 @cpu(0)>, 
[[-0.17634277  0.35655284  0.74419165  0.77874237  0.60878229]
 [ 1.0741756   0.06642842  0.84869862 -0.80038017 -0.16882208]
 [ 0.93632793  0.35744399  0.77932847 -1.01030731 -0.39157307]]
<NDArray 3x5 @cpu(0)>]


# Hyperparameters and EnvironmentVariables

In [22]:
num_embd = 256
num_hidden = 512
num_layers = 3
opt = 'sgd'
lr = .001
momentum = .9
wd = 0
num_gpus = min(16, mx.context.num_gpus())
ctx = [mx.gpu(i) for i in range(num_gpus)]
batch_size = 64
grad_clip = 0.25
log_interval = 200
model_name="CharLSTM"
dataset_name="TimeMachine"

```python mx.context.num_gpus()``` is a newly merged method to context and does not exist in mxnet 1.2


# Data Pipeline
1. Downnload and split the data file.
2. Instantiating datasets objects from ```CharLevelDataset``` in order to be able to create batches.
3. Creating vocabulary by instantiating a gluonnlp.Vocab object 

In [23]:
#Downloading and splitting data files
train_dataset, test_dataset = [TimeMachineDataLoader(segment=segment, root='../data/text_generator')
                               for segment in ['train', 'test']]

# Crearing CharLevelDataSet that supports batchifying
train_data = CharLevelDataSet(train_dataset)
test_data = CharLevelDataSet(test_dataset)

#Creating vocabulary
vocab = nlp.vocab.Vocab(nlp.data.Counter(train_dataset[:] + test_dataset[:]), 
                        padding_token=None, 
                        eos_token=None, 
                        bos_token=None)
#Creating iterable training and test data
train_data, test_data = [x.bptt_batchify(bptt=129, vocab=vocab, batch_size=batch_size)
                        for x in [train_data, test_data]]

In [24]:
print("traindata: {}\n\ntestdata: {}".format(train_data[0], test_data[0]))

traindata: (
[[ 11.   5.   2. ...,   1.   1.   2.]
 [ 17.  19.  22. ...,  25.  11.  11.]
 [ 42.   1.  28. ...,   1.   2.  22.]
 ..., 
 [ 23.   4.   5. ...,   1.  10.   8.]
 [  1.  12.  12. ...,  24.   2.   1.]
 [ 49.   6.  20. ...,   2.   2.  26.]]
<NDArray 129x64 @cpu(0)>, 
[[ 17.  19.  22. ...,  25.  11.  11.]
 [ 42.   1.  28. ...,   1.   2.  22.]
 [  7.   4.   1. ...,  15.   2.   1.]
 ..., 
 [  1.  12.  12. ...,  24.   2.   1.]
 [ 49.   6.  20. ...,   2.   2.  26.]
 [  6.   5.   1. ...,  19.   8.   2.]]
<NDArray 129x64 @cpu(0)>)

testdata: (
[[ 13.   5.   9. ...,   5.   1.   2.]
 [ 20.   3.   2. ...,   1.  11.   1.]
 [  1.   1.   1. ...,  19.   7.   4.]
 ..., 
 [ 12.  14.   2. ...,  12.   5.   3.]
 [  4.  19.  17. ...,   7.  11.   3.]
 [  3.   9.  16. ...,   5.   1.   2.]]
<NDArray 129x64 @cpu(0)>, 
[[ 20.   3.   2. ...,   1.  11.   1.]
 [  1.   1.   1. ...,  19.   7.   4.]
 [ 13.   7.   7. ...,  12.   8.   5.]
 ..., 
 [  4.  19.  17. ...,   7.  11.   3.]
 [  3.   9.  16. ...,   5. 

# Model

In [25]:
class LSTMModel(gluon.Block):
    def __init__(self, vocab_size, num_embd, num_hidden, num_layers, dropout=.5, **kwargs):
        super(LSTMModel, self).__init__(**kwargs)
        with self.name_scope():
            self.drop = gluon.nn.Dropout(dropout)
            self.encoder = gluon.nn.Embedding(vocab_size, num_embd, weight_initializer=mx.init.Uniform(.1))
            self.lstm = gluon.rnn.LSTM(hidden_size=num_hidden, 
                                       num_layers=num_layers, 
                                       dropout=dropout, 
                                       input_size = num_embd)
            self.decoder = gluon.nn.Dense(units=vocab_size, in_units=num_hidden)
            self.num_hidden = num_hidden
    
    def forward(self, inputs, hidden):
        emb = self.drop(self.encoder(inputs))
        #print("EMB_SHAPE: {}".format(emb.shape))
        output, hidden = self.lstm(emb, hidden)
        #print("OUTPUT_SHAPE_IN_MODEL: {}".format(output.shape))
        output = self.drop(output)
        decoded = self.decoder(output.reshape((-1, self.num_hidden)))
        return decoded, hidden
    
    def begin_state(self, *args, **kwargs):
        return self.lstm.begin_state(*args, **kwargs)

        


## Creating the Model

In [26]:
model = LSTMModel(vocab_size=len(vocab.idx_to_token), num_embd=256, num_hidden=512, num_layers=3)
model.collect_params()

lstmmodel0_ (
  Parameter lstmmodel0_embedding0_weight (shape=(68, 256), dtype=float32)
  Parameter lstmmodel0_lstm0_l0_i2h_weight (shape=(2048, 256), dtype=<class 'numpy.float32'>)
  Parameter lstmmodel0_lstm0_l0_h2h_weight (shape=(2048, 512), dtype=<class 'numpy.float32'>)
  Parameter lstmmodel0_lstm0_l0_i2h_bias (shape=(2048,), dtype=<class 'numpy.float32'>)
  Parameter lstmmodel0_lstm0_l0_h2h_bias (shape=(2048,), dtype=<class 'numpy.float32'>)
  Parameter lstmmodel0_lstm0_l1_i2h_weight (shape=(2048, 512), dtype=<class 'numpy.float32'>)
  Parameter lstmmodel0_lstm0_l1_h2h_weight (shape=(2048, 512), dtype=<class 'numpy.float32'>)
  Parameter lstmmodel0_lstm0_l1_i2h_bias (shape=(2048,), dtype=<class 'numpy.float32'>)
  Parameter lstmmodel0_lstm0_l1_h2h_bias (shape=(2048,), dtype=<class 'numpy.float32'>)
  Parameter lstmmodel0_lstm0_l2_i2h_weight (shape=(2048, 512), dtype=<class 'numpy.float32'>)
  Parameter lstmmodel0_lstm0_l2_h2h_weight (shape=(2048, 512), dtype=<class 'numpy.float32

## RNN Support in ```gluon.rnn``

### Recurrent Layers
Recurruent Layers can be used in the ```Sequential``` block with other nn types of layers

|Supported Feature|Description|Notes|
|:-|:-|:-|
||RNN|Applies a multi-layer Elman RNN with tanh or ReLU non-linearity to an input sequence. in order to make a layer **bi-directional** you would need to change the default behavious by passing ```python bidirectional=True``` to the constructor|[docs](https://mxnet.incubator.apache.org/api/python/gluon/rnn.html#mxnet.gluon.rnn.RNN)|
||LSTM|Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence.In order to make a layer **bi-directional** you would need to change the default behavious by passing ```python bidirectional=True``` to the constructor|[docs](https://mxnet.incubator.apache.org/api/python/gluon/rnn.html#mxnet.gluon.rnn.LSTM)|
||GRU|Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.|[docs](https://mxnet.incubator.apache.org/api/python/gluon/rnn.html#mxnet.gluon.rnn.GRU)|

### Recurrent Cells
In order to gain fine-grained control over implementation of your network you can use cells.

|Supported Feature|Description|Notes|
|:-|:-|:-|
|RNNCell|Elman RNN recurrent neural network cell.|[docs](https://mxnet.incubator.apache.org/api/python/gluon/rnn.html#mxnet.gluon.rnn.RNNCell)|
|LSTM Cell|Long-Short Term Memory (LSTM) network cell.|[docs](https://mxnet.incubator.apache.org/api/python/gluon/rnn.html#mxnet.gluon.rnn.LSTMCell)|
|GRU Cell|Gated Rectified Unit (GRU) network cell.|[docs](https://mxnet.incubator.apache.org/api/python/gluon/rnn.html#mxnet.gluon.rnn.GRUCell)|
|SequentialRNNCell|Sequentially stacking multiple RNN cells.|[docs](https://mxnet.incubator.apache.org/api/python/gluon/rnn.html#mxnet.gluon.rnn.SequentialRNNCell)|
|DropoutCell|Applied dropout on a cell|[docs](https://mxnet.incubator.apache.org/api/python/gluon/rnn.html#mxnet.gluon.rnn.DropoutCell)|
|ResidualCell|Adds residual connections|[docs](https://mxnet.incubator.apache.org/api/python/gluon/rnn.html#mxnet.gluon.rnn.ResidualCell)|

## Model Initialization

In [27]:
model.collect_params().initialize(mx.init.Xavier(), ctx=ctx, force_reinit=True)

## Model Optimizer

In [28]:
trainer = gluon.Trainer(model.collect_params(), 'sgd',
                        {'learning_rate': lr, 'momentum': momentum, 'wd': 0})

## Objective Function

In [29]:
loss = gluon.loss.SoftmaxCrossEntropyLoss()

# Training

## Detaching Gradients
We need to detach gradient for truncated BPTT

In [30]:
def detach(hidden):
    if isinstance(hidden, (tuple, list)):
        hidden = [detach(i) for i in hidden]
    else:
        hidden = hidden.detach()
    return hidden

## Evaluation

In [31]:
def evaluate(model, data_source, batch_size, ctx):
    total_L = 0.0
    ntotal = 0
    hidden = model.begin_state(batch_size=batch_size, func=mx.nd.zeros, ctx=ctx)
    for i, (data, target) in enumerate(data_source):
        data = data.as_in_context(ctx)
        #print("DATA_SHAPE: {}".format(data.shape))
        target = target.reshape((-1, )).as_in_context(ctx)
        #print("TARGET_SHAPE: {}".format(target.shape))
        output, hidden = model(data, hidden)
        #print("OUTPUT_SHAPE: {}".format(output.shape))
        hidden = detach(hidden)
        L = loss(output,target)
        total_L += mx.nd.sum(L).asscalar()
        ntotal += L.size
    return total_L / ntotal

In [32]:
evaluate(model=model, data_source=test_data, batch_size=batch_size,ctx=ctx[0])

4.2192995191808551

## Training Loop

```python
def train(model, train_data, val_data, test_data, epochs, lr, context):
...
    for epoch in range(epochs):
    ...
        hiddens = [model.begin_state(batch_size//len(context), func=mx.nd.zeros, ctx=ctx) 
                   for ctx in context]
        for i, (data, target) in enumerate(train_data):
            data_list = gluon.utils.split_and_load(data, context, batch_axis=1, even_split=True)
            target_list = gluon.utils.split_and_load(target, context, batch_axis=1, even_split=True)
            hiddens = detach(hiddens)
            L = 0
            Ls = []
            with autograd.record():
                for j, (X, y, h) in enumerate(zip(data_list, target_list, hiddens)):
                    output, h = model(X, h)
                    batch_L = loss(output, y.reshape(-1,))
                    L = L + batch_L.as_in_context(context[0]) / X.size
                    Ls.append(batch_L / X.size)
                    hiddens[j] = h
            L.backward()
            grads = [p.grad(x.context) for p in parameters for x in data_list]
            gluon.utils.clip_global_norm(grads, grad_clip)
            trainer.step(1)
            total_L += sum([mx.nd.sum(l).asscalar() for l in Ls])
...    

```

In [33]:
def train(model, train_data, val_data, test_data, epochs, lr, context):
    best_val = float("Inf")
    start_train_time = time.time()
    parameters = model.collect_params().values()
    for epoch in range(epochs):
        total_L = 0.0
        start_epoch_time = time.time()
        start_log_interval_time = time.time()
        hiddens = [model.begin_state(batch_size//len(context), func=mx.nd.zeros, ctx=ctx) 
                   for ctx in context]
        for i, (data, target) in enumerate(train_data):
            data_list = gluon.utils.split_and_load(data, context, 
                                                   batch_axis=1, even_split=True)
            target_list = gluon.utils.split_and_load(target, context, 
                                                     batch_axis=1, even_split=True)
            hiddens = detach(hiddens)
            L = 0
            Ls = []
            with autograd.record():
                for j, (X, y, h) in enumerate(zip(data_list, target_list, hiddens)):
                    output, h = model(X, h)
                    batch_L = loss(output, y.reshape(-1,))
                    L = L + batch_L.as_in_context(context[0]) / X.size
                    Ls.append(batch_L / X.size)
                    hiddens[j] = h
            L.backward()
            grads = [p.grad(x.context) for p in parameters for x in data_list]
            gluon.utils.clip_global_norm(grads, grad_clip)

            trainer.step(1)

            total_L += sum([mx.nd.sum(l).asscalar() for l in Ls])

            if i % log_interval == 0 and i > 0:
                cur_L = total_L / log_interval
                print('[Epoch %d Batch %d/%d] loss %.2f, ppl %.2f, '
                      'throughput %.2f samples/s'%(
                    epoch, i, len(train_data), cur_L, math.exp(cur_L), 
                    batch_size * log_interval / (time.time() - start_log_interval_time)))
                total_L = 0.0
                start_log_interval_time = time.time()

        mx.nd.waitall()

        print('[Epoch %d] throughput %.2f samples/s'%(
                    epoch, len(train_data)*batch_size / (time.time() - start_epoch_time)))
        val_L = evaluate(model, val_data, batch_size, context[0])
        print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%(
            epoch, time.time()-start_epoch_time, val_L, math.exp(val_L)))

        if val_L < best_val:
            best_val = val_L
            test_L = evaluate(model, test_data, batch_size, context[0])
            model.save_parameters('../model/{}_{}-{}.params'.format(model_name, dataset_name, epoch))
            print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
        else:
            lr = lr*0.25
            print('Learning rate now %f'%(lr))
            trainer.set_learning_rate(lr)

    print('Total training throughput %.2f samples/s'%(
                            (batch_size * len(train_data) * epochs) / 
                            (time.time() - start_train_time)))
    

In [39]:
train(model=model, train_data=train_data, val_data=test_data, test_data=test_data, epochs=3, lr=lr, context=ctx)

[Epoch 0] throughput 865.52 samples/s
[Epoch 0] time cost 1.21s, valid loss 4.17, valid ppl 64.41
test loss 4.17, test ppl 64.41
[Epoch 1] throughput 859.77 samples/s
[Epoch 1] time cost 1.21s, valid loss 4.14, valid ppl 62.96
test loss 4.14, test ppl 62.96
[Epoch 2] throughput 857.14 samples/s
[Epoch 2] time cost 1.22s, valid loss 4.12, valid ppl 61.55
test loss 4.12, test ppl 61.55
Total training throughput 553.28 samples/s
