diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index beb797b289e2..355233385cbf 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -36,6 +36,9 @@ export(mx.io.MNISTIter)
 export(mx.io.arrayiter)
 export(mx.io.extract)
 export(mx.kv.create)
+export(mx.lstm)
+export(mx.lstm.forward)
+export(mx.lstm.inference)
 export(mx.metric.accuracy)
 export(mx.metric.custom)
 export(mx.metric.logger)
diff --git a/R-package/R/io.R b/R-package/R/io.R
index cde2b4c032f1..10298fbaf056 100644
--- a/R-package/R/io.R
+++ b/R-package/R/io.R
@@ -41,7 +41,14 @@ mx.io.arrayiter <- function(data, label,
                             batch.size=128,
                             shuffle=FALSE) {
   if (shuffle) {
-    unif.rnds <- as.array(mx.runif(c(length(label)), ctx=mx.cpu()));
+    shape <- dim(data)
+    if (is.null(shape)) {
+      num.data <- length(data)
+    } else {
+      ndim <- length(shape)
+      num.data <- shape[[ndim]]
+    }
+    unif.rnds <- as.array(mx.runif(c(num.data), ctx=mx.cpu()));
   } else {
     unif.rnds <- as.array(0)
   }
diff --git a/R-package/R/lstm.R b/R-package/R/lstm.R
new file mode 100644
index 000000000000..3fcd0e831751
--- /dev/null
+++ b/R-package/R/lstm.R
@@ -0,0 +1,582 @@
+# lstm cell symbol
+lstm <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout=0) {
+    if (dropout > 0)
+        indata <- mx.symbol.Dropout(data=indata, p=dropout)
+    i2h <- mx.symbol.FullyConnected(data=indata,
+                                    weight=param$i2h.weight,
+                                    bias=param$i2h.bias,
+                                    num.hidden=num.hidden * 4,
+                                    name=paste0("t", seqidx, ".l", layeridx, ".i2h"))
+    h2h <- mx.symbol.FullyConnected(data=prev.state$h,
+                                    weight=param$h2h.weight,
+                                    bias=param$h2h.bias,
+                                    num.hidden=num.hidden * 4,
+                                    name=paste0("t", seqidx, ".l", layeridx, ".h2h"))
+    gates <- i2h + h2h
+    slice.gates <- mx.symbol.SliceChannel(gates, num.outputs=4,
+                                          name=paste0("t", seqidx, ".l", layeridx, ".slice"))
+
+    in.gate <- mx.symbol.Activation(slice.gates[[1]], act.type="sigmoid")
+    in.transform <- mx.symbol.Activation(slice.gates[[2]], act.type="tanh")
+    forget.gate <- mx.symbol.Activation(slice.gates[[3]], act.type="sigmoid")
+    out.gate <- mx.symbol.Activation(slice.gates[[4]], act.type="sigmoid")
+    next.c <- (forget.gate * prev.state$c) + (in.gate * in.transform)
+    next.h <- out.gate * mx.symbol.Activation(next.c, act.type="tanh")
+
+    return (list(c=next.c, h=next.h))
+}
+
+# unrolled lstm network
+lstm.unroll <- function(num.lstm.layer, seq.len, input.size,
+                        num.hidden, num.embed, num.label, dropout=0.) {
+
+    embed.weight <- mx.symbol.Variable("embed.weight")
+    cls.weight <- mx.symbol.Variable("cls.weight")
+    cls.bias <- mx.symbol.Variable("cls.bias")
+
+    param.cells <- lapply(1:num.lstm.layer, function(i) {
+        cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
+                     i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
+                     h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
+                     h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
+        return (cell)
+    })
+    last.states <- lapply(1:num.lstm.layer, function(i) {
+        state <- list(c=mx.symbol.Variable(paste0("l", i, ".init.c")),
+                      h=mx.symbol.Variable(paste0("l", i, ".init.h")))
+        return (state)
+    })
+
+    # embeding layer
+    label <- mx.symbol.Variable("label")
+    data <- mx.symbol.Variable("data")
+    embed <- mx.symbol.Embedding(data=data, input_dim=input.size,
+                                 weight=embed.weight, output_dim=num.embed, name="embed")
+    wordvec <- mx.symbol.SliceChannel(data=embed, num_outputs=seq.len, squeeze_axis=1)
+
+    last.hidden <- list()
+    for (seqidx in 1:seq.len) {
+
+        hidden = wordvec[[seqidx]]
+
+        # stack lstm
+        for (i in 1:num.lstm.layer) {
+            dp <- ifelse(i==1, 0, dropout)
+            next.state <- lstm(num.hidden, indata=hidden,
+                               prev.state=last.states[[i]],
+                               param=param.cells[[i]],
+                               seqidx=seqidx, layeridx=i,
+                               dropout=dp)
+            hidden <- next.state$h
+            last.states[[i]] <- next.state
+        }
+        # decoder
+        if (dropout > 0)
+            hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
+        last.hidden <- c(last.hidden, hidden)
+    }
+    last.hidden$dim <- 0
+    last.hidden$num.args <- seq.len
+    concat <-mxnet:::mx.varg.symbol.Concat(last.hidden)
+    fc <- mx.symbol.FullyConnected(data=concat,
+                                   weight=cls.weight,
+                                   bias=cls.bias,
+                                   num.hidden=num.label)
+
+    label <- mx.symbol.transpose(data=label)
+    label <- mx.symbol.Reshape(data=label, target.shape=c(0))
+
+    loss.all <- mx.symbol.SoftmaxOutput(data=fc, label=label, name="sm")
+    return (loss.all)
+}
+
+lstm.inference.symbol <- function(num.lstm.layer, input.size,
+                                  num.hidden, num.embed, num.label, dropout=0.) {
+    seqidx <- 0
+    embed.weight <- mx.symbol.Variable("embed.weight")
+    cls.weight <- mx.symbol.Variable("cls.weight")
+    cls.bias <- mx.symbol.Variable("cls.bias")
+
+    param.cells <- lapply(1:num.lstm.layer, function(i) {
+        cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
+                                 i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
+                                 h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
+                                 h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
+        return (cell)
+    })
+    last.states <- lapply(1:num.lstm.layer, function(i) {
+        state <- list(c=mx.symbol.Variable(paste0("l", i, ".init.c")),
+                      h=mx.symbol.Variable(paste0("l", i, ".init.h")))
+        return (state)
+    })
+
+    # embeding layer
+    data <- mx.symbol.Variable("data")
+    hidden <- mx.symbol.Embedding(data=data, input_dim=input.size,
+                                  weight=embed.weight, output_dim=num.embed, name="embed")
+
+    # stack lstm
+    for (i in 1:num.lstm.layer) {
+        dp <- ifelse(i==1, 0, dropout)
+        next.state <- lstm(num.hidden, indata=hidden,
+                           prev.state=last.states[[i]],
+                           param=param.cells[[i]],
+                           seqidx=seqidx, layeridx=i,
+                           dropout=dp)
+        hidden <- next.state$h
+        last.states[[i]] <- next.state
+    }
+    # decoder
+    if (dropout > 0)
+        hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
+
+    fc <- mx.symbol.FullyConnected(data=hidden, num_hidden=num.label,
+                                   weight=cls.weight, bias=cls.bias, name='pred')
+    sm <- mx.symbol.SoftmaxOutput(data=fc, name='sm')
+    unpack.c <- lapply(1:num.lstm.layer, function(i) {
+        state <- last.states[[i]]
+        state.c <- mx.symbol.BlockGrad(state$c, name=paste0("l", i, ".last.c"))
+        return (state.c)
+    })
+    unpack.h <- lapply(1:num.lstm.layer, function(i) {
+        state <- last.states[[i]]
+        state.h <- mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h"))
+        return (state.h)
+    })
+
+    list.all <- c(sm, unpack.c, unpack.h)
+    return (mx.symbol.Group(list.all))
+}
+
+is.param.name <- function(name) {
+    return (grepl('weight$', name) || grepl('bias$', name) ||
+           grepl('gamma$', name) || grepl('beta$', name) )
+}
+
+# Initialize parameters
+mx.model.init.params.rnn <- function(symbol, input.shape, initializer, ctx) {
+  if (!is.mx.symbol(symbol)) stop("symbol need to be MXSymbol")
+  slist <- symbol$infer.shape(input.shape)
+  if (is.null(slist)) stop("Not enough information to get shapes")
+  arg.params <- mx.init.create(initializer, slist$arg.shapes, ctx, skip.unknown=TRUE)
+  aux.params <- mx.init.create(initializer, slist$aux.shapes, ctx, skip.unknown=FALSE)
+  return(list(arg.params=arg.params, aux.params=aux.params))
+}
+
+# Initialize the data iter
+mx.model.init.iter.rnn <- function(X, y, batch.size, is.train) {
+  if (is.MXDataIter(X)) return(X)
+  shape <- dim(data)
+  if (is.null(shape)) {
+    num.data <- length(X)
+  } else {
+    ndim <- length(shape)
+    num.data <- shape[[ndim]]
+  }
+  if (is.null(y)) {
+    if (is.train) stop("Need to provide parameter y for training with R arrays.")
+    y <- c(1:num.data) * 0
+  }
+
+  batch.size <- min(num.data, batch.size)
+
+  return(mx.io.arrayiter(X, y, batch.size=batch.size, shuffle=is.train))
+}
+
+# set up rnn model with lstm cells
+setup.rnn.model <- function(rnn.sym, ctx,
+                            num.lstm.layer, seq.len,
+                            num.hidden, num.embed, num.label,
+                            batch.size, input.size,
+                            initializer=mx.init.uniform(0.01),
+                            dropout=0) {
+
+    arg.names <- rnn.sym$arguments
+    input.shapes <- list()
+    for (name in arg.names) {
+        if (grepl('init.c$', name) || grepl('init.h$', name)) {
+            input.shapes[[name]] <- c(num.hidden, batch.size)
+        }
+        else if (grepl('data$', name) || grepl('label$', name) ) {
+            if (seq.len == 1) {
+                input.shapes[[name]] <- c(batch.size)
+            } else {
+            input.shapes[[name]] <- c(seq.len, batch.size)
+            }
+        }
+    }
+    params <- mx.model.init.params.rnn(rnn.sym, input.shapes, initializer, mx.cpu())
+    args <- input.shapes
+    args$symbol <- rnn.sym
+    args$ctx <- ctx
+    args$grad.req <- "add"
+    rnn.exec <- do.call(mx.simple.bind, args)
+
+    mx.exec.update.arg.arrays(rnn.exec, params$arg.params, match.name=TRUE)
+    mx.exec.update.aux.arrays(rnn.exec, params$aux.params, match.name=TRUE)
+
+    grad.arrays <- list()
+    for (name in names(rnn.exec$ref.grad.arrays)) {
+        if (is.param.name(name))
+            grad.arrays[[name]] <- rnn.exec$ref.arg.arrays[[name]]*0
+    }
+    mx.exec.update.grad.arrays(rnn.exec, grad.arrays, match.name=TRUE)
+
+    return (list(rnn.exec=rnn.exec, symbol=rnn.sym,
+                 num.lstm.layer=num.lstm.layer, num.hidden=num.hidden,
+                 seq.len=seq.len, batch.size=batch.size,
+                 num.embed=num.embed))
+
+}
+
+
+calc.nll <- function(seq.label.probs, batch.size) {
+    nll = - sum(log(seq.label.probs)) / batch.size
+    return (nll)
+}
+
+get.label <- function(label, ctx) {
+    label <- as.array(label)
+    seq.len <- dim(label)[[1]]
+    batch.size <- dim(label)[[2]]
+    sm.label <- array(0, dim=c(seq.len*batch.size))
+    for (seqidx in 1:seq.len) {
+        sm.label[((seqidx-1)*batch.size+1) : (seqidx*batch.size)] <- label[seqidx,]
+    }
+    return (mx.nd.array(sm.label, ctx))
+}
+
+
+
+train.lstm <- function(model, train.data, eval.data,
+                       num.round, update.period,
+                       optimizer='sgd', ctx=mx.ctx.default(), ...) {
+    m <- model
+    seq.len <- m$seq.len
+    batch.size <- m$batch.size
+    num.lstm.layer <- m$num.lstm.layer
+    num.hidden <- m$num.hidden
+
+    opt <- mx.opt.create(optimizer, rescale.grad=(1/batch.size), ...)
+
+    updater <- mx.opt.get.updater(opt, m$rnn.exec$ref.arg.arrays)
+    epoch.counter <- 0
+    log.period <- max(as.integer(1000 / seq.len), 1)
+    last.perp <- 10000000.0
+
+    for (iteration in 1:num.round) {
+        nbatch <- 0
+        train.nll <- 0
+        # reset states
+        init.states <- list()
+        for (i in 1:num.lstm.layer) {
+            init.states[[paste0("l", i, ".init.c")]] <- mx.nd.zeros(c(num.hidden, batch.size))
+            init.states[[paste0("l", i, ".init.h")]] <- mx.nd.zeros(c(num.hidden, batch.size))
+        }
+        mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
+
+        tic <- Sys.time()
+
+        train.data$reset()
+
+        while (train.data$iter.next()) {
+            # set rnn input
+            rnn.input <- train.data$value()
+            mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE)
+
+            mx.exec.forward(m$rnn.exec, is.train=TRUE)
+            seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$ref.outputs[["sm_output"]], get.label(m$rnn.exec$ref.arg.arrays[["label"]], ctx))
+
+            mx.exec.backward(m$rnn.exec)
+            init.states <- list()
+            for (i in 1:num.lstm.layer) {
+                init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
+                init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
+            }
+            mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
+            # update epoch counter
+            epoch.counter <- epoch.counter + 1
+            if (epoch.counter %% update.period == 0) {
+                # the gradient of initial c and inital h should be zero
+                init.grad <- list()
+                for (i in 1:num.lstm.layer) {
+                    init.grad[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
+                    init.grad[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
+                }
+                mx.exec.update.grad.arrays(m$rnn.exec, init.grad, match.name=TRUE)
+
+                arg.blocks <- updater(m$rnn.exec$ref.arg.arrays, m$rnn.exec$ref.grad.arrays)
+
+                mx.exec.update.arg.arrays(m$rnn.exec, arg.blocks, skip.null=TRUE)
+
+                grad.arrays <- list()
+                for (name in names(m$rnn.exec$ref.grad.arrays)) {
+                    if (is.param.name(name))
+                        grad.arrays[[name]] <- m$rnn.exec$ref.grad.arrays[[name]]*0
+                }
+                mx.exec.update.grad.arrays(m$rnn.exec, grad.arrays, match.name=TRUE)
+
+            }
+
+            train.nll <- train.nll + calc.nll(as.array(seq.label.probs), batch.size)
+
+            nbatch <- nbatch + seq.len
+            if ((epoch.counter %% log.period) == 0) {
+                cat(paste0("Epoch [", epoch.counter,
+                           "] Train: NLL=", train.nll / nbatch,
+                           ", Perp=", exp(train.nll / nbatch), "\n"))
+            }
+        }
+        train.data$reset()
+        # end of training loop
+        toc <- Sys.time()
+        cat(paste0("Iter [", iteration,
+                   "] Train: Time: ", as.numeric(toc - tic, units="secs"),
+                   " sec, NLL=", train.nll / nbatch,
+                   ", Perp=", exp(train.nll / nbatch), "\n"))
+
+        if (!is.null(eval.data)) {
+            val.nll <- 0.0
+            # validation set, reset states
+            init.states <- list()
+            for (i in 1:num.lstm.layer) {
+                init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
+                init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
+            }
+            mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
+
+            eval.data$reset()
+            nbatch <- 0
+            while (eval.data$iter.next()) {
+                # set rnn input
+                rnn.input <- eval.data$value()
+                mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE)
+                mx.exec.forward(m$rnn.exec, is.train=FALSE)
+                # probability of each label class, used to evaluate nll
+                seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$ref.outputs[["sm_output"]], get.label(m$rnn.exec$ref.arg.arrays[["label"]], ctx))
+                # transfer the states
+                init.states <- list()
+                for (i in 1:num.lstm.layer) {
+                    init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
+                    init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
+                }
+                mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
+                val.nll <- val.nll + calc.nll(as.array(seq.label.probs), batch.size)
+                nbatch <- nbatch + seq.len
+            }
+            eval.data$reset()
+            perp <- exp(val.nll / nbatch)
+            cat(paste0("Iter [", iteration,
+                       "] Val: NLL=", val.nll / nbatch,
+                       ", Perp=", exp(val.nll / nbatch), "\n"))
+        }
+    }
+
+    return (m)
+}
+
+
+check.data <- function(data, batch.size, is.train) {
+    if (!is.null(data) && !is.list(data) && !is.mx.dataiter(data)) {
+        stop("The dataset should be either a mx.io.DataIter or a R list")
+    }
+    if (is.list(data)) {
+        if (is.null(data$data) || is.null(data$label)){
+            stop("Please provide dataset as list(data=R.array, label=R.array)")
+        }
+    data <- mx.model.init.iter.rnn(data$data, data$label, batch.size=batch.size, is.train = is.train)
+    }
+    if (!is.null(data) && !data$iter.next()) {
+        data$reset()
+        if (!data$iter.next()) stop("Empty input")
+    }
+    return (data)
+}
+
+#' Training LSTM Unrolled Model
+#'
+#' @param train.data mx.io.DataIter or list(data=R.array, label=R.array)
+#'      The Training set.
+#' @param eval.data mx.io.DataIter or list(data=R.array, label=R.array), optional
+#'      The validation set used for validation evaluation during the progress.
+#' @param num.lstm.layer integer
+#'      The number of the layer of lstm.
+#' @param seq.len integer
+#'      The length of the input sequence.
+#' @param num.hidden integer
+#'      The number of hidden nodes.
+#' @param num.embed integer
+#'      The output dim of embedding.
+#' @param num.label  integer
+#'      The number of labels.
+#' @param batch.size integer
+#'      The batch size used for R array training.
+#' @param input.size integer
+#'       The input dim of one-hot encoding of embedding
+#' @param ctx mx.context, optional
+#'      The device used to perform training.
+#' @param num.round integer, default=10
+#'      The number of iterations over training data to train the model.
+#' @param update.period integer, default=1
+#'      The number of iterations to update parameters during training period.
+#' @param initializer initializer object. default=mx.init.uniform(0.01)
+#'      The initialization scheme for parameters.
+#' @param dropout float, default=0
+#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
+#' @param optimizer string, default="sgd"
+#'      The optimization method.
+#' @param ... other parameters passing to \code{mx.lstm}/.
+#' @return model A trained lstm unrolled model.
+#'
+#' @export
+mx.lstm <- function(train.data, eval.data=NULL,
+                    num.lstm.layer, seq.len,
+                    num.hidden, num.embed, num.label,
+                    batch.size, input.size,
+                    ctx=mx.ctx.default(),
+                    num.round=10, update.period=1,
+                    initializer=mx.init.uniform(0.01),
+                    dropout=0, optimizer='sgd',
+                    ...) {
+    # check data and change data into iterator
+    train.data <- check.data(train.data, batch.size, TRUE)
+    eval.data <- check.data(eval.data, batch.size, FALSE)
+
+    # get unrolled lstm symbol
+    rnn.sym <- lstm.unroll(num.lstm.layer=num.lstm.layer,
+                           num.hidden=num.hidden,
+                           seq.len=seq.len,
+                           input.size=input.size,
+                           num.embed=num.embed,
+                           num.label=num.label,
+                           dropout=dropout)
+    # set up lstm model
+    model <- setup.rnn.model(rnn.sym=rnn.sym,
+                             ctx=ctx,
+                             num.lstm.layer=num.lstm.layer,
+                             seq.len=seq.len,
+                             num.hidden=num.hidden,
+                             num.embed=num.embed,
+                             num.label=num.label,
+                             batch.size=batch.size,
+                             input.size=input.size,
+                             initializer=initializer,
+                             dropout=dropout)
+
+    # train lstm model
+    model <- train.lstm(model, train.data, eval.data,
+                        num.round=num.round,
+                        update.period=update.period,
+                        ctx=ctx,
+                        ...)
+    # change model into MXFeedForwardModel
+    model <- list(symbol=model$symbol, arg.params=model$rnn.exec$ref.arg.arrays, aux.params=model$rnn.exec$ref.aux.arrays)
+    return(structure(model, class="MXFeedForwardModel"))
+}
+
+
+#' Create a LSTM Inference Model
+#'
+#' @param num.lstm.layer integer
+#'      The number of the layer of lstm.
+#' @param input.size integer
+#'       The input dim of one-hot encoding of embedding
+#' @param num.hidden integer
+#'      The number of hidden nodes.
+#' @param num.embed integer
+#'      The output dim of embedding.
+#' @param num.label  integer
+#'      The number of labels.
+#' @param batch.size integer
+#'      The batch size used for R array training.
+#' @param arg.params list
+#'      The batch size used for R array training.
+#' @param ctx mx.context, optional
+#'      Model parameter, list of name to NDArray of net's weights.
+#' @param dropout float, default=0
+#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
+#' @return model a lstm inference model.
+#'
+#' @export
+mx.lstm.inference <- function(num.lstm.layer,
+                              input.size,
+                              num.hidden,
+                              num.embed,
+                              num.label,
+                              batch.size=1,
+                              arg.params,
+                              ctx=mx.cpu(),
+                              dropout=0.) {
+    sym <- lstm.inference.symbol(num.lstm.layer,
+                                 input.size,
+                                 num.hidden,
+                                 num.embed,
+                                 num.label,
+                                 dropout)
+
+    seq.len <- 1
+    # set up lstm model
+    model <- setup.rnn.model(rnn.sym=sym,
+                             ctx=ctx,
+                             num.lstm.layer=num.lstm.layer,
+                             seq.len=seq.len,
+                             num.hidden=num.hidden,
+                             num.embed=num.embed,
+                             num.label=num.label,
+                             batch.size=batch.size,
+                             input.size=input.size,
+                             initializer=mx.init.uniform(0.01),
+                             dropout=dropout)
+    arg.names <- names(model$rnn.exec$ref.arg.arrays)
+    for (k in names(arg.params)) {
+        if ((k %in% arg.names) && is.param.name(k) ) {
+            rnn.input <- list()
+            rnn.input[[k]] <- arg.params[[k]]
+            mx.exec.update.arg.arrays(model$rnn.exec, rnn.input, match.name=TRUE)
+        }
+    }
+    init.states <- list()
+    for (i in 1:num.lstm.layer) {
+        init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
+        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
+    }
+    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
+
+    return (model)
+}
+
+#' Using forward function to predict in lstm inference model
+#'
+#' @param model lstm model
+#'      A Lstm inference model
+#' @param input.data, array.matrix
+#'      The input data for forward function
+#' @param new.seq boolean, default=FALSE
+#'      Whether the input is the start of a new sequence
+#'
+#' @return result A list(prob=prob, model=model) containing the result probability of each label and the model.
+#'
+#' @export
+
+mx.lstm.forward <- function(model, input.data, new.seq=FALSE) {
+    if (new.seq == TRUE) {
+        init.states <- list()
+        for (i in 1:num.lstm.layer) {
+            init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
+            init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
+        }
+        mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
+    }
+    dim(input.data) <- c(model$batch.size)
+    data <- list(data=mx.nd.array(input.data))
+    mx.exec.update.arg.arrays(model$rnn.exec, data, match.name=TRUE)
+    mx.exec.forward(model$rnn.exec, is.train=FALSE)
+    init.states <- list()
+    for (i in 1:num.lstm.layer) {
+        init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.c_output")]]
+        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.h_output")]]
+    }
+    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
+    prob <- model$rnn.exec$ref.outputs[["sm_output"]]
+    return (list(prob=prob, model=model))
+}
diff --git a/R-package/man/mx.lstm.Rd b/R-package/man/mx.lstm.Rd
new file mode 100644
index 000000000000..32d3c8b58da4
--- /dev/null
+++ b/R-package/man/mx.lstm.Rd
@@ -0,0 +1,66 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lstm.R
+\name{mx.lstm}
+\alias{mx.lstm}
+\title{Training LSTM Unrolled Model}
+\usage{
+mx.lstm(train.data, eval.data = NULL, num.lstm.layer, seq.len, num.hidden,
+  num.embed, num.label, batch.size, input.size, ctx = mx.ctx.default(),
+  num.round = 10, update.period = 1, initializer = mx.init.uniform(0.01),
+  dropout = 0, optimizer = "sgd", ...)
+}
+\arguments{
+\item{train.data}{mx.io.DataIter or list(data=R.array, label=R.array)
+The Training set.}
+
+\item{eval.data}{mx.io.DataIter or list(data=R.array, label=R.array), optional
+The validation set used for validation evaluation during the progress.}
+
+\item{num.lstm.layer}{integer
+The number of the layer of lstm.}
+
+\item{seq.len}{integer
+The length of the input sequence.}
+
+\item{num.hidden}{integer
+The number of hidden nodes.}
+
+\item{num.embed}{integer
+The output dim of embedding.}
+
+\item{num.label}{integer
+The number of labels.}
+
+\item{batch.size}{integer
+The batch size used for R array training.}
+
+\item{input.size}{integer
+The input dim of one-hot encoding of embedding}
+
+\item{ctx}{mx.context, optional
+The device used to perform training.}
+
+\item{num.round}{integer, default=10
+The number of iterations over training data to train the model.}
+
+\item{update.period}{integer, default=1
+The number of iterations to update parameters during training period.}
+
+\item{initializer}{initializer object. default=mx.init.uniform(0.01)
+The initialization scheme for parameters.}
+
+\item{dropout}{float, default=0
+A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.}
+
+\item{optimizer}{string, default="sgd"
+The optimization method.}
+
+\item{...}{other parameters passing to \code{mx.lstm}/.}
+}
+\value{
+model A trained lstm unrolled model.
+}
+\description{
+Training LSTM Unrolled Model
+}
+
diff --git a/R-package/man/mx.lstm.forward.Rd b/R-package/man/mx.lstm.forward.Rd
new file mode 100644
index 000000000000..3fa654f7c901
--- /dev/null
+++ b/R-package/man/mx.lstm.forward.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lstm.R
+\name{mx.lstm.forward}
+\alias{mx.lstm.forward}
+\title{Using forward function to predict in lstm inference model}
+\usage{
+mx.lstm.forward(model, input.data, new.seq = FALSE)
+}
+\arguments{
+\item{model}{lstm model
+A Lstm inference model}
+
+\item{input.data, }{array.matrix
+The input data for forward function}
+
+\item{new.seq}{boolean, default=FALSE
+Whether the input is the start of a new sequence}
+}
+\value{
+result A list(prob=prob, model=model) containing the result probability of each label and the model.
+}
+\description{
+Using forward function to predict in lstm inference model
+}
+
diff --git a/R-package/man/mx.lstm.inference.Rd b/R-package/man/mx.lstm.inference.Rd
new file mode 100644
index 000000000000..af572ee28590
--- /dev/null
+++ b/R-package/man/mx.lstm.inference.Rd
@@ -0,0 +1,44 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lstm.R
+\name{mx.lstm.inference}
+\alias{mx.lstm.inference}
+\title{Create a LSTM Inference Model}
+\usage{
+mx.lstm.inference(num.lstm.layer, input.size, num.hidden, num.embed, num.label,
+  batch.size = 1, arg.params, ctx = mx.cpu(), dropout = 0)
+}
+\arguments{
+\item{num.lstm.layer}{integer
+The number of the layer of lstm.}
+
+\item{input.size}{integer
+The input dim of one-hot encoding of embedding}
+
+\item{num.hidden}{integer
+The number of hidden nodes.}
+
+\item{num.embed}{integer
+The output dim of embedding.}
+
+\item{num.label}{integer
+The number of labels.}
+
+\item{batch.size}{integer
+The batch size used for R array training.}
+
+\item{arg.params}{list
+The batch size used for R array training.}
+
+\item{ctx}{mx.context, optional
+Model parameter, list of name to NDArray of net's weights.}
+
+\item{dropout}{float, default=0
+A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.}
+}
+\value{
+model a lstm inference model.
+}
+\description{
+Create a LSTM Inference Model
+}
+
diff --git a/R-package/src/io.cc b/R-package/src/io.cc
index 8da9fbba839c..27a25382cd0d 100644
--- a/R-package/src/io.cc
+++ b/R-package/src/io.cc
@@ -54,14 +54,14 @@ ArrayDataIter::ArrayDataIter(const Rcpp::NumericVector& data,
       RLOG_FATAL << "Data and label shape in-consistent";
     }
   }
-
-  std::vector<size_t> order(label.size());
+  num_data = lshape[lshape.size() - 1];
+  std::vector<size_t> order(num_data);
   for (size_t i = 0; i < order.size(); ++i) {
     order[i] = i;
   }
 
   if (shuffle) {
-    RCHECK(unif_rnds.size() == label.size());
+    RCHECK(unif_rnds.size() == num_data);
     for (size_t i = order.size() - 1; i != 0; --i) {
       size_t idx = static_cast<size_t>(unif_rnds[i] * (i + 1));
       if (idx < i) {
@@ -110,7 +110,7 @@ void ArrayDataIter::Convert(const Rcpp::NumericVector& src,
 }
 
 Rcpp::List ArrayDataIter::Value() const {
-  RCHECK(counter_ != 0 && counter_ <= label_.size())
+  RCHECK(counter_ != 0 && counter_ <= num_data)
       << "Read Iter at end or before iter.next is called";
   return Rcpp::List::create(
       Rcpp::Named("data") = data_[counter_ - 1].RObject(),
diff --git a/R-package/src/io.h b/R-package/src/io.h
index 8a68ec7d30df..b3032351fe7a 100644
--- a/R-package/src/io.h
+++ b/R-package/src/io.h
@@ -132,12 +132,15 @@ class ArrayDataIter : public DataIter {
   size_t counter_;
   /*! \brief number of pad instances*/
   size_t num_pad_;
+  /*! \brief number of data */
+  size_t num_data;
   /*! \brief The data list of each batch */
   std::vector<NDArray> data_;
   /*! \brief The data list of each batch */
   std::vector<NDArray> label_;
 };
 
+
 /*! \brief The DataIterCreate functions to be invoked */
 class DataIterCreateFunction : public ::Rcpp::CppFunction {
  public:
@@ -177,6 +180,8 @@ class DataIterCreateFunction : public ::Rcpp::CppFunction {
   // name of the function
   std::string name_;
 };
+
+
 }  // namespace R
 }  // namespace mxnet
 
diff --git a/R-package/tests/testthat/test_lstm.R b/R-package/tests/testthat/test_lstm.R
new file mode 100644
index 000000000000..24b1a59636dc
--- /dev/null
+++ b/R-package/tests/testthat/test_lstm.R
@@ -0,0 +1,52 @@
+require(mxnet)
+
+context("lstm models")
+
+get.nll <- function(s) {
+    pat <- ".*\\NLL=(.+), Perp=.*"
+    nll <- sub(pat, "\\1", s)
+    return (as.numeric(nll))
+} 
+
+test_that("training error decreasing", {
+
+    # Set basic network parameters.
+    batch.size = 2
+    seq.len = 2
+    num.hidden = 1
+    num.embed = 2
+    num.lstm.layer = 2
+    num.round = 5
+    learning.rate= 0.1
+    wd=0.00001
+    clip_gradient=1
+    update.period = 1
+    vocab=17
+
+    X.train <- list(data=array(1:16, dim=c(2,8)), label=array(2:17, dim=c(2,8)))
+
+    s <- capture.output(model <- mx.lstm( X.train, 
+                                          ctx=mx.cpu(),
+                                          num.round=num.round, 
+                                          update.period=update.period,
+                                          num.lstm.layer=num.lstm.layer, 
+                                          seq.len=seq.len,
+                                          num.hidden=num.hidden, 
+                                          num.embed=num.embed, 
+                                          num.label=vocab,
+                                          batch.size=batch.size, 
+                                          input.size=vocab,
+                                          initializer=mx.init.uniform(0.01), 
+                                          learning.rate=learning.rate,
+                                          wd=wd,
+                                          clip_gradient=clip_gradient))
+
+    prev.nll <- 10000000.0
+    for (r in s) {
+        nll <- get.nll(r)
+        expect_true(prev.nll >= nll)
+        prev.nll <- nll
+
+    }
+
+})
\ No newline at end of file
diff --git a/R-package/vignettes/CharRnnModel.Rmd b/R-package/vignettes/CharRnnModel.Rmd
new file mode 100644
index 000000000000..9066d60f7513
--- /dev/null
+++ b/R-package/vignettes/CharRnnModel.Rmd
@@ -0,0 +1,276 @@
+Char RNN Example
+=============================================
+
+This example aims to show how to use lstm model to build a char level language model, and generate text from it. We use a tiny shakespeare text for demo purpose.
+
+Data can be found at https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare. 
+
+Preface
+-------
+This tutorial is written in Rmarkdown.
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/package/r/CharRnnModel.html)
+- You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/CharRnnModel.Rmd)
+
+Load Data 
+---------
+First of all, load in the data and preprocess it.
+```{r}
+require(mxnet)
+```
+Set basic network parameters.
+```{r}
+batch.size = 32
+seq.len = 32
+num.hidden = 256
+num.embed = 256
+num.lstm.layer = 2
+num.round = 3
+learning.rate= 0.1
+wd=0.00001
+clip_gradient=1
+update.period = 1
+```
+download the data.
+```{r}
+download.data <- function(data_dir) {
+    dir.create(data_dir, showWarnings = FALSE)
+    if (!file.exists(paste0(data_dir,'input.txt'))) {
+        download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt',
+                      destfile=paste0(data_dir,'input.txt'), method='wget')
+    }
+}
+```
+Make dictionary from text.
+```{r}
+make.dict <- function(text, max.vocab=10000) {
+    text <- strsplit(text, '')
+    dic <- list()
+    idx <- 1
+    for (c in text[[1]]) {
+        if (!(c %in% names(dic))) {
+            dic[[c]] <- idx
+            idx <- idx + 1
+        }
+    }
+    if (length(dic) == max.vocab - 1)
+        dic[["UNKNOWN"]] <- idx
+    cat(paste0("Total unique char: ", length(dic), "\n"))
+    return (dic)
+}
+```
+Transfer text into data feature.
+```{r}
+make.data <- function(file.path, seq.len=32, max.vocab=10000, dic=NULL) {
+    fi <- file(file.path, "r")
+    text <- paste(readLines(fi), collapse="\n")
+    close(fi)
+
+    if (is.null(dic))
+        dic <- make.dict(text, max.vocab)
+    lookup.table <- list()
+    for (c in names(dic)) {
+        idx <- dic[[c]]
+        lookup.table[[idx]] <- c 
+    }
+
+    char.lst <- strsplit(text, '')[[1]]
+    num.seq <- as.integer(length(char.lst) / seq.len)
+    char.lst <- char.lst[1:(num.seq * seq.len)]
+    data <- array(0, dim=c(seq.len, num.seq))
+    idx <- 1
+    for (i in 1:num.seq) {
+        for (j in 1:seq.len) {
+            if (char.lst[idx] %in% names(dic))
+                data[j, i] <- dic[[ char.lst[idx] ]]-1
+            else {
+                data[j, i] <- dic[["UNKNOWN"]]-1
+            }
+            idx <- idx + 1
+        }
+    }
+    return (list(data=data, dic=dic, lookup.table=lookup.table))
+}
+```
+Move tail text.
+```{r}
+drop.tail <- function(X, batch.size) {
+    shape <- dim(X)
+    nstep <- as.integer(shape[2] / batch.size)
+    return (X[, 1:(nstep * batch.size)])
+}
+```
+get the label of X
+```{r}
+get.label <- function(X) {
+    label <- array(0, dim=dim(X))
+    d <- dim(X)[1]
+    w <- dim(X)[2]
+    for (i in 0:(w-1)) {
+        for (j in 1:d) {
+            label[i*d+j] <- X[(i*d+j)%%(w*d)+1]
+        }
+    }
+    return (label)
+}
+```
+get training data and eval data
+```{r}
+download.data("./data/")
+ret <- make.data("./data/input.txt", seq.len=seq.len)
+X <- ret$data
+dic <- ret$dic
+lookup.table <- ret$lookup.table
+
+vocab <- length(dic)
+
+shape <- dim(X)
+train.val.fraction <- 0.9
+size <- shape[2]
+
+X.train.data <- X[, 1:as.integer(size * train.val.fraction)]
+X.val.data <- X[, -(1:as.integer(size * train.val.fraction))]
+X.train.data <- drop.tail(X.train.data, batch.size)
+X.val.data <- drop.tail(X.val.data, batch.size)
+
+X.train.label <- get.label(X.train.data)
+X.val.label <- get.label(X.val.data)
+
+X.train <- list(data=X.train.data, label=X.train.label)
+X.val <- list(data=X.val.data, label=X.val.label)
+```
+
+Training Model
+--------------
+In `mxnet`, we have a function called `mx.lstm` so that users can build a general lstm model. 
+
+```{r}
+model <- mx.lstm(X.train, X.val, 
+                 ctx=mx.cpu(),
+                 num.round=num.round, 
+                 update.period=update.period,
+                 num.lstm.layer=num.lstm.layer, 
+                 seq.len=seq.len,
+                 num.hidden=num.hidden, 
+                 num.embed=num.embed, 
+                 num.label=vocab,
+                 batch.size=batch.size, 
+                 input.size=vocab,
+                 initializer=mx.init.uniform(0.1), 
+                 learning.rate=learning.rate,
+                 wd=wd,
+                 clip_gradient=clip_gradient)
+
+```
+Setting the parameters ctx=mx.gpu(0) and num.round=5 can get the following result.
+```
+Epoch [31] Train: NLL=3.47213018872144, Perp=32.2052727363657
+...
+Epoch [961] Train: NLL=2.32060007657895, Perp=10.181782322355
+Iter [1] Train: Time: 186.397065639496 sec, NLL=2.31135356537961, Perp=10.0880702804858
+Iter [1] Val: NLL=1.94184484060012, Perp=6.97160060607419
+Epoch [992] Train: NLL=1.84784553299322, Perp=6.34613225095329
+...
+Epoch [1953] Train: NLL=1.70175791172558, Perp=5.48357857093351
+Iter [2] Train: Time: 188.929051160812 sec, NLL=1.70103940328978, Perp=5.47963998859367
+Iter [2] Val: NLL=1.74979316010449, Perp=5.75341251767988
+...
+Epoch [2914] Train: NLL=1.54738185300295, Perp=4.69915099483974
+Iter [3] Train: Time: 185.425321578979 sec, NLL=1.54604189517013, Perp=4.69285854740519
+Iter [3] Val: NLL=1.67780240235925, Perp=5.35377758479576
+Epoch [2945] Train: NLL=1.48868466087876, Perp=4.43126307034767
+...
+Iter [4] Train: Time: 185.487086296082 sec, NLL=1.4744973925858, Perp=4.36883940994296
+Iter [4] Val: NLL=1.64488167325603, Perp=5.18039689118454
+Epoch [3937] Train: NLL=1.46355541021581, Perp=4.32129622881604
+...
+Epoch [4898] Train: NLL=1.42900458455642, Perp=4.17454171976281
+Iter [5] Train: Time: 185.070136785507 sec, NLL=1.42909226256273, Perp=4.17490775130428
+Iter [5] Val: NLL=1.62716655804022, Perp=5.08943365437187
+
+```
+Inference from model
+--------------------
+helper function for random sample.
+```{r}
+cdf <- function(weights) {
+    total <- sum(weights)
+    result <- c()
+    cumsum <- 0
+    for (w in weights) {
+        cumsum <- cumsum+w
+        result <- c(result, cumsum / total)
+    }
+    return (result)
+}
+
+search.val <- function(cdf, x) {
+    l <- 1
+    r <- length(cdf) 
+    while (l <= r) {
+        m <- as.integer((l+r)/2)
+        if (cdf[m] < x) {
+            l <- m+1
+        } else {
+            r <- m-1
+        }
+    }
+    return (l)
+}
+choice <- function(weights) {
+    cdf.vals <- cdf(as.array(weights))
+    x <- runif(1)
+    idx <- search.val(cdf.vals, x)
+    return (idx)
+}
+```
+we can use random output or fixed output by choosing largest probability.
+```{r}
+make.output <- function(prob, sample=FALSE, temperature=1.) {
+    if (!sample) {
+        idx <- which.max(as.array(prob))
+    }
+    else {
+        scale_prob <- mx.nd.clip(prob, 1e-6, 1 - 1e-6)
+        rescale <- mx.nd.exp(mx.nd.log(scale_prob) / temperature)
+        rescale <- rescale / (as.array(mx.nd.sum(rescale))[1])
+        idx <- choice(rescale)
+    }
+    return (idx)
+
+}
+```
+
+In `mxnet`, we have a function called `mx.lstm.inference` so that users can build a inference from lstm model and then use function `mx.lstm.forward` to get forward output from the inference.
+Build inference from model.
+```{r}
+infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer,
+                                 input.size=vocab,
+                                 num.hidden=num.hidden,
+                                 num.embed=num.embed,
+                                 num.label=vocab,
+                                 arg.params=model$arg.params,
+                                 ctx=mx.cpu())
+```
+generate a sequence of 75 chars using function `mx.lstm.forward`.
+```
+start <- 'a'
+seq.len <- 75
+random.sample <- TRUE
+
+last.id <- dic[[start]]
+out <- "a"
+for (i in (1:(seq.len-1))) {
+    input <- c(last.id-1)
+    ret <- mx.lstm.forward(infer.model, input, FALSE)
+    infer.model <- ret$model
+    prob <- ret$prob
+    last.id <- make.output(prob, random.sample)
+    out <- paste0(out, lookup.table[[last.id]])
+}
+cat (paste0(out, "\n"))
+```
+The result:
+```
+ah not a drobl greens
+Settled asing lately sistering sounted to their hight
+```
\ No newline at end of file
diff --git a/docs/packages/r/CharRnnModel.Rmd b/docs/packages/r/CharRnnModel.Rmd
new file mode 100644
index 000000000000..9066d60f7513
--- /dev/null
+++ b/docs/packages/r/CharRnnModel.Rmd
@@ -0,0 +1,276 @@
+Char RNN Example
+=============================================
+
+This example aims to show how to use lstm model to build a char level language model, and generate text from it. We use a tiny shakespeare text for demo purpose.
+
+Data can be found at https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare. 
+
+Preface
+-------
+This tutorial is written in Rmarkdown.
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/package/r/CharRnnModel.html)
+- You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/CharRnnModel.Rmd)
+
+Load Data 
+---------
+First of all, load in the data and preprocess it.
+```{r}
+require(mxnet)
+```
+Set basic network parameters.
+```{r}
+batch.size = 32
+seq.len = 32
+num.hidden = 256
+num.embed = 256
+num.lstm.layer = 2
+num.round = 3
+learning.rate= 0.1
+wd=0.00001
+clip_gradient=1
+update.period = 1
+```
+download the data.
+```{r}
+download.data <- function(data_dir) {
+    dir.create(data_dir, showWarnings = FALSE)
+    if (!file.exists(paste0(data_dir,'input.txt'))) {
+        download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt',
+                      destfile=paste0(data_dir,'input.txt'), method='wget')
+    }
+}
+```
+Make dictionary from text.
+```{r}
+make.dict <- function(text, max.vocab=10000) {
+    text <- strsplit(text, '')
+    dic <- list()
+    idx <- 1
+    for (c in text[[1]]) {
+        if (!(c %in% names(dic))) {
+            dic[[c]] <- idx
+            idx <- idx + 1
+        }
+    }
+    if (length(dic) == max.vocab - 1)
+        dic[["UNKNOWN"]] <- idx
+    cat(paste0("Total unique char: ", length(dic), "\n"))
+    return (dic)
+}
+```
+Transfer text into data feature.
+```{r}
+make.data <- function(file.path, seq.len=32, max.vocab=10000, dic=NULL) {
+    fi <- file(file.path, "r")
+    text <- paste(readLines(fi), collapse="\n")
+    close(fi)
+
+    if (is.null(dic))
+        dic <- make.dict(text, max.vocab)
+    lookup.table <- list()
+    for (c in names(dic)) {
+        idx <- dic[[c]]
+        lookup.table[[idx]] <- c 
+    }
+
+    char.lst <- strsplit(text, '')[[1]]
+    num.seq <- as.integer(length(char.lst) / seq.len)
+    char.lst <- char.lst[1:(num.seq * seq.len)]
+    data <- array(0, dim=c(seq.len, num.seq))
+    idx <- 1
+    for (i in 1:num.seq) {
+        for (j in 1:seq.len) {
+            if (char.lst[idx] %in% names(dic))
+                data[j, i] <- dic[[ char.lst[idx] ]]-1
+            else {
+                data[j, i] <- dic[["UNKNOWN"]]-1
+            }
+            idx <- idx + 1
+        }
+    }
+    return (list(data=data, dic=dic, lookup.table=lookup.table))
+}
+```
+Move tail text.
+```{r}
+drop.tail <- function(X, batch.size) {
+    shape <- dim(X)
+    nstep <- as.integer(shape[2] / batch.size)
+    return (X[, 1:(nstep * batch.size)])
+}
+```
+get the label of X
+```{r}
+get.label <- function(X) {
+    label <- array(0, dim=dim(X))
+    d <- dim(X)[1]
+    w <- dim(X)[2]
+    for (i in 0:(w-1)) {
+        for (j in 1:d) {
+            label[i*d+j] <- X[(i*d+j)%%(w*d)+1]
+        }
+    }
+    return (label)
+}
+```
+get training data and eval data
+```{r}
+download.data("./data/")
+ret <- make.data("./data/input.txt", seq.len=seq.len)
+X <- ret$data
+dic <- ret$dic
+lookup.table <- ret$lookup.table
+
+vocab <- length(dic)
+
+shape <- dim(X)
+train.val.fraction <- 0.9
+size <- shape[2]
+
+X.train.data <- X[, 1:as.integer(size * train.val.fraction)]
+X.val.data <- X[, -(1:as.integer(size * train.val.fraction))]
+X.train.data <- drop.tail(X.train.data, batch.size)
+X.val.data <- drop.tail(X.val.data, batch.size)
+
+X.train.label <- get.label(X.train.data)
+X.val.label <- get.label(X.val.data)
+
+X.train <- list(data=X.train.data, label=X.train.label)
+X.val <- list(data=X.val.data, label=X.val.label)
+```
+
+Training Model
+--------------
+In `mxnet`, we have a function called `mx.lstm` so that users can build a general lstm model. 
+
+```{r}
+model <- mx.lstm(X.train, X.val, 
+                 ctx=mx.cpu(),
+                 num.round=num.round, 
+                 update.period=update.period,
+                 num.lstm.layer=num.lstm.layer, 
+                 seq.len=seq.len,
+                 num.hidden=num.hidden, 
+                 num.embed=num.embed, 
+                 num.label=vocab,
+                 batch.size=batch.size, 
+                 input.size=vocab,
+                 initializer=mx.init.uniform(0.1), 
+                 learning.rate=learning.rate,
+                 wd=wd,
+                 clip_gradient=clip_gradient)
+
+```
+Setting the parameters ctx=mx.gpu(0) and num.round=5 can get the following result.
+```
+Epoch [31] Train: NLL=3.47213018872144, Perp=32.2052727363657
+...
+Epoch [961] Train: NLL=2.32060007657895, Perp=10.181782322355
+Iter [1] Train: Time: 186.397065639496 sec, NLL=2.31135356537961, Perp=10.0880702804858
+Iter [1] Val: NLL=1.94184484060012, Perp=6.97160060607419
+Epoch [992] Train: NLL=1.84784553299322, Perp=6.34613225095329
+...
+Epoch [1953] Train: NLL=1.70175791172558, Perp=5.48357857093351
+Iter [2] Train: Time: 188.929051160812 sec, NLL=1.70103940328978, Perp=5.47963998859367
+Iter [2] Val: NLL=1.74979316010449, Perp=5.75341251767988
+...
+Epoch [2914] Train: NLL=1.54738185300295, Perp=4.69915099483974
+Iter [3] Train: Time: 185.425321578979 sec, NLL=1.54604189517013, Perp=4.69285854740519
+Iter [3] Val: NLL=1.67780240235925, Perp=5.35377758479576
+Epoch [2945] Train: NLL=1.48868466087876, Perp=4.43126307034767
+...
+Iter [4] Train: Time: 185.487086296082 sec, NLL=1.4744973925858, Perp=4.36883940994296
+Iter [4] Val: NLL=1.64488167325603, Perp=5.18039689118454
+Epoch [3937] Train: NLL=1.46355541021581, Perp=4.32129622881604
+...
+Epoch [4898] Train: NLL=1.42900458455642, Perp=4.17454171976281
+Iter [5] Train: Time: 185.070136785507 sec, NLL=1.42909226256273, Perp=4.17490775130428
+Iter [5] Val: NLL=1.62716655804022, Perp=5.08943365437187
+
+```
+Inference from model
+--------------------
+helper function for random sample.
+```{r}
+cdf <- function(weights) {
+    total <- sum(weights)
+    result <- c()
+    cumsum <- 0
+    for (w in weights) {
+        cumsum <- cumsum+w
+        result <- c(result, cumsum / total)
+    }
+    return (result)
+}
+
+search.val <- function(cdf, x) {
+    l <- 1
+    r <- length(cdf) 
+    while (l <= r) {
+        m <- as.integer((l+r)/2)
+        if (cdf[m] < x) {
+            l <- m+1
+        } else {
+            r <- m-1
+        }
+    }
+    return (l)
+}
+choice <- function(weights) {
+    cdf.vals <- cdf(as.array(weights))
+    x <- runif(1)
+    idx <- search.val(cdf.vals, x)
+    return (idx)
+}
+```
+we can use random output or fixed output by choosing largest probability.
+```{r}
+make.output <- function(prob, sample=FALSE, temperature=1.) {
+    if (!sample) {
+        idx <- which.max(as.array(prob))
+    }
+    else {
+        scale_prob <- mx.nd.clip(prob, 1e-6, 1 - 1e-6)
+        rescale <- mx.nd.exp(mx.nd.log(scale_prob) / temperature)
+        rescale <- rescale / (as.array(mx.nd.sum(rescale))[1])
+        idx <- choice(rescale)
+    }
+    return (idx)
+
+}
+```
+
+In `mxnet`, we have a function called `mx.lstm.inference` so that users can build a inference from lstm model and then use function `mx.lstm.forward` to get forward output from the inference.
+Build inference from model.
+```{r}
+infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer,
+                                 input.size=vocab,
+                                 num.hidden=num.hidden,
+                                 num.embed=num.embed,
+                                 num.label=vocab,
+                                 arg.params=model$arg.params,
+                                 ctx=mx.cpu())
+```
+generate a sequence of 75 chars using function `mx.lstm.forward`.
+```
+start <- 'a'
+seq.len <- 75
+random.sample <- TRUE
+
+last.id <- dic[[start]]
+out <- "a"
+for (i in (1:(seq.len-1))) {
+    input <- c(last.id-1)
+    ret <- mx.lstm.forward(infer.model, input, FALSE)
+    infer.model <- ret$model
+    prob <- ret$prob
+    last.id <- make.output(prob, random.sample)
+    out <- paste0(out, lookup.table[[last.id]])
+}
+cat (paste0(out, "\n"))
+```
+The result:
+```
+ah not a drobl greens
+Settled asing lately sistering sounted to their hight
+```
\ No newline at end of file
diff --git a/docs/packages/r/index.md b/docs/packages/r/index.md
index 35e1fe7642b0..ef427abc4899 100644
--- a/docs/packages/r/index.md
+++ b/docs/packages/r/index.md
@@ -20,6 +20,7 @@ Tutorials
 * [Handwritten Digits Classification Competition](mnistCompetition.md)
 * [Tutorial on NDArray and Symbol](ndarrayAndSymbolTutorial.md)
 * [Tutorial on Callback Functions](CallbackFunctionTutorial.md)
+* [Character Language Model using RNN Model](CharRnnModel.Rmd)
 
 Resources
 ---------
diff --git a/example/rnn/char_lstm.R b/example/rnn/char_lstm.R
deleted file mode 100644
index 26d08b262b20..000000000000
--- a/example/rnn/char_lstm.R
+++ /dev/null
@@ -1,113 +0,0 @@
-# Char LSTM Example.
-
-# This example aims to show how to use lstm to build a char level language model, and generate text from it. We use a tiny shakespeare text for demo purpose.
-# Data can be found at https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare. 
-
-# If running for the first time, download the data by running the following commands: sh get_ptb_data.sh
- 
-require(mxnet)
-source("lstm.R")
-
-# Set basic network parameters.
-batch.size = 32
-seq.len = 32
-num.hidden = 256
-num.embed = 256
-num.lstm.layer = 2
-num.round = 21
-learning.rate= 0.01
-wd=0.00001
-clip_gradient=1
-update.period = 1
-
-# Make dictionary from text
-make.dict <- function(text, max.vocab=10000) {
-	text <- strsplit(text, '')
-	dic <- list()
-	idx <- 1
-    for (c in text[[1]]) {
-    	if (!(c %in% names(dic))) {
-        	dic[[c]] <- idx
-        	idx <- idx + 1
-        }
-    }
-    if (length(dic) == max.vocab - 1)
-        dic[["UNKNOWN"]] <- idx
-    cat(paste0("Total unique char: ", length(dic), "\n"))
-    return (dic)
-}
-
-# Transfer text into data batch
-make.batch <- function(file.path, batch.size=32, seq.lenth=32, max.vocab=10000, dic=NULL) {
-    fi <- file(file.path, "r")
-    text <- paste(readLines(fi), collapse="\n")
-    close(fi)
-
-    if (is.null(dic))
-        dic <- make.dict(text, max.vocab)
-    lookup.table <- list()
-    for (c in names(dic)) {
-    	idx <- dic[[c]]
-    	lookup.table[[idx]] <- c 
-    }
-
-    char.lst <- strsplit(text, '')[[1]]
-    num.batch <- as.integer(length(char.lst) / batch.size)
-    char.lst <- char.lst[1:(num.batch * batch.size)]
-    data <- array(0, dim=c(batch.size, num.batch))
-    idx <- 1
-    for (j in 1:batch.size) {
-        for (i in 1:num.batch) {
-            if (char.lst[idx] %in% names(dic))
-                data[j, i] <- dic[[ char.lst[idx] ]]
-            else {
-                data[j, i] <- dic[["UNKNOWN"]]
-            }
-            idx <- idx + 1
-        }
-    }
-    return (list(data=data, dic=dic, lookup.table=lookup.table))
-}
-
-# Move tail text
-drop.tail <- function(X, seq.len) {
-    shape <- dim(X)
-    nstep <- as.integer(shape[2] / seq.len)
-    return (X[, 1:(nstep * seq.len)])
-}
-
-ret <- make.batch("./data/input.txt", batch.size=batch.size, seq.lenth=seq.len)
-X <- ret$data
-dic <- ret$dic
-lookup.table <- ret$lookup.table
-
-vocab <- length(dic)
-
-shape <- dim(X)
-train.val.fraction <- 0.9
-size <- shape[2]
-X.train <- X[, 1:as.integer(size * train.val.fraction)]
-X.val <- X[, -(1:as.integer(size * train.val.fraction))]
-X.train <- drop.tail(X.train, seq.len)
-X.val <- drop.tail(X.val, seq.len)
-
-# Set up LSTM model
-model <- setup.rnn.model(ctx=mx.gpu(0),
-                         num.lstm.layer=num.lstm.layer,
-                         seq.len=seq.len,
-                         num.hidden=num.hidden,
-                         num.embed=num.embed,
-                         num.label=vocab,
-                         batch.size=batch.size,
-                         input.size=vocab,
-                         initializer=mx.init.uniform(0.1),
-                         dropout=0.)
-
-# Train LSTM model
-train.lstm(model, X.train, X.val,
-                num.round=num.round,
-                half.life=3,
-                update.period=update.period,
-                learning.rate=learning.rate,
-                wd=wd,
-                clip_gradient=clip_gradient)
diff --git a/example/rnn/lstm.R b/example/rnn/lstm.R
deleted file mode 100644
index 5caaf0213b95..000000000000
--- a/example/rnn/lstm.R
+++ /dev/null
@@ -1,322 +0,0 @@
-require(mxnet)
-
-# lstm cell symbol
-lstm <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout=0) {
-    if (dropout > 0)
-        indata <- mx.symbol.Dropout(data=indata, p=dropout)
-    i2h <- mx.symbol.FullyConnected(data=indata,
-                                    weight=param$i2h.weight,
-                                    bias=param$i2h.bias,
-                                    num.hidden=num.hidden * 4,
-                                    name=paste0("t", seqidx, ".l", layeridx, ".i2h"))
-    h2h <- mx.symbol.FullyConnected(data=prev.state$h,
-                                    weight=param$h2h.weight,
-                                    bias=param$h2h.bias,
-                                    num.hidden=num.hidden * 4,
-                                    name=paste0("t", seqidx, ".l", layeridx, ".h2h"))
-    gates <- i2h + h2h
-    slice.gates <- mx.symbol.SliceChannel(gates, num.outputs=4,
-                                          name=paste0("t", seqidx, ".l", layeridx, ".slice"))
-
-    in.gate <- mx.symbol.Activation(slice.gates[[1]], act.type="sigmoid")
-    in.transform <- mx.symbol.Activation(slice.gates[[2]], act.type="tanh")
-    forget.gate <- mx.symbol.Activation(slice.gates[[3]], act.type="sigmoid")
-    out.gate <- mx.symbol.Activation(slice.gates[[4]], act.type="sigmoid")
-    next.c <- (forget.gate * prev.state$c) + (in.gate * in.transform)
-    next.h <- out.gate * mx.symbol.Activation(next.c, act.type="tanh")
-    
-    return (list(c=next.c, h=next.h))
-}
-
-# unrolled lstm network
-lstm.unroll <- function(num.lstm.layer, seq.len, input.size,
-                        num.hidden, num.embed, num.label, dropout=0.) {
-
-    embed.weight <- mx.symbol.Variable("embed.weight")
-    cls.weight <- mx.symbol.Variable("cls.weight")
-    cls.bias <- mx.symbol.Variable("cls.bias")
-    param.cells <- list()
-    last.states <- list()
-    for (i in 1:num.lstm.layer) {
-        param.cells[[i]] <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
-                                 i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
-                                 h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
-                                 h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
-        state <- list(c=mx.symbol.Variable(paste0("l", i, ".init.c")),
-                      h=mx.symbol.Variable(paste0("l", i, ".init.h")))
-        last.states[[i]] <- state
-    }
-
-    last.hidden <- list()
-    label <- mx.symbol.Variable("label")
-    for (seqidx in 1:seq.len) {
-        # embeding layer
-        data <- mx.symbol.Variable(paste0("t", seqidx, ".data"))
-
-        hidden <- mx.symbol.Embedding(data=data, weight=embed.weight,
-                                      input.dim=input.size,
-                                      output.dim=num.embed,
-                                      name=paste0("t", seqidx, ".embed"))
-        
-        # stack lstm
-        for (i in 1:num.lstm.layer) {
-            if (i==0) {
-                dp <- 0
-            }
-            else {
-                dp <- dropout
-            }
-            next.state <- lstm(num.hidden, indata=hidden,
-                               prev.state=last.states[[i]],
-                               param=param.cells[[i]],
-                               seqidx=seqidx, layeridx=i, 
-                               dropout=dp)
-            hidden <- next.state$h
-            last.states[[i]] <- next.state
-        }
-        # decoder
-        if (dropout > 0)  
-            hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
-        last.hidden <- c(last.hidden, hidden)
-    }
-    last.hidden$dim <- 0
-    last.hidden$num.args <- seq.len
-    concat <-mxnet:::mx.varg.symbol.Concat(last.hidden)
-    fc <- mx.symbol.FullyConnected(data=concat,
-                                   weight=cls.weight,
-                                   bias=cls.bias,
-                                   num.hidden=num.label)
-    loss.all <- mx.symbol.SoftmaxOutput(data=fc, label=label, name="sm")
-    unpack.c <- list()
-    unpack.h <- list()
-    for (i in 1:num.lstm.layer) {
-         state <- last.states[[i]]
-         state <- list(c=mx.symbol.BlockGrad(state$c, name=paste0("l", i, ".last.c")),
-                            h=mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h" )))
-        last.states[[i]] <- state
-        unpack.c <- c(unpack.c, state$c)
-        unpack.h <- c(unpack.h, state$h)
-    }
-    list.all <- c(loss.all, unpack.c, unpack.h)
-
-    return (mx.symbol.Group(list.all))
-}
-
-is.param.name <- function(name) {
-    return (grepl('weight$', name) || grepl('bias$', name) || 
-           grepl('gamma$', name) || grepl('beta$', name) )
-}
-
-mx.model.init.params <- function(symbol, input.shape, initializer, ctx) {
-  if (!is.mx.symbol(symbol)) stop("symbol need to be MXSymbol")
-  slist <- symbol$infer.shape(input.shape)
-  if (is.null(slist)) stop("Not enough information to get shapes")
-  arg.params <- mx.init.create(initializer, slist$arg.shapes, ctx, skip.unknown=TRUE)
-  aux.params <- mx.init.create(initializer, slist$aux.shapes, ctx, skip.unknown=FALSE)
-  return(list(arg.params=arg.params, aux.params=aux.params))
-}
-
-# set up rnn model with lstm cells
-setup.rnn.model <- function(ctx,
-                            num.lstm.layer, seq.len,
-                            num.hidden, num.embed, num.label,
-                            batch.size, input.size,
-                            initializer=mx.init.uniform(0.01), 
-                            dropout=0) {
-
-    rnn.sym <- lstm.unroll(num.lstm.layer=num.lstm.layer,
-                           num.hidden=num.hidden,
-                           seq.len=seq.len,
-                           input.size=input.size,
-                           num.embed=num.embed,
-                           num.label=num.label,
-                           dropout=dropout)
-    arg.names <- rnn.sym$arguments
-    input.shapes <- list()
-    for (name in arg.names) {
-        if (grepl('init.c$', name) || grepl('init.h$', name)) {
-            input.shapes[[name]] <- c(num.hidden, batch.size)
-        }
-        else if (grepl('data$', name)) {
-            input.shapes[[name]] <- c(batch.size)
-        }
-    }
-    
-    params <- mx.model.init.params(rnn.sym, input.shapes, initializer, ctx)
-
-    args <- input.shapes
-    args$symbol <- rnn.sym
-    args$ctx <- ctx
-    args$grad.req <- "add"
-    rnn.exec <- do.call(mx.simple.bind, args)
-
-    mx.exec.update.arg.arrays(rnn.exec, params$arg.params, match.name=TRUE)
-    mx.exec.update.aux.arrays(rnn.exec, params$aux.params, match.name=TRUE)
-
-    grad.arrays <- list()
-    for (name in names(rnn.exec$ref.grad.arrays)) {
-        if (is.param.name(name))
-            grad.arrays[[name]] <- rnn.exec$ref.arg.arrays[[name]]*0
-    }
-    mx.exec.update.grad.arrays(rnn.exec, grad.arrays, match.name=TRUE)
-
-    return (list(rnn.exec=rnn.exec, symbol=rnn.sym,
-                 num.lstm.layer=num.lstm.layer, num.hidden=num.hidden, 
-                 seq.len=seq.len, batch.size=batch.size,
-                 num.embed=num.embed))
-
-}
-
-
-get.rnn.inputs <- function(m, X, begin) {
-    seq.len <- m$seq.len
-    batch.size <- m$batch.size
-    seq.labels <- array(0, dim=c(seq.len*batch.size))
-    seq.data <- list()
-    for (seqidx in 1:seq.len) {
-        idx <- (begin + seqidx - 1) %% dim(X)[2] + 1
-        next.idx <- (begin + seqidx) %% dim(X)[2] + 1
-        x <- X[, idx]
-        y <- X[, next.idx]
-
-        seq.data[[paste0("t", seqidx, ".data")]] <- mx.nd.array(as.array(x))
-        seq.labels[((seqidx-1)*batch.size+1) : (seqidx*batch.size)] <- y
-    }
-    seq.data$label <- mx.nd.array(seq.labels)
-    return (seq.data)
-}
-
-
-calc.nll <- function(seq.label.probs, X, begin) {
-    nll = - sum(log(seq.label.probs)) / length(X[,1])
-    return (nll)
-}
-
-train.lstm <- function(model, X.train.batch, X.val.batch,
-                       num.round, update.period,
-                       optimizer='sgd', half.life=2, max.grad.norm = 5.0, ...) {
-    X.train.batch.shape <- dim(X.train.batch)
-    X.val.batch.shape <- dim(X.val.batch)
-    cat(paste0("Training with train.shape=(", paste0(X.train.batch.shape, collapse=","), ")"), "\n")
-    cat(paste0("Training with val.shape=(", paste0(X.val.batch.shape, collapse=","), ")"), "\n")
-
-    m <- model
-    seq.len <- m$seq.len
-    batch.size <- m$batch.size
-    num.lstm.layer <- m$num.lstm.layer
-    num.hidden <- m$num.hidden
-
-    opt <- mx.opt.create(optimizer, rescale.grad=(1/batch.size), ...)
-
-    updater <- mx.opt.get.updater(opt, m$rnn.exec$ref.arg.arrays)
-    epoch.counter <- 0
-    log.period <- max(as.integer(1000 / seq.len), 1)
-    last.perp <- 10000000.0
-
-    for (iteration in 1:num.round) {
-        nbatch <- 0
-        train.nll <- 0
-        # reset states
-        init.states <- list()
-        for (i in 1:num.lstm.layer) {
-            init.states[[paste0("l", i, ".init.c")]] <- mx.nd.zeros(c(num.hidden, batch.size))
-            init.states[[paste0("l", i, ".init.h")]] <- mx.nd.zeros(c(num.hidden, batch.size))
-        }
-        mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE) 
-
-        tic <- Sys.time()
-
-        stopifnot(dim(X.train.batch)[[2]] %% seq.len == 0)
-        stopifnot(dim(X.val.batch)[[2]] %% seq.len == 0)
-
-        for (begin in seq(1, dim(X.train.batch)[2], seq.len)) {
-            # set rnn input
-            rnn.input <- get.rnn.inputs(m, X.train.batch, begin=begin)
-            mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE) 
-
-            mx.exec.forward(m$rnn.exec, is.train=TRUE)
-            # probability of each label class, used to evaluate nll
-            seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$outputs[["sm_output"]], m$rnn.exec$arg.arrays[["label"]])
-            mx.exec.backward(m$rnn.exec)
-            # transfer the states
-            init.states <- list()
-            for (i in 1:num.lstm.layer) {
-                init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$outputs[[paste0("l", i, ".last.c_output")]]
-                init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$outputs[[paste0("l", i, ".last.h_output")]]
-            }
-            mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE) 
-            # update epoch counter
-            epoch.counter <- epoch.counter + 1
-            if (epoch.counter %% update.period == 0) {
-
-                # the gradient of initial c and inital h should be zero
-                init.grad <- list()
-                for (i in 1:num.lstm.layer) {
-                    init.grad[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
-                    init.grad[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-                }
-                mx.exec.update.grad.arrays(m$rnn.exec, init.grad, match.name=TRUE)
-
-                arg.blocks <- updater(m$rnn.exec$ref.arg.arrays, m$rnn.exec$ref.grad.arrays)
-
-                mx.exec.update.arg.arrays(m$rnn.exec, arg.blocks, skip.null=TRUE)
-
-                grad.arrays <- list()
-                for (name in names(m$rnn.exec$ref.grad.arrays)) {
-                    if (is.param.name(name))
-                        grad.arrays[[name]] <- m$rnn.exec$ref.grad.arrays[[name]]*0
-                }
-                mx.exec.update.grad.arrays(m$rnn.exec, grad.arrays, match.name=TRUE)
-
-            }
-
-            train.nll <- train.nll + calc.nll(as.array(seq.label.probs), X.train.batch, begin=begin)
-
-            nbatch <- begin + seq.len
-            if ((epoch.counter %% log.period) == 0) {
-                cat(paste0("Epoch [", epoch.counter, 
-                           "] Train: NLL=", train.nll / nbatch, 
-                           ", Perp=", exp(train.nll / nbatch), "\n"))
-            }
-        }
-        # end of training loop
-        toc <- Sys.time()
-        cat(paste0("Iter [", iteration, 
-                   "] Train: Time: ", as.numeric(toc - tic, units="secs"),
-                   " sec, NLL=", train.nll / nbatch,
-                   ", Perp=", exp(train.nll / nbatch), "\n"))
-
-        val.nll <- 0.0
-        # validation set, reset states
-        init.states <- list()
-        for (i in 1:num.lstm.layer) {
-            init.states[[paste0("l", i, ".init.c")]] <- mx.nd.zeros(c(num.hidden, batch.size))
-            init.states[[paste0("l", i, ".init.h")]] <- mx.nd.zeros(c(num.hidden, batch.size))
-        }
-        mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE) 
-
-        for (begin in seq(1, dim(X.val.batch)[2], seq.len)) {
-            # set rnn input
-            rnn.input <- get.rnn.inputs(m, X.val.batch, begin=begin)
-            mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE) 
-            mx.exec.forward(m$rnn.exec, is.train=FALSE)
-            # probability of each label class, used to evaluate nll
-            seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$outputs[["sm_output"]], m$rnn.exec$arg.arrays[["label"]])
-            # transfer the states
-            init.states <- list()
-            for (i in 1:num.lstm.layer) {
-                init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$outputs[[paste0("l", i, ".last.c_output")]]
-                init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$outputs[[paste0("l", i, ".last.h_output")]]
-            }
-            mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
-            val.nll <- val.nll + calc.nll(as.array(seq.label.probs), X.val.batch, begin=begin)
-        }
-        nbatch <- dim(X.val.batch)[2]
-        perp <- exp(val.nll / nbatch)
-        cat(paste0("Iter [", iteration,  
-                   "] Val: NLL=", val.nll / nbatch,
-                   ", Perp=", exp(val.nll / nbatch), "\n"))
-
-
-    }
-}
\ No newline at end of file