From 2e9a8393a38f1cdd254fde756ef21ebc6f23726e Mon Sep 17 00:00:00 2001 From: ziyeqinghan Date: Fri, 3 Jun 2016 18:29:39 +0800 Subject: [PATCH 1/8] modify mx.io.arrayiter to let label support multi-dimension --- R-package/R/io.R | 9 ++++++++- {example/rnn => R-package/R}/lstm.R | 0 R-package/src/io.cc | 10 +++++----- R-package/src/io.h | 5 +++++ 4 files changed, 18 insertions(+), 6 deletions(-) rename {example/rnn => R-package/R}/lstm.R (100%) diff --git a/R-package/R/io.R b/R-package/R/io.R index cde2b4c032f1..10298fbaf056 100644 --- a/R-package/R/io.R +++ b/R-package/R/io.R @@ -41,7 +41,14 @@ mx.io.arrayiter <- function(data, label, batch.size=128, shuffle=FALSE) { if (shuffle) { - unif.rnds <- as.array(mx.runif(c(length(label)), ctx=mx.cpu())); + shape <- dim(data) + if (is.null(shape)) { + num.data <- length(data) + } else { + ndim <- length(shape) + num.data <- shape[[ndim]] + } + unif.rnds <- as.array(mx.runif(c(num.data), ctx=mx.cpu())); } else { unif.rnds <- as.array(0) } diff --git a/example/rnn/lstm.R b/R-package/R/lstm.R similarity index 100% rename from example/rnn/lstm.R rename to R-package/R/lstm.R diff --git a/R-package/src/io.cc b/R-package/src/io.cc index 8da9fbba839c..237b9d4bbf74 100644 --- a/R-package/src/io.cc +++ b/R-package/src/io.cc @@ -54,14 +54,14 @@ ArrayDataIter::ArrayDataIter(const Rcpp::NumericVector& data, RLOG_FATAL << "Data and label shape in-consistent"; } } - - std::vector order(label.size()); + num_data = lshape[lshape.size() - 1]; + std::vector order(num_data); for (size_t i = 0; i < order.size(); ++i) { order[i] = i; } if (shuffle) { - RCHECK(unif_rnds.size() == label.size()); + RCHECK(unif_rnds.size() == num_data); for (size_t i = order.size() - 1; i != 0; --i) { size_t idx = static_cast(unif_rnds[i] * (i + 1)); if (idx < i) { @@ -110,7 +110,7 @@ void ArrayDataIter::Convert(const Rcpp::NumericVector& src, } Rcpp::List ArrayDataIter::Value() const { - RCHECK(counter_ != 0 && counter_ <= label_.size()) + RCHECK(counter_ != 0 && counter_ <= num_data) << "Read Iter at end or before iter.next is called"; return Rcpp::List::create( Rcpp::Named("data") = data_[counter_ - 1].RObject(), @@ -126,7 +126,7 @@ bool ArrayDataIter::Next() { } int ArrayDataIter::NumPad() const { - if (counter_ == label_.size()) { + if (counter_ == num_data) { return static_cast(num_pad_); } else { return 0; diff --git a/R-package/src/io.h b/R-package/src/io.h index 8a68ec7d30df..b3032351fe7a 100644 --- a/R-package/src/io.h +++ b/R-package/src/io.h @@ -132,12 +132,15 @@ class ArrayDataIter : public DataIter { size_t counter_; /*! \brief number of pad instances*/ size_t num_pad_; + /*! \brief number of data */ + size_t num_data; /*! \brief The data list of each batch */ std::vector data_; /*! \brief The data list of each batch */ std::vector label_; }; + /*! \brief The DataIterCreate functions to be invoked */ class DataIterCreateFunction : public ::Rcpp::CppFunction { public: @@ -177,6 +180,8 @@ class DataIterCreateFunction : public ::Rcpp::CppFunction { // name of the function std::string name_; }; + + } // namespace R } // namespace mxnet From aa2d32fb37290d04450c9f68fcc2000cbee56319 Mon Sep 17 00:00:00 2001 From: ziyeqinghan Date: Fri, 3 Jun 2016 18:58:41 +0800 Subject: [PATCH 2/8] refactor lstm model to some high-level reusable API in R --- R-package/NAMESPACE | 3 + R-package/R/lstm.R | 483 ++++++++++++++++++++++------- R-package/man/mx.lstm.Rd | 66 ++++ R-package/man/mx.lstm.forward.Rd | 25 ++ R-package/man/mx.lstm.inference.Rd | 44 +++ example/rnn/char_lstm.R | 113 ------- 6 files changed, 513 insertions(+), 221 deletions(-) create mode 100644 R-package/man/mx.lstm.Rd create mode 100644 R-package/man/mx.lstm.forward.Rd create mode 100644 R-package/man/mx.lstm.inference.Rd delete mode 100644 example/rnn/char_lstm.R diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index beb797b289e2..355233385cbf 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -36,6 +36,9 @@ export(mx.io.MNISTIter) export(mx.io.arrayiter) export(mx.io.extract) export(mx.kv.create) +export(mx.lstm) +export(mx.lstm.forward) +export(mx.lstm.inference) export(mx.metric.accuracy) export(mx.metric.custom) export(mx.metric.logger) diff --git a/R-package/R/lstm.R b/R-package/R/lstm.R index 5caaf0213b95..87caed1eeecf 100644 --- a/R-package/R/lstm.R +++ b/R-package/R/lstm.R @@ -24,7 +24,7 @@ lstm <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropou out.gate <- mx.symbol.Activation(slice.gates[[4]], act.type="sigmoid") next.c <- (forget.gate * prev.state$c) + (in.gate * in.transform) next.h <- out.gate * mx.symbol.Activation(next.c, act.type="tanh") - + return (list(c=next.c, h=next.h)) } @@ -47,20 +47,22 @@ lstm.unroll <- function(num.lstm.layer, seq.len, input.size, last.states[[i]] <- state } - last.hidden <- list() + + # embeding layer label <- mx.symbol.Variable("label") + data <- mx.symbol.Variable("data") + embed <- mx.symbol.Embedding(data=data, input_dim=input.size, + weight=embed.weight, output_dim=num.embed, name="embed") + wordvec <- mx.symbol.SliceChannel(data=embed, num_outputs=seq.len, squeeze_axis=1) + + last.hidden <- list() for (seqidx in 1:seq.len) { - # embeding layer - data <- mx.symbol.Variable(paste0("t", seqidx, ".data")) - - hidden <- mx.symbol.Embedding(data=data, weight=embed.weight, - input.dim=input.size, - output.dim=num.embed, - name=paste0("t", seqidx, ".embed")) - + + hidden = wordvec[[seqidx]] + # stack lstm for (i in 1:num.lstm.layer) { - if (i==0) { + if (i == 1) { dp <- 0 } else { @@ -69,13 +71,13 @@ lstm.unroll <- function(num.lstm.layer, seq.len, input.size, next.state <- lstm(num.hidden, indata=hidden, prev.state=last.states[[i]], param=param.cells[[i]], - seqidx=seqidx, layeridx=i, + seqidx=seqidx, layeridx=i, dropout=dp) hidden <- next.state$h last.states[[i]] <- next.state } # decoder - if (dropout > 0) + if (dropout > 0) hidden <- mx.symbol.Dropout(data=hidden, p=dropout) last.hidden <- c(last.hidden, hidden) } @@ -86,28 +88,80 @@ lstm.unroll <- function(num.lstm.layer, seq.len, input.size, weight=cls.weight, bias=cls.bias, num.hidden=num.label) + + label <- mx.symbol.transpose(data=label) + label <- mx.symbol.Reshape(data=label, target.shape=c(0)) + loss.all <- mx.symbol.SoftmaxOutput(data=fc, label=label, name="sm") - unpack.c <- list() - unpack.h <- list() + return (loss.all) +} + +lstm.inference.symbol <- function(num.lstm.layer, input.size, + num.hidden, num.embed, num.label, dropout=0.) { + seqidx <- 0 + embed.weight <- mx.symbol.Variable("embed.weight") + cls.weight <- mx.symbol.Variable("cls.weight") + cls.bias <- mx.symbol.Variable("cls.bias") + param.cells <- list() + last.states <- list() for (i in 1:num.lstm.layer) { - state <- last.states[[i]] - state <- list(c=mx.symbol.BlockGrad(state$c, name=paste0("l", i, ".last.c")), - h=mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h" ))) + param.cells[[i]] <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")), + i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")), + h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")), + h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias"))) + state <- list(c=mx.symbol.Variable(paste0("l", i, ".init.c")), + h=mx.symbol.Variable(paste0("l", i, ".init.h"))) last.states[[i]] <- state - unpack.c <- c(unpack.c, state$c) - unpack.h <- c(unpack.h, state$h) } - list.all <- c(loss.all, unpack.c, unpack.h) - return (mx.symbol.Group(list.all)) + # embeding layer + data <- mx.symbol.Variable("data") + hidden <- mx.symbol.Embedding(data=data, input_dim=input.size, + weight=embed.weight, output_dim=num.embed, name="embed") + + # stack lstm + for (i in 1:num.lstm.layer) { + if (i == 1) { + dp <- 0 + } + else { + dp <- dropout + } + next.state <- lstm(num.hidden, indata=hidden, + prev.state=last.states[[i]], + param=param.cells[[i]], + seqidx=seqidx, layeridx=i, + dropout=dp) + hidden <- next.state$h + last.states[[i]] <- next.state + } + # decoder + if (dropout > 0) + hidden <- mx.symbol.Dropout(data=hidden, p=dropout) + + fc <- mx.symbol.FullyConnected(data=hidden, num_hidden=num.label, + weight=cls.weight, bias=cls.bias, name='pred') + sm <- mx.symbol.SoftmaxOutput(data=fc, name='sm') + output <- list() + output <- c(output, sm) + for (i in 1:num.lstm.layer) { + state <- last.states[[i]] + state <- list(c=mx.symbol.BlockGrad(state$c, name=paste0("l", i, ".last.c")), + h=mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h" ))) + last.states[[i]] <- state + output <- c(output, state$c) + output <- c(output, state$h) + } + return (mx.symbol.Group(output)) } is.param.name <- function(name) { - return (grepl('weight$', name) || grepl('bias$', name) || + return (grepl('weight$', name) || grepl('bias$', name) || grepl('gamma$', name) || grepl('beta$', name) ) } -mx.model.init.params <- function(symbol, input.shape, initializer, ctx) { +# Initialize parameters +mx.model.init.params.rnn <- function(symbol, input.shape, initializer, ctx) { if (!is.mx.symbol(symbol)) stop("symbol need to be MXSymbol") slist <- symbol$infer.shape(input.shape) if (is.null(slist)) stop("Not enough information to get shapes") @@ -116,34 +170,49 @@ mx.model.init.params <- function(symbol, input.shape, initializer, ctx) { return(list(arg.params=arg.params, aux.params=aux.params)) } +# Initialize the data iter +mx.model.init.iter.rnn <- function(X, y, batch.size, is.train) { + if (is.MXDataIter(X)) return(X) + shape <- dim(data) + if (is.null(shape)) { + num.data <- length(X) + } else { + ndim <- length(shape) + num.data <- shape[[ndim]] + } + if (is.null(y)) { + if (is.train) stop("Need to provide parameter y for training with R arrays.") + y <- c(1:num.data) * 0 + } + + batch.size <- min(num.data, batch.size) + + return(mx.io.arrayiter(X, y, batch.size=batch.size, shuffle=is.train)) +} + # set up rnn model with lstm cells -setup.rnn.model <- function(ctx, +setup.rnn.model <- function(rnn.sym, ctx, num.lstm.layer, seq.len, num.hidden, num.embed, num.label, batch.size, input.size, - initializer=mx.init.uniform(0.01), + initializer=mx.init.uniform(0.01), dropout=0) { - rnn.sym <- lstm.unroll(num.lstm.layer=num.lstm.layer, - num.hidden=num.hidden, - seq.len=seq.len, - input.size=input.size, - num.embed=num.embed, - num.label=num.label, - dropout=dropout) arg.names <- rnn.sym$arguments input.shapes <- list() for (name in arg.names) { if (grepl('init.c$', name) || grepl('init.h$', name)) { input.shapes[[name]] <- c(num.hidden, batch.size) } - else if (grepl('data$', name)) { - input.shapes[[name]] <- c(batch.size) + else if (grepl('data$', name) || grepl('label$', name) ) { + if (seq.len == 1) { + input.shapes[[name]] <- c(batch.size) + } else { + input.shapes[[name]] <- c(seq.len, batch.size) + } } } - - params <- mx.model.init.params(rnn.sym, input.shapes, initializer, ctx) - + params <- mx.model.init.params.rnn(rnn.sym, input.shapes, initializer, mx.cpu()) args <- input.shapes args$symbol <- rnn.sym args$ctx <- ctx @@ -161,45 +230,34 @@ setup.rnn.model <- function(ctx, mx.exec.update.grad.arrays(rnn.exec, grad.arrays, match.name=TRUE) return (list(rnn.exec=rnn.exec, symbol=rnn.sym, - num.lstm.layer=num.lstm.layer, num.hidden=num.hidden, + num.lstm.layer=num.lstm.layer, num.hidden=num.hidden, seq.len=seq.len, batch.size=batch.size, num.embed=num.embed)) } -get.rnn.inputs <- function(m, X, begin) { - seq.len <- m$seq.len - batch.size <- m$batch.size - seq.labels <- array(0, dim=c(seq.len*batch.size)) - seq.data <- list() - for (seqidx in 1:seq.len) { - idx <- (begin + seqidx - 1) %% dim(X)[2] + 1 - next.idx <- (begin + seqidx) %% dim(X)[2] + 1 - x <- X[, idx] - y <- X[, next.idx] +calc.nll <- function(seq.label.probs, batch.size) { + nll = - sum(log(seq.label.probs)) / batch.size + return (nll) +} - seq.data[[paste0("t", seqidx, ".data")]] <- mx.nd.array(as.array(x)) - seq.labels[((seqidx-1)*batch.size+1) : (seqidx*batch.size)] <- y +get.label <- function(label, ctx) { + label <- as.array(label) + seq.len <- dim(label)[[1]] + batch.size <- dim(label)[[2]] + sm.label <- array(0, dim=c(seq.len*batch.size)) + for (seqidx in 1:seq.len) { + sm.label[((seqidx-1)*batch.size+1) : (seqidx*batch.size)] <- label[seqidx,] } - seq.data$label <- mx.nd.array(seq.labels) - return (seq.data) + return (mx.nd.array(sm.label, ctx)) } -calc.nll <- function(seq.label.probs, X, begin) { - nll = - sum(log(seq.label.probs)) / length(X[,1]) - return (nll) -} -train.lstm <- function(model, X.train.batch, X.val.batch, +train.lstm <- function(model, train.data, eval.data, num.round, update.period, - optimizer='sgd', half.life=2, max.grad.norm = 5.0, ...) { - X.train.batch.shape <- dim(X.train.batch) - X.val.batch.shape <- dim(X.val.batch) - cat(paste0("Training with train.shape=(", paste0(X.train.batch.shape, collapse=","), ")"), "\n") - cat(paste0("Training with val.shape=(", paste0(X.val.batch.shape, collapse=","), ")"), "\n") - + optimizer='sgd', ctx=mx.ctx.default(), ...) { m <- model seq.len <- m$seq.len batch.size <- m$batch.size @@ -222,33 +280,30 @@ train.lstm <- function(model, X.train.batch, X.val.batch, init.states[[paste0("l", i, ".init.c")]] <- mx.nd.zeros(c(num.hidden, batch.size)) init.states[[paste0("l", i, ".init.h")]] <- mx.nd.zeros(c(num.hidden, batch.size)) } - mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE) + mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE) tic <- Sys.time() - stopifnot(dim(X.train.batch)[[2]] %% seq.len == 0) - stopifnot(dim(X.val.batch)[[2]] %% seq.len == 0) + train.data$reset() - for (begin in seq(1, dim(X.train.batch)[2], seq.len)) { + while (train.data$iter.next()) { # set rnn input - rnn.input <- get.rnn.inputs(m, X.train.batch, begin=begin) - mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE) + rnn.input <- train.data$value() + mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE) mx.exec.forward(m$rnn.exec, is.train=TRUE) - # probability of each label class, used to evaluate nll - seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$outputs[["sm_output"]], m$rnn.exec$arg.arrays[["label"]]) + seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$ref.outputs[["sm_output"]], get.label(m$rnn.exec$ref.arg.arrays[["label"]], ctx)) + mx.exec.backward(m$rnn.exec) - # transfer the states init.states <- list() for (i in 1:num.lstm.layer) { - init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$outputs[[paste0("l", i, ".last.c_output")]] - init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$outputs[[paste0("l", i, ".last.h_output")]] + init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0 + init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0 } - mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE) + mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE) # update epoch counter epoch.counter <- epoch.counter + 1 if (epoch.counter %% update.period == 0) { - # the gradient of initial c and inital h should be zero init.grad <- list() for (i in 1:num.lstm.layer) { @@ -270,53 +325,265 @@ train.lstm <- function(model, X.train.batch, X.val.batch, } - train.nll <- train.nll + calc.nll(as.array(seq.label.probs), X.train.batch, begin=begin) + train.nll <- train.nll + calc.nll(as.array(seq.label.probs), batch.size) - nbatch <- begin + seq.len + nbatch <- nbatch + seq.len if ((epoch.counter %% log.period) == 0) { - cat(paste0("Epoch [", epoch.counter, - "] Train: NLL=", train.nll / nbatch, + cat(paste0("Epoch [", epoch.counter, + "] Train: NLL=", train.nll / nbatch, ", Perp=", exp(train.nll / nbatch), "\n")) } } + train.data$reset() # end of training loop toc <- Sys.time() - cat(paste0("Iter [", iteration, + cat(paste0("Iter [", iteration, "] Train: Time: ", as.numeric(toc - tic, units="secs"), " sec, NLL=", train.nll / nbatch, ", Perp=", exp(train.nll / nbatch), "\n")) - val.nll <- 0.0 - # validation set, reset states - init.states <- list() - for (i in 1:num.lstm.layer) { - init.states[[paste0("l", i, ".init.c")]] <- mx.nd.zeros(c(num.hidden, batch.size)) - init.states[[paste0("l", i, ".init.h")]] <- mx.nd.zeros(c(num.hidden, batch.size)) - } - mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE) - - for (begin in seq(1, dim(X.val.batch)[2], seq.len)) { - # set rnn input - rnn.input <- get.rnn.inputs(m, X.val.batch, begin=begin) - mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE) - mx.exec.forward(m$rnn.exec, is.train=FALSE) - # probability of each label class, used to evaluate nll - seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$outputs[["sm_output"]], m$rnn.exec$arg.arrays[["label"]]) - # transfer the states + if (!is.null(eval.data)) { + val.nll <- 0.0 + # validation set, reset states init.states <- list() for (i in 1:num.lstm.layer) { - init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$outputs[[paste0("l", i, ".last.c_output")]] - init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$outputs[[paste0("l", i, ".last.h_output")]] + init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0 + init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0 } mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE) - val.nll <- val.nll + calc.nll(as.array(seq.label.probs), X.val.batch, begin=begin) + + eval.data$reset() + nbatch <- 0 + while (eval.data$iter.next()) { + # set rnn input + rnn.input <- eval.data$value() + mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE) + mx.exec.forward(m$rnn.exec, is.train=FALSE) + # probability of each label class, used to evaluate nll + seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$ref.outputs[["sm_output"]], get.label(m$rnn.exec$ref.arg.arrays[["label"]], ctx)) + # transfer the states + init.states <- list() + for (i in 1:num.lstm.layer) { + init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0 + init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0 + } + mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE) + val.nll <- val.nll + calc.nll(as.array(seq.label.probs), batch.size) + nbatch <- nbatch + seq.len + } + eval.data$reset() + perp <- exp(val.nll / nbatch) + cat(paste0("Iter [", iteration, + "] Val: NLL=", val.nll / nbatch, + ", Perp=", exp(val.nll / nbatch), "\n")) } - nbatch <- dim(X.val.batch)[2] - perp <- exp(val.nll / nbatch) - cat(paste0("Iter [", iteration, - "] Val: NLL=", val.nll / nbatch, - ", Perp=", exp(val.nll / nbatch), "\n")) + } + + return (m) +} +check.data <- function(data, batch.size, is.train) { + if (!is.null(data) && !is.list(data) && !is.mx.dataiter(data)) { + stop("The dataset should be either a mx.io.DataIter or a R list") + } + if (is.list(data)) { + if (is.null(data$data) || is.null(data$label)){ + stop("Please provide dataset as list(data=R.array, label=R.array)") + } + data <- mx.model.init.iter.rnn(data$data, data$label, batch.size=batch.size, is.train = is.train) } -} \ No newline at end of file + if (!is.null(data) && !data$iter.next()) { + data$reset() + if (!data$iter.next()) stop("Empty input") + } + return (data) +} + +#' Training LSTM Unrolled Model +#' +#' @param train.data mx.io.DataIter or list(data=R.array, label=R.array) +#' The Training set. +#' @param eval.data mx.io.DataIter or list(data=R.array, label=R.array), optional +#' The validation set used for validation evaluation during the progress. +#' @param num.lstm.layer integer +#' The number of the layer of lstm. +#' @param seq.len integer +#' The length of the input sequence. +#' @param num.hidden integer +#' The number of hidden nodes. +#' @param num.embed integer +#' The output dim of embedding. +#' @param num.label integer +#' The number of labels. +#' @param batch.size integer +#' The batch size used for R array training. +#' @param input.size integer +#' The input dim of one-hot encoding of embedding +#' @param ctx mx.context, optional +#' The device used to perform training. +#' @param num.round integer, default=10 +#' The number of iterations over training data to train the model. +#' @param update.period integer, default=1 +#' The number of iterations to update parameters during training period. +#' @param initializer initializer object. default=mx.init.uniform(0.01) +#' The initialization scheme for parameters. +#' @param dropout float, default=0 +#' A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer. +#' @param optimizer string, default="sgd" +#' The optimization method. +#' @param ... other parameters passing to \code{mx.lstm}/. +#' @return model A trained lstm unrolled model. +#' +#' @export +mx.lstm <- function(train.data, eval.data=NULL, + num.lstm.layer, seq.len, + num.hidden, num.embed, num.label, + batch.size, input.size, + ctx=mx.ctx.default(), + num.round=10, update.period=1, + initializer=mx.init.uniform(0.01), + dropout=0, optimizer='sgd', + ...) { + # check data and change data into iterator + train.data <- check.data(train.data, batch.size, TRUE) + eval.data <- check.data(eval.data, batch.size, FALSE) + + # get unrolled lstm symbol + rnn.sym <- lstm.unroll(num.lstm.layer=num.lstm.layer, + num.hidden=num.hidden, + seq.len=seq.len, + input.size=input.size, + num.embed=num.embed, + num.label=num.label, + dropout=dropout) + # set up lstm model + model <- setup.rnn.model(rnn.sym=rnn.sym, + ctx=ctx, + num.lstm.layer=num.lstm.layer, + seq.len=seq.len, + num.hidden=num.hidden, + num.embed=num.embed, + num.label=num.label, + batch.size=batch.size, + input.size=input.size, + initializer=initializer, + dropout=dropout) + + # train lstm model + model <- train.lstm(model, train.data, eval.data, + num.round=num.round, + update.period=update.period, + ctx=ctx, + ...) + # change model into MXFeedForwardModel + model <- list(symbol=model$symbol, arg.params=model$rnn.exec$ref.arg.arrays, aux.params=model$rnn.exec$ref.aux.arrays) + return(structure(model, class="MXFeedForwardModel")) +} + + +#' Create a LSTM Inference Model +#' +#' @param num.lstm.layer integer +#' The number of the layer of lstm. +#' @param input.size integer +#' The input dim of one-hot encoding of embedding +#' @param num.hidden integer +#' The number of hidden nodes. +#' @param num.embed integer +#' The output dim of embedding. +#' @param num.label integer +#' The number of labels. +#' @param batch.size integer +#' The batch size used for R array training. +#' @param arg.params list +#' The batch size used for R array training. +#' @param ctx mx.context, optional +#' Model parameter, list of name to NDArray of net's weights. +#' @param dropout float, default=0 +#' A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer. +#' @return model a lstm inference model. +#' +#' @export +mx.lstm.inference <- function(num.lstm.layer, + input.size, + num.hidden, + num.embed, + num.label, + batch.size=1, + arg.params, + ctx=mx.cpu(), + dropout=0.) { + sym <- lstm.inference.symbol(num.lstm.layer, + input.size, + num.hidden, + num.embed, + num.label, + dropout) + + seq.len <- 1 + # set up lstm model + model <- setup.rnn.model(rnn.sym=sym, + ctx=ctx, + num.lstm.layer=num.lstm.layer, + seq.len=seq.len, + num.hidden=num.hidden, + num.embed=num.embed, + num.label=num.label, + batch.size=batch.size, + input.size=input.size, + initializer=mx.init.uniform(0.01), + dropout=dropout) + arg.names <- names(model$rnn.exec$ref.arg.arrays) + for (k in names(arg.params)) { + if ((k %in% arg.names) && is.param.name(k) ) { + rnn.input <- list() + rnn.input[[k]] <- arg.params[[k]] + mx.exec.update.arg.arrays(model$rnn.exec, rnn.input, match.name=TRUE) + } + } + init.states <- list() + for (i in 1:num.lstm.layer) { + init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0 + init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0 + } + mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE) + + return (model) +} + +#' Using forward function to predict in lstm inference model +#' +#' @param model lstm model +#' A Lstm inference model +#' @param input.data, array.matrix +#' The input data for forward function +#' @param new.seq boolean, default=FALSE +#' Whether the input is the start of a new sequence +#' +#' @return result A list(prob=prob, model=model) containing the result probability of each label and the model. +#' +#' @export + +mx.lstm.forward <- function(model, input.data, new.seq=FALSE) { + if (new.seq == TRUE) { + init.states <- list() + for (i in 1:num.lstm.layer) { + init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0 + init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0 + } + mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE) + } + dim(input.data) <- c(model$batch.size) + data <- list(data=mx.nd.array(input.data)) + mx.exec.update.arg.arrays(model$rnn.exec, data, match.name=TRUE) + mx.exec.forward(model$rnn.exec, is.train=FALSE) + init.states <- list() + for (i in 1:num.lstm.layer) { + init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.c_output")]] + init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.h_output")]] + } + mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE) + prob <- model$rnn.exec$ref.outputs[["sm_output"]] + return (list(prob=prob, model=model)) +} diff --git a/R-package/man/mx.lstm.Rd b/R-package/man/mx.lstm.Rd new file mode 100644 index 000000000000..32d3c8b58da4 --- /dev/null +++ b/R-package/man/mx.lstm.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lstm.R +\name{mx.lstm} +\alias{mx.lstm} +\title{Training LSTM Unrolled Model} +\usage{ +mx.lstm(train.data, eval.data = NULL, num.lstm.layer, seq.len, num.hidden, + num.embed, num.label, batch.size, input.size, ctx = mx.ctx.default(), + num.round = 10, update.period = 1, initializer = mx.init.uniform(0.01), + dropout = 0, optimizer = "sgd", ...) +} +\arguments{ +\item{train.data}{mx.io.DataIter or list(data=R.array, label=R.array) +The Training set.} + +\item{eval.data}{mx.io.DataIter or list(data=R.array, label=R.array), optional +The validation set used for validation evaluation during the progress.} + +\item{num.lstm.layer}{integer +The number of the layer of lstm.} + +\item{seq.len}{integer +The length of the input sequence.} + +\item{num.hidden}{integer +The number of hidden nodes.} + +\item{num.embed}{integer +The output dim of embedding.} + +\item{num.label}{integer +The number of labels.} + +\item{batch.size}{integer +The batch size used for R array training.} + +\item{input.size}{integer +The input dim of one-hot encoding of embedding} + +\item{ctx}{mx.context, optional +The device used to perform training.} + +\item{num.round}{integer, default=10 +The number of iterations over training data to train the model.} + +\item{update.period}{integer, default=1 +The number of iterations to update parameters during training period.} + +\item{initializer}{initializer object. default=mx.init.uniform(0.01) +The initialization scheme for parameters.} + +\item{dropout}{float, default=0 +A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.} + +\item{optimizer}{string, default="sgd" +The optimization method.} + +\item{...}{other parameters passing to \code{mx.lstm}/.} +} +\value{ +model A trained lstm unrolled model. +} +\description{ +Training LSTM Unrolled Model +} + diff --git a/R-package/man/mx.lstm.forward.Rd b/R-package/man/mx.lstm.forward.Rd new file mode 100644 index 000000000000..3fa654f7c901 --- /dev/null +++ b/R-package/man/mx.lstm.forward.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lstm.R +\name{mx.lstm.forward} +\alias{mx.lstm.forward} +\title{Using forward function to predict in lstm inference model} +\usage{ +mx.lstm.forward(model, input.data, new.seq = FALSE) +} +\arguments{ +\item{model}{lstm model +A Lstm inference model} + +\item{input.data, }{array.matrix +The input data for forward function} + +\item{new.seq}{boolean, default=FALSE +Whether the input is the start of a new sequence} +} +\value{ +result A list(prob=prob, model=model) containing the result probability of each label and the model. +} +\description{ +Using forward function to predict in lstm inference model +} + diff --git a/R-package/man/mx.lstm.inference.Rd b/R-package/man/mx.lstm.inference.Rd new file mode 100644 index 000000000000..af572ee28590 --- /dev/null +++ b/R-package/man/mx.lstm.inference.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lstm.R +\name{mx.lstm.inference} +\alias{mx.lstm.inference} +\title{Create a LSTM Inference Model} +\usage{ +mx.lstm.inference(num.lstm.layer, input.size, num.hidden, num.embed, num.label, + batch.size = 1, arg.params, ctx = mx.cpu(), dropout = 0) +} +\arguments{ +\item{num.lstm.layer}{integer +The number of the layer of lstm.} + +\item{input.size}{integer +The input dim of one-hot encoding of embedding} + +\item{num.hidden}{integer +The number of hidden nodes.} + +\item{num.embed}{integer +The output dim of embedding.} + +\item{num.label}{integer +The number of labels.} + +\item{batch.size}{integer +The batch size used for R array training.} + +\item{arg.params}{list +The batch size used for R array training.} + +\item{ctx}{mx.context, optional +Model parameter, list of name to NDArray of net's weights.} + +\item{dropout}{float, default=0 +A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.} +} +\value{ +model a lstm inference model. +} +\description{ +Create a LSTM Inference Model +} + diff --git a/example/rnn/char_lstm.R b/example/rnn/char_lstm.R deleted file mode 100644 index 26d08b262b20..000000000000 --- a/example/rnn/char_lstm.R +++ /dev/null @@ -1,113 +0,0 @@ -# Char LSTM Example. - -# This example aims to show how to use lstm to build a char level language model, and generate text from it. We use a tiny shakespeare text for demo purpose. -# Data can be found at https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare. - -# If running for the first time, download the data by running the following commands: sh get_ptb_data.sh - -require(mxnet) -source("lstm.R") - -# Set basic network parameters. -batch.size = 32 -seq.len = 32 -num.hidden = 256 -num.embed = 256 -num.lstm.layer = 2 -num.round = 21 -learning.rate= 0.01 -wd=0.00001 -clip_gradient=1 -update.period = 1 - -# Make dictionary from text -make.dict <- function(text, max.vocab=10000) { - text <- strsplit(text, '') - dic <- list() - idx <- 1 - for (c in text[[1]]) { - if (!(c %in% names(dic))) { - dic[[c]] <- idx - idx <- idx + 1 - } - } - if (length(dic) == max.vocab - 1) - dic[["UNKNOWN"]] <- idx - cat(paste0("Total unique char: ", length(dic), "\n")) - return (dic) -} - -# Transfer text into data batch -make.batch <- function(file.path, batch.size=32, seq.lenth=32, max.vocab=10000, dic=NULL) { - fi <- file(file.path, "r") - text <- paste(readLines(fi), collapse="\n") - close(fi) - - if (is.null(dic)) - dic <- make.dict(text, max.vocab) - lookup.table <- list() - for (c in names(dic)) { - idx <- dic[[c]] - lookup.table[[idx]] <- c - } - - char.lst <- strsplit(text, '')[[1]] - num.batch <- as.integer(length(char.lst) / batch.size) - char.lst <- char.lst[1:(num.batch * batch.size)] - data <- array(0, dim=c(batch.size, num.batch)) - idx <- 1 - for (j in 1:batch.size) { - for (i in 1:num.batch) { - if (char.lst[idx] %in% names(dic)) - data[j, i] <- dic[[ char.lst[idx] ]] - else { - data[j, i] <- dic[["UNKNOWN"]] - } - idx <- idx + 1 - } - } - return (list(data=data, dic=dic, lookup.table=lookup.table)) -} - -# Move tail text -drop.tail <- function(X, seq.len) { - shape <- dim(X) - nstep <- as.integer(shape[2] / seq.len) - return (X[, 1:(nstep * seq.len)]) -} - -ret <- make.batch("./data/input.txt", batch.size=batch.size, seq.lenth=seq.len) -X <- ret$data -dic <- ret$dic -lookup.table <- ret$lookup.table - -vocab <- length(dic) - -shape <- dim(X) -train.val.fraction <- 0.9 -size <- shape[2] -X.train <- X[, 1:as.integer(size * train.val.fraction)] -X.val <- X[, -(1:as.integer(size * train.val.fraction))] -X.train <- drop.tail(X.train, seq.len) -X.val <- drop.tail(X.val, seq.len) - -# Set up LSTM model -model <- setup.rnn.model(ctx=mx.gpu(0), - num.lstm.layer=num.lstm.layer, - seq.len=seq.len, - num.hidden=num.hidden, - num.embed=num.embed, - num.label=vocab, - batch.size=batch.size, - input.size=vocab, - initializer=mx.init.uniform(0.1), - dropout=0.) - -# Train LSTM model -train.lstm(model, X.train, X.val, - num.round=num.round, - half.life=3, - update.period=update.period, - learning.rate=learning.rate, - wd=wd, - clip_gradient=clip_gradient) From 6842303cc5b741881f908976fdd312f3e5e8e046 Mon Sep 17 00:00:00 2001 From: ziyeqinghan Date: Fri, 3 Jun 2016 19:02:28 +0800 Subject: [PATCH 3/8] add the rnn example and documentations of Char-RNN model in R --- R-package/vignettes/CharRnnModel.Rmd | 277 +++++++++++++++++++++++++++ docs/packages/r/CharRnnModel.Rmd | 277 +++++++++++++++++++++++++++ docs/packages/r/index.md | 1 + 3 files changed, 555 insertions(+) create mode 100644 R-package/vignettes/CharRnnModel.Rmd create mode 100644 docs/packages/r/CharRnnModel.Rmd diff --git a/R-package/vignettes/CharRnnModel.Rmd b/R-package/vignettes/CharRnnModel.Rmd new file mode 100644 index 000000000000..ab56cfaaad3c --- /dev/null +++ b/R-package/vignettes/CharRnnModel.Rmd @@ -0,0 +1,277 @@ +Char RNN Example +============================================= + +This example aims to show how to use lstm model to build a char level language model, and generate text from it. We use a tiny shakespeare text for demo purpose. + +Data can be found at https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare. + +Preface +------- +This tutorial is written in Rmarkdown. +- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/package/r/CharRnnModel.html) +- You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/CharRnnModel.Rmd) + +Load Data +--------- +First of all, load in the data and preprocess it. +```{r} +require(mxnet) +``` +Set basic network parameters. +```{r} +batch.size = 32 +seq.len = 32 +num.hidden = 256 +num.embed = 256 +num.lstm.layer = 2 +num.round = 5 +learning.rate= 0.1 +wd=0.00001 +clip_gradient=1 +update.period = 1 +``` +download the data. +```{r} +download.data <- function(data_dir) { + dir.create(data_dir, showWarnings = FALSE) + setwd(data_dir) + if (!file.exists('input.txt')) { + download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt', + destfile='input.txt', method='wget') + } + setwd("..") +} +``` +Make dictionary from text. +```{r} +make.dict <- function(text, max.vocab=10000) { + text <- strsplit(text, '') + dic <- list() + idx <- 1 + for (c in text[[1]]) { + if (!(c %in% names(dic))) { + dic[[c]] <- idx + idx <- idx + 1 + } + } + if (length(dic) == max.vocab - 1) + dic[["UNKNOWN"]] <- idx + cat(paste0("Total unique char: ", length(dic), "\n")) + return (dic) +} +``` +Transfer text into data feature. +```{r} +make.data <- function(file.path, seq.len=32, max.vocab=10000, dic=NULL) { + fi <- file(file.path, "r") + text <- paste(readLines(fi), collapse="\n") + close(fi) + + if (is.null(dic)) + dic <- make.dict(text, max.vocab) + lookup.table <- list() + for (c in names(dic)) { + idx <- dic[[c]] + lookup.table[[idx]] <- c + } + + char.lst <- strsplit(text, '')[[1]] + num.seq <- as.integer(length(char.lst) / seq.len) + char.lst <- char.lst[1:(num.seq * seq.len)] + data <- array(0, dim=c(seq.len, num.seq)) + idx <- 1 + for (i in 1:num.seq) { + for (j in 1:seq.len) { + if (char.lst[idx] %in% names(dic)) + data[j, i] <- dic[[ char.lst[idx] ]]-1 + else { + data[j, i] <- dic[["UNKNOWN"]]-1 + } + idx <- idx + 1 + } + } + return (list(data=data, dic=dic, lookup.table=lookup.table)) +} +``` +Move tail text. +```{r} +drop.tail <- function(X, batch.size) { + shape <- dim(X) + nstep <- as.integer(shape[2] / batch.size) + return (X[, 1:(nstep * batch.size)]) +} +``` +get the label of X +```{r} +get.label <- function(X) { + label <- array(0, dim=dim(X)) + d <- dim(X)[1] + w <- dim(X)[2] + for (i in 0:(w-1)) { + for (j in 1:d) { + label[i*d+j] <- X[(i*d+j)%%(w*d)+1] + } + } + return (label) +} +``` +get training data and eval data +```{r} +download.data("./data/") +ret <- make.data("./data/input.txt", seq.len=seq.len) +X <- ret$data +dic <- ret$dic +lookup.table <- ret$lookup.table + +vocab <- length(dic) + +shape <- dim(X) +train.val.fraction <- 0.9 +size <- shape[2] + +X.train.data <- X[, 1:as.integer(size * train.val.fraction)] +X.val.data <- X[, -(1:as.integer(size * train.val.fraction))] +X.train.data <- drop.tail(X.train.data, batch.size) +X.val.data <- drop.tail(X.val.data, batch.size) + +X.train.label <- get.label(X.train.data) +X.val.label <- get.label(X.val.data) + +X.train <- list(data=X.train.data, label=X.train.label) +X.val <- list(data=X.val.data, label=X.val.label) +``` + +Training Model +-------------- +In `mxnet`, we have a function called `mx.lstm` so that users can build a general lstm model. + +```{r} +model <- mx.lstm(X.train, X.val, + ctx=mx.gpu(0), + num.round=num.round, + update.period=update.period, + num.lstm.layer=num.lstm.layer, + seq.len=seq.len, + num.hidden=num.hidden, + num.embed=num.embed, + num.label=vocab, + batch.size=batch.size, + input.size=vocab, + initializer=mx.init.uniform(0.1), + learning.rate=learning.rate, + wd=wd, + clip_gradient=clip_gradient) + +``` +``` +Epoch [31] Train: NLL=3.47213018872144, Perp=32.2052727363657 +... +Epoch [961] Train: NLL=2.32060007657895, Perp=10.181782322355 +Iter [1] Train: Time: 186.397065639496 sec, NLL=2.31135356537961, Perp=10.0880702804858 +Iter [1] Val: NLL=1.94184484060012, Perp=6.97160060607419 +Epoch [992] Train: NLL=1.84784553299322, Perp=6.34613225095329 +... +Epoch [1953] Train: NLL=1.70175791172558, Perp=5.48357857093351 +Iter [2] Train: Time: 188.929051160812 sec, NLL=1.70103940328978, Perp=5.47963998859367 +Iter [2] Val: NLL=1.74979316010449, Perp=5.75341251767988 +... +Epoch [2914] Train: NLL=1.54738185300295, Perp=4.69915099483974 +Iter [3] Train: Time: 185.425321578979 sec, NLL=1.54604189517013, Perp=4.69285854740519 +Iter [3] Val: NLL=1.67780240235925, Perp=5.35377758479576 +Epoch [2945] Train: NLL=1.48868466087876, Perp=4.43126307034767 +... +Iter [4] Train: Time: 185.487086296082 sec, NLL=1.4744973925858, Perp=4.36883940994296 +Iter [4] Val: NLL=1.64488167325603, Perp=5.18039689118454 +Epoch [3937] Train: NLL=1.46355541021581, Perp=4.32129622881604 +... +Epoch [4898] Train: NLL=1.42900458455642, Perp=4.17454171976281 +Iter [5] Train: Time: 185.070136785507 sec, NLL=1.42909226256273, Perp=4.17490775130428 +Iter [5] Val: NLL=1.62716655804022, Perp=5.08943365437187 + +``` +Inference from model +-------------------- +helper function for random sample. +```{r} +cdf <- function(weights) { + total <- sum(weights) + result <- c() + cumsum <- 0 + for (w in weights) { + cumsum <- cumsum+w + result <- c(result, cumsum / total) + } + return (result) +} + +search.val <- function(cdf, x) { + l <- 1 + r <- length(cdf) + while (l <= r) { + m <- as.integer((l+r)/2) + if (cdf[m] < x) { + l <- m+1 + } else { + r <- m-1 + } + } + return (l) +} +choice <- function(weights) { + cdf.vals <- cdf(as.array(weights)) + x <- runif(1) + idx <- search.val(cdf.vals, x) + return (idx) +} +``` +we can use random output or fixed output by choosing largest probability. +```{r} +make.output <- function(prob, sample=FALSE, temperature=1.) { + if (!sample) { + idx <- which.max(as.array(prob)) + } + else { + scale_prob <- mx.nd.clip(prob, 1e-6, 1 - 1e-6) + rescale <- mx.nd.exp(mx.nd.log(scale_prob) / temperature) + rescale <- rescale / (as.array(mx.nd.sum(rescale))[1]) + idx <- choice(rescale) + } + return (idx) + +} +``` + +In `mxnet`, we have a function called `mx.lstm.inference` so that users can build a inference from lstm model and then use function `mx.lstm.forward` to get forward output from the inference. +Build inference from model. +```{r} +infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer, + input.size=vocab, + num.hidden=num.hidden, + num.embed=num.embed, + num.label=vocab, + arg.params=model$arg.params, + ctx=mx.gpu(0)) +``` +generate a sequence of 75 chars using function `mx.lstm.forward`. +``` +start <- 'a' +seq.len <- 75 +random.sample <- TRUE + +last.id <- dic[[start]] +out <- "a" +for (i in (1:(seq.len-1))) { + input <- c(last.id-1) + ret <- mx.lstm.forward(infer.model, input, FALSE) + infer.model <- ret$model + prob <- ret$prob + last.id <- make.output(prob, random.sample) + out <- paste0(out, lookup.table[[last.id]]) +} +cat (paste0(out, "\n")) +``` +The result: +``` +ah not a drobl greens +Settled asing lately sistering sounted to their hight +``` \ No newline at end of file diff --git a/docs/packages/r/CharRnnModel.Rmd b/docs/packages/r/CharRnnModel.Rmd new file mode 100644 index 000000000000..ab56cfaaad3c --- /dev/null +++ b/docs/packages/r/CharRnnModel.Rmd @@ -0,0 +1,277 @@ +Char RNN Example +============================================= + +This example aims to show how to use lstm model to build a char level language model, and generate text from it. We use a tiny shakespeare text for demo purpose. + +Data can be found at https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare. + +Preface +------- +This tutorial is written in Rmarkdown. +- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/package/r/CharRnnModel.html) +- You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/CharRnnModel.Rmd) + +Load Data +--------- +First of all, load in the data and preprocess it. +```{r} +require(mxnet) +``` +Set basic network parameters. +```{r} +batch.size = 32 +seq.len = 32 +num.hidden = 256 +num.embed = 256 +num.lstm.layer = 2 +num.round = 5 +learning.rate= 0.1 +wd=0.00001 +clip_gradient=1 +update.period = 1 +``` +download the data. +```{r} +download.data <- function(data_dir) { + dir.create(data_dir, showWarnings = FALSE) + setwd(data_dir) + if (!file.exists('input.txt')) { + download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt', + destfile='input.txt', method='wget') + } + setwd("..") +} +``` +Make dictionary from text. +```{r} +make.dict <- function(text, max.vocab=10000) { + text <- strsplit(text, '') + dic <- list() + idx <- 1 + for (c in text[[1]]) { + if (!(c %in% names(dic))) { + dic[[c]] <- idx + idx <- idx + 1 + } + } + if (length(dic) == max.vocab - 1) + dic[["UNKNOWN"]] <- idx + cat(paste0("Total unique char: ", length(dic), "\n")) + return (dic) +} +``` +Transfer text into data feature. +```{r} +make.data <- function(file.path, seq.len=32, max.vocab=10000, dic=NULL) { + fi <- file(file.path, "r") + text <- paste(readLines(fi), collapse="\n") + close(fi) + + if (is.null(dic)) + dic <- make.dict(text, max.vocab) + lookup.table <- list() + for (c in names(dic)) { + idx <- dic[[c]] + lookup.table[[idx]] <- c + } + + char.lst <- strsplit(text, '')[[1]] + num.seq <- as.integer(length(char.lst) / seq.len) + char.lst <- char.lst[1:(num.seq * seq.len)] + data <- array(0, dim=c(seq.len, num.seq)) + idx <- 1 + for (i in 1:num.seq) { + for (j in 1:seq.len) { + if (char.lst[idx] %in% names(dic)) + data[j, i] <- dic[[ char.lst[idx] ]]-1 + else { + data[j, i] <- dic[["UNKNOWN"]]-1 + } + idx <- idx + 1 + } + } + return (list(data=data, dic=dic, lookup.table=lookup.table)) +} +``` +Move tail text. +```{r} +drop.tail <- function(X, batch.size) { + shape <- dim(X) + nstep <- as.integer(shape[2] / batch.size) + return (X[, 1:(nstep * batch.size)]) +} +``` +get the label of X +```{r} +get.label <- function(X) { + label <- array(0, dim=dim(X)) + d <- dim(X)[1] + w <- dim(X)[2] + for (i in 0:(w-1)) { + for (j in 1:d) { + label[i*d+j] <- X[(i*d+j)%%(w*d)+1] + } + } + return (label) +} +``` +get training data and eval data +```{r} +download.data("./data/") +ret <- make.data("./data/input.txt", seq.len=seq.len) +X <- ret$data +dic <- ret$dic +lookup.table <- ret$lookup.table + +vocab <- length(dic) + +shape <- dim(X) +train.val.fraction <- 0.9 +size <- shape[2] + +X.train.data <- X[, 1:as.integer(size * train.val.fraction)] +X.val.data <- X[, -(1:as.integer(size * train.val.fraction))] +X.train.data <- drop.tail(X.train.data, batch.size) +X.val.data <- drop.tail(X.val.data, batch.size) + +X.train.label <- get.label(X.train.data) +X.val.label <- get.label(X.val.data) + +X.train <- list(data=X.train.data, label=X.train.label) +X.val <- list(data=X.val.data, label=X.val.label) +``` + +Training Model +-------------- +In `mxnet`, we have a function called `mx.lstm` so that users can build a general lstm model. + +```{r} +model <- mx.lstm(X.train, X.val, + ctx=mx.gpu(0), + num.round=num.round, + update.period=update.period, + num.lstm.layer=num.lstm.layer, + seq.len=seq.len, + num.hidden=num.hidden, + num.embed=num.embed, + num.label=vocab, + batch.size=batch.size, + input.size=vocab, + initializer=mx.init.uniform(0.1), + learning.rate=learning.rate, + wd=wd, + clip_gradient=clip_gradient) + +``` +``` +Epoch [31] Train: NLL=3.47213018872144, Perp=32.2052727363657 +... +Epoch [961] Train: NLL=2.32060007657895, Perp=10.181782322355 +Iter [1] Train: Time: 186.397065639496 sec, NLL=2.31135356537961, Perp=10.0880702804858 +Iter [1] Val: NLL=1.94184484060012, Perp=6.97160060607419 +Epoch [992] Train: NLL=1.84784553299322, Perp=6.34613225095329 +... +Epoch [1953] Train: NLL=1.70175791172558, Perp=5.48357857093351 +Iter [2] Train: Time: 188.929051160812 sec, NLL=1.70103940328978, Perp=5.47963998859367 +Iter [2] Val: NLL=1.74979316010449, Perp=5.75341251767988 +... +Epoch [2914] Train: NLL=1.54738185300295, Perp=4.69915099483974 +Iter [3] Train: Time: 185.425321578979 sec, NLL=1.54604189517013, Perp=4.69285854740519 +Iter [3] Val: NLL=1.67780240235925, Perp=5.35377758479576 +Epoch [2945] Train: NLL=1.48868466087876, Perp=4.43126307034767 +... +Iter [4] Train: Time: 185.487086296082 sec, NLL=1.4744973925858, Perp=4.36883940994296 +Iter [4] Val: NLL=1.64488167325603, Perp=5.18039689118454 +Epoch [3937] Train: NLL=1.46355541021581, Perp=4.32129622881604 +... +Epoch [4898] Train: NLL=1.42900458455642, Perp=4.17454171976281 +Iter [5] Train: Time: 185.070136785507 sec, NLL=1.42909226256273, Perp=4.17490775130428 +Iter [5] Val: NLL=1.62716655804022, Perp=5.08943365437187 + +``` +Inference from model +-------------------- +helper function for random sample. +```{r} +cdf <- function(weights) { + total <- sum(weights) + result <- c() + cumsum <- 0 + for (w in weights) { + cumsum <- cumsum+w + result <- c(result, cumsum / total) + } + return (result) +} + +search.val <- function(cdf, x) { + l <- 1 + r <- length(cdf) + while (l <= r) { + m <- as.integer((l+r)/2) + if (cdf[m] < x) { + l <- m+1 + } else { + r <- m-1 + } + } + return (l) +} +choice <- function(weights) { + cdf.vals <- cdf(as.array(weights)) + x <- runif(1) + idx <- search.val(cdf.vals, x) + return (idx) +} +``` +we can use random output or fixed output by choosing largest probability. +```{r} +make.output <- function(prob, sample=FALSE, temperature=1.) { + if (!sample) { + idx <- which.max(as.array(prob)) + } + else { + scale_prob <- mx.nd.clip(prob, 1e-6, 1 - 1e-6) + rescale <- mx.nd.exp(mx.nd.log(scale_prob) / temperature) + rescale <- rescale / (as.array(mx.nd.sum(rescale))[1]) + idx <- choice(rescale) + } + return (idx) + +} +``` + +In `mxnet`, we have a function called `mx.lstm.inference` so that users can build a inference from lstm model and then use function `mx.lstm.forward` to get forward output from the inference. +Build inference from model. +```{r} +infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer, + input.size=vocab, + num.hidden=num.hidden, + num.embed=num.embed, + num.label=vocab, + arg.params=model$arg.params, + ctx=mx.gpu(0)) +``` +generate a sequence of 75 chars using function `mx.lstm.forward`. +``` +start <- 'a' +seq.len <- 75 +random.sample <- TRUE + +last.id <- dic[[start]] +out <- "a" +for (i in (1:(seq.len-1))) { + input <- c(last.id-1) + ret <- mx.lstm.forward(infer.model, input, FALSE) + infer.model <- ret$model + prob <- ret$prob + last.id <- make.output(prob, random.sample) + out <- paste0(out, lookup.table[[last.id]]) +} +cat (paste0(out, "\n")) +``` +The result: +``` +ah not a drobl greens +Settled asing lately sistering sounted to their hight +``` \ No newline at end of file diff --git a/docs/packages/r/index.md b/docs/packages/r/index.md index 35e1fe7642b0..ef427abc4899 100644 --- a/docs/packages/r/index.md +++ b/docs/packages/r/index.md @@ -20,6 +20,7 @@ Tutorials * [Handwritten Digits Classification Competition](mnistCompetition.md) * [Tutorial on NDArray and Symbol](ndarrayAndSymbolTutorial.md) * [Tutorial on Callback Functions](CallbackFunctionTutorial.md) +* [Character Language Model using RNN Model](CharRnnModel.Rmd) Resources --------- From 7662da67ae4925c4b84d4a364b28311cab57bdb8 Mon Sep 17 00:00:00 2001 From: ziyeqinghan Date: Sun, 5 Jun 2016 20:15:23 +0800 Subject: [PATCH 4/8] modify some codes of lstm model in R --- R-package/R/lstm.R | 73 +++++++++++++--------------- R-package/vignettes/CharRnnModel.Rmd | 4 +- docs/packages/r/CharRnnModel.Rmd | 4 +- 3 files changed, 37 insertions(+), 44 deletions(-) diff --git a/R-package/R/lstm.R b/R-package/R/lstm.R index 87caed1eeecf..3fcd0e831751 100644 --- a/R-package/R/lstm.R +++ b/R-package/R/lstm.R @@ -1,5 +1,3 @@ -require(mxnet) - # lstm cell symbol lstm <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout=0) { if (dropout > 0) @@ -35,18 +33,19 @@ lstm.unroll <- function(num.lstm.layer, seq.len, input.size, embed.weight <- mx.symbol.Variable("embed.weight") cls.weight <- mx.symbol.Variable("cls.weight") cls.bias <- mx.symbol.Variable("cls.bias") - param.cells <- list() - last.states <- list() - for (i in 1:num.lstm.layer) { - param.cells[[i]] <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")), - i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")), - h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")), - h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias"))) + + param.cells <- lapply(1:num.lstm.layer, function(i) { + cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")), + i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")), + h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")), + h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias"))) + return (cell) + }) + last.states <- lapply(1:num.lstm.layer, function(i) { state <- list(c=mx.symbol.Variable(paste0("l", i, ".init.c")), h=mx.symbol.Variable(paste0("l", i, ".init.h"))) - last.states[[i]] <- state - } - + return (state) + }) # embeding layer label <- mx.symbol.Variable("label") @@ -62,12 +61,7 @@ lstm.unroll <- function(num.lstm.layer, seq.len, input.size, # stack lstm for (i in 1:num.lstm.layer) { - if (i == 1) { - dp <- 0 - } - else { - dp <- dropout - } + dp <- ifelse(i==1, 0, dropout) next.state <- lstm(num.hidden, indata=hidden, prev.state=last.states[[i]], param=param.cells[[i]], @@ -102,17 +96,19 @@ lstm.inference.symbol <- function(num.lstm.layer, input.size, embed.weight <- mx.symbol.Variable("embed.weight") cls.weight <- mx.symbol.Variable("cls.weight") cls.bias <- mx.symbol.Variable("cls.bias") - param.cells <- list() - last.states <- list() - for (i in 1:num.lstm.layer) { - param.cells[[i]] <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")), + + param.cells <- lapply(1:num.lstm.layer, function(i) { + cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")), i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")), h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")), h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias"))) + return (cell) + }) + last.states <- lapply(1:num.lstm.layer, function(i) { state <- list(c=mx.symbol.Variable(paste0("l", i, ".init.c")), h=mx.symbol.Variable(paste0("l", i, ".init.h"))) - last.states[[i]] <- state - } + return (state) + }) # embeding layer data <- mx.symbol.Variable("data") @@ -121,12 +117,7 @@ lstm.inference.symbol <- function(num.lstm.layer, input.size, # stack lstm for (i in 1:num.lstm.layer) { - if (i == 1) { - dp <- 0 - } - else { - dp <- dropout - } + dp <- ifelse(i==1, 0, dropout) next.state <- lstm(num.hidden, indata=hidden, prev.state=last.states[[i]], param=param.cells[[i]], @@ -142,17 +133,19 @@ lstm.inference.symbol <- function(num.lstm.layer, input.size, fc <- mx.symbol.FullyConnected(data=hidden, num_hidden=num.label, weight=cls.weight, bias=cls.bias, name='pred') sm <- mx.symbol.SoftmaxOutput(data=fc, name='sm') - output <- list() - output <- c(output, sm) - for (i in 1:num.lstm.layer) { + unpack.c <- lapply(1:num.lstm.layer, function(i) { state <- last.states[[i]] - state <- list(c=mx.symbol.BlockGrad(state$c, name=paste0("l", i, ".last.c")), - h=mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h" ))) - last.states[[i]] <- state - output <- c(output, state$c) - output <- c(output, state$h) - } - return (mx.symbol.Group(output)) + state.c <- mx.symbol.BlockGrad(state$c, name=paste0("l", i, ".last.c")) + return (state.c) + }) + unpack.h <- lapply(1:num.lstm.layer, function(i) { + state <- last.states[[i]] + state.h <- mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h")) + return (state.h) + }) + + list.all <- c(sm, unpack.c, unpack.h) + return (mx.symbol.Group(list.all)) } is.param.name <- function(name) { diff --git a/R-package/vignettes/CharRnnModel.Rmd b/R-package/vignettes/CharRnnModel.Rmd index ab56cfaaad3c..faebf54f9fd2 100644 --- a/R-package/vignettes/CharRnnModel.Rmd +++ b/R-package/vignettes/CharRnnModel.Rmd @@ -147,7 +147,7 @@ In `mxnet`, we have a function called `mx.lstm` so that users can build a genera ```{r} model <- mx.lstm(X.train, X.val, - ctx=mx.gpu(0), + ctx=mx.cpu(), num.round=num.round, update.period=update.period, num.lstm.layer=num.lstm.layer, @@ -250,7 +250,7 @@ infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer, num.embed=num.embed, num.label=vocab, arg.params=model$arg.params, - ctx=mx.gpu(0)) + ctx=mx.cpu()) ``` generate a sequence of 75 chars using function `mx.lstm.forward`. ``` diff --git a/docs/packages/r/CharRnnModel.Rmd b/docs/packages/r/CharRnnModel.Rmd index ab56cfaaad3c..faebf54f9fd2 100644 --- a/docs/packages/r/CharRnnModel.Rmd +++ b/docs/packages/r/CharRnnModel.Rmd @@ -147,7 +147,7 @@ In `mxnet`, we have a function called `mx.lstm` so that users can build a genera ```{r} model <- mx.lstm(X.train, X.val, - ctx=mx.gpu(0), + ctx=mx.cpu(), num.round=num.round, update.period=update.period, num.lstm.layer=num.lstm.layer, @@ -250,7 +250,7 @@ infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer, num.embed=num.embed, num.label=vocab, arg.params=model$arg.params, - ctx=mx.gpu(0)) + ctx=mx.cpu()) ``` generate a sequence of 75 chars using function `mx.lstm.forward`. ``` From 72724a774f59d9502823661c2c054f3a079a4323 Mon Sep 17 00:00:00 2001 From: ziyeqinghan Date: Mon, 6 Jun 2016 20:40:48 +0800 Subject: [PATCH 5/8] add unit-test for lstm model in R --- R-package/tests/testthat/test_lstm.R | 52 ++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 R-package/tests/testthat/test_lstm.R diff --git a/R-package/tests/testthat/test_lstm.R b/R-package/tests/testthat/test_lstm.R new file mode 100644 index 000000000000..24b1a59636dc --- /dev/null +++ b/R-package/tests/testthat/test_lstm.R @@ -0,0 +1,52 @@ +require(mxnet) + +context("lstm models") + +get.nll <- function(s) { + pat <- ".*\\NLL=(.+), Perp=.*" + nll <- sub(pat, "\\1", s) + return (as.numeric(nll)) +} + +test_that("training error decreasing", { + + # Set basic network parameters. + batch.size = 2 + seq.len = 2 + num.hidden = 1 + num.embed = 2 + num.lstm.layer = 2 + num.round = 5 + learning.rate= 0.1 + wd=0.00001 + clip_gradient=1 + update.period = 1 + vocab=17 + + X.train <- list(data=array(1:16, dim=c(2,8)), label=array(2:17, dim=c(2,8))) + + s <- capture.output(model <- mx.lstm( X.train, + ctx=mx.cpu(), + num.round=num.round, + update.period=update.period, + num.lstm.layer=num.lstm.layer, + seq.len=seq.len, + num.hidden=num.hidden, + num.embed=num.embed, + num.label=vocab, + batch.size=batch.size, + input.size=vocab, + initializer=mx.init.uniform(0.01), + learning.rate=learning.rate, + wd=wd, + clip_gradient=clip_gradient)) + + prev.nll <- 10000000.0 + for (r in s) { + nll <- get.nll(r) + expect_true(prev.nll >= nll) + prev.nll <- nll + + } + +}) \ No newline at end of file From acaef5f63a6b554d7cfacd823bc6d7263b030ec9 Mon Sep 17 00:00:00 2001 From: ziyeqinghan Date: Mon, 6 Jun 2016 22:43:00 +0800 Subject: [PATCH 6/8] modify the code in CharRnnModel.Rmd to unchange the working directory --- R-package/vignettes/CharRnnModel.Rmd | 6 ++---- docs/packages/r/CharRnnModel.Rmd | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/R-package/vignettes/CharRnnModel.Rmd b/R-package/vignettes/CharRnnModel.Rmd index faebf54f9fd2..00f1f4c9f23e 100644 --- a/R-package/vignettes/CharRnnModel.Rmd +++ b/R-package/vignettes/CharRnnModel.Rmd @@ -34,12 +34,10 @@ download the data. ```{r} download.data <- function(data_dir) { dir.create(data_dir, showWarnings = FALSE) - setwd(data_dir) - if (!file.exists('input.txt')) { + if (!file.exists(paste0(data_dir,'input.txt'))) { download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt', - destfile='input.txt', method='wget') + destfile=paste0(data_dir,'input.txt'), method='wget') } - setwd("..") } ``` Make dictionary from text. diff --git a/docs/packages/r/CharRnnModel.Rmd b/docs/packages/r/CharRnnModel.Rmd index faebf54f9fd2..00f1f4c9f23e 100644 --- a/docs/packages/r/CharRnnModel.Rmd +++ b/docs/packages/r/CharRnnModel.Rmd @@ -34,12 +34,10 @@ download the data. ```{r} download.data <- function(data_dir) { dir.create(data_dir, showWarnings = FALSE) - setwd(data_dir) - if (!file.exists('input.txt')) { + if (!file.exists(paste0(data_dir,'input.txt'))) { download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt', - destfile='input.txt', method='wget') + destfile=paste0(data_dir,'input.txt'), method='wget') } - setwd("..") } ``` Make dictionary from text. From e59eadebbf40d859d5d6e11f1de36496429676b2 Mon Sep 17 00:00:00 2001 From: ziyeqinghan Date: Tue, 7 Jun 2016 08:48:35 +0800 Subject: [PATCH 7/8] change num.round smaller to decrease the running time --- R-package/vignettes/CharRnnModel.Rmd | 3 ++- docs/packages/r/CharRnnModel.Rmd | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/R-package/vignettes/CharRnnModel.Rmd b/R-package/vignettes/CharRnnModel.Rmd index 00f1f4c9f23e..9066d60f7513 100644 --- a/R-package/vignettes/CharRnnModel.Rmd +++ b/R-package/vignettes/CharRnnModel.Rmd @@ -24,7 +24,7 @@ seq.len = 32 num.hidden = 256 num.embed = 256 num.lstm.layer = 2 -num.round = 5 +num.round = 3 learning.rate= 0.1 wd=0.00001 clip_gradient=1 @@ -161,6 +161,7 @@ model <- mx.lstm(X.train, X.val, clip_gradient=clip_gradient) ``` +Setting the parameters ctx=mx.gpu(0) and num.round=5 can get the following result. ``` Epoch [31] Train: NLL=3.47213018872144, Perp=32.2052727363657 ... diff --git a/docs/packages/r/CharRnnModel.Rmd b/docs/packages/r/CharRnnModel.Rmd index 00f1f4c9f23e..9066d60f7513 100644 --- a/docs/packages/r/CharRnnModel.Rmd +++ b/docs/packages/r/CharRnnModel.Rmd @@ -24,7 +24,7 @@ seq.len = 32 num.hidden = 256 num.embed = 256 num.lstm.layer = 2 -num.round = 5 +num.round = 3 learning.rate= 0.1 wd=0.00001 clip_gradient=1 @@ -161,6 +161,7 @@ model <- mx.lstm(X.train, X.val, clip_gradient=clip_gradient) ``` +Setting the parameters ctx=mx.gpu(0) and num.round=5 can get the following result. ``` Epoch [31] Train: NLL=3.47213018872144, Perp=32.2052727363657 ... From 921bc29c789fc48972a31fdc54febabd7649ce26 Mon Sep 17 00:00:00 2001 From: ziyeqinghan Date: Tue, 7 Jun 2016 14:36:06 +0800 Subject: [PATCH 8/8] fix a bug on mx.io.arrayiter --- R-package/src/io.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/src/io.cc b/R-package/src/io.cc index 237b9d4bbf74..27a25382cd0d 100644 --- a/R-package/src/io.cc +++ b/R-package/src/io.cc @@ -126,7 +126,7 @@ bool ArrayDataIter::Next() { } int ArrayDataIter::NumPad() const { - if (counter_ == num_data) { + if (counter_ == label_.size()) { return static_cast(num_pad_); } else { return 0;