diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index beb797b289e2..355233385cbf 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -36,6 +36,9 @@ export(mx.io.MNISTIter) export(mx.io.arrayiter) export(mx.io.extract) export(mx.kv.create) +export(mx.lstm) +export(mx.lstm.forward) +export(mx.lstm.inference) export(mx.metric.accuracy) export(mx.metric.custom) export(mx.metric.logger) diff --git a/R-package/R/io.R b/R-package/R/io.R index cde2b4c032f1..10298fbaf056 100644 --- a/R-package/R/io.R +++ b/R-package/R/io.R @@ -41,7 +41,14 @@ mx.io.arrayiter <- function(data, label, batch.size=128, shuffle=FALSE) { if (shuffle) { - unif.rnds <- as.array(mx.runif(c(length(label)), ctx=mx.cpu())); + shape <- dim(data) + if (is.null(shape)) { + num.data <- length(data) + } else { + ndim <- length(shape) + num.data <- shape[[ndim]] + } + unif.rnds <- as.array(mx.runif(c(num.data), ctx=mx.cpu())); } else { unif.rnds <- as.array(0) } diff --git a/R-package/R/lstm.R b/R-package/R/lstm.R new file mode 100644 index 000000000000..3fcd0e831751 --- /dev/null +++ b/R-package/R/lstm.R @@ -0,0 +1,582 @@ +# lstm cell symbol +lstm <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout=0) { + if (dropout > 0) + indata <- mx.symbol.Dropout(data=indata, p=dropout) + i2h <- mx.symbol.FullyConnected(data=indata, + weight=param$i2h.weight, + bias=param$i2h.bias, + num.hidden=num.hidden * 4, + name=paste0("t", seqidx, ".l", layeridx, ".i2h")) + h2h <- mx.symbol.FullyConnected(data=prev.state$h, + weight=param$h2h.weight, + bias=param$h2h.bias, + num.hidden=num.hidden * 4, + name=paste0("t", seqidx, ".l", layeridx, ".h2h")) + gates <- i2h + h2h + slice.gates <- mx.symbol.SliceChannel(gates, num.outputs=4, + name=paste0("t", seqidx, ".l", layeridx, ".slice")) + + in.gate <- mx.symbol.Activation(slice.gates[[1]], act.type="sigmoid") + in.transform <- mx.symbol.Activation(slice.gates[[2]], act.type="tanh") + forget.gate <- mx.symbol.Activation(slice.gates[[3]], act.type="sigmoid") + out.gate <- mx.symbol.Activation(slice.gates[[4]], act.type="sigmoid") + next.c <- (forget.gate * prev.state$c) + (in.gate * in.transform) + next.h <- out.gate * mx.symbol.Activation(next.c, act.type="tanh") + + return (list(c=next.c, h=next.h)) +} + +# unrolled lstm network +lstm.unroll <- function(num.lstm.layer, seq.len, input.size, + num.hidden, num.embed, num.label, dropout=0.) { + + embed.weight <- mx.symbol.Variable("embed.weight") + cls.weight <- mx.symbol.Variable("cls.weight") + cls.bias <- mx.symbol.Variable("cls.bias") + + param.cells <- lapply(1:num.lstm.layer, function(i) { + cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")), + i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")), + h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")), + h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias"))) + return (cell) + }) + last.states <- lapply(1:num.lstm.layer, function(i) { + state <- list(c=mx.symbol.Variable(paste0("l", i, ".init.c")), + h=mx.symbol.Variable(paste0("l", i, ".init.h"))) + return (state) + }) + + # embeding layer + label <- mx.symbol.Variable("label") + data <- mx.symbol.Variable("data") + embed <- mx.symbol.Embedding(data=data, input_dim=input.size, + weight=embed.weight, output_dim=num.embed, name="embed") + wordvec <- mx.symbol.SliceChannel(data=embed, num_outputs=seq.len, squeeze_axis=1) + + last.hidden <- list() + for (seqidx in 1:seq.len) { + + hidden = wordvec[[seqidx]] + + # stack lstm + for (i in 1:num.lstm.layer) { + dp <- ifelse(i==1, 0, dropout) + next.state <- lstm(num.hidden, indata=hidden, + prev.state=last.states[[i]], + param=param.cells[[i]], + seqidx=seqidx, layeridx=i, + dropout=dp) + hidden <- next.state$h + last.states[[i]] <- next.state + } + # decoder + if (dropout > 0) + hidden <- mx.symbol.Dropout(data=hidden, p=dropout) + last.hidden <- c(last.hidden, hidden) + } + last.hidden$dim <- 0 + last.hidden$num.args <- seq.len + concat <-mxnet:::mx.varg.symbol.Concat(last.hidden) + fc <- mx.symbol.FullyConnected(data=concat, + weight=cls.weight, + bias=cls.bias, + num.hidden=num.label) + + label <- mx.symbol.transpose(data=label) + label <- mx.symbol.Reshape(data=label, target.shape=c(0)) + + loss.all <- mx.symbol.SoftmaxOutput(data=fc, label=label, name="sm") + return (loss.all) +} + +lstm.inference.symbol <- function(num.lstm.layer, input.size, + num.hidden, num.embed, num.label, dropout=0.) { + seqidx <- 0 + embed.weight <- mx.symbol.Variable("embed.weight") + cls.weight <- mx.symbol.Variable("cls.weight") + cls.bias <- mx.symbol.Variable("cls.bias") + + param.cells <- lapply(1:num.lstm.layer, function(i) { + cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")), + i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")), + h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")), + h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias"))) + return (cell) + }) + last.states <- lapply(1:num.lstm.layer, function(i) { + state <- list(c=mx.symbol.Variable(paste0("l", i, ".init.c")), + h=mx.symbol.Variable(paste0("l", i, ".init.h"))) + return (state) + }) + + # embeding layer + data <- mx.symbol.Variable("data") + hidden <- mx.symbol.Embedding(data=data, input_dim=input.size, + weight=embed.weight, output_dim=num.embed, name="embed") + + # stack lstm + for (i in 1:num.lstm.layer) { + dp <- ifelse(i==1, 0, dropout) + next.state <- lstm(num.hidden, indata=hidden, + prev.state=last.states[[i]], + param=param.cells[[i]], + seqidx=seqidx, layeridx=i, + dropout=dp) + hidden <- next.state$h + last.states[[i]] <- next.state + } + # decoder + if (dropout > 0) + hidden <- mx.symbol.Dropout(data=hidden, p=dropout) + + fc <- mx.symbol.FullyConnected(data=hidden, num_hidden=num.label, + weight=cls.weight, bias=cls.bias, name='pred') + sm <- mx.symbol.SoftmaxOutput(data=fc, name='sm') + unpack.c <- lapply(1:num.lstm.layer, function(i) { + state <- last.states[[i]] + state.c <- mx.symbol.BlockGrad(state$c, name=paste0("l", i, ".last.c")) + return (state.c) + }) + unpack.h <- lapply(1:num.lstm.layer, function(i) { + state <- last.states[[i]] + state.h <- mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h")) + return (state.h) + }) + + list.all <- c(sm, unpack.c, unpack.h) + return (mx.symbol.Group(list.all)) +} + +is.param.name <- function(name) { + return (grepl('weight$', name) || grepl('bias$', name) || + grepl('gamma$', name) || grepl('beta$', name) ) +} + +# Initialize parameters +mx.model.init.params.rnn <- function(symbol, input.shape, initializer, ctx) { + if (!is.mx.symbol(symbol)) stop("symbol need to be MXSymbol") + slist <- symbol$infer.shape(input.shape) + if (is.null(slist)) stop("Not enough information to get shapes") + arg.params <- mx.init.create(initializer, slist$arg.shapes, ctx, skip.unknown=TRUE) + aux.params <- mx.init.create(initializer, slist$aux.shapes, ctx, skip.unknown=FALSE) + return(list(arg.params=arg.params, aux.params=aux.params)) +} + +# Initialize the data iter +mx.model.init.iter.rnn <- function(X, y, batch.size, is.train) { + if (is.MXDataIter(X)) return(X) + shape <- dim(data) + if (is.null(shape)) { + num.data <- length(X) + } else { + ndim <- length(shape) + num.data <- shape[[ndim]] + } + if (is.null(y)) { + if (is.train) stop("Need to provide parameter y for training with R arrays.") + y <- c(1:num.data) * 0 + } + + batch.size <- min(num.data, batch.size) + + return(mx.io.arrayiter(X, y, batch.size=batch.size, shuffle=is.train)) +} + +# set up rnn model with lstm cells +setup.rnn.model <- function(rnn.sym, ctx, + num.lstm.layer, seq.len, + num.hidden, num.embed, num.label, + batch.size, input.size, + initializer=mx.init.uniform(0.01), + dropout=0) { + + arg.names <- rnn.sym$arguments + input.shapes <- list() + for (name in arg.names) { + if (grepl('init.c$', name) || grepl('init.h$', name)) { + input.shapes[[name]] <- c(num.hidden, batch.size) + } + else if (grepl('data$', name) || grepl('label$', name) ) { + if (seq.len == 1) { + input.shapes[[name]] <- c(batch.size) + } else { + input.shapes[[name]] <- c(seq.len, batch.size) + } + } + } + params <- mx.model.init.params.rnn(rnn.sym, input.shapes, initializer, mx.cpu()) + args <- input.shapes + args$symbol <- rnn.sym + args$ctx <- ctx + args$grad.req <- "add" + rnn.exec <- do.call(mx.simple.bind, args) + + mx.exec.update.arg.arrays(rnn.exec, params$arg.params, match.name=TRUE) + mx.exec.update.aux.arrays(rnn.exec, params$aux.params, match.name=TRUE) + + grad.arrays <- list() + for (name in names(rnn.exec$ref.grad.arrays)) { + if (is.param.name(name)) + grad.arrays[[name]] <- rnn.exec$ref.arg.arrays[[name]]*0 + } + mx.exec.update.grad.arrays(rnn.exec, grad.arrays, match.name=TRUE) + + return (list(rnn.exec=rnn.exec, symbol=rnn.sym, + num.lstm.layer=num.lstm.layer, num.hidden=num.hidden, + seq.len=seq.len, batch.size=batch.size, + num.embed=num.embed)) + +} + + +calc.nll <- function(seq.label.probs, batch.size) { + nll = - sum(log(seq.label.probs)) / batch.size + return (nll) +} + +get.label <- function(label, ctx) { + label <- as.array(label) + seq.len <- dim(label)[[1]] + batch.size <- dim(label)[[2]] + sm.label <- array(0, dim=c(seq.len*batch.size)) + for (seqidx in 1:seq.len) { + sm.label[((seqidx-1)*batch.size+1) : (seqidx*batch.size)] <- label[seqidx,] + } + return (mx.nd.array(sm.label, ctx)) +} + + + +train.lstm <- function(model, train.data, eval.data, + num.round, update.period, + optimizer='sgd', ctx=mx.ctx.default(), ...) { + m <- model + seq.len <- m$seq.len + batch.size <- m$batch.size + num.lstm.layer <- m$num.lstm.layer + num.hidden <- m$num.hidden + + opt <- mx.opt.create(optimizer, rescale.grad=(1/batch.size), ...) + + updater <- mx.opt.get.updater(opt, m$rnn.exec$ref.arg.arrays) + epoch.counter <- 0 + log.period <- max(as.integer(1000 / seq.len), 1) + last.perp <- 10000000.0 + + for (iteration in 1:num.round) { + nbatch <- 0 + train.nll <- 0 + # reset states + init.states <- list() + for (i in 1:num.lstm.layer) { + init.states[[paste0("l", i, ".init.c")]] <- mx.nd.zeros(c(num.hidden, batch.size)) + init.states[[paste0("l", i, ".init.h")]] <- mx.nd.zeros(c(num.hidden, batch.size)) + } + mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE) + + tic <- Sys.time() + + train.data$reset() + + while (train.data$iter.next()) { + # set rnn input + rnn.input <- train.data$value() + mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE) + + mx.exec.forward(m$rnn.exec, is.train=TRUE) + seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$ref.outputs[["sm_output"]], get.label(m$rnn.exec$ref.arg.arrays[["label"]], ctx)) + + mx.exec.backward(m$rnn.exec) + init.states <- list() + for (i in 1:num.lstm.layer) { + init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0 + init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0 + } + mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE) + # update epoch counter + epoch.counter <- epoch.counter + 1 + if (epoch.counter %% update.period == 0) { + # the gradient of initial c and inital h should be zero + init.grad <- list() + for (i in 1:num.lstm.layer) { + init.grad[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0 + init.grad[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0 + } + mx.exec.update.grad.arrays(m$rnn.exec, init.grad, match.name=TRUE) + + arg.blocks <- updater(m$rnn.exec$ref.arg.arrays, m$rnn.exec$ref.grad.arrays) + + mx.exec.update.arg.arrays(m$rnn.exec, arg.blocks, skip.null=TRUE) + + grad.arrays <- list() + for (name in names(m$rnn.exec$ref.grad.arrays)) { + if (is.param.name(name)) + grad.arrays[[name]] <- m$rnn.exec$ref.grad.arrays[[name]]*0 + } + mx.exec.update.grad.arrays(m$rnn.exec, grad.arrays, match.name=TRUE) + + } + + train.nll <- train.nll + calc.nll(as.array(seq.label.probs), batch.size) + + nbatch <- nbatch + seq.len + if ((epoch.counter %% log.period) == 0) { + cat(paste0("Epoch [", epoch.counter, + "] Train: NLL=", train.nll / nbatch, + ", Perp=", exp(train.nll / nbatch), "\n")) + } + } + train.data$reset() + # end of training loop + toc <- Sys.time() + cat(paste0("Iter [", iteration, + "] Train: Time: ", as.numeric(toc - tic, units="secs"), + " sec, NLL=", train.nll / nbatch, + ", Perp=", exp(train.nll / nbatch), "\n")) + + if (!is.null(eval.data)) { + val.nll <- 0.0 + # validation set, reset states + init.states <- list() + for (i in 1:num.lstm.layer) { + init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0 + init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0 + } + mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE) + + eval.data$reset() + nbatch <- 0 + while (eval.data$iter.next()) { + # set rnn input + rnn.input <- eval.data$value() + mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE) + mx.exec.forward(m$rnn.exec, is.train=FALSE) + # probability of each label class, used to evaluate nll + seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$ref.outputs[["sm_output"]], get.label(m$rnn.exec$ref.arg.arrays[["label"]], ctx)) + # transfer the states + init.states <- list() + for (i in 1:num.lstm.layer) { + init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0 + init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0 + } + mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE) + val.nll <- val.nll + calc.nll(as.array(seq.label.probs), batch.size) + nbatch <- nbatch + seq.len + } + eval.data$reset() + perp <- exp(val.nll / nbatch) + cat(paste0("Iter [", iteration, + "] Val: NLL=", val.nll / nbatch, + ", Perp=", exp(val.nll / nbatch), "\n")) + } + } + + return (m) +} + + +check.data <- function(data, batch.size, is.train) { + if (!is.null(data) && !is.list(data) && !is.mx.dataiter(data)) { + stop("The dataset should be either a mx.io.DataIter or a R list") + } + if (is.list(data)) { + if (is.null(data$data) || is.null(data$label)){ + stop("Please provide dataset as list(data=R.array, label=R.array)") + } + data <- mx.model.init.iter.rnn(data$data, data$label, batch.size=batch.size, is.train = is.train) + } + if (!is.null(data) && !data$iter.next()) { + data$reset() + if (!data$iter.next()) stop("Empty input") + } + return (data) +} + +#' Training LSTM Unrolled Model +#' +#' @param train.data mx.io.DataIter or list(data=R.array, label=R.array) +#' The Training set. +#' @param eval.data mx.io.DataIter or list(data=R.array, label=R.array), optional +#' The validation set used for validation evaluation during the progress. +#' @param num.lstm.layer integer +#' The number of the layer of lstm. +#' @param seq.len integer +#' The length of the input sequence. +#' @param num.hidden integer +#' The number of hidden nodes. +#' @param num.embed integer +#' The output dim of embedding. +#' @param num.label integer +#' The number of labels. +#' @param batch.size integer +#' The batch size used for R array training. +#' @param input.size integer +#' The input dim of one-hot encoding of embedding +#' @param ctx mx.context, optional +#' The device used to perform training. +#' @param num.round integer, default=10 +#' The number of iterations over training data to train the model. +#' @param update.period integer, default=1 +#' The number of iterations to update parameters during training period. +#' @param initializer initializer object. default=mx.init.uniform(0.01) +#' The initialization scheme for parameters. +#' @param dropout float, default=0 +#' A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer. +#' @param optimizer string, default="sgd" +#' The optimization method. +#' @param ... other parameters passing to \code{mx.lstm}/. +#' @return model A trained lstm unrolled model. +#' +#' @export +mx.lstm <- function(train.data, eval.data=NULL, + num.lstm.layer, seq.len, + num.hidden, num.embed, num.label, + batch.size, input.size, + ctx=mx.ctx.default(), + num.round=10, update.period=1, + initializer=mx.init.uniform(0.01), + dropout=0, optimizer='sgd', + ...) { + # check data and change data into iterator + train.data <- check.data(train.data, batch.size, TRUE) + eval.data <- check.data(eval.data, batch.size, FALSE) + + # get unrolled lstm symbol + rnn.sym <- lstm.unroll(num.lstm.layer=num.lstm.layer, + num.hidden=num.hidden, + seq.len=seq.len, + input.size=input.size, + num.embed=num.embed, + num.label=num.label, + dropout=dropout) + # set up lstm model + model <- setup.rnn.model(rnn.sym=rnn.sym, + ctx=ctx, + num.lstm.layer=num.lstm.layer, + seq.len=seq.len, + num.hidden=num.hidden, + num.embed=num.embed, + num.label=num.label, + batch.size=batch.size, + input.size=input.size, + initializer=initializer, + dropout=dropout) + + # train lstm model + model <- train.lstm(model, train.data, eval.data, + num.round=num.round, + update.period=update.period, + ctx=ctx, + ...) + # change model into MXFeedForwardModel + model <- list(symbol=model$symbol, arg.params=model$rnn.exec$ref.arg.arrays, aux.params=model$rnn.exec$ref.aux.arrays) + return(structure(model, class="MXFeedForwardModel")) +} + + +#' Create a LSTM Inference Model +#' +#' @param num.lstm.layer integer +#' The number of the layer of lstm. +#' @param input.size integer +#' The input dim of one-hot encoding of embedding +#' @param num.hidden integer +#' The number of hidden nodes. +#' @param num.embed integer +#' The output dim of embedding. +#' @param num.label integer +#' The number of labels. +#' @param batch.size integer +#' The batch size used for R array training. +#' @param arg.params list +#' The batch size used for R array training. +#' @param ctx mx.context, optional +#' Model parameter, list of name to NDArray of net's weights. +#' @param dropout float, default=0 +#' A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer. +#' @return model a lstm inference model. +#' +#' @export +mx.lstm.inference <- function(num.lstm.layer, + input.size, + num.hidden, + num.embed, + num.label, + batch.size=1, + arg.params, + ctx=mx.cpu(), + dropout=0.) { + sym <- lstm.inference.symbol(num.lstm.layer, + input.size, + num.hidden, + num.embed, + num.label, + dropout) + + seq.len <- 1 + # set up lstm model + model <- setup.rnn.model(rnn.sym=sym, + ctx=ctx, + num.lstm.layer=num.lstm.layer, + seq.len=seq.len, + num.hidden=num.hidden, + num.embed=num.embed, + num.label=num.label, + batch.size=batch.size, + input.size=input.size, + initializer=mx.init.uniform(0.01), + dropout=dropout) + arg.names <- names(model$rnn.exec$ref.arg.arrays) + for (k in names(arg.params)) { + if ((k %in% arg.names) && is.param.name(k) ) { + rnn.input <- list() + rnn.input[[k]] <- arg.params[[k]] + mx.exec.update.arg.arrays(model$rnn.exec, rnn.input, match.name=TRUE) + } + } + init.states <- list() + for (i in 1:num.lstm.layer) { + init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0 + init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0 + } + mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE) + + return (model) +} + +#' Using forward function to predict in lstm inference model +#' +#' @param model lstm model +#' A Lstm inference model +#' @param input.data, array.matrix +#' The input data for forward function +#' @param new.seq boolean, default=FALSE +#' Whether the input is the start of a new sequence +#' +#' @return result A list(prob=prob, model=model) containing the result probability of each label and the model. +#' +#' @export + +mx.lstm.forward <- function(model, input.data, new.seq=FALSE) { + if (new.seq == TRUE) { + init.states <- list() + for (i in 1:num.lstm.layer) { + init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0 + init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0 + } + mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE) + } + dim(input.data) <- c(model$batch.size) + data <- list(data=mx.nd.array(input.data)) + mx.exec.update.arg.arrays(model$rnn.exec, data, match.name=TRUE) + mx.exec.forward(model$rnn.exec, is.train=FALSE) + init.states <- list() + for (i in 1:num.lstm.layer) { + init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.c_output")]] + init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.h_output")]] + } + mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE) + prob <- model$rnn.exec$ref.outputs[["sm_output"]] + return (list(prob=prob, model=model)) +} diff --git a/R-package/man/mx.lstm.Rd b/R-package/man/mx.lstm.Rd new file mode 100644 index 000000000000..32d3c8b58da4 --- /dev/null +++ b/R-package/man/mx.lstm.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lstm.R +\name{mx.lstm} +\alias{mx.lstm} +\title{Training LSTM Unrolled Model} +\usage{ +mx.lstm(train.data, eval.data = NULL, num.lstm.layer, seq.len, num.hidden, + num.embed, num.label, batch.size, input.size, ctx = mx.ctx.default(), + num.round = 10, update.period = 1, initializer = mx.init.uniform(0.01), + dropout = 0, optimizer = "sgd", ...) +} +\arguments{ +\item{train.data}{mx.io.DataIter or list(data=R.array, label=R.array) +The Training set.} + +\item{eval.data}{mx.io.DataIter or list(data=R.array, label=R.array), optional +The validation set used for validation evaluation during the progress.} + +\item{num.lstm.layer}{integer +The number of the layer of lstm.} + +\item{seq.len}{integer +The length of the input sequence.} + +\item{num.hidden}{integer +The number of hidden nodes.} + +\item{num.embed}{integer +The output dim of embedding.} + +\item{num.label}{integer +The number of labels.} + +\item{batch.size}{integer +The batch size used for R array training.} + +\item{input.size}{integer +The input dim of one-hot encoding of embedding} + +\item{ctx}{mx.context, optional +The device used to perform training.} + +\item{num.round}{integer, default=10 +The number of iterations over training data to train the model.} + +\item{update.period}{integer, default=1 +The number of iterations to update parameters during training period.} + +\item{initializer}{initializer object. default=mx.init.uniform(0.01) +The initialization scheme for parameters.} + +\item{dropout}{float, default=0 +A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.} + +\item{optimizer}{string, default="sgd" +The optimization method.} + +\item{...}{other parameters passing to \code{mx.lstm}/.} +} +\value{ +model A trained lstm unrolled model. +} +\description{ +Training LSTM Unrolled Model +} + diff --git a/R-package/man/mx.lstm.forward.Rd b/R-package/man/mx.lstm.forward.Rd new file mode 100644 index 000000000000..3fa654f7c901 --- /dev/null +++ b/R-package/man/mx.lstm.forward.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lstm.R +\name{mx.lstm.forward} +\alias{mx.lstm.forward} +\title{Using forward function to predict in lstm inference model} +\usage{ +mx.lstm.forward(model, input.data, new.seq = FALSE) +} +\arguments{ +\item{model}{lstm model +A Lstm inference model} + +\item{input.data, }{array.matrix +The input data for forward function} + +\item{new.seq}{boolean, default=FALSE +Whether the input is the start of a new sequence} +} +\value{ +result A list(prob=prob, model=model) containing the result probability of each label and the model. +} +\description{ +Using forward function to predict in lstm inference model +} + diff --git a/R-package/man/mx.lstm.inference.Rd b/R-package/man/mx.lstm.inference.Rd new file mode 100644 index 000000000000..af572ee28590 --- /dev/null +++ b/R-package/man/mx.lstm.inference.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lstm.R +\name{mx.lstm.inference} +\alias{mx.lstm.inference} +\title{Create a LSTM Inference Model} +\usage{ +mx.lstm.inference(num.lstm.layer, input.size, num.hidden, num.embed, num.label, + batch.size = 1, arg.params, ctx = mx.cpu(), dropout = 0) +} +\arguments{ +\item{num.lstm.layer}{integer +The number of the layer of lstm.} + +\item{input.size}{integer +The input dim of one-hot encoding of embedding} + +\item{num.hidden}{integer +The number of hidden nodes.} + +\item{num.embed}{integer +The output dim of embedding.} + +\item{num.label}{integer +The number of labels.} + +\item{batch.size}{integer +The batch size used for R array training.} + +\item{arg.params}{list +The batch size used for R array training.} + +\item{ctx}{mx.context, optional +Model parameter, list of name to NDArray of net's weights.} + +\item{dropout}{float, default=0 +A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.} +} +\value{ +model a lstm inference model. +} +\description{ +Create a LSTM Inference Model +} + diff --git a/R-package/src/io.cc b/R-package/src/io.cc index 8da9fbba839c..27a25382cd0d 100644 --- a/R-package/src/io.cc +++ b/R-package/src/io.cc @@ -54,14 +54,14 @@ ArrayDataIter::ArrayDataIter(const Rcpp::NumericVector& data, RLOG_FATAL << "Data and label shape in-consistent"; } } - - std::vector order(label.size()); + num_data = lshape[lshape.size() - 1]; + std::vector order(num_data); for (size_t i = 0; i < order.size(); ++i) { order[i] = i; } if (shuffle) { - RCHECK(unif_rnds.size() == label.size()); + RCHECK(unif_rnds.size() == num_data); for (size_t i = order.size() - 1; i != 0; --i) { size_t idx = static_cast(unif_rnds[i] * (i + 1)); if (idx < i) { @@ -110,7 +110,7 @@ void ArrayDataIter::Convert(const Rcpp::NumericVector& src, } Rcpp::List ArrayDataIter::Value() const { - RCHECK(counter_ != 0 && counter_ <= label_.size()) + RCHECK(counter_ != 0 && counter_ <= num_data) << "Read Iter at end or before iter.next is called"; return Rcpp::List::create( Rcpp::Named("data") = data_[counter_ - 1].RObject(), diff --git a/R-package/src/io.h b/R-package/src/io.h index 8a68ec7d30df..b3032351fe7a 100644 --- a/R-package/src/io.h +++ b/R-package/src/io.h @@ -132,12 +132,15 @@ class ArrayDataIter : public DataIter { size_t counter_; /*! \brief number of pad instances*/ size_t num_pad_; + /*! \brief number of data */ + size_t num_data; /*! \brief The data list of each batch */ std::vector data_; /*! \brief The data list of each batch */ std::vector label_; }; + /*! \brief The DataIterCreate functions to be invoked */ class DataIterCreateFunction : public ::Rcpp::CppFunction { public: @@ -177,6 +180,8 @@ class DataIterCreateFunction : public ::Rcpp::CppFunction { // name of the function std::string name_; }; + + } // namespace R } // namespace mxnet diff --git a/R-package/tests/testthat/test_lstm.R b/R-package/tests/testthat/test_lstm.R new file mode 100644 index 000000000000..24b1a59636dc --- /dev/null +++ b/R-package/tests/testthat/test_lstm.R @@ -0,0 +1,52 @@ +require(mxnet) + +context("lstm models") + +get.nll <- function(s) { + pat <- ".*\\NLL=(.+), Perp=.*" + nll <- sub(pat, "\\1", s) + return (as.numeric(nll)) +} + +test_that("training error decreasing", { + + # Set basic network parameters. + batch.size = 2 + seq.len = 2 + num.hidden = 1 + num.embed = 2 + num.lstm.layer = 2 + num.round = 5 + learning.rate= 0.1 + wd=0.00001 + clip_gradient=1 + update.period = 1 + vocab=17 + + X.train <- list(data=array(1:16, dim=c(2,8)), label=array(2:17, dim=c(2,8))) + + s <- capture.output(model <- mx.lstm( X.train, + ctx=mx.cpu(), + num.round=num.round, + update.period=update.period, + num.lstm.layer=num.lstm.layer, + seq.len=seq.len, + num.hidden=num.hidden, + num.embed=num.embed, + num.label=vocab, + batch.size=batch.size, + input.size=vocab, + initializer=mx.init.uniform(0.01), + learning.rate=learning.rate, + wd=wd, + clip_gradient=clip_gradient)) + + prev.nll <- 10000000.0 + for (r in s) { + nll <- get.nll(r) + expect_true(prev.nll >= nll) + prev.nll <- nll + + } + +}) \ No newline at end of file diff --git a/R-package/vignettes/CharRnnModel.Rmd b/R-package/vignettes/CharRnnModel.Rmd new file mode 100644 index 000000000000..9066d60f7513 --- /dev/null +++ b/R-package/vignettes/CharRnnModel.Rmd @@ -0,0 +1,276 @@ +Char RNN Example +============================================= + +This example aims to show how to use lstm model to build a char level language model, and generate text from it. We use a tiny shakespeare text for demo purpose. + +Data can be found at https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare. + +Preface +------- +This tutorial is written in Rmarkdown. +- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/package/r/CharRnnModel.html) +- You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/CharRnnModel.Rmd) + +Load Data +--------- +First of all, load in the data and preprocess it. +```{r} +require(mxnet) +``` +Set basic network parameters. +```{r} +batch.size = 32 +seq.len = 32 +num.hidden = 256 +num.embed = 256 +num.lstm.layer = 2 +num.round = 3 +learning.rate= 0.1 +wd=0.00001 +clip_gradient=1 +update.period = 1 +``` +download the data. +```{r} +download.data <- function(data_dir) { + dir.create(data_dir, showWarnings = FALSE) + if (!file.exists(paste0(data_dir,'input.txt'))) { + download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt', + destfile=paste0(data_dir,'input.txt'), method='wget') + } +} +``` +Make dictionary from text. +```{r} +make.dict <- function(text, max.vocab=10000) { + text <- strsplit(text, '') + dic <- list() + idx <- 1 + for (c in text[[1]]) { + if (!(c %in% names(dic))) { + dic[[c]] <- idx + idx <- idx + 1 + } + } + if (length(dic) == max.vocab - 1) + dic[["UNKNOWN"]] <- idx + cat(paste0("Total unique char: ", length(dic), "\n")) + return (dic) +} +``` +Transfer text into data feature. +```{r} +make.data <- function(file.path, seq.len=32, max.vocab=10000, dic=NULL) { + fi <- file(file.path, "r") + text <- paste(readLines(fi), collapse="\n") + close(fi) + + if (is.null(dic)) + dic <- make.dict(text, max.vocab) + lookup.table <- list() + for (c in names(dic)) { + idx <- dic[[c]] + lookup.table[[idx]] <- c + } + + char.lst <- strsplit(text, '')[[1]] + num.seq <- as.integer(length(char.lst) / seq.len) + char.lst <- char.lst[1:(num.seq * seq.len)] + data <- array(0, dim=c(seq.len, num.seq)) + idx <- 1 + for (i in 1:num.seq) { + for (j in 1:seq.len) { + if (char.lst[idx] %in% names(dic)) + data[j, i] <- dic[[ char.lst[idx] ]]-1 + else { + data[j, i] <- dic[["UNKNOWN"]]-1 + } + idx <- idx + 1 + } + } + return (list(data=data, dic=dic, lookup.table=lookup.table)) +} +``` +Move tail text. +```{r} +drop.tail <- function(X, batch.size) { + shape <- dim(X) + nstep <- as.integer(shape[2] / batch.size) + return (X[, 1:(nstep * batch.size)]) +} +``` +get the label of X +```{r} +get.label <- function(X) { + label <- array(0, dim=dim(X)) + d <- dim(X)[1] + w <- dim(X)[2] + for (i in 0:(w-1)) { + for (j in 1:d) { + label[i*d+j] <- X[(i*d+j)%%(w*d)+1] + } + } + return (label) +} +``` +get training data and eval data +```{r} +download.data("./data/") +ret <- make.data("./data/input.txt", seq.len=seq.len) +X <- ret$data +dic <- ret$dic +lookup.table <- ret$lookup.table + +vocab <- length(dic) + +shape <- dim(X) +train.val.fraction <- 0.9 +size <- shape[2] + +X.train.data <- X[, 1:as.integer(size * train.val.fraction)] +X.val.data <- X[, -(1:as.integer(size * train.val.fraction))] +X.train.data <- drop.tail(X.train.data, batch.size) +X.val.data <- drop.tail(X.val.data, batch.size) + +X.train.label <- get.label(X.train.data) +X.val.label <- get.label(X.val.data) + +X.train <- list(data=X.train.data, label=X.train.label) +X.val <- list(data=X.val.data, label=X.val.label) +``` + +Training Model +-------------- +In `mxnet`, we have a function called `mx.lstm` so that users can build a general lstm model. + +```{r} +model <- mx.lstm(X.train, X.val, + ctx=mx.cpu(), + num.round=num.round, + update.period=update.period, + num.lstm.layer=num.lstm.layer, + seq.len=seq.len, + num.hidden=num.hidden, + num.embed=num.embed, + num.label=vocab, + batch.size=batch.size, + input.size=vocab, + initializer=mx.init.uniform(0.1), + learning.rate=learning.rate, + wd=wd, + clip_gradient=clip_gradient) + +``` +Setting the parameters ctx=mx.gpu(0) and num.round=5 can get the following result. +``` +Epoch [31] Train: NLL=3.47213018872144, Perp=32.2052727363657 +... +Epoch [961] Train: NLL=2.32060007657895, Perp=10.181782322355 +Iter [1] Train: Time: 186.397065639496 sec, NLL=2.31135356537961, Perp=10.0880702804858 +Iter [1] Val: NLL=1.94184484060012, Perp=6.97160060607419 +Epoch [992] Train: NLL=1.84784553299322, Perp=6.34613225095329 +... +Epoch [1953] Train: NLL=1.70175791172558, Perp=5.48357857093351 +Iter [2] Train: Time: 188.929051160812 sec, NLL=1.70103940328978, Perp=5.47963998859367 +Iter [2] Val: NLL=1.74979316010449, Perp=5.75341251767988 +... +Epoch [2914] Train: NLL=1.54738185300295, Perp=4.69915099483974 +Iter [3] Train: Time: 185.425321578979 sec, NLL=1.54604189517013, Perp=4.69285854740519 +Iter [3] Val: NLL=1.67780240235925, Perp=5.35377758479576 +Epoch [2945] Train: NLL=1.48868466087876, Perp=4.43126307034767 +... +Iter [4] Train: Time: 185.487086296082 sec, NLL=1.4744973925858, Perp=4.36883940994296 +Iter [4] Val: NLL=1.64488167325603, Perp=5.18039689118454 +Epoch [3937] Train: NLL=1.46355541021581, Perp=4.32129622881604 +... +Epoch [4898] Train: NLL=1.42900458455642, Perp=4.17454171976281 +Iter [5] Train: Time: 185.070136785507 sec, NLL=1.42909226256273, Perp=4.17490775130428 +Iter [5] Val: NLL=1.62716655804022, Perp=5.08943365437187 + +``` +Inference from model +-------------------- +helper function for random sample. +```{r} +cdf <- function(weights) { + total <- sum(weights) + result <- c() + cumsum <- 0 + for (w in weights) { + cumsum <- cumsum+w + result <- c(result, cumsum / total) + } + return (result) +} + +search.val <- function(cdf, x) { + l <- 1 + r <- length(cdf) + while (l <= r) { + m <- as.integer((l+r)/2) + if (cdf[m] < x) { + l <- m+1 + } else { + r <- m-1 + } + } + return (l) +} +choice <- function(weights) { + cdf.vals <- cdf(as.array(weights)) + x <- runif(1) + idx <- search.val(cdf.vals, x) + return (idx) +} +``` +we can use random output or fixed output by choosing largest probability. +```{r} +make.output <- function(prob, sample=FALSE, temperature=1.) { + if (!sample) { + idx <- which.max(as.array(prob)) + } + else { + scale_prob <- mx.nd.clip(prob, 1e-6, 1 - 1e-6) + rescale <- mx.nd.exp(mx.nd.log(scale_prob) / temperature) + rescale <- rescale / (as.array(mx.nd.sum(rescale))[1]) + idx <- choice(rescale) + } + return (idx) + +} +``` + +In `mxnet`, we have a function called `mx.lstm.inference` so that users can build a inference from lstm model and then use function `mx.lstm.forward` to get forward output from the inference. +Build inference from model. +```{r} +infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer, + input.size=vocab, + num.hidden=num.hidden, + num.embed=num.embed, + num.label=vocab, + arg.params=model$arg.params, + ctx=mx.cpu()) +``` +generate a sequence of 75 chars using function `mx.lstm.forward`. +``` +start <- 'a' +seq.len <- 75 +random.sample <- TRUE + +last.id <- dic[[start]] +out <- "a" +for (i in (1:(seq.len-1))) { + input <- c(last.id-1) + ret <- mx.lstm.forward(infer.model, input, FALSE) + infer.model <- ret$model + prob <- ret$prob + last.id <- make.output(prob, random.sample) + out <- paste0(out, lookup.table[[last.id]]) +} +cat (paste0(out, "\n")) +``` +The result: +``` +ah not a drobl greens +Settled asing lately sistering sounted to their hight +``` \ No newline at end of file diff --git a/docs/packages/r/CharRnnModel.Rmd b/docs/packages/r/CharRnnModel.Rmd new file mode 100644 index 000000000000..9066d60f7513 --- /dev/null +++ b/docs/packages/r/CharRnnModel.Rmd @@ -0,0 +1,276 @@ +Char RNN Example +============================================= + +This example aims to show how to use lstm model to build a char level language model, and generate text from it. We use a tiny shakespeare text for demo purpose. + +Data can be found at https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare. + +Preface +------- +This tutorial is written in Rmarkdown. +- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/package/r/CharRnnModel.html) +- You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/CharRnnModel.Rmd) + +Load Data +--------- +First of all, load in the data and preprocess it. +```{r} +require(mxnet) +``` +Set basic network parameters. +```{r} +batch.size = 32 +seq.len = 32 +num.hidden = 256 +num.embed = 256 +num.lstm.layer = 2 +num.round = 3 +learning.rate= 0.1 +wd=0.00001 +clip_gradient=1 +update.period = 1 +``` +download the data. +```{r} +download.data <- function(data_dir) { + dir.create(data_dir, showWarnings = FALSE) + if (!file.exists(paste0(data_dir,'input.txt'))) { + download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt', + destfile=paste0(data_dir,'input.txt'), method='wget') + } +} +``` +Make dictionary from text. +```{r} +make.dict <- function(text, max.vocab=10000) { + text <- strsplit(text, '') + dic <- list() + idx <- 1 + for (c in text[[1]]) { + if (!(c %in% names(dic))) { + dic[[c]] <- idx + idx <- idx + 1 + } + } + if (length(dic) == max.vocab - 1) + dic[["UNKNOWN"]] <- idx + cat(paste0("Total unique char: ", length(dic), "\n")) + return (dic) +} +``` +Transfer text into data feature. +```{r} +make.data <- function(file.path, seq.len=32, max.vocab=10000, dic=NULL) { + fi <- file(file.path, "r") + text <- paste(readLines(fi), collapse="\n") + close(fi) + + if (is.null(dic)) + dic <- make.dict(text, max.vocab) + lookup.table <- list() + for (c in names(dic)) { + idx <- dic[[c]] + lookup.table[[idx]] <- c + } + + char.lst <- strsplit(text, '')[[1]] + num.seq <- as.integer(length(char.lst) / seq.len) + char.lst <- char.lst[1:(num.seq * seq.len)] + data <- array(0, dim=c(seq.len, num.seq)) + idx <- 1 + for (i in 1:num.seq) { + for (j in 1:seq.len) { + if (char.lst[idx] %in% names(dic)) + data[j, i] <- dic[[ char.lst[idx] ]]-1 + else { + data[j, i] <- dic[["UNKNOWN"]]-1 + } + idx <- idx + 1 + } + } + return (list(data=data, dic=dic, lookup.table=lookup.table)) +} +``` +Move tail text. +```{r} +drop.tail <- function(X, batch.size) { + shape <- dim(X) + nstep <- as.integer(shape[2] / batch.size) + return (X[, 1:(nstep * batch.size)]) +} +``` +get the label of X +```{r} +get.label <- function(X) { + label <- array(0, dim=dim(X)) + d <- dim(X)[1] + w <- dim(X)[2] + for (i in 0:(w-1)) { + for (j in 1:d) { + label[i*d+j] <- X[(i*d+j)%%(w*d)+1] + } + } + return (label) +} +``` +get training data and eval data +```{r} +download.data("./data/") +ret <- make.data("./data/input.txt", seq.len=seq.len) +X <- ret$data +dic <- ret$dic +lookup.table <- ret$lookup.table + +vocab <- length(dic) + +shape <- dim(X) +train.val.fraction <- 0.9 +size <- shape[2] + +X.train.data <- X[, 1:as.integer(size * train.val.fraction)] +X.val.data <- X[, -(1:as.integer(size * train.val.fraction))] +X.train.data <- drop.tail(X.train.data, batch.size) +X.val.data <- drop.tail(X.val.data, batch.size) + +X.train.label <- get.label(X.train.data) +X.val.label <- get.label(X.val.data) + +X.train <- list(data=X.train.data, label=X.train.label) +X.val <- list(data=X.val.data, label=X.val.label) +``` + +Training Model +-------------- +In `mxnet`, we have a function called `mx.lstm` so that users can build a general lstm model. + +```{r} +model <- mx.lstm(X.train, X.val, + ctx=mx.cpu(), + num.round=num.round, + update.period=update.period, + num.lstm.layer=num.lstm.layer, + seq.len=seq.len, + num.hidden=num.hidden, + num.embed=num.embed, + num.label=vocab, + batch.size=batch.size, + input.size=vocab, + initializer=mx.init.uniform(0.1), + learning.rate=learning.rate, + wd=wd, + clip_gradient=clip_gradient) + +``` +Setting the parameters ctx=mx.gpu(0) and num.round=5 can get the following result. +``` +Epoch [31] Train: NLL=3.47213018872144, Perp=32.2052727363657 +... +Epoch [961] Train: NLL=2.32060007657895, Perp=10.181782322355 +Iter [1] Train: Time: 186.397065639496 sec, NLL=2.31135356537961, Perp=10.0880702804858 +Iter [1] Val: NLL=1.94184484060012, Perp=6.97160060607419 +Epoch [992] Train: NLL=1.84784553299322, Perp=6.34613225095329 +... +Epoch [1953] Train: NLL=1.70175791172558, Perp=5.48357857093351 +Iter [2] Train: Time: 188.929051160812 sec, NLL=1.70103940328978, Perp=5.47963998859367 +Iter [2] Val: NLL=1.74979316010449, Perp=5.75341251767988 +... +Epoch [2914] Train: NLL=1.54738185300295, Perp=4.69915099483974 +Iter [3] Train: Time: 185.425321578979 sec, NLL=1.54604189517013, Perp=4.69285854740519 +Iter [3] Val: NLL=1.67780240235925, Perp=5.35377758479576 +Epoch [2945] Train: NLL=1.48868466087876, Perp=4.43126307034767 +... +Iter [4] Train: Time: 185.487086296082 sec, NLL=1.4744973925858, Perp=4.36883940994296 +Iter [4] Val: NLL=1.64488167325603, Perp=5.18039689118454 +Epoch [3937] Train: NLL=1.46355541021581, Perp=4.32129622881604 +... +Epoch [4898] Train: NLL=1.42900458455642, Perp=4.17454171976281 +Iter [5] Train: Time: 185.070136785507 sec, NLL=1.42909226256273, Perp=4.17490775130428 +Iter [5] Val: NLL=1.62716655804022, Perp=5.08943365437187 + +``` +Inference from model +-------------------- +helper function for random sample. +```{r} +cdf <- function(weights) { + total <- sum(weights) + result <- c() + cumsum <- 0 + for (w in weights) { + cumsum <- cumsum+w + result <- c(result, cumsum / total) + } + return (result) +} + +search.val <- function(cdf, x) { + l <- 1 + r <- length(cdf) + while (l <= r) { + m <- as.integer((l+r)/2) + if (cdf[m] < x) { + l <- m+1 + } else { + r <- m-1 + } + } + return (l) +} +choice <- function(weights) { + cdf.vals <- cdf(as.array(weights)) + x <- runif(1) + idx <- search.val(cdf.vals, x) + return (idx) +} +``` +we can use random output or fixed output by choosing largest probability. +```{r} +make.output <- function(prob, sample=FALSE, temperature=1.) { + if (!sample) { + idx <- which.max(as.array(prob)) + } + else { + scale_prob <- mx.nd.clip(prob, 1e-6, 1 - 1e-6) + rescale <- mx.nd.exp(mx.nd.log(scale_prob) / temperature) + rescale <- rescale / (as.array(mx.nd.sum(rescale))[1]) + idx <- choice(rescale) + } + return (idx) + +} +``` + +In `mxnet`, we have a function called `mx.lstm.inference` so that users can build a inference from lstm model and then use function `mx.lstm.forward` to get forward output from the inference. +Build inference from model. +```{r} +infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer, + input.size=vocab, + num.hidden=num.hidden, + num.embed=num.embed, + num.label=vocab, + arg.params=model$arg.params, + ctx=mx.cpu()) +``` +generate a sequence of 75 chars using function `mx.lstm.forward`. +``` +start <- 'a' +seq.len <- 75 +random.sample <- TRUE + +last.id <- dic[[start]] +out <- "a" +for (i in (1:(seq.len-1))) { + input <- c(last.id-1) + ret <- mx.lstm.forward(infer.model, input, FALSE) + infer.model <- ret$model + prob <- ret$prob + last.id <- make.output(prob, random.sample) + out <- paste0(out, lookup.table[[last.id]]) +} +cat (paste0(out, "\n")) +``` +The result: +``` +ah not a drobl greens +Settled asing lately sistering sounted to their hight +``` \ No newline at end of file diff --git a/docs/packages/r/index.md b/docs/packages/r/index.md index 35e1fe7642b0..ef427abc4899 100644 --- a/docs/packages/r/index.md +++ b/docs/packages/r/index.md @@ -20,6 +20,7 @@ Tutorials * [Handwritten Digits Classification Competition](mnistCompetition.md) * [Tutorial on NDArray and Symbol](ndarrayAndSymbolTutorial.md) * [Tutorial on Callback Functions](CallbackFunctionTutorial.md) +* [Character Language Model using RNN Model](CharRnnModel.Rmd) Resources --------- diff --git a/example/rnn/char_lstm.R b/example/rnn/char_lstm.R deleted file mode 100644 index 26d08b262b20..000000000000 --- a/example/rnn/char_lstm.R +++ /dev/null @@ -1,113 +0,0 @@ -# Char LSTM Example. - -# This example aims to show how to use lstm to build a char level language model, and generate text from it. We use a tiny shakespeare text for demo purpose. -# Data can be found at https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare. - -# If running for the first time, download the data by running the following commands: sh get_ptb_data.sh - -require(mxnet) -source("lstm.R") - -# Set basic network parameters. -batch.size = 32 -seq.len = 32 -num.hidden = 256 -num.embed = 256 -num.lstm.layer = 2 -num.round = 21 -learning.rate= 0.01 -wd=0.00001 -clip_gradient=1 -update.period = 1 - -# Make dictionary from text -make.dict <- function(text, max.vocab=10000) { - text <- strsplit(text, '') - dic <- list() - idx <- 1 - for (c in text[[1]]) { - if (!(c %in% names(dic))) { - dic[[c]] <- idx - idx <- idx + 1 - } - } - if (length(dic) == max.vocab - 1) - dic[["UNKNOWN"]] <- idx - cat(paste0("Total unique char: ", length(dic), "\n")) - return (dic) -} - -# Transfer text into data batch -make.batch <- function(file.path, batch.size=32, seq.lenth=32, max.vocab=10000, dic=NULL) { - fi <- file(file.path, "r") - text <- paste(readLines(fi), collapse="\n") - close(fi) - - if (is.null(dic)) - dic <- make.dict(text, max.vocab) - lookup.table <- list() - for (c in names(dic)) { - idx <- dic[[c]] - lookup.table[[idx]] <- c - } - - char.lst <- strsplit(text, '')[[1]] - num.batch <- as.integer(length(char.lst) / batch.size) - char.lst <- char.lst[1:(num.batch * batch.size)] - data <- array(0, dim=c(batch.size, num.batch)) - idx <- 1 - for (j in 1:batch.size) { - for (i in 1:num.batch) { - if (char.lst[idx] %in% names(dic)) - data[j, i] <- dic[[ char.lst[idx] ]] - else { - data[j, i] <- dic[["UNKNOWN"]] - } - idx <- idx + 1 - } - } - return (list(data=data, dic=dic, lookup.table=lookup.table)) -} - -# Move tail text -drop.tail <- function(X, seq.len) { - shape <- dim(X) - nstep <- as.integer(shape[2] / seq.len) - return (X[, 1:(nstep * seq.len)]) -} - -ret <- make.batch("./data/input.txt", batch.size=batch.size, seq.lenth=seq.len) -X <- ret$data -dic <- ret$dic -lookup.table <- ret$lookup.table - -vocab <- length(dic) - -shape <- dim(X) -train.val.fraction <- 0.9 -size <- shape[2] -X.train <- X[, 1:as.integer(size * train.val.fraction)] -X.val <- X[, -(1:as.integer(size * train.val.fraction))] -X.train <- drop.tail(X.train, seq.len) -X.val <- drop.tail(X.val, seq.len) - -# Set up LSTM model -model <- setup.rnn.model(ctx=mx.gpu(0), - num.lstm.layer=num.lstm.layer, - seq.len=seq.len, - num.hidden=num.hidden, - num.embed=num.embed, - num.label=vocab, - batch.size=batch.size, - input.size=vocab, - initializer=mx.init.uniform(0.1), - dropout=0.) - -# Train LSTM model -train.lstm(model, X.train, X.val, - num.round=num.round, - half.life=3, - update.period=update.period, - learning.rate=learning.rate, - wd=wd, - clip_gradient=clip_gradient) diff --git a/example/rnn/lstm.R b/example/rnn/lstm.R deleted file mode 100644 index 5caaf0213b95..000000000000 --- a/example/rnn/lstm.R +++ /dev/null @@ -1,322 +0,0 @@ -require(mxnet) - -# lstm cell symbol -lstm <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout=0) { - if (dropout > 0) - indata <- mx.symbol.Dropout(data=indata, p=dropout) - i2h <- mx.symbol.FullyConnected(data=indata, - weight=param$i2h.weight, - bias=param$i2h.bias, - num.hidden=num.hidden * 4, - name=paste0("t", seqidx, ".l", layeridx, ".i2h")) - h2h <- mx.symbol.FullyConnected(data=prev.state$h, - weight=param$h2h.weight, - bias=param$h2h.bias, - num.hidden=num.hidden * 4, - name=paste0("t", seqidx, ".l", layeridx, ".h2h")) - gates <- i2h + h2h - slice.gates <- mx.symbol.SliceChannel(gates, num.outputs=4, - name=paste0("t", seqidx, ".l", layeridx, ".slice")) - - in.gate <- mx.symbol.Activation(slice.gates[[1]], act.type="sigmoid") - in.transform <- mx.symbol.Activation(slice.gates[[2]], act.type="tanh") - forget.gate <- mx.symbol.Activation(slice.gates[[3]], act.type="sigmoid") - out.gate <- mx.symbol.Activation(slice.gates[[4]], act.type="sigmoid") - next.c <- (forget.gate * prev.state$c) + (in.gate * in.transform) - next.h <- out.gate * mx.symbol.Activation(next.c, act.type="tanh") - - return (list(c=next.c, h=next.h)) -} - -# unrolled lstm network -lstm.unroll <- function(num.lstm.layer, seq.len, input.size, - num.hidden, num.embed, num.label, dropout=0.) { - - embed.weight <- mx.symbol.Variable("embed.weight") - cls.weight <- mx.symbol.Variable("cls.weight") - cls.bias <- mx.symbol.Variable("cls.bias") - param.cells <- list() - last.states <- list() - for (i in 1:num.lstm.layer) { - param.cells[[i]] <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")), - i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")), - h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")), - h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias"))) - state <- list(c=mx.symbol.Variable(paste0("l", i, ".init.c")), - h=mx.symbol.Variable(paste0("l", i, ".init.h"))) - last.states[[i]] <- state - } - - last.hidden <- list() - label <- mx.symbol.Variable("label") - for (seqidx in 1:seq.len) { - # embeding layer - data <- mx.symbol.Variable(paste0("t", seqidx, ".data")) - - hidden <- mx.symbol.Embedding(data=data, weight=embed.weight, - input.dim=input.size, - output.dim=num.embed, - name=paste0("t", seqidx, ".embed")) - - # stack lstm - for (i in 1:num.lstm.layer) { - if (i==0) { - dp <- 0 - } - else { - dp <- dropout - } - next.state <- lstm(num.hidden, indata=hidden, - prev.state=last.states[[i]], - param=param.cells[[i]], - seqidx=seqidx, layeridx=i, - dropout=dp) - hidden <- next.state$h - last.states[[i]] <- next.state - } - # decoder - if (dropout > 0) - hidden <- mx.symbol.Dropout(data=hidden, p=dropout) - last.hidden <- c(last.hidden, hidden) - } - last.hidden$dim <- 0 - last.hidden$num.args <- seq.len - concat <-mxnet:::mx.varg.symbol.Concat(last.hidden) - fc <- mx.symbol.FullyConnected(data=concat, - weight=cls.weight, - bias=cls.bias, - num.hidden=num.label) - loss.all <- mx.symbol.SoftmaxOutput(data=fc, label=label, name="sm") - unpack.c <- list() - unpack.h <- list() - for (i in 1:num.lstm.layer) { - state <- last.states[[i]] - state <- list(c=mx.symbol.BlockGrad(state$c, name=paste0("l", i, ".last.c")), - h=mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h" ))) - last.states[[i]] <- state - unpack.c <- c(unpack.c, state$c) - unpack.h <- c(unpack.h, state$h) - } - list.all <- c(loss.all, unpack.c, unpack.h) - - return (mx.symbol.Group(list.all)) -} - -is.param.name <- function(name) { - return (grepl('weight$', name) || grepl('bias$', name) || - grepl('gamma$', name) || grepl('beta$', name) ) -} - -mx.model.init.params <- function(symbol, input.shape, initializer, ctx) { - if (!is.mx.symbol(symbol)) stop("symbol need to be MXSymbol") - slist <- symbol$infer.shape(input.shape) - if (is.null(slist)) stop("Not enough information to get shapes") - arg.params <- mx.init.create(initializer, slist$arg.shapes, ctx, skip.unknown=TRUE) - aux.params <- mx.init.create(initializer, slist$aux.shapes, ctx, skip.unknown=FALSE) - return(list(arg.params=arg.params, aux.params=aux.params)) -} - -# set up rnn model with lstm cells -setup.rnn.model <- function(ctx, - num.lstm.layer, seq.len, - num.hidden, num.embed, num.label, - batch.size, input.size, - initializer=mx.init.uniform(0.01), - dropout=0) { - - rnn.sym <- lstm.unroll(num.lstm.layer=num.lstm.layer, - num.hidden=num.hidden, - seq.len=seq.len, - input.size=input.size, - num.embed=num.embed, - num.label=num.label, - dropout=dropout) - arg.names <- rnn.sym$arguments - input.shapes <- list() - for (name in arg.names) { - if (grepl('init.c$', name) || grepl('init.h$', name)) { - input.shapes[[name]] <- c(num.hidden, batch.size) - } - else if (grepl('data$', name)) { - input.shapes[[name]] <- c(batch.size) - } - } - - params <- mx.model.init.params(rnn.sym, input.shapes, initializer, ctx) - - args <- input.shapes - args$symbol <- rnn.sym - args$ctx <- ctx - args$grad.req <- "add" - rnn.exec <- do.call(mx.simple.bind, args) - - mx.exec.update.arg.arrays(rnn.exec, params$arg.params, match.name=TRUE) - mx.exec.update.aux.arrays(rnn.exec, params$aux.params, match.name=TRUE) - - grad.arrays <- list() - for (name in names(rnn.exec$ref.grad.arrays)) { - if (is.param.name(name)) - grad.arrays[[name]] <- rnn.exec$ref.arg.arrays[[name]]*0 - } - mx.exec.update.grad.arrays(rnn.exec, grad.arrays, match.name=TRUE) - - return (list(rnn.exec=rnn.exec, symbol=rnn.sym, - num.lstm.layer=num.lstm.layer, num.hidden=num.hidden, - seq.len=seq.len, batch.size=batch.size, - num.embed=num.embed)) - -} - - -get.rnn.inputs <- function(m, X, begin) { - seq.len <- m$seq.len - batch.size <- m$batch.size - seq.labels <- array(0, dim=c(seq.len*batch.size)) - seq.data <- list() - for (seqidx in 1:seq.len) { - idx <- (begin + seqidx - 1) %% dim(X)[2] + 1 - next.idx <- (begin + seqidx) %% dim(X)[2] + 1 - x <- X[, idx] - y <- X[, next.idx] - - seq.data[[paste0("t", seqidx, ".data")]] <- mx.nd.array(as.array(x)) - seq.labels[((seqidx-1)*batch.size+1) : (seqidx*batch.size)] <- y - } - seq.data$label <- mx.nd.array(seq.labels) - return (seq.data) -} - - -calc.nll <- function(seq.label.probs, X, begin) { - nll = - sum(log(seq.label.probs)) / length(X[,1]) - return (nll) -} - -train.lstm <- function(model, X.train.batch, X.val.batch, - num.round, update.period, - optimizer='sgd', half.life=2, max.grad.norm = 5.0, ...) { - X.train.batch.shape <- dim(X.train.batch) - X.val.batch.shape <- dim(X.val.batch) - cat(paste0("Training with train.shape=(", paste0(X.train.batch.shape, collapse=","), ")"), "\n") - cat(paste0("Training with val.shape=(", paste0(X.val.batch.shape, collapse=","), ")"), "\n") - - m <- model - seq.len <- m$seq.len - batch.size <- m$batch.size - num.lstm.layer <- m$num.lstm.layer - num.hidden <- m$num.hidden - - opt <- mx.opt.create(optimizer, rescale.grad=(1/batch.size), ...) - - updater <- mx.opt.get.updater(opt, m$rnn.exec$ref.arg.arrays) - epoch.counter <- 0 - log.period <- max(as.integer(1000 / seq.len), 1) - last.perp <- 10000000.0 - - for (iteration in 1:num.round) { - nbatch <- 0 - train.nll <- 0 - # reset states - init.states <- list() - for (i in 1:num.lstm.layer) { - init.states[[paste0("l", i, ".init.c")]] <- mx.nd.zeros(c(num.hidden, batch.size)) - init.states[[paste0("l", i, ".init.h")]] <- mx.nd.zeros(c(num.hidden, batch.size)) - } - mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE) - - tic <- Sys.time() - - stopifnot(dim(X.train.batch)[[2]] %% seq.len == 0) - stopifnot(dim(X.val.batch)[[2]] %% seq.len == 0) - - for (begin in seq(1, dim(X.train.batch)[2], seq.len)) { - # set rnn input - rnn.input <- get.rnn.inputs(m, X.train.batch, begin=begin) - mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE) - - mx.exec.forward(m$rnn.exec, is.train=TRUE) - # probability of each label class, used to evaluate nll - seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$outputs[["sm_output"]], m$rnn.exec$arg.arrays[["label"]]) - mx.exec.backward(m$rnn.exec) - # transfer the states - init.states <- list() - for (i in 1:num.lstm.layer) { - init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$outputs[[paste0("l", i, ".last.c_output")]] - init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$outputs[[paste0("l", i, ".last.h_output")]] - } - mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE) - # update epoch counter - epoch.counter <- epoch.counter + 1 - if (epoch.counter %% update.period == 0) { - - # the gradient of initial c and inital h should be zero - init.grad <- list() - for (i in 1:num.lstm.layer) { - init.grad[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0 - init.grad[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0 - } - mx.exec.update.grad.arrays(m$rnn.exec, init.grad, match.name=TRUE) - - arg.blocks <- updater(m$rnn.exec$ref.arg.arrays, m$rnn.exec$ref.grad.arrays) - - mx.exec.update.arg.arrays(m$rnn.exec, arg.blocks, skip.null=TRUE) - - grad.arrays <- list() - for (name in names(m$rnn.exec$ref.grad.arrays)) { - if (is.param.name(name)) - grad.arrays[[name]] <- m$rnn.exec$ref.grad.arrays[[name]]*0 - } - mx.exec.update.grad.arrays(m$rnn.exec, grad.arrays, match.name=TRUE) - - } - - train.nll <- train.nll + calc.nll(as.array(seq.label.probs), X.train.batch, begin=begin) - - nbatch <- begin + seq.len - if ((epoch.counter %% log.period) == 0) { - cat(paste0("Epoch [", epoch.counter, - "] Train: NLL=", train.nll / nbatch, - ", Perp=", exp(train.nll / nbatch), "\n")) - } - } - # end of training loop - toc <- Sys.time() - cat(paste0("Iter [", iteration, - "] Train: Time: ", as.numeric(toc - tic, units="secs"), - " sec, NLL=", train.nll / nbatch, - ", Perp=", exp(train.nll / nbatch), "\n")) - - val.nll <- 0.0 - # validation set, reset states - init.states <- list() - for (i in 1:num.lstm.layer) { - init.states[[paste0("l", i, ".init.c")]] <- mx.nd.zeros(c(num.hidden, batch.size)) - init.states[[paste0("l", i, ".init.h")]] <- mx.nd.zeros(c(num.hidden, batch.size)) - } - mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE) - - for (begin in seq(1, dim(X.val.batch)[2], seq.len)) { - # set rnn input - rnn.input <- get.rnn.inputs(m, X.val.batch, begin=begin) - mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE) - mx.exec.forward(m$rnn.exec, is.train=FALSE) - # probability of each label class, used to evaluate nll - seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$outputs[["sm_output"]], m$rnn.exec$arg.arrays[["label"]]) - # transfer the states - init.states <- list() - for (i in 1:num.lstm.layer) { - init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$outputs[[paste0("l", i, ".last.c_output")]] - init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$outputs[[paste0("l", i, ".last.h_output")]] - } - mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE) - val.nll <- val.nll + calc.nll(as.array(seq.label.probs), X.val.batch, begin=begin) - } - nbatch <- dim(X.val.batch)[2] - perp <- exp(val.nll / nbatch) - cat(paste0("Iter [", iteration, - "] Val: NLL=", val.nll / nbatch, - ", Perp=", exp(val.nll / nbatch), "\n")) - - - } -} \ No newline at end of file