From 10647782d491887c6952b671ca90ed220c8b0ff0 Mon Sep 17 00:00:00 2001
From: Liang Zhang <liangzha@research-hm9.corp.sp1.yahoo.com>
Date: Mon, 9 Jan 2012 22:58:28 +0000
Subject: [PATCH] add more tutorial text

---
 doc/quick-start.tex                           | 23 ++++++----
 doc/tutorial.tex                              | 44 ++++++++++---------
 src/R/BST.R                                   |  3 +-
 .../multicontext_model/regression-test-0.R    |  6 +--
 4 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/doc/quick-start.tex b/doc/quick-start.tex
index 50fa7f5..33558d8 100644
--- a/doc/quick-start.tex
+++ b/doc/quick-start.tex
@@ -2,7 +2,7 @@
 \subsection{Quick Start}
 \label{sec:bst-quick-start}
 
-In this section, we describe how to fit BST models using this package without much need for familiarity of R or deep understanding of the code. Before you run the sample code, please make sure you are in the top-level directory of the installed code, i.e. by using Linux command {\tt ls}, you should be able to see files "LICENSE" and "README".
+In this section, we describe how to fit BST models using this package without much need for familiarity of R or deep understanding of the code. Before you run the sample code, please make sure you are in the top-level directory of the installed code, i.e. by using Linux command {\tt ls}, you should be able to see files ``LICENSE" and ``README".
 
 \parahead{Step 1}
 Read training and test observation tables ({\tt obs.train} and {\tt obs.test}), their corresponding observation feature tables ({\tt x\_obs.train} and {\tt x\_obs.test}), the source feature table ({\tt x\_src}), the destination feature table ({\tt x\_dst}) and the edge context feature table ({\tt x\_ctx}) from the corresponding files.  Note that if you replace these tables with your data, you must not change the column names. Assuming we use the dense format of the feature files, a sample code can be
@@ -36,19 +36,26 @@ \subsection{Quick Start}
 {\small\begin{verbatim}
 >source("src/R/BST.R");
 \end{verbatim}}
-Then we run a simple latent factor model using the following command
+Then we can run a simple latent factor model without any feature using the following command
+{\small\begin{verbatim}
+>ans = fit.bst(obs.train=obs.train, obs.test=obs.test, 
+		 out.dir = "/tmp/unit-test/simulated-mtx-uvw-10K", model.name="uvw", 
+		 nFactors=3, nIter=10);
+\end{verbatim}}
+Or with all the feature files
 {\small\begin{verbatim}
 >ans = fit.bst(obs.train=obs.train, obs.test=obs.test, x_obs.train=x_obs.train, 
-	x_obs.test=x_obs.test, x_src=x_src, x_dst=x_dst, x_ctx=x_ctx,
-	out.dir = "/tmp/unit-test/simulated-mtx-uvw-10K", 
-	model.name="uvw", nFactors=3, nIter=10);
+		 x_obs.test=x_obs.test, x_src=x_src, x_dst=x_dst, x_ctx=x_ctx,
+		 out.dir = "/tmp/unit-test/simulated-mtx-uvw-10K", 
+		 model.name="uvw", nFactors=3, nIter=10);
 \end{verbatim}}
 Basically we put all the loaded data sets as input of the function, specify the output directory prefix as {\tt /tmp/unit-test/simulated-mtx-uvw-10K}, and run model {\tt uvw}. Note that the model name is quite arbitrary, and the final output directory for model {\tt uvw} is {\tt /tmp/unit-test/simulated-mtx-uvw-10K\_uvw}. For model {\tt uvw}, we use 3 factors and run 10 EM iterations. 
 
 \parahead{Step 3}
-Once Step 2 is finished, we have the predicted values of the response variable $y$, since we have the test data as input of the function (If we do not have test data, we can simply omit the {\tt obs.test} and {\tt x.obs.test} option, and the final output would only have model parameters without prediction results).  Check out the file {\tt prediction} inside the output directory (In our example, {\tt /tmp/unit-test/simulated-mtx-uvw-10K\_uvw/prediction} is the filename). The file has two columns: the original observed $y$ and the predicted $y$ (the {\tt pred\_y} column). Standard metrics such as complete data log-likelihood and RMSE (root mean squared error) have been generated in the file {\tt summary}. Check Section \ref{sec:output} for more details.
+Once Step 2 is finished, we have the predicted values of the response variable $y$, since we have the test data as input of the function (If we do not have test data, we can simply omit the {\tt obs.test} and {\tt x.obs.test} option, and the final output would only have model parameters without prediction results).  Check out the file {\tt prediction} inside the output directory (In our example, {\tt /tmp/unit-test/simulated-mtx-uvw-10K\_uvw/prediction} is the filename). The file has two columns: the original observed $y$ and the predicted $y$ (the {\tt pred\_y} column). Standard metrics such as complete data log-likelihood and RMSE (root mean squared error) have been generated in the file {\tt summary}. Check Section \ref{sec:model-output} for more details.
+
+Please note that the predicted values of $y$ for model {\tt uvw} can also be found at {\tt ans\$pred.y\$uvw}.
 
-Please note that the predicted values of $y$ for model {\tt uvw} can also be found at {\tt ans$pred.y$uvw}.
 \parahead{Run multiple models simultaneously}
 We actually are able to run multiple BST models simultaneously using the following command
 {\small\begin{verbatim}
@@ -78,7 +85,7 @@ \subsection{Quick Start}
 \begin{itemize}
 \item {\tt rm.self.link}: Whether to remove self-edges.  If {\tt src.dst.same=TRUE}, you can choose to remove observations with ${\tt src\_id} = {\tt dst\_id}$ by setting {\tt rm.self.link=FALSE}.  Otherwise, {\tt rm.self.link} should be set to {\tt FALSE}. The default of {\tt rm.self.link} is FALSE.
 \item {\tt add.intercept}: Whether you want to add an intercept to each feature matrix.  If {\tt add.intercept=TRUE}, a column of all 1s will be added to every feature matrix. The default of {\tt add.intercept} is TRUE.
-\item {\tt has.gamma} specifies whether to include $\gamma_k$ in the model specified in Eq~\ref{eq:uvw-model} or not.  If {\tt has.gamm=FALSE}, $\gamma_k$ will be disabled or removed from the model. For the default, {\tt has.gamma} is set as FALSE unless the training response data {\tt obs.train} do not have any source or destination context, but have edge context.
+\item {\tt has.gamma} specifies whether to include $\gamma_k$ in the model specified in Equation ~\ref{eq:uvw-model} or not.  If {\tt has.gamma=FALSE}, $\gamma_k$ will be disabled or removed from the model. For the default, {\tt has.gamma} is set as FALSE unless the training response data {\tt obs.train} do not have any source or destination context, but have edge context.
 \item {\tt reg.algo} and {\tt reg.control} specify how the regression priors will to be fitted.  If they are set to {\tt NULL} (default), R's basic linear regression function {\tt lm} will be used to fit the prior regression coefficients $\bm{g}, \bm{d}, \bm{h}, G, D$ and $H$.  Currently, we only support two other algorithms {\tt "GLMNet"} and {\tt "RandomForest"}. Therefore, {\tt reg.algo} can only have three types of values: NULL, {\tt "GLMNet"} and {\tt "RandomForest"} (both are strings). Notice that if {\tt "RandomForest"} is used, the regression priors become nonlinear; see~\cite{gmf:recsys11} for more information.
 \item {\tt nBurnin} is the number of burn-in samples per E-step. The default is 10\% of {\tt nSamplesPerIter}.
 \item {\tt init.params} is a list of the initial values of all the variance component parameters. The default values of {\tt init.params} is
diff --git a/doc/tutorial.tex b/doc/tutorial.tex
index 029b92b..3dbe66c 100644
--- a/doc/tutorial.tex
+++ b/doc/tutorial.tex
@@ -3,6 +3,7 @@
 \usepackage{amsmath}
 \usepackage{amssymb}
 \usepackage{bm}
+\usepackage{comment}
 \newcommand{\parahead}[1]{\vspace{0.15in}\noindent{\bf #1:}}
 
 \begin{document}
@@ -10,7 +11,7 @@
 \author{Bee-Chung Chen}
 \maketitle
 
-This paper describes how you can fit latent factor models (e.g., \cite{rlfm:kdd09,bst:kdd11,gmf:recsys11}) using the open source package developed in Yahoo! Labs.
+This tutorial describes how you can fit latent factor models (e.g., \cite{rlfm:kdd09,bst:kdd11,gmf:recsys11}) using the open source package developed in Yahoo! Labs.
 
 {\small\begin{verbatim}
 Stable repository: https://github.com/yahoo/Latent-Factor-Models
@@ -29,10 +30,11 @@ \subsection{Install R}
 
 Alternatively, you can install R using linux's package management software.  In this case, please install {\tt r-base}, {\tt r-base-core}, {\tt r-base-dev}, {\tt r-recommended}.
 
-After installing R, enter R by simply typing {\tt R} and install the following R packages: {\tt Matrix} and {\tt glmnet}.  Notice that these two packages are not required if you do not need to handle sparse feature vectors or matrices.  To install these R packages, use the following commands in R.
+After installing R, enter R by simply typing {\tt R} and install the following R packages: {\tt Matrix}, {\tt glmnet} and {\tt randomForest}. Note that the R packages {\tt glmnet} and {\tt randomForest} are not required unless you want to use them in the regression priors of the model (the parameter {\tt reg.algo} in {\tt control} of {\tt fit.bst}).  To install these R packages, use the following commands in R.
 {\small\begin{verbatim}
 > install.packages("Matrix");
 > install.packages("glmnet");
+> install.packages("randomForest");
 \end{verbatim}}
 \noindent Make sure that you can run R by simply typing {\tt R}.  Otherwise, please use alias to point {\tt R} to your R executable file.  This is required for {\tt make} to work properly.
 
@@ -77,16 +79,16 @@ \subsection{Model}
 \parahead{Regression Priors}
 The priors of the latent factors are specified in the following:
 \begin{align}
-\alpha_{ip} & \sim \mathcal{N}(\bm{g}_{p}^\prime \bm{x}_{i} + q_{p} \alpha_i, ~\sigma_{\alpha,p}^2),
+\alpha_{ip} & \sim \mathcal{N}(\bm{g}_{p}(\bm{x}_{i}) + q_{p} \alpha_i, ~\sigma_{\alpha,p}^2),
 	~~~~ \alpha_i \sim \mathcal{N}(0, 1) \label{eq:alpha} \\
-\beta_{jq} & \sim \mathcal{N}(\bm{d}_{q}^\prime \bm{x}_{j} + r_{q} \beta_j, ~\sigma_{\beta,q}^2),
+\beta_{jq} & \sim \mathcal{N}(\bm{d}_{q}(\bm{x}_{j}) + r_{q} \beta_j, ~\sigma_{\beta,q}^2),
 	~~~~ \beta_j  \sim \mathcal{N}(0, 1) \label{eq:beta} \\
-\gamma_{k} & \sim \mathcal{N}(\bm{h}' \bm{x}_k, \,\sigma_{\gamma}^2 I), \\
-\bm{u}_{i} & \sim \mathcal{N}(G' \bm{x}_i, \,\sigma_{u}^2 I), ~~~
-\bm{v}_{j} \sim \mathcal{N}(D' \bm{x}_j, \,\sigma_{v}^2 I), ~~~
-\bm{w}_{k} \sim \mathcal{N}(H' \bm{x}_k, \,\sigma_{w}^2 I), \label{eq:uvw}
+\gamma_{k} & \sim \mathcal{N}(\bm{h}(\bm{x}_k), \,\sigma_{\gamma}^2 I), \\
+\bm{u}_{i} & \sim \mathcal{N}(G(\bm{x}_i), \,\sigma_{u}^2 I), ~~~
+\bm{v}_{j} \sim \mathcal{N}(D(\bm{x}_j), \,\sigma_{v}^2 I), ~~~
+\bm{w}_{k} \sim \mathcal{N}(H(\bm{x}_k), \,\sigma_{w}^2 I), \label{eq:uvw}
 \end{align}
-where $\bm{g}_p$, $q_p$, $\bm{d}_q$, $r_q$, $G$, $D$ and $H$ are regression coefficient vectors and matrices.  These regression coefficients will be learned from data and provide the ability to make predictions for users or items that do not appear in training data.  The factors of these new users or items will be predicted based on their features through regression.
+where $\bm{g}_p(\cdot)$, $q_p(\cdot)$, $\bm{d}_q(\cdot)$, $r_q(\cdot)$, $G(\cdot)$, $D(\cdot)$ and $H(\cdot)$ are regression functions that can either be linear regression coefficients/matrices, or non-linear regression parameters such as random forests. These regression functions will be learned from data and provide the ability to make predictions for users or items that do not appear in training data.  The factors of these new users or items will be predicted based on their features through regression.
 
 \subsection{Toy Dataset}
 
@@ -112,9 +114,9 @@ \subsection{Toy Dataset}
 \begin{enumerate}
 \item {\tt src\_id}: Source node ID (e.g., user $i$).
 \item {\tt dst\_id}: Destination node ID (e.g., item $j$).
-\item {\tt src\_context}: Source context ID (e.g., source context $p$).
-\item {\tt dst\_context}: Destination context ID (e.g., destination context $q$).
-\item {\tt ctx\_id}: Edge context ID (e.g., edge context $k$).
+\item {\tt src\_context}: Source context ID (e.g., source context $p$), an optional column.
+\item {\tt dst\_context}: Destination context ID (e.g., destination context $q$), an optional column.
+\item {\tt ctx\_id}: Edge context ID (e.g., edge context $k$), an optional column.
 \item {\tt y}: Response (e.g., the rating that user $i$ gives item $j$ in context $(k,p,q)$).
 \end{enumerate}
 Note that all of the above IDs can be numbers or character strings.
@@ -128,14 +130,14 @@ \subsection{Toy Dataset}
   "dst_context", "ctx_id", "y");
 \end{verbatim}
 }
-It is important to note that the {\bf column names} of an observation table have to be exactly {\tt src\_id}, {\tt dst\_id}, {\tt src\_context}, {\tt dst\_context}, {\tt ctx\_id} and {\tt y}.  The model fitting code does not recognize other names.
+It is important to note that the {\bf column names} of an observation table have to be exactly {\tt src\_id}, {\tt dst\_id}, {\tt src\_context}, {\tt dst\_context}, {\tt ctx\_id} and {\tt y}. The model fitting code does not recognize other names. Also, note that {\tt src\_context}, {\tt dst\_context} and {\tt ctx\_id} are optional columns, i.e. a data with only 3 columns: {\tt src\_id}, {\tt dst\_id}, and {\tt y} still run and the model actually becomes the RLFM model introduced in \cite{rlfm:kdd09}. 
 
 \parahead{Source, Destination and Context Features}
-The features vectors of source nodes ($\bm{x}_i$), destination nodes ($\bm{x}_j$), edge contexts ($\bm{x}_k$) and training and test observations ($\bm{x}_{ijk}$) are in \\
+The feature vectors of source nodes ($\bm{x}_i$), destination nodes ($\bm{x}_j$), edge contexts ($\bm{x}_k$) and training and test observations ($\bm{x}_{ijk}$) are in \\
 \indent{\tt {\it type}-feature-user.txt}, \\
 \indent{\tt {\it type}-feature-item.txt}, \\
 \indent{\tt {\it type}-feature-ctxt.txt}, \\
-where {\it type} = "dense" for the dense format and {\it type} = "sparse" for the sparse format.
+where {\it type} = {\tt "dense"} for the dense format and {\it type} = {\tt "sparse"} for the sparse format.
 
 For the dense format, take {\tt dense-feature-user.txt} for example.  The first column is {\tt src\_id} (the {\tt src\_id} column in the observation table refers to this column to get the feature vector of the source node for each observation).  It is important to note that the {\bf name of the first column} has to be exactly {\tt src\_id}.  The rest of the columns specify the feature values and the column names can be arbitrary.
 
@@ -158,7 +160,7 @@ \subsection{Toy Dataset}
 The features vectors of training and test observations ($\bm{x}_{ijk}$) are in\\
 \indent{\tt {\it type}-feature-obs-train.txt}, \\
 \indent{\tt {\it type}-feature-obs-test.txt}, \\
-where {\it type} = "dense" for the dense format and {\it type} = "sparse" for the sparse format.
+where {\it type} = {\tt "dense"} for the dense format and {\it type} = {\tt "sparse"} for the sparse format.
 
 For the dense format, take {\tt dense-feature-obs-train.txt} for example.  The $n$th line specifies the feature vector of observation on the $n$th line of {\tt obs-train.txt}.  Since there is a line-by-line correspondence, there is no need to have an ID column.  Each column in this file represents a feature and the column names can be arbitrary.
 
@@ -180,10 +182,12 @@ \subsection{Toy Dataset}
 
 \subsection{Model Fitting Details}
 \label{sec:fitting}
+In this section we do a deep-dive of this package by trying to avoid the wrapper function {\tt fit.bst}; hence it includes more details and insights of how to use this package to fit your own problems. All the R code written in this section can be 
+seen in Example 1 in {\tt src/R/examples/tutorial-BST.R}.  For succinctness, we ignore some R commands in the following description.
 
-See Example 1 in {\tt src/R/examples/tutorial-BST.R} for the R script.  For succinctness, we ignore some R commands in the following description.
+\parahead{Step 1} Read all the data sets in the same way as described in Step 1 in Section \ref{sec:bst-quick-start}.
 
-\parahead{Step 1} 
+\begin{comment}
 Read training and test observation tables ({\tt obs.train} and {\tt obs.test}), their corresponding observation feature tables ({\tt x\_obs.train} and {\tt x\_obs.test}), the source feature table ({\tt x\_src}), the destination feature table ({\tt x\_dst}) and the edge context feature table ({\tt x\_ctx}) from the corresponding files.  Note that if you replace these tables with your data, you must not change the column names.
 {\small\begin{verbatim}
 input.dir = "test-data/multicontext_model/simulated-mtx-uvw-10K"
@@ -209,7 +213,7 @@ \subsection{Model Fitting Details}
         sep="\t", header=FALSE, as.is=TRUE);
 names(x_ctx)[1] = "ctx_id";
 \end{verbatim}}
-
+\end{comment}
 \parahead{Step 2} Index the training and test data.  Functions {\tt indexData} and {\tt indexTestData} (defined in {\tt rc/R/model/multicontext\_model\_utils.R}) convert the input data tables into the right data structure.  In particular, they replace the original IDs ({\tt src\_id}, {\tt dst\_id}, {\tt src\_context}, {\tt dst\_context} and {\tt ctx\_id}) by consecutive index numbers, and convert feature tables (data frames) into feature matrices.
 {\small\begin{verbatim}
 data.train = indexData(
@@ -323,7 +327,7 @@ \subsection{Model Fitting Details}
 \begin{itemize}
 \item {\tt nSamples}, {\tt nBurnIn} and {\tt nIter} determine how long the procedure will run.  In the above example, the procedure runs 10 EM iterations.  In each iteration, it draws 220 Gibbs samples, where the first 20 samples are burn-in samples (which are thrown away) and the rest 200 samples are used to compute the Monte Carlo means in the E-step of this iteration.  In our experience, 10-20 EM iterations with 100-200 samples per iteration are usually sufficient.
 \item {\tt reg.algo} and {\tt reg.control} specify how the regression priors will to be fitted.  If they are set to {\tt NULL}, R's basic linear regression function {\tt lm} will be used to fit the prior regression coefficients $\bm{g}, \bm{d}, \bm{h}, G, D$ and $H$.  Currently, we only support two other algorithms {\tt GLMNet} and {\tt RandomForest}.  Notice that if {\tt RandomForest} is used, the regression priors become nonlinear; see~\cite{gmf:recsys11} for more information.
-\item {\tt out.level} and {\tt out.dir} specify what and where the fitting procedure will output.  If {\tt out.level} > 0, each model specified in {\tt setting} (i.e., each row in the {\tt setting} table) will be output to a separate directory.  The output directory name of the $m$th model is
+\item {\tt out.level} and {\tt out.dir} specify what and where the fitting procedure will output.  If {\tt out.level} $> 0$, each model specified in {\tt setting} (i.e., each row in the {\tt setting} table) will be output to a separate directory.  The output directory name of the $m$th model is
 {\small\begin{verbatim}
 paste(out.dir, "_", setting$name[m], sep="")
 \end{verbatim}}
diff --git a/src/R/BST.R b/src/R/BST.R
index 2cf40b6..bc15eca 100644
--- a/src/R/BST.R
+++ b/src/R/BST.R
@@ -36,8 +36,9 @@ fit.bst <- function(
   if (!is.null(obs.test)) {
     if (is.null(obs.test$src_id) || is.null(obs.test$dst_id) || is.null(obs.test$y)) stop("obs.test must have src_id, dst_id, and response y");
   }
+
   if (is.null(x_obs.train) && !is.null(x_obs.test)) stop("x_obs.train does not exist while x_obs.test is used!");
-  if (is.null(x_obs.test) && !is.null(x_obs.train)) stop("x_obs.test does not exist while x_obs.train is used!");
+  if (is.null(x_obs.test) && !is.null(x_obs.train) && !is.null(obs.test)) stop("x_obs.test does not exist while x_obs.train is used!");
   if (!is.null(x_obs.train) && !is.null(x_obs.test)) {
     if (ncol(x_obs.train)!=ncol(x_obs.test)) stop("ncol(x_obs.train)!=ncol(x_obs.test)! The number of features for training and test should be exactly the same!");
   }
diff --git a/src/unit-test/multicontext_model/regression-test-0.R b/src/unit-test/multicontext_model/regression-test-0.R
index ba9fd14..d071f57 100644
--- a/src/unit-test/multicontext_model/regression-test-0.R
+++ b/src/unit-test/multicontext_model/regression-test-0.R
@@ -19,11 +19,11 @@ x_ctx = read.table(paste(input.dir,"/dense-feature-ctxt.txt",sep=""), sep="\t",
 names(x_ctx)[1] = "ctx_id";
 
 # (2) Call BST
-ans = fit.bst(obs.train=obs.train, obs.test=obs.test, x_obs.train=x_obs.train, x_obs.test=x_obs.test, x_src=x_src, x_dst=x_dst, x_ctx=x_ctx,
-        out.dir = "/tmp/unit-test/simulated-mtx-uvw-10K", model.name=c("uvw1", "uvw2"), nFactors=c(1,2), nIter=10);
-#ans = fit.bst(obs.train=obs.train, x_obs.train=x_obs.train, x_src=x_src, x_dst=x_dst, x_ctx=x_ctx,
+ans = fit.bst(obs.train=obs.train, obs.test=obs.test, out.dir = "/tmp/unit-test/simulated-mtx-uvw-10K", model.name=c("uvw1", "uvw2"), nFactors=c(1,2), nIter=10);
+#ans = fit.bst(obs.train=obs.train, obs.test=obs.test, x_obs.train=x_obs.train, x_obs.test=x_obs.test, x_src=x_src, x_dst=x_dst, x_ctx=x_ctx,
 #        out.dir = "/tmp/unit-test/simulated-mtx-uvw-10K", model.name=c("uvw1", "uvw2"), nFactors=c(1,2), nIter=10);
 
+
 # (3) Compare to the reference run
 warnings()
 out.dir = "/tmp/unit-test/simulated-mtx-uvw-10K"