Permalink
Browse files

Fix issue #2 and #6

  • Loading branch information...
1 parent c3c1d89 commit 77a115ff88418e2bff3e770b83f7bff4d20e1bf3 Bee-Chung Chen committed Jan 7, 2012
Showing with 244 additions and 53 deletions.
  1. BIN doc/tutorial.pdf
  2. +11 −7 doc/tutorial.tex
  3. +152 −24 src/R/examples/tutorial-BST.R
  4. +77 −19 src/R/model/multicontext_model_EM.R
  5. +4 −3 src/R/model/util.R
View
Binary file not shown.
View
@@ -298,17 +298,14 @@ \subsection{Model Fitting Details}
{\small\begin{verbatim}
out.dir = "/tmp/unit-test/simulated-mtx-uvw-10K";
ans = run.multicontext(
- obs=data.train$obs, # training observation table
- feature=data.train$feature, # training feature matrices
+ data.train=data.train, # training data
+ data.test=data.test, # test data (optional)
setting=setting, # setting specified in Step 3
nSamples=200, # number of Gibbs samples in each E-step
nBurnIn=20, # number of burn-in samples for the Gibbs sampler
nIter=10, # number of EM iterations
- test.obs=data.test$obs, # test observation table (optional)
- test.feature=data.test$feature, # test feature matrices (optional)
reg.algo=NULL, # regression algorithm; see below
reg.control=NULL, # control parameters for the regression algorithm
- IDs=data.test$IDs, # ID mappings (optional)
out.level=1, # see below
out.dir=out.dir, # output directory
out.overwrite=TRUE, # whether to overwrite the output directory
@@ -369,7 +366,7 @@ \subsection{Output}
{\small\begin{verbatim}
load(paste(out.dir,"_uvw2/model.last",sep=""));
\end{verbatim}}
-\noindent After loading, the fitted prior parameters are in object {\tt param} and the fitted latent factors are in object {\tt factor}. Also, the object {\tt IDs} contains the ID mappings described in Step~2 of Section~\ref{sec:fitting}.
+\noindent After loading, the fitted prior parameters are in object {\tt param} and the fitted latent factors are in object {\tt factor}. Also, the object {\tt data.train} contains the ID mappings described in Step~2 of Section~\ref{sec:fitting} that are needed when you need to index a new test dataset. Notice that {\tt data.train} does not contain actual data, but just meta information.
\subsection{Prediction}
@@ -380,7 +377,14 @@ \subsection{Prediction}
obs=data.test$obs, feature=data.test$feature, is.logistic=FALSE
);
\end{verbatim}}
-\noindent Now, {\tt pred\$pred.y} contains the predicted response for {\tt data.test\$obs}. Notice that the test data {\tt data.test} was created by call {\tt indexTestData} in Step 2 of Section~\ref{sec:fitting}.
+\noindent Now, {\tt pred\$pred.y} contains the predicted response for {\tt data.test\$obs}. Notice that the test data {\tt data.test} was created by calling {\tt indexTestData} in Step 2 of Section~\ref{sec:fitting}. If you have new test data, you can use the following command to index the new test data.
+{\small\begin{verbatim}
+data.test = indexTestData(
+ data.train=data.train, obs=obs.test,
+ x_obs=x_obs.test, x_src=x_src, x_dst=x_dst, x_ctx=x_ctx
+);
+\end{verbatim}}
+\noindent where {\tt obs.test}, {\tt x\_obs.test}, {\tt x\_src}, {\tt x\_dst} and {\tt x\_ctx} contain new data in the same format as described in Step 2 of Section~\ref{sec:fitting}.
\subsection{Other Examples}
@@ -77,14 +77,12 @@ source("src/R/model/multicontext_model_EM.R");
set.seed(2);
out.dir = "/tmp/unit-test/simulated-mtx-uvw-10K";
ans = run.multicontext(
- obs=data.train$obs, # Observation table
- feature=data.train$feature, # Features
- setting=setting, # Model setting
- nSamples=200, # Number of samples drawn in each E-step: could be a vector of size nIter.
- nBurnIn=20, # Number of burn-in draws before take samples for the E-step: could be a vector of size nIter.
- nIter=10, # Number of EM iterations
- test.obs=data.test$obs, # Test data: Observations for testing (optional)
- test.feature=data.test$feature, # Features for testing (optional)
+ data.train=data.train, # training data
+ data.test=data.test, # test data (optional)
+ setting=setting, # Model setting
+ nSamples=200, # Number of samples drawn in each E-step: could be a vector of size nIter.
+ nBurnIn=20, # Number of burn-in draws before take samples for the E-step: could be a vector of size nIter.
+ nIter=10, # Number of EM iterations
approx.interaction=TRUE, # predict E[uv] as E[u]E[v].
reg.algo=NULL, # The regression algorithm to be used in the M-step (NULL => linear regression)
reg.control=NULL, # The control paramter for reg.algo
@@ -93,7 +91,6 @@ ans = run.multicontext(
var_v=1, var_u=1, var_w=1, var_y=NULL,
relative.to.var_y=FALSE, var_alpha_global=1, var_beta_global=1,
# others
- IDs=data.test$IDs,
out.level=1, # out.level=1: Save the factor & parameter values to out.dir/model.last and out.dir/model.minTestLoss
out.dir=out.dir, # out.level=2: Save the factor & parameter values of each iteration i to out.dir/model.i
out.overwrite=TRUE, # whether to overwrite the output directory if it exists
@@ -204,22 +201,19 @@ source("src/R/model/GLMNet.R");
set.seed(2);
out.dir = "/tmp/tutorial-BST/example-2";
ans = run.multicontext(
- obs=data.train$obs, # Observation table
- feature=data.train$feature, # Features
+ data.train=data.train, # training data
+ data.test=data.test, # test data (optional)
setting=setting, # Model setting
nSamples=200, # Number of samples drawn in each E-step: could be a vector of size nIter.
nBurnIn=20, # Number of burn-in draws before take samples for the E-step: could be a vector of size nIter.
nIter=10, # Number of EM iterations
- test.obs=data.test$obs, # Test data: Observations for testing (optional)
- test.feature=data.test$feature, # Features for testing (optional)
approx.interaction=TRUE, # predict E[uv] as E[u]E[v].
reg.algo=GLMNet, # The regression algorithm to be used in the M-step (NULL => linear regression)
# initialization parameters
var_alpha=1, var_beta=1, var_gamma=1,
var_v=1, var_u=1, var_w=1, var_y=NULL,
relative.to.var_y=FALSE, var_alpha_global=1, var_beta_global=1,
# others
- IDs=data.test$IDs,
out.level=1, # out.level=1: Save the factor & parameter values to out.dir/model.last and out.dir/model.minTestLoss
out.dir=out.dir, # out.level=2: Save the factor & parameter values of each iteration i to out.dir/model.i
out.overwrite=TRUE, # whether to overwrite the output directory if it exists
@@ -312,15 +306,12 @@ source("src/R/model/GLMNet.R");
out.dir = "/tmp/tutorial-BST/example-3_uvw2";
set.seed(2);
ans = fit.multicontext(
- obs=data.train$obs, # Observation table
- feature=data.train$feature, # Features
+ data.train=data.train, # training data
+ data.test=data.test, # test data (optional)
init.model=model, # Initial model = list(factor, param)
nSamples=200, # Number of samples drawn in each E-step: could be a vector of size nIter.
nBurnIn=20, # Number of burn-in draws before take samples for the E-step: could be a vector of size nIter.
nIter=5, # Number of EM iterations
- test.obs=data.test$obs, # Test data: Observations for testing
- test.feature=data.test$feature, # Features for testing
- IDs=data.test$IDs,
is.logistic=FALSE,
out.level=1, # out.level=1: Save the factor & parameter values to out.dir/model.last and out.dir/model.minTestLoss
out.dir=out.dir, # out.level=2: Save the factor & parameter values of each iteration i to out.dir/model.i
@@ -422,22 +413,19 @@ source("src/R/model/GLMNet.R");
set.seed(2);
out.dir = "/tmp/tutorial-BST/example-4";
ans = run.multicontext(
- obs=data.train$obs, # Observation table
- feature=data.train$feature, # Features
+ data.train=data.train, # training data
+ data.test=data.test, # test data (optional)
setting=setting, # Model setting
nSamples=200, # Number of samples drawn in each E-step: could be a vector of size nIter.
nBurnIn=20, # Number of burn-in draws before take samples for the E-step: could be a vector of size nIter.
nIter=10, # Number of EM iterations
- test.obs=data.test$obs, # Test data: Observations for testing (optional)
- test.feature=data.test$feature, # Features for testing (optional)
approx.interaction=TRUE, # predict E[uv] as E[u]E[v].
reg.algo=GLMNet, # The regression algorithm to be used in the M-step (NULL => linear regression)
# initialization parameters
var_alpha=1, var_beta=1, var_gamma=1,
var_v=1, var_u=1, var_w=1, var_y=NULL,
relative.to.var_y=FALSE, var_alpha_global=1, var_beta_global=1,
# others
- IDs=data.test$IDs,
out.level=1, # out.level=1: Save the factor & parameter values to out.dir/model.last and out.dir/model.minTestLoss
out.dir=out.dir, # out.level=2: Save the factor & parameter values of each iteration i to out.dir/model.i
out.overwrite=TRUE, # whether to overwrite the output directory if it exists
@@ -447,3 +435,143 @@ ans = run.multicontext(
ridge.lambda=1, # Add diag(lambda) to X'X in linear regression
rnd.seed.init=0, rnd.seed.fit=1
);
+
+
+###
+### Example 5: Fit the BST model with dense features
+### Do not give the test data to the fitting procedure,
+### and then later load the test data, index it (in the correct way)
+### and predict the response in the test data.
+###
+library(Matrix);
+dyn.load("lib/c_funcs.so");
+source("src/R/c_funcs.R");
+source("src/R/util.R");
+source("src/R/model/util.R");
+source("src/R/model/multicontext_model_utils.R");
+set.seed(0);
+
+# (1) Read only the training data (NOT the test data)
+input.dir = "test-data/multicontext_model/simulated-mtx-uvw-10K"
+# (1.1) Training observations and observation features
+obs.train = read.table(paste(input.dir,"/obs-train.txt",sep=""),
+ sep="\t", header=FALSE, as.is=TRUE);
+names(obs.train) = c("src_id", "dst_id", "src_context",
+ "dst_context", "ctx_id", "y");
+x_obs.train = read.table(paste(input.dir,"/dense-feature-obs-train.txt",
+ sep=""), sep="\t", header=FALSE, as.is=TRUE);
+# (1.2) User/item/context features
+x_src = read.table(paste(input.dir,"/dense-feature-user.txt",sep=""),
+ sep="\t", header=FALSE, as.is=TRUE);
+names(x_src)[1] = "src_id";
+x_dst = read.table(paste(input.dir,"/dense-feature-item.txt",sep=""),
+ sep="\t", header=FALSE, as.is=TRUE);
+names(x_dst)[1] = "dst_id";
+x_ctx = read.table(paste(input.dir,"/dense-feature-ctxt.txt",sep=""),
+ sep="\t", header=FALSE, as.is=TRUE);
+names(x_ctx)[1] = "ctx_id";
+
+# (2) Index the training data: Put the input data into the right form
+# Convert IDs into numeric indices and
+# Convert some data frames into matrices
+data.train = indexData(
+ obs=obs.train, src.dst.same=FALSE, rm.self.link=FALSE,
+ x_obs=x_obs.train, x_src=x_src, x_dst=x_dst, x_ctx=x_ctx,
+ add.intercept=TRUE
+);
+
+# (3) Setup the model(s) to be fitted
+setting = data.frame(
+ name = c("uvw1", "uvw2"),
+ nFactors = c( 1, 2), # number of interaction factors
+ has.u = c( TRUE, TRUE), # whether to use u_i' v_j or v_i' v_j
+ has.gamma = c( FALSE, FALSE), # whether to include gamma_k in the model
+ nLocalFactors = c( 0, 0), # just set to 0
+ is.logistic = c( FALSE, FALSE) # whether to use the logistic response model
+);
+
+# (4) Run the fitting code without supplying the test data
+# See src/R/model/multicontext_model_EM.R: run.multicontext() for details
+dyn.load("lib/c_funcs.so");
+source("src/R/c_funcs.R");
+source("src/R/util.R");
+source("src/R/model/util.R");
+source("src/R/model/multicontext_model_genData.R");
+source("src/R/model/multicontext_model_utils.R");
+source("src/R/model/multicontext_model_MStep.R");
+source("src/R/model/multicontext_model_EM.R");
+set.seed(2);
+out.dir = "/tmp/tutorial-BST/example-5";
+ans = run.multicontext(
+ data.train=data.train, # training data
+ setting=setting, # Model setting
+ nSamples=200, # Number of samples drawn in each E-step: could be a vector of size nIter.
+ nBurnIn=20, # Number of burn-in draws before take samples for the E-step: could be a vector of size nIter.
+ nIter=10, # Number of EM iterations
+ approx.interaction=TRUE, # predict E[uv] as E[u]E[v].
+ reg.algo=NULL, # The regression algorithm to be used in the M-step (NULL => linear regression)
+ reg.control=NULL, # The control paramter for reg.algo
+ # initialization parameters
+ var_alpha=1, var_beta=1, var_gamma=1,
+ var_v=1, var_u=1, var_w=1, var_y=NULL,
+ relative.to.var_y=FALSE, var_alpha_global=1, var_beta_global=1,
+ # others
+ out.level=1, # out.level=1: Save the factor & parameter values to out.dir/model.last and out.dir/model.minTestLoss
+ out.dir=out.dir, # out.level=2: Save the factor & parameter values of each iteration i to out.dir/model.i
+ out.overwrite=TRUE, # whether to overwrite the output directory if it exists
+ debug=0, # Set to 0 to disable internal sanity checking; Set to 100 for most detailed sanity checking
+ verbose=1, # Set to 0 to disable console output; Set to 100 to print everything to the console
+ verbose.M=2,
+ ridge.lambda=1, # Add diag(lambda) to X'X in linear regression
+ rnd.seed.init=0, rnd.seed.fit=1
+);
+
+# Quit from R and enter R again.
+
+out.dir = "/tmp/tutorial-BST/example-5";
+# Check the output
+read.table(paste(out.dir,"_uvw2/summary",sep=""), header=TRUE, sep="\t", as.is=TRUE);
+
+# Load the model
+load(paste(out.dir,"_uvw2/model.last",sep=""));
+# It loads param, factor, data.train
+str(param);
+str(factor);
+str(data.train); # this does not include the actual data!!
+
+# Read the test data
+input.dir = "test-data/multicontext_model/simulated-mtx-uvw-10K"
+obs.test = read.table(paste(input.dir,"/obs-test.txt",sep=""),
+ sep="\t", header=FALSE, as.is=TRUE);
+names(obs.test) = c("src_id", "dst_id", "src_context",
+ "dst_context", "ctx_id", "y");
+x_obs.test = read.table(paste(input.dir,"/dense-feature-obs-test.txt",
+ sep=""), sep="\t", header=FALSE, as.is=TRUE);
+x_src = read.table(paste(input.dir,"/dense-feature-user.txt",sep=""),
+ sep="\t", header=FALSE, as.is=TRUE);
+names(x_src)[1] = "src_id";
+x_dst = read.table(paste(input.dir,"/dense-feature-item.txt",sep=""),
+ sep="\t", header=FALSE, as.is=TRUE);
+names(x_dst)[1] = "dst_id";
+x_ctx = read.table(paste(input.dir,"/dense-feature-ctxt.txt",sep=""),
+ sep="\t", header=FALSE, as.is=TRUE);
+names(x_ctx)[1] = "ctx_id";
+
+# Index the test data
+dyn.load("lib/c_funcs.so");
+source("src/R/c_funcs.R");
+source("src/R/util.R");
+source("src/R/model/util.R");
+source("src/R/model/multicontext_model_utils.R");
+data.test = indexTestData(
+ data.train=data.train, obs=obs.test,
+ x_obs=x_obs.test, x_src=x_src, x_dst=x_dst, x_ctx=x_ctx
+);
+
+# Make prediction
+pred = predict.multicontext(
+ model=list(factor=factor, param=param),
+ obs=data.test$obs, feature=data.test$feature, is.logistic=FALSE
+);
+# Now, pred$pred.y contains the predicted rating for data.test$obs
+str(pred);
Oops, something went wrong. Retry.

0 comments on commit 77a115f

Please sign in to comment.