initial sfpca implementation

alexpghayes · Mar 19, 2018 · accd2e6 · accd2e6
1 parent 58c6e57
commit accd2e6
Show file tree

Hide file tree

Showing 23 changed files with 1,274 additions and 32 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,4 @@
+^.*\.Rproj$
+^\.Rproj\.user$
+^LICENSE\.md$
+^README\.Rmd$
diff --git a/.gitignore b/.gitignore
@@ -1,33 +1,3 @@
-# History files
+.Rproj.user
 .Rhistory
-.Rapp.history
-
-# Session Data files
 .RData
-
-# Example code in package build process
-*-Ex.R
-
-# Output files from R CMD build
-/*.tar.gz
-
-# Output files from R CMD check
-/*.Rcheck/
-
-# RStudio files
-.Rproj.user/
-
-# produced vignettes
-vignettes/*.html
-vignettes/*.pdf
-
-# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
-.httr-oauth
-
-# knitr and R markdown default cache directories
-/*_cache/
-/cache/
-
-# Temporary files created by R markdown
-*.utf8.md
-*.knit.md
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,22 @@
+Package: moma
+Title: Application for GSOC MoMA Project
+Version: 0.0.0.9000
+Authors@R: person("Alex", "Hayes", email = "alexpghayes@gmail.com", role = c("aut", "cre"))
+Description: Computes the L1-penalized rank 1 sparse and functional matrix
+    factorization described in Sparse and Functional Principal Components
+    Analysis (Allen, 2013). Package is Alex's Hayes application to the
+    Google Summer of Code program.
+Depends: R (>= 3.4.1)
+License: MIT + file LICENSE
+Encoding: UTF-8
+LazyData: true
+Suggests: 
+    testthat
+RoxygenNote: 6.0.1
+Imports: 
+    irlba,
+    Rcpp,
+    RcppArmadillo
+LinkingTo: 
+    Rcpp,
+    RcppArmadillo
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2018
+COPYRIGHT HOLDER: Alex Hayes
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,21 @@
+# MIT License
+
+Copyright (c) 2018 Alex Hayes
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,6 @@
+# Generated by roxygen2: do not edit by hand
+
+export(sfpca)
+export(sfpca_r)
+importFrom(Rcpp,sourceCpp)
+useDynLib(moma, .registration = TRUE)
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -0,0 +1,7 @@
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+sfpca_arma <- function(X, lambda_u, lambda_v, alpha_u, alpha_v, Omega_u, Omega_v) {
+    .Call(`_moma_sfpca_arma`, X, lambda_u, lambda_v, alpha_u, alpha_v, Omega_u, Omega_v)
+}
+
diff --git a/R/sfpca.R b/R/sfpca.R
@@ -0,0 +1,125 @@
+#' @useDynLib moma, .registration = TRUE
+#' @importFrom Rcpp sourceCpp
+NULL
+
+zeros_like <- function(x) rep(0, length(x))  # vectors only
+op_norm <- function(x, A) sqrt(t(x) %*% A %*% x)  # operator norm
+l2_norm <- function(x) sqrt(sum(x^2))
+prox_l1 <- function(y, lambda) sign(y) * pmax(abs(y) - lambda, 0)
+
+max_eigenval <- function(X) {
+  # add 0.01 to agree with MATLAB implementation
+  max(eigen(X, symmetric = FALSE, only.values = TRUE)$values) + 0.01
+}
+
+#' Compute rank 1 sparse and functional principal components
+#'
+#' @param X A data matrix. Considered n x p by convention.
+#' @param lambda_u Sparsity parameter for left singular vectors.
+#' @param lambda_v Sparsity parameter for right singular vectors.
+#' @param alpha_u Smoothness parameter for left singular vectors.
+#' @param alpha_v Smoothness parameter for right singular vectors.
+#' @param Omega_u Roughness penalty matrix (n x n) for left singular vectors.
+#'   Must be positive semi-definite.
+#' @param Omega_v Roughness penalty matrix (p x p) for right singular vectors.
+#'   Must be positive semi-definite.
+#'
+#' @return List with three named elements: left singular vector u, eigenval d,
+#'   and right singular vector v.
+#' @export
+sfpca <- function(X, lambda_u = 0, lambda_v = 0, alpha_u = 0, alpha_v = 0,
+                  Omega_u = diag(nrow(X)), Omega_v = diag(ncol(X))) {
+
+  # R wrapper around C++ implementation for lazily argument evaluation and
+  # convenient argument checking
+
+  stopifnot(lambda_u >= 0)
+  stopifnot(lambda_v >= 0)
+  stopifnot(alpha_u >= 0)
+  stopifnot(alpha_v >= 0)
+  stopifnot(all(svd(Omega_u)$d >= 0))  # check if positive semi-definite
+  stopifnot(all(svd(Omega_v)$d >= 0))
+
+  sfpca_arma(X, lambda_u, lambda_v, alpha_u, alpha_v, Omega_u, Omega_v)
+}
+
+#' Compute rank 1 sparse and functional principal components
+#'
+#' @param X A data matrix. Considered n x p by convention.
+#' @param lambda_u Sparsity parameter for left singular vectors.
+#' @param lambda_v Sparsity parameter for right singular vectors.
+#' @param alpha_u Smoothness parameter for left singular vectors.
+#' @param alpha_v Smoothness parameter for right singular vectors.
+#' @param Omega_u Roughness penalty matrix (n x n) for left singular vectors.
+#'   Must be positive semi-definite.
+#' @param Omega_v Roughness penalty matrix (p x p) for right singular vectors.
+#'   Must be positive semi-definite.
+#'
+#' @return List with three named elements: left singular vector u, eigenval d,
+#'   and right singular vector v.
+#' @export
+sfpca_r <- function(X, lambda_u = 0, lambda_v = 0, alpha_u = 0, alpha_v = 0,
+                        Omega_u = diag(nrow(X)), Omega_v = diag(ncol(X))) {
+
+  stopifnot(lambda_u >= 0)
+  stopifnot(lambda_v >= 0)
+  stopifnot(alpha_u >= 0)
+  stopifnot(alpha_v >= 0)
+  stopifnot(all(svd(Omega_u)$d >= 0))  # check if positive semi-definite
+  stopifnot(all(svd(Omega_v)$d >= 0))
+
+  # some conventions so the following code matches the paper:
+  #   - u: left singular vectors, or related to
+  #   - v: right singular vectors, or related to
+  #   - d: eigenvalues
+  #   - vectors are lowercase (u, v), matrices uppercase (X, Omega_u)
+  #   - underscores indicate subscripts, LaTeX style
+  #   - X is an n x p matrix
+
+  n <- nrow(X)
+  p <- ncol(X)
+
+  # initialize u, v to rank one SVD solution
+
+  tsvd <- irlba::irlba(X, 1, 1)
+  u <- tsvd$u  # n x 1 matrix
+  v <- tsvd$v  # p x 1 matrix
+
+  # multiplication by n and p here is to agree with the MATLAB implementation
+  S_u <- diag(n) + alpha_u * Omega_u * n
+  S_v <- diag(p) + alpha_v * Omega_v * p
+
+  L_u <- max_eigenval(S_u)
+  L_v <- max_eigenval(S_v)
+
+  tol <- 1e-6
+  delta_u <- tol + 1
+  delta_v <- tol + 1
+
+  while (delta_u > tol & delta_v > tol) {
+
+    while (delta_u > tol) {
+      old_u <- u
+      u <- prox_l1(u + (X %*% v - S_u %*% u) / L_u, lambda_u / L_u)
+      u_norm <- as.numeric(op_norm(u, S_u))
+      u <- if (u_norm > 0) u / u_norm else zeros_like(u)
+      delta_u <- l2_norm(u - old_u)
+    }
+
+    # same as u case except place with v, and add a single transposition
+    while (delta_v > tol) {
+      old_v <- v
+      v <- prox_l1(v + (t(X) %*% u - S_v %*% v) / L_v, lambda_v / L_v)
+      v_norm <- as.numeric(op_norm(v, S_v))
+      v <- if (v_norm > 0) v / v_norm else zeros_like(v)
+      delta_v <- l2_norm(v - old_v)
+    }
+  }
+
+  # normalize eigenvectors and calculate eigenvalue
+  u <- if (l2_norm(u) > 0) u / l2_norm(u) else u
+  v <- if (l2_norm(v) > 0) v / l2_norm(v) else v
+  d <- as.numeric(t(u) %*% X %*% v)
+
+  list(u = u, d = d, v = v)
+}
diff --git a/README.Rmd b/README.Rmd
@@ -0,0 +1,45 @@
+---
+output: github_document
+---
+
+<!-- README.md is generated from README.Rmd. Please edit that file -->
+
+```{r setup, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>",
+  out.width = "100%"
+)
+```
+
+# GSOC Modern Multivariate Analysis
+
+**This application is in progress and is not yet complete**
+
+So far I have:
+  - Implemented L1 penalized rank 1 SFPCA algorithm in R
+  - Implemented L1 penalized rank 1 SFPCA algorithm in C++ using Armadillo
+
+Next steps:
+  - Legitimate tests based on the provided MATLAB code
+  - Make sure to pass R CMD check
+  - Some general code cleanup
+
+At this point both the R and C++ code runs and I get the same values when on the `iris` dataset:
+
+```{r}
+library(moma)
+
+X <- as.matrix(iris[, 1:4])
+sfpca(X, 10, 4)  
+sfpca_r(X, 10, 4)
+```
+
+The results match up with a standard rank 1 SVD when no penalties are included.
+
+```{r}
+irlba::irlba(X, 1, 1)
+sfpca(X)
+```
+
+