From ddcb6144ad369e6c066d095280d42150d318e49e Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Mon, 19 May 2014 19:58:59 -0700 Subject: [PATCH 01/34] MAHOUT-1500: Implement H2O backend for Mahout Scala DSL Barebone only, no logic yet. Compiles, tests fail with NotImplementedError Signed-off-by: Anand Avati --- h2o/pom.xml | 223 ++++++++++++++++++ .../apache/mahout/h2obindings/H2OContext.java | 31 +++ .../h2obindings/H2ODistributedContext.scala | 27 +++ .../apache/mahout/h2obindings/H2OEngine.scala | 36 +++ .../h2obindings/drm/CheckpointedDrmH2O.scala | 30 +++ .../mahout/h2obindings/drm/H2OBCast.scala | 24 ++ .../apache/mahout/h2obindings/package.scala | 24 ++ .../test/LoggerConfiguration.scala | 13 + .../h2obindings/test/MahoutLocalContext.scala | 29 +++ .../math/decompositions/MathSuite.scala | 212 +++++++++++++++++ pom.xml | 2 + 11 files changed, 651 insertions(+) create mode 100644 h2o/pom.xml create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/H2OContext.java create mode 100644 h2o/src/main/scala/org/apache/mahout/h2obindings/H2ODistributedContext.scala create mode 100644 h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala create mode 100644 h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala create mode 100644 h2o/src/main/scala/org/apache/mahout/h2obindings/drm/H2OBCast.scala create mode 100644 h2o/src/main/scala/org/apache/mahout/h2obindings/package.scala create mode 100644 h2o/src/test/scala/org/apache/mahout/h2obindings/test/LoggerConfiguration.scala create mode 100644 h2o/src/test/scala/org/apache/mahout/h2obindings/test/MahoutLocalContext.scala create mode 100644 h2o/src/test/scala/org/apache/mahout/math/decompositions/MathSuite.scala diff --git a/h2o/pom.xml b/h2o/pom.xml new file mode 100644 index 0000000000..11a5443740 --- /dev/null +++ b/h2o/pom.xml @@ -0,0 +1,223 @@ + + + + + + 4.0.0 + + + org.apache.mahout + mahout + 1.0-SNAPSHOT + ../pom.xml + + + mahout-h2o + Mahout H2O backend + + H2O Backend for Mahout DSL + + + jar + + + + oss.sonatype.org-snapshot + http://oss.sonatype.org/content/repositories/snapshots + + false + + + true + + + + + + + + sonatype + https://oss.sonatype.org/content/groups/public + + true + + + + + + install + + + + + org.codehaus.mojo + build-helper-maven-plugin + + + add-source + generate-sources + + add-source + + + + ${project.build.directory}/generated-sources/mahout + + + + + add-test-source + generate-sources + + add-test-source + + + + ${project.build.directory}/generated-test-sources/mahout + + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + + test-jar + + package + + + + + + maven-javadoc-plugin + + + + maven-source-plugin + + + + org.scala-tools + maven-scala-plugin + + + scala-compile-first + process-resources + + add-source + compile + + + + + compile + testCompile + + + + + src/main/scala + + -Xms64m + -Xmx1024m + + + + + + + + + + + + + + + + + + org.scalatest + scalatest-maven-plugin + 1.0-M2 + + ${project.build.directory}/scalatest-reports + . + WDF TestSuite.txt + + + + test + + test + + + + + + + + + + + + org.apache.mahout + mahout-math-scala + ${project.version} + + + + org.apache.mahout + mahout-math-scala + tests + test + + + + org.apache.mahout + mahout-math + ${project.version} + + + + + + + + ai.h2o + h2o-core + ${h2o.version} + + + + + org.scalatest + scalatest_2.10 + 2.0 + test + + + + diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OContext.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OContext.java new file mode 100644 index 0000000000..7502a2909e --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OContext.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings; + +import water.H2O; + +public class H2OContext { + String masterURL; + + public H2OContext(String _masterURL) { + masterURL = _masterURL; + + H2O.main(new String[]{"-name", _masterURL}); + H2O.joinOthers(); + } +} diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2ODistributedContext.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2ODistributedContext.scala new file mode 100644 index 0000000000..416c95031c --- /dev/null +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2ODistributedContext.scala @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings + +import org.apache.mahout.math.drm._ + +class H2ODistributedContext(val masterUrl: String) extends DistributedContext { + + def close(): Unit = return + + val engine: DistributedEngine = H2OEngine +} diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala new file mode 100644 index 0000000000..75f4614e87 --- /dev/null +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings + +import scala.reflect._ +import org.apache.mahout.math._ +import org.apache.mahout.math.drm._ + +object H2OEngine extends DistributedEngine { + def colMeans[K:ClassTag](drm: CheckpointedDrm[K]): Vector = ??? + def colSums[K:ClassTag](drm: CheckpointedDrm[K]): Vector = ??? + def norm[K: ClassTag](drm: CheckpointedDrm[K]): Double = ??? + def drmBroadcast(m: Matrix)(implicit dc: DistributedContext): BCast[Matrix] = ??? + def drmBroadcast(v: Vector)(implicit dc: DistributedContext): BCast[Vector] = ??? + def drmFromHDFS(path: String, parMin: Int = 0)(implicit dc: DistributedContext): CheckpointedDrm[_] = ??? + def drmParallelizeEmpty(nrow: Int, ncol: Int, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Int] = ??? + def drmParallelizeEmptyLong(nrow: Long, ncol: Int, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Long] = ??? + def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Int] = ??? + def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[String] = ??? + def toPhysical[K:ClassTag](plan: DrmLike[K], ch: CacheHint.CacheHint): CheckpointedDrm[K] = ??? +} diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala new file mode 100644 index 0000000000..1dfda12639 --- /dev/null +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala @@ -0,0 +1,30 @@ +package org.apache.mahout.h2obindings.drm + +import org.apache.mahout.math.{SparseMatrix, DenseMatrix, Matrix, Vector} +import math._ +import org.apache.mahout.math.scalabindings._ +import RLikeOps._ +import org.apache.mahout.math.drm._ +import org.apache.mahout.h2obindings._ + +import water._ +import water.fvec._ + +import scala.reflect._ + +/** H2O-specific optimizer-checkpointed DRM. */ +class CheckpointedDrmH2O[K: ClassTag]( + val frame: Frame +) extends CheckpointedDrm[K] { + + def collect: Matrix = ??? + def uncache(): Unit = ??? + def writeDRM(path: String): Unit = ??? + + + def checkpoint(cacheHint: CacheHint.CacheHint): CheckpointedDrm[K] = ??? + protected[mahout] val context: DistributedContext = ??? + def ncol: Int = ??? + def nrow: Long = ??? + protected[mahout] def partitioningTag: Long = ??? +} diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/H2OBCast.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/H2OBCast.scala new file mode 100644 index 0000000000..1b0baac8c1 --- /dev/null +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/H2OBCast.scala @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.drm + +import org.apache.mahout.math.drm.BCast + +class H2OBCast[T] extends BCast[T] with Serializable { + def value: T = ??? +} diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/package.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/package.scala new file mode 100644 index 0000000000..79d9c5b2aa --- /dev/null +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/package.scala @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout + +package object h2obindings { + def mahoutH2OContext(masterURL: String): H2ODistributedContext = { + new H2ODistributedContext(masterURL) + } +} diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/test/LoggerConfiguration.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/test/LoggerConfiguration.scala new file mode 100644 index 0000000000..6444ece61b --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/test/LoggerConfiguration.scala @@ -0,0 +1,13 @@ +package org.apache.mahout.h2obindings.test + +import org.scalatest.Suite +import org.apache.log4j.{Level, Logger, BasicConfigurator} + +trait LoggerConfiguration extends org.apache.mahout.test.LoggerConfiguration { + this: Suite => + + override protected def beforeAll(): Unit = { + super.beforeAll() + Logger.getLogger("org.apache.mahout.h2obindings").setLevel(Level.DEBUG) + } +} diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/test/MahoutLocalContext.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/test/MahoutLocalContext.scala new file mode 100644 index 0000000000..f08c613225 --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/test/MahoutLocalContext.scala @@ -0,0 +1,29 @@ +package org.apache.mahout.h2obindings.test + +import org.scalatest.Suite +import org.apache.mahout.h2obindings._ +import org.apache.mahout.test.MahoutSuite +import org.apache.mahout.math.drm.DistributedContext + +trait MahoutLocalContext extends MahoutSuite with LoggerConfiguration { + this: Suite => + + protected implicit var mahoutCtx: DistributedContext = _ + + override protected def beforeEach() { + super.beforeEach() + + mahoutCtx = mahoutH2OContext("local") + } + + override protected def afterEach() { + if (mahoutCtx != null) { + try { + mahoutCtx.close() + } finally { + mahoutCtx = null + } + } + super.afterEach() + } +} diff --git a/h2o/src/test/scala/org/apache/mahout/math/decompositions/MathSuite.scala b/h2o/src/test/scala/org/apache/mahout/math/decompositions/MathSuite.scala new file mode 100644 index 0000000000..7040fd32bd --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/math/decompositions/MathSuite.scala @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math.decompositions + +import org.scalatest.{Matchers, FunSuite} +import org.apache.mahout.h2obindings.test.MahoutLocalContext +import org.apache.mahout.math._ +import drm._ +import scalabindings._ +import RLikeOps._ +import RLikeDrmOps._ +import org.apache.mahout.h2obindings._ +import org.apache.mahout.common.RandomUtils +import scala.math._ + +class MathSuite extends FunSuite with Matchers with MahoutLocalContext { + + test("thin distributed qr") { + + val inCoreA = dense( + (1, 2, 3, 4), + (2, 3, 4, 5), + (3, -4, 5, 6), + (4, 5, 6, 7), + (8, 6, 7, 8) + ) + + val A = drmParallelize(inCoreA, numPartitions = 2) + val (drmQ, inCoreR) = dqrThin(A, checkRankDeficiency = false) + + // Assert optimizer still knows Q and A are identically partitioned + drmQ.partitioningTag should equal(A.partitioningTag) + +// drmQ.rdd.partitions.size should be(A.rdd.partitions.size) + + // Should also be zippable +// drmQ.rdd.zip(other = A.rdd) + + val inCoreQ = drmQ.collect + + printf("A=\n%s\n", inCoreA) + printf("Q=\n%s\n", inCoreQ) + printf("R=\n%s\n", inCoreR) + + val (qControl, rControl) = qr(inCoreA) + printf("qControl=\n%s\n", qControl) + printf("rControl=\n%s\n", rControl) + + // Validate with Cholesky + val ch = chol(inCoreA.t %*% inCoreA) + printf("A'A=\n%s\n", inCoreA.t %*% inCoreA) + printf("L:\n%s\n", ch.getL) + + val rControl2 = (ch.getL cloned).t + val qControl2 = ch.solveRight(inCoreA) + printf("qControl2=\n%s\n", qControl2) + printf("rControl2=\n%s\n", rControl2) + + // Housholder approach seems to be a little bit more stable + (rControl - inCoreR).norm should be < 1E-5 + (qControl - inCoreQ).norm should be < 1E-5 + + // Assert identicity with in-core Cholesky-based -- this should be tighter. + (rControl2 - inCoreR).norm should be < 1E-10 + (qControl2 - inCoreQ).norm should be < 1E-10 + + // Assert orhtogonality: + // (a) Q[,j] dot Q[,j] == 1.0 for all j + // (b) Q[,i] dot Q[,j] == 0.0 for all i != j + for (col <- 0 until inCoreQ.ncol) + ((inCoreQ(::, col) dot inCoreQ(::, col)) - 1.0).abs should be < 1e-10 + for (col1 <- 0 until inCoreQ.ncol - 1; col2 <- col1 + 1 until inCoreQ.ncol) + (inCoreQ(::, col1) dot inCoreQ(::, col2)).abs should be < 1e-10 + + + } + + test("dssvd - the naive-est - q=0") { + dssvdNaive(q = 0) + } + + test("ddsvd - naive - q=1") { + dssvdNaive(q = 1) + } + + test("ddsvd - naive - q=2") { + dssvdNaive(q = 2) + } + + + def dssvdNaive(q: Int) { + val inCoreA = dense( + (1, 2, 3, 4), + (2, 3, 4, 5), + (3, -4, 5, 6), + (4, 5, 6, 7), + (8, 6, 7, 8) + ) + val drmA = drmParallelize(inCoreA, numPartitions = 2) + + val (drmU, drmV, s) = dssvd(drmA, k = 4, q = q) + val (inCoreU, inCoreV) = (drmU.collect, drmV.collect) + + printf("U:\n%s\n", inCoreU) + printf("V:\n%s\n", inCoreV) + printf("Sigma:\n%s\n", s) + + (inCoreA - (inCoreU %*%: diagv(s)) %*% inCoreV.t).norm should be < 1E-5 + } + + test("dspca") { + + val rnd = RandomUtils.getRandom + + // Number of points + val m = 500 + // Length of actual spectrum + val spectrumLen = 40 + + val spectrum = dvec((0 until spectrumLen).map(x => 300.0 * exp(-x) max 1e-3)) + printf("spectrum:%s\n", spectrum) + + val (u, _) = qr(new SparseRowMatrix(m, spectrumLen) := + ((r, c, v) => if (rnd.nextDouble() < 0.2) 0 else rnd.nextDouble() + 5.0)) + + // PCA Rotation matrix -- should also be orthonormal. + val (tr, _) = qr(Matrices.symmetricUniformView(spectrumLen, spectrumLen, rnd.nextInt) - 10.0) + + val input = (u %*%: diagv(spectrum)) %*% tr.t + val drmInput = drmParallelize(m = input, numPartitions = 2) + + // Calculate just first 10 principal factors and reduce dimensionality. + // Since we assert just validity of the s-pca, not stochastic error, we bump p parameter to + // ensure to zero stochastic error and assert only functional correctness of the method's pca- + // specific additions. + val k = 10 + + // Calculate just first 10 principal factors and reduce dimensionality. + var (drmPCA, _, s) = dspca(A = drmInput, k = 10, p = spectrumLen, q = 1) + // Un-normalized pca data: + drmPCA = drmPCA %*% diagv(s) + + val pca = drmPCA.checkpoint(CacheHint.NONE).collect + + // Of course, once we calculated the pca, the spectrum is going to be different since our originally + // generated input was not centered. So here, we'd just brute-solve pca to verify + val xi = input.colMeans() + for (r <- 0 until input.nrow) input(r, ::) -= xi + var (pcaControl, _, sControl) = svd(m = input) + pcaControl = (pcaControl %*%: diagv(sControl))(::, 0 until k) + + printf("pca:\n%s\n", pca(0 until 10, 0 until 10)) + printf("pcaControl:\n%s\n", pcaControl(0 until 10, 0 until 10)) + + (pca(0 until 10, 0 until 10).norm - pcaControl(0 until 10, 0 until 10).norm).abs should be < 1E-5 + + } + + test("als") { + + val rnd = RandomUtils.getRandom + + // Number of points + val m = 500 + val n = 500 + + // Length of actual spectrum + val spectrumLen = 40 + + // Create singluar values with decay + val spectrum = dvec((0 until spectrumLen).map(x => 300.0 * exp(-x) max 1e-3)) + printf("spectrum:%s\n", spectrum) + + // Create A as an ideal input + val inCoreA = (qr(Matrices.symmetricUniformView(m, spectrumLen, 1234))._1 %*%: diagv(spectrum)) %*% + qr(Matrices.symmetricUniformView(n, spectrumLen, 2345))._1.t + val drmA = drmParallelize(inCoreA, numPartitions = 2) + + // Decompose using ALS + val (drmU, drmV, rmse) = als(drmInput = drmA, k = 20).toTuple + val inCoreU = drmU.collect + val inCoreV = drmV.collect + + val predict = inCoreU %*% inCoreV.t + + printf("Control block:\n%s\n", inCoreA(0 until 3, 0 until 3)) + printf("ALS factorized approximation block:\n%s\n", predict(0 until 3, 0 until 3)) + + val err = (inCoreA - predict).norm + printf ("norm of residuals %f\n",err) + printf ("train iteration rmses: %s\n", rmse) + + err should be < 1e-2 + + } + +} diff --git a/pom.xml b/pom.xml index ef9ae03ebd..ccef99daa8 100644 --- a/pom.xml +++ b/pom.xml @@ -110,6 +110,7 @@ 2.10 2.10.4 1.0.1 + 0.1.0-SNAPSHOT Jira @@ -700,6 +701,7 @@ math-scala spark spark-shell + h2o From 0ddc36dd601b5844ad640d14637e6538c13cfeb3 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Wed, 11 Jun 2014 17:07:10 -0700 Subject: [PATCH 02/34] MAHOUT-1500: Implement non-operator parts of H2O bindings Signed-off-by: Anand Avati --- h2o/pom.xml | 7 + .../apache/mahout/h2obindings/H2OHelper.java | 283 ++++++++++++++++++ .../mahout/h2obindings/drm/H2OBCast.java | 109 +++++++ .../h2obindings/H2ODistributedContext.scala | 1 + .../apache/mahout/h2obindings/H2OEngine.scala | 59 +++- .../h2obindings/drm/CheckpointedDrmH2O.scala | 25 +- .../mahout/h2obindings/drm/H2OBCast.scala | 24 -- 7 files changed, 463 insertions(+), 45 deletions(-) create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java delete mode 100644 h2o/src/main/scala/org/apache/mahout/h2obindings/drm/H2OBCast.scala diff --git a/h2o/pom.xml b/h2o/pom.xml index 11a5443740..711295d091 100644 --- a/h2o/pom.xml +++ b/h2o/pom.xml @@ -188,6 +188,13 @@ ${project.version} + + + org.apache.mahout + mahout-mrlegacy + ${project.version} + + org.apache.mahout mahout-math-scala diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java new file mode 100644 index 0000000000..6be6810a7a --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings; + +import org.apache.mahout.math.*; + +import water.*; +import water.fvec.*; +import water.util.FrameUtils; + +import java.io.File; +import java.io.IOException; + +public class H2OHelper { + + /* + Is the matrix sparse? If the number of missing elements is + 32 x times the number of present elements, treat it as sparse + */ + private static boolean is_sparse (Frame frame) { + long rows = frame.numRows(); + long cols = frame.numCols(); + + + class MRTaskNZ extends MRTask { + long _sparselen; + public void map(Chunk chks[]) { + for (Chunk chk : chks) { + _sparselen += chk.sparseLen(); + } + } + public void reduce(MRTaskNZ other) { + _sparselen += other._sparselen; + } + } + + long sparselen = new MRTaskNZ().doAll(frame)._sparselen; + + return (((rows * cols) / (sparselen + 1)) > 32); + } + + /* + Extract a Matrix from a Frame. Create either Sparse or + Dense Matrix depending on number of missing elements + in Frame. + */ + public static Matrix matrix_from_frame (Frame frame) { + Matrix m; + + if (is_sparse (frame)) + m = new SparseMatrix ((int)frame.numRows(), frame.numCols()); + else + m = new DenseMatrix ((int)frame.numRows(), frame.numCols()); + + int c = 0; + for (Vec v : frame.vecs()) { + for (int r = 0; r < frame.numRows(); r++) { + double d = 0.0; + if (!v.isNA(r) && ((d = v.at(r)) != 0.0)) + m.setQuick(r, c, d); + } + c++; + } + return m; + } + + /* Calculate Means of elements in a column, and return + as a vector. + + H2O precalculates means in a Vec, and a Vec corresponds + to a column. + */ + public static Vector colMeans (Frame frame) { + double means[] = new double[frame.numCols()]; + for (int i = 0; i < frame.numCols(); i++) + means[i] = frame.vecs()[i].mean(); + return new DenseVector(means); + } + + /* Calculate Sum of all elements in a column, and + return as a Vector + + Run an MRTask Job to add up sums in @_sums + + WARNING: Vulnerable to overflow. No way around it. + */ + public static Vector colSums (Frame frame) { + class MRTaskSum extends MRTask { + public double _sums[]; + public void map(Chunk chks[]) { + _sums = new double[chks.length]; + + for (int c = 0; c < chks.length; c++) { + for (int r = 0; r < chks[c]._len; r++) { + _sums[c] += chks[c].at0(r); + } + } + } + public void reduce(MRTaskSum other) { + for (int i = 0; i < _sums.length; i++) + _sums[i] += other._sums[i]; + } + } + return new DenseVector(new MRTaskSum().doAll(frame)._sums); + } + + + /* Calculate Sum of squares of all elements in the Matrix + + WARNING: Vulnerable to overflow. No way around it. + */ + public static double sumSqr (Frame frame) { + class MRTaskSumSqr extends MRTask { + public double _sumSqr; + public void map(Chunk chks[]) { + for (int c = 0; c < chks.length; c++) { + for (int r = 0; r < chks[c]._len; r++) { + _sumSqr += (chks[c].at0(r) * chks[c].at0(r)); + } + } + } + public void reduce(MRTaskSumSqr other) { + _sumSqr += other._sumSqr; + } + } + return new MRTaskSumSqr().doAll(frame)._sumSqr; + } + + /* Calculate Sum of all elements in a column, and + return as a Vector + + Run an MRTask Job to add up sums in @_sums + + WARNING: Vulnerable to overflow. No way around it. + */ + public static Vector nonZeroCnt (Frame frame) { + class MRTaskNonZero extends MRTask { + public double _sums[]; + public void map(Chunk chks[]) { + _sums = new double[chks.length]; + + for (int c = 0; c < chks.length; c++) { + for (int r = 0; r < chks[c]._len; r++) { + if ((long)chks[c].at0(r) != 0) + _sums[c] ++; + } + } + } + public void reduce(MRTaskNonZero other) { + for (int i = 0; i < _sums.length; i++) + _sums[i] += other._sums[i]; + } + } + return new DenseVector(new MRTaskNonZero().doAll(frame)._sums); + } + + + public static Frame frame_from_file (String path) throws IOException { + return FrameUtils.parseFrame(null, new File(path)); + } + + + private static int chunk_size (long nrow, int ncol, int parts_hint) { + int chunk_sz; + + if (parts_hint < 1) + parts_hint = 1; + + chunk_sz = (int) (((nrow - 1) / parts_hint) + 1); + if (parts_hint < 2) { + if (chunk_sz < 1e3) + chunk_sz = (int)1e3; + } + + if (chunk_sz > 1e6) + chunk_sz = (int)1e6; + + return chunk_sz; + } + + private static int next_chunks(NewChunk ncs[], int cidx, long r, int chunk_sz, AppendableVec avs[], Futures fs) { + if ((r % chunk_sz) != 0) + return cidx; + for (int i = 0; i < ncs.length; i++) { + if (ncs[i] != null) + ncs[i].close(fs); + ncs[i] = new NewChunk (avs[i], cidx); + } + return cidx + 1; + } + /* Ingest a Matrix into an H2O Frame. H2O Frame is the "backing" + data structure behind CheckpointedDrm. Steps: + + - @cols is the number of columsn in the Matrix + - An H2O Vec represents an H2O Column. + - Create @cols number of Vec's. + - Load data into Vecs by routing them through NewChunks + */ + public static Frame frame_from_matrix (Matrix m, int parts_hint) { + int cols = m.columnSize(); + Vec.VectorGroup vg = new Vec.VectorGroup(); + Key keys[] = vg.addVecs(cols); + AppendableVec avs[] = new AppendableVec[cols]; + Vec vecs[] = new Vec[cols]; + NewChunk ncs[] = new NewChunk[cols]; + int chunk_sz = chunk_size (m.rowSize(), m.columnSize(), parts_hint); + Futures fs = new Futures(); + int cidx = 0; + + for (int c = 0; c < cols; c++) + avs[c] = new AppendableVec(keys[c]); + + long r = 0; + for (MatrixSlice row : m) { + cidx = next_chunks (ncs, cidx, r, chunk_sz, avs, fs); + /* Detect entire sparse rows */ + while (r < row.index()) { + for (NewChunk nc : ncs) + nc.addNum(0.0); + r++; + cidx = next_chunks (ncs, cidx, r, chunk_sz, avs, fs); + } + int c = 0; + for (Vector.Element element : row.nonZeroes()) { + while (c < element.index()) + /* Detect sparse column elements within a row */ + ncs[c++].addNum(0.0); + ncs[c++].addNum(element.get()); + } + r++; + } + + for (int c = 0; c < cols; c++) { + ncs[c].close(fs); + vecs[c] = avs[c].close(fs); + } + fs.blockForPending(); + + return new Frame(vecs); + } + + public static Frame empty_frame (long nrow, int ncol, int parts_hint) { + int chunk_sz = chunk_size (nrow, ncol, parts_hint); + int nchunks = (int) ((nrow - 1) / chunk_sz) + 1; /* Final number of Chunks per Vec */ + Futures fs = new Futures(); + Vec.VectorGroup vg = new Vec.VectorGroup(); + Key keys[] = vg.addVecs(ncol); + long espc[] = new long[nchunks+1]; + for (int i = 0; i < nchunks; i++) + espc[i] = i * chunk_sz; + espc[nchunks] = nrow; + final Vec[] vecs = new Vec[ncol]; + for (int i = 0; i < vecs.length; i++) + vecs[i] = new Vec(keys[i], espc); + new MRTask() { + protected void setupLocal() { + for (Vec v : vecs) { + for (int i = 0; i < v.nChunks(); i++) { + Key k = v.chunkKey(i); + if (k.home()) DKV.put(k, new C0LChunk(0L, v.chunkLen(i)), _fs); + } + } + for(Vec v : vecs) if(v._key.home()) DKV.put(v._key, v, _fs); + } + }.doAllNodes(); + return new Frame(vecs); + } +} diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java b/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java new file mode 100644 index 0000000000..8e9e70cd03 --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.drm; + +import org.apache.mahout.math.drm.BCast; +import org.apache.mahout.math.Matrix; +import org.apache.mahout.math.Vector; +import org.apache.mahout.math.MatrixWritable; +import org.apache.mahout.math.VectorWritable; +import java.io.Serializable; +import java.io.ByteArrayOutputStream; +import java.io.ByteArrayInputStream; +import java.io.ObjectOutputStream; +import java.io.ObjectInputStream; + +/* Handle Matrix and Vector separately so that we can live with + just importing MatrixWritable and VectorWritable. + + We could collapse the two into a single method using Writable, + but then we would have to import org.apache.hadoop.Writable, + pick a hadoop distribution in pom.xml etc. Instead let + mahout-mrlegacy solve that transitively for us. +*/ + +public class H2OBCast implements BCast, Serializable { + transient T obj; + byte buf[]; + boolean is_matrix; + + public H2OBCast(T o) { + obj = o; + + if (o instanceof Matrix) { + buf = serializeMatrix((Matrix)o); + is_matrix = true; + } else if (o instanceof Vector) { + buf = serializeVector((Vector)o); + } else { + throw new IllegalArgumentException("Only Matrix or Vector supported for now"); + } + } + + public T value() { + if (obj == null) + obj = deserialize(buf); + return obj; + } + + private byte[] serializeMatrix(Matrix m) { + MatrixWritable w = new MatrixWritable(m); + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + try { + ObjectOutputStream oos = new ObjectOutputStream(bos); + w.write(oos); + oos.close(); + } catch (java.io.IOException e) { + return null; + } + return bos.toByteArray(); + } + + private byte[] serializeVector(Vector v) { + VectorWritable w = new VectorWritable(v); + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + try { + ObjectOutputStream oos = new ObjectOutputStream(bos); + w.write(oos); + oos.close(); + } catch (java.io.IOException e) { + return null; + } + return bos.toByteArray(); + } + + private T deserialize(byte buf[]) { + T ret = null; + ByteArrayInputStream bis = new ByteArrayInputStream(buf); + try { + ObjectInputStream ois = new ObjectInputStream(bis); + if (is_matrix) { + MatrixWritable w = new MatrixWritable(); + w.readFields(ois); + ret = (T) w.get(); + } else { + VectorWritable w = new VectorWritable(); + w.readFields(ois); + ret = (T) w.get(); + } + } catch (java.io.IOException e) { + System.out.println("Caught exception: " + e); + } + return ret; + } +} diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2ODistributedContext.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2ODistributedContext.scala index 416c95031c..40289d7504 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2ODistributedContext.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2ODistributedContext.scala @@ -20,6 +20,7 @@ package org.apache.mahout.h2obindings import org.apache.mahout.math.drm._ class H2ODistributedContext(val masterUrl: String) extends DistributedContext { + val h2octx = new H2OContext("local"); def close(): Unit = return diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala index 75f4614e87..85bd8f2548 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala @@ -20,17 +20,54 @@ package org.apache.mahout.h2obindings import scala.reflect._ import org.apache.mahout.math._ import org.apache.mahout.math.drm._ +import org.apache.mahout.math.drm.logical._ + +import org.apache.mahout.h2obindings.drm._ + +import water._ +import water.fvec._ object H2OEngine extends DistributedEngine { - def colMeans[K:ClassTag](drm: CheckpointedDrm[K]): Vector = ??? - def colSums[K:ClassTag](drm: CheckpointedDrm[K]): Vector = ??? - def norm[K: ClassTag](drm: CheckpointedDrm[K]): Double = ??? - def drmBroadcast(m: Matrix)(implicit dc: DistributedContext): BCast[Matrix] = ??? - def drmBroadcast(v: Vector)(implicit dc: DistributedContext): BCast[Vector] = ??? - def drmFromHDFS(path: String, parMin: Int = 0)(implicit dc: DistributedContext): CheckpointedDrm[_] = ??? - def drmParallelizeEmpty(nrow: Int, ncol: Int, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Int] = ??? - def drmParallelizeEmptyLong(nrow: Long, ncol: Int, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Long] = ??? - def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Int] = ??? - def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[String] = ??? - def toPhysical[K:ClassTag](plan: DrmLike[K], ch: CacheHint.CacheHint): CheckpointedDrm[K] = ??? + def colMeans[K:ClassTag](drm: CheckpointedDrm[K]): Vector = + H2OHelper.colMeans (drm.frame) + + def colSums[K:ClassTag](drm: CheckpointedDrm[K]): Vector = + H2OHelper.colSums (drm.frame) + + def norm[K: ClassTag](drm: CheckpointedDrm[K]): Double = + H2OHelper.sumSqr (drm.frame) + + def numNonZeroElementsPerColumn[K: ClassTag](drm: CheckpointedDrm[K]): Vector = + H2OHelper.nonZeroCnt (drm.frame) + + def drmBroadcast(m: Matrix)(implicit dc: DistributedContext): BCast[Matrix] = + new H2OBCast(m) + + def drmBroadcast(v: Vector)(implicit dc: DistributedContext): BCast[Vector] = + new H2OBCast(v) + + /* XXX - H2O parser does not support seqfile */ + def drmFromHDFS(path: String, parMin: Int = 0)(implicit dc: DistributedContext): CheckpointedDrm[_] = + new CheckpointedDrmH2O (H2OHelper.frame_from_file (path), dc) + + def drmParallelizeEmpty(nrow: Int, ncol: Int, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Int] = + new CheckpointedDrmH2O (H2OHelper.empty_frame (nrow, ncol, numPartitions), dc) + + def drmParallelizeEmptyLong(nrow: Long, ncol: Int, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Long] = + new CheckpointedDrmH2O (H2OHelper.empty_frame (nrow, ncol, numPartitions), dc) + + def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Int] = + new CheckpointedDrmH2O (H2OHelper.frame_from_matrix (m, numPartitions), dc) + + def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[String] = + new CheckpointedDrmH2O (H2OHelper.frame_from_matrix (m, numPartitions), dc) + + def toPhysical[K:ClassTag](plan: DrmLike[K], ch: CacheHint.CacheHint): CheckpointedDrm[K] = + new CheckpointedDrmH2O (tr2phys (plan), plan.context) + + // H2O specific + + private def tr2phys[K: ClassTag](oper: DrmLike[K]): Frame = ??? + + implicit def cp2cph2o[K:ClassTag](drm: CheckpointedDrm[K]): CheckpointedDrmH2O[K] = drm.asInstanceOf[CheckpointedDrmH2O[K]] } diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala index 1dfda12639..0c067474a5 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala @@ -1,7 +1,6 @@ package org.apache.mahout.h2obindings.drm -import org.apache.mahout.math.{SparseMatrix, DenseMatrix, Matrix, Vector} -import math._ +import org.apache.mahout.math.{Matrix, Vector} import org.apache.mahout.math.scalabindings._ import RLikeOps._ import org.apache.mahout.math.drm._ @@ -14,17 +13,23 @@ import scala.reflect._ /** H2O-specific optimizer-checkpointed DRM. */ class CheckpointedDrmH2O[K: ClassTag]( - val frame: Frame + val frame: Frame, + protected[mahout] val context: DistributedContext ) extends CheckpointedDrm[K] { - def collect: Matrix = ??? - def uncache(): Unit = ??? + /* XXX: Row index not supported. Numerical index generated on the fly (for mapBlock etc.) */ + def collect: Matrix = H2OHelper.matrix_from_frame(frame) + /* XXX: call frame.remove */ + def uncache(): Unit = return + /* XXX: H2O does not support seqfile format yet */ def writeDRM(path: String): Unit = ??? - def checkpoint(cacheHint: CacheHint.CacheHint): CheckpointedDrm[K] = ??? - protected[mahout] val context: DistributedContext = ??? - def ncol: Int = ??? - def nrow: Long = ??? - protected[mahout] def partitioningTag: Long = ??? + def checkpoint(cacheHint: CacheHint.CacheHint): CheckpointedDrm[K] = this + + def ncol: Int = frame.numCols + + def nrow: Long = frame.numRows + + protected[mahout] def partitioningTag: Long = frame.vecs()(0).group.hashCode } diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/H2OBCast.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/H2OBCast.scala deleted file mode 100644 index 1b0baac8c1..0000000000 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/H2OBCast.scala +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.h2obindings.drm - -import org.apache.mahout.math.drm.BCast - -class H2OBCast[T] extends BCast[T] with Serializable { - def value: T = ??? -} From 97ca53d7dd0d19ac73ee95866942c89134643372 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Mon, 16 Jun 2014 18:30:13 -0700 Subject: [PATCH 03/34] MAHOUT-1500: Implement non Linear Algebra ops in H2O backend i.e, mapBlock() and rowRange() Provide dummy methods for all operators Signed-off-by: Anand Avati --- .../mahout/h2obindings/H2OBlockMatrix.java | 103 ++++++++++++++++++ .../apache/mahout/h2obindings/ops/ABt.java | 28 +++++ .../apache/mahout/h2obindings/ops/AewB.java | 28 +++++ .../mahout/h2obindings/ops/AewScalar.java | 28 +++++ .../org/apache/mahout/h2obindings/ops/At.java | 28 +++++ .../apache/mahout/h2obindings/ops/AtA.java | 28 +++++ .../apache/mahout/h2obindings/ops/AtB.java | 28 +++++ .../apache/mahout/h2obindings/ops/Atx.java | 30 +++++ .../org/apache/mahout/h2obindings/ops/Ax.java | 30 +++++ .../mahout/h2obindings/ops/MapBlock.java | 65 +++++++++++ .../apache/mahout/h2obindings/ops/Par.java | 28 +++++ .../mahout/h2obindings/ops/RowRange.java | 48 ++++++++ .../h2obindings/ops/TimesRightMatrix.java | 30 +++++ .../apache/mahout/h2obindings/H2OEngine.scala | 35 ++++-- .../h2obindings/ops/MapBlockHelper.scala | 42 +++++++ 15 files changed, 572 insertions(+), 7 deletions(-) create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java create mode 100644 h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java new file mode 100644 index 0000000000..3a6a68354d --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings; + +import org.apache.mahout.math.Matrix; +import org.apache.mahout.math.Vector; +import org.apache.mahout.math.AbstractMatrix; +import org.apache.mahout.math.DenseMatrix; +import org.apache.mahout.math.SparseMatrix; + +import water.fvec.*; + +/* + * A Matrix implementation to represent a vertical Block of DRM. + * + * Creation of the matrix is an O(1) operation with negligible + * overhead, and will remain so as long as the matrix is only + * read from (no modifications). + * + * On the first modification, create a copy on write Matrix and + * all further operations happen on this cow matrix. + * + * The benefit is, mapBlock() closures which never modify the + * input matrix save on the copy overhead. + */ +public class H2OBlockMatrix extends AbstractMatrix { + Chunk _chks[]; + Matrix cow; /* Copy on Write */ + + public H2OBlockMatrix(Chunk chks[]) { + super(chks[0]._len, chks.length); + _chks = chks; + } + + private void cow() { + if (cow != null) + return; + + if (_chks[0].isSparse()) + cow = new SparseMatrix(_chks[0]._len, _chks.length); + else + cow = new DenseMatrix(_chks[0]._len, _chks.length); + + for (int c = 0; c < _chks.length; c++) { + for (int r = 0; r < _chks[0]._len; r++) { + cow.setQuick(r, c, _chks[c].at0(r)); + } + } + } + + public void setQuick(int row, int col, double val) { + cow(); + cow.setQuick (row, col, val); + } + + public Matrix like(int nrow, int ncol) { + if (_chks[0].isSparse()) + return new SparseMatrix(nrow, ncol); + else + return new DenseMatrix(nrow, ncol); + } + + public Matrix like() { + if (_chks[0].isSparse()) + return new SparseMatrix(rowSize(), columnSize()); + else + return new DenseMatrix(rowSize(), columnSize()); + } + + public double getQuick(int row, int col) { + if (cow != null) + return cow.getQuick(row, col); + else + return _chks[col].at0(row); + } + + public Matrix assignRow(int row, Vector v) { + cow(); + cow.assignRow(row, v); + return cow; + } + + public Matrix assignColumn(int col, Vector v) { + cow(); + cow.assignColumn(col, v); + return cow; + } +} diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java new file mode 100644 index 0000000000..4c5ab5572f --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops; + +import water.*; +import water.fvec.*; + +public class ABt { + /* Calculate AB' */ + public static Frame ABt(Frame A, Frame B) { + return null; + } +} diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java new file mode 100644 index 0000000000..227c7139a2 --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops; + +import water.*; +import water.fvec.*; + +public class AewB { + /* Element-wise DRM-DRM operations */ + public static Frame AewB(Frame A, Frame B, String op) { + return null; + } +} diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java new file mode 100644 index 0000000000..cf39c41bd7 --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops; + +import water.*; +import water.fvec.*; + +public class AewScalar { + /* Element-wise DRM-DRM operations */ + public static Frame AewScalar(Frame A, double s, String op) { + return null; + } +} diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java new file mode 100644 index 0000000000..29c9a86eae --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops; + +import water.*; +import water.fvec.*; + +public class At { + /* Calculate A' (transpose) */ + public static Frame At(Frame A) { + return null; + } +} diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java new file mode 100644 index 0000000000..9b31daf501 --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops; + +import water.*; +import water.fvec.*; + +public class AtA { + /* Calculate A'A */ + public static Frame AtA(Frame A) { + return null; + } +} diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java new file mode 100644 index 0000000000..375d93b903 --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops; + +import water.*; +import water.fvec.*; + +public class AtB { + /* Calculate A'B */ + public static Frame AtB(Frame A, Frame B) { + return null; + } +} diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java new file mode 100644 index 0000000000..eaa647ccb3 --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops; + +import org.apache.mahout.math.Vector; + +import water.*; +import water.fvec.*; + +public class Atx { + /* Calculate A'x (where x is an in-core Vector) */ + public static Frame Atx(Frame A, Vector x) { + return null; + } +} diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java new file mode 100644 index 0000000000..3cb70ca122 --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops; + +import org.apache.mahout.math.Vector; + +import water.*; +import water.fvec.*; + +public class Ax { + /* Calculate Ax (where x is an in-core Vector) */ + public static Frame Ax(Frame A, Vector x) { + return null; + } +} diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java new file mode 100644 index 0000000000..96e6c746e8 --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops; + +import org.apache.mahout.math.Matrix; +import org.apache.mahout.h2obindings.H2OBlockMatrix; + +import water.*; +import water.fvec.*; +import java.io.Serializable; + +import scala.reflect.ClassTag; + +public class MapBlock { + public static Frame exec(Frame A, int ncol, Object bmf, final ClassTag k, final ClassTag r) { + class MRTaskBMF extends MRTask { + Serializable _bmf; + MRTaskBMF(Object bmf) { + /* BlockMapFun does not implement Serializable, + but Scala closures are _always_ Serializable. + + So receive the object as a plain Object (else + compilation fails) and typcast it with conviction, + that Scala always tags the actually generated + closure functions with Serializable. + */ + _bmf = (Serializable)bmf; + } + + private Matrix blockify (Chunk chks[]) { + return new H2OBlockMatrix(chks); + } + + private void deblockify (Matrix out, NewChunk ncs[]) { + // assert (out.colSize() == ncs.length) + for (int c = 0; c < out.columnSize(); c++) { + for (int r = 0; r < out.rowSize(); r++) { + ncs[c].addNum(out.getQuick(r, c)); + } + } + } + + public void map(Chunk chks[], NewChunk ncs[]) { + deblockify(MapBlockHelper.exec(_bmf, blockify(chks), chks[0]._start, k, r), ncs); + // assert chks[i]._len == ncs[j]._len + } + } + return new MRTaskBMF(bmf).doAll(ncol, A).outputFrame(A.names(), A.domains()); + } +} diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java new file mode 100644 index 0000000000..2b3f1e3ec2 --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops; + +import water.*; +import water.fvec.*; + +public class Par { + public static Frame exec(Frame A, int min, int exact) { + /* XXX: re-org Frame */ + return A; + } +} diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java new file mode 100644 index 0000000000..9c960eb1bb --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops; + +import scala.collection.immutable.Range; + +import water.*; +import water.fvec.*; + +public class RowRange { + /* Filter operation */ + public static Frame RowRange(Frame A, Range r) { + class MRTaskFilter extends MRTask { + Range _r; + MRTaskFilter(Range r) { + _r = r; + } + public void map(Chunk chks[], NewChunk ncs[]) { + if (chks[0]._start > _r.end() || (chks[0]._start + chks[0]._len) < _r.start()) + return; + + for (int r = 0; r < chks[0]._len; r++) { + if (!_r.contains (chks[0]._start + r)) + continue; + + for (int c = 0; c < chks.length; c++) + ncs[c].addNum(chks[c].at0(r)); + } + } + } + return new MRTaskFilter(r).doAll(A.numCols(), A).outputFrame(A.names(), A.domains()); + } +} diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java new file mode 100644 index 0000000000..5eccb4975b --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops; + +import org.apache.mahout.math.Matrix; + +import water.*; +import water.fvec.*; + +public class TimesRightMatrix { + /* Multiple with in-core Matrix */ + public static Frame TimesRightMatrix(Frame A, Matrix m) { + return null; + } +} diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala index 85bd8f2548..ddd567567c 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala @@ -21,7 +21,7 @@ import scala.reflect._ import org.apache.mahout.math._ import org.apache.mahout.math.drm._ import org.apache.mahout.math.drm.logical._ - +import org.apache.mahout.h2obindings.ops._ import org.apache.mahout.h2obindings.drm._ import water._ @@ -51,23 +51,44 @@ object H2OEngine extends DistributedEngine { new CheckpointedDrmH2O (H2OHelper.frame_from_file (path), dc) def drmParallelizeEmpty(nrow: Int, ncol: Int, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Int] = - new CheckpointedDrmH2O (H2OHelper.empty_frame (nrow, ncol, numPartitions), dc) + new CheckpointedDrmH2O[Int] (H2OHelper.empty_frame (nrow, ncol, numPartitions), dc) def drmParallelizeEmptyLong(nrow: Long, ncol: Int, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Long] = - new CheckpointedDrmH2O (H2OHelper.empty_frame (nrow, ncol, numPartitions), dc) + new CheckpointedDrmH2O[Long] (H2OHelper.empty_frame (nrow, ncol, numPartitions), dc) def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Int] = - new CheckpointedDrmH2O (H2OHelper.frame_from_matrix (m, numPartitions), dc) + new CheckpointedDrmH2O[Int] (H2OHelper.frame_from_matrix (m, numPartitions), dc) def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[String] = - new CheckpointedDrmH2O (H2OHelper.frame_from_matrix (m, numPartitions), dc) + new CheckpointedDrmH2O[String] (H2OHelper.frame_from_matrix (m, numPartitions), dc) def toPhysical[K:ClassTag](plan: DrmLike[K], ch: CacheHint.CacheHint): CheckpointedDrm[K] = - new CheckpointedDrmH2O (tr2phys (plan), plan.context) + new CheckpointedDrmH2O[K] (tr2phys (plan), plan.context) // H2O specific - private def tr2phys[K: ClassTag](oper: DrmLike[K]): Frame = ??? + private def tr2phys[K: ClassTag](oper: DrmLike[K]): Frame = { + oper match { + case OpAtAnyKey(_) => + throw new IllegalArgumentException("\"A\" must be Int-keyed in this A.t expression.") + case op@OpAt(a) => At.At(tr2phys(a)(op.classTagA)) + case op@OpABt(a, b) => ABt.ABt(tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB)) + case op@OpAtB(a, b) => AtB.AtB(tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB)) + case op@OpAtA(a) => AtA.AtA(tr2phys(a)(op.classTagA)) + case op@OpAx(a, v) => Ax.Ax(tr2phys(a)(op.classTagA), v) + case op@OpAtx(a, v) => Atx.Atx(tr2phys(a)(op.classTagA), v) + case op@OpAewB(a, b, opId) => AewB.AewB(tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB), opId) + case op@OpAewScalar(a, s, opId) => AewScalar.AewScalar(tr2phys(a)(op.classTagA), s, opId) + case op@OpRowRange(a, r) => RowRange.RowRange(tr2phys(a)(op.classTagA), r) + case op@OpTimesRightMatrix(a, m) => TimesRightMatrix.TimesRightMatrix(tr2phys(a)(op.classTagA), m) + // Custom operators, we just execute them + case blockOp: OpMapBlock[K, _] => MapBlock.exec(tr2phys(blockOp.A)(blockOp.classTagA), blockOp.ncol, blockOp.bmf, blockOp.classTagA, blockOp.classTagK) + case op@OpPar(a, m, e) => Par.exec(tr2phys(a)(op.classTagA), m, e) + case cp: CheckpointedDrm[K] => cp.frame + case _ => throw new IllegalArgumentException("Internal:Optimizer has no exec policy for operator %s." + .format(oper)) + } + } implicit def cp2cph2o[K:ClassTag](drm: CheckpointedDrm[K]): CheckpointedDrmH2O[K] = drm.asInstanceOf[CheckpointedDrmH2O[K]] } diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala new file mode 100644 index 0000000000..d53fe22187 --- /dev/null +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops + +import org.apache.mahout.math.Matrix +import org.apache.mahout.math.drm.BlockMapFunc +import scala.reflect.ClassTag + +object MapBlockHelper { + def exec[K: ClassTag, R: ClassTag](bmf: Object, in: Matrix, startlong: Long): Matrix = { + val i = implicitly[ClassTag[Int]] + val l = implicitly[ClassTag[Long]] + val s = implicitly[ClassTag[String]] + + val inarray = implicitly[ClassTag[K]] match { + case `i` => val startint: Int = startlong.asInstanceOf[Int] + startint until (startint + in.rowSize) toArray + case `l` => startlong until (startlong + in.rowSize) toArray + case `s` => new Array[String](in.rowSize) + } + + val _bmf = bmf.asInstanceOf[BlockMapFunc[K,R]] + val out = _bmf((inarray.asInstanceOf[Array[K]], in)) + out._2 + } +} + From 9b265a7acb6f3249a77752648d905a580f359bc9 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Fri, 20 Jun 2014 00:32:30 -0700 Subject: [PATCH 04/34] MAHOUT-1500: Implement Linear Algebra ops in H2O backend - A' - A'A - A'B - AB' - A (element-wise) B - A (element-wise) Scalar - AinCoreB - Ax - Atx All MathSuite tests are passing. Signed-off-by: Anand Avati --- .../apache/mahout/h2obindings/H2OHelper.java | 1 + .../apache/mahout/h2obindings/ops/ABt.java | 24 +++++++- .../apache/mahout/h2obindings/ops/AewB.java | 29 ++++++++- .../mahout/h2obindings/ops/AewScalar.java | 29 ++++++++- .../org/apache/mahout/h2obindings/ops/At.java | 18 +++++- .../apache/mahout/h2obindings/ops/AtA.java | 22 ++++++- .../apache/mahout/h2obindings/ops/AtB.java | 22 ++++++- .../apache/mahout/h2obindings/ops/Atx.java | 29 ++++++++- .../org/apache/mahout/h2obindings/ops/Ax.java | 17 +++++- .../h2obindings/ops/TimesRightMatrix.java | 61 ++++++++++++++++++- 10 files changed, 236 insertions(+), 16 deletions(-) diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java index 6be6810a7a..b17fb8a713 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -179,6 +179,7 @@ private static int chunk_size (long nrow, int ncol, int parts_hint) { int chunk_sz; if (parts_hint < 1) + /* XXX: calculate based on cloud size and # of cpu */ parts_hint = 1; chunk_sz = (int) (((nrow - 1) / parts_hint) + 1); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java index 4c5ab5572f..6764dea58d 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java @@ -17,12 +17,32 @@ package org.apache.mahout.h2obindings.ops; +import org.apache.mahout.h2obindings.H2OHelper; + import water.*; import water.fvec.*; public class ABt { /* Calculate AB' */ - public static Frame ABt(Frame A, Frame B) { - return null; + public static Frame ABt(final Frame A, final Frame B) { + /* XXX - make ABt similar to A */ + Frame ABt = H2OHelper.empty_frame (A.numRows(), (int)B.numRows(), 0); + + class MRTaskABt extends MRTask { + public void map(Chunk chks[]) { + long start = chks[0]._start; + for (int c = 0; c < chks.length; c++) { + for (int r = 0; r < chks[0]._len; r++) { + double v = 0; + for (int i = 0; i < A.vecs().length; i++) { + v += (A.vecs()[i].at(start+r) * B.vecs()[i].at(c)); + } + chks[c].set0(r, v); + } + } + } + } + new MRTaskABt().doAll(ABt); + return ABt; } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java index 227c7139a2..836783beb5 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java @@ -17,12 +17,37 @@ package org.apache.mahout.h2obindings.ops; +import org.apache.mahout.h2obindings.H2OHelper; + import water.*; import water.fvec.*; public class AewB { /* Element-wise DRM-DRM operations */ - public static Frame AewB(Frame A, Frame B, String op) { - return null; + public static Frame AewB(final Frame A, final Frame B, final String op) { + class MRTaskAewB extends MRTask { + private double opfn (String op, double a, double b) { + if (a == 0.0 && b == 0.0) + return 0.0; + if (op.equals("+")) + return a + b; + else if (op.equals("-")) + return a - b; + else if (op.equals("*")) + return a * b; + else if (op.equals("/")) + return a / b; + return 0.0; + } + public void map(Chunk chks[], NewChunk ncs[]) { + long start = chks[0]._start; + for (int c = 0; c < chks.length; c++) { + for (int r = 0; r < chks[0]._len; r++) { + ncs[c].addNum(opfn(op, chks[c].at0(r), B.vecs()[c].at(start+r))); + } + } + } + } + return new MRTaskAewB().doAll(A.numCols(), A).outputFrame(A.names(), A.domains()); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java index cf39c41bd7..ec4aeaecce 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java @@ -17,12 +17,37 @@ package org.apache.mahout.h2obindings.ops; +import org.apache.mahout.h2obindings.H2OHelper; + import water.*; import water.fvec.*; public class AewScalar { /* Element-wise DRM-DRM operations */ - public static Frame AewScalar(Frame A, double s, String op) { - return null; + public static Frame AewScalar(final Frame A, final double s, final String op) { + class MRTaskAewScalar extends MRTask { + private double opfn (String op, double a, double b) { + if (a == 0.0 && b == 0.0) + return 0.0; + if (op.equals("+")) + return a + b; + else if (op.equals("-")) + return a - b; + else if (op.equals("*")) + return a * b; + else if (op.equals("/")) + return a / b; + return 0.0; + } + public void map(Chunk chks[], NewChunk ncs[]) { + long start = chks[0]._start; + for (int c = 0; c < chks.length; c++) { + for (int r = 0; r < chks[0]._len; r++) { + ncs[c].addNum(opfn(op, chks[c].at0(r), s)); + } + } + } + } + return new MRTaskAewScalar().doAll(A.numCols(), A).outputFrame(A.names(), A.domains()); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java index 29c9a86eae..1f131a4022 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java @@ -17,12 +17,26 @@ package org.apache.mahout.h2obindings.ops; +import org.apache.mahout.h2obindings.H2OHelper; + import water.*; import water.fvec.*; public class At { /* Calculate A' (transpose) */ - public static Frame At(Frame A) { - return null; + public static Frame At(final Frame A) { + Frame At = H2OHelper.empty_frame (A.numCols(), (int)A.numRows(), 0); + class MRTaskAt extends MRTask { + public void map(Chunk chks[]) { + long start = chks[0]._start; + for (int c = 0; c < chks.length; c++) { + for (int r = 0; r < chks[0]._len; r++) { + chks[c].set0(r, A.vecs()[(int)(start+r)].at(c)); + } + } + } + } + new MRTaskAt().doAll(At); + return At; } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java index 9b31daf501..125c348fef 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java @@ -17,12 +17,30 @@ package org.apache.mahout.h2obindings.ops; +import org.apache.mahout.h2obindings.H2OHelper; + import water.*; import water.fvec.*; public class AtA { /* Calculate A'A */ - public static Frame AtA(Frame A) { - return null; + public static Frame AtA(final Frame A) { + Frame AtA = H2OHelper.empty_frame (A.numCols(), A.numCols(), 0); + class MRTaskAtA extends MRTask { + public void map(Chunk chks[]) { + long start = chks[0]._start; + for (int c = 0; c < chks.length; c++) { + for (int r = 0; r < chks[0]._len; r++) { + double v = 0; + for (int i = 0; i < A.numRows(); i++) { + v += (A.vecs()[(int)(start+r)].at(i) * A.vecs()[c].at(i)); + } + chks[c].set0(r, v); + } + } + } + } + new MRTaskAtA().doAll(AtA); + return AtA; } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java index 375d93b903..d720ee2aa4 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java @@ -17,12 +17,30 @@ package org.apache.mahout.h2obindings.ops; +import org.apache.mahout.h2obindings.H2OHelper; + import water.*; import water.fvec.*; public class AtB { /* Calculate A'B */ - public static Frame AtB(Frame A, Frame B) { - return null; + public static Frame AtB(final Frame A, final Frame B) { + Frame AtB = H2OHelper.empty_frame (A.numCols(), B.numCols(), 0); + class MRTaskAtB extends MRTask { + public void map(Chunk chks[]) { + long start = chks[0]._start; + for (int c = 0; c < chks.length; c++) { + for (int r = 0; r < chks[0]._len; r++) { + double v = 0; + for (int i = 0; i < A.numRows(); i++) { + v += (A.vecs()[(int)(start+r)].at(i) * B.vecs()[c].at(i)); + } + chks[c].set0(r, v); + } + } + } + } + new MRTaskAtB().doAll(AtB); + return AtB; } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java index eaa647ccb3..ab8d987f1a 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java @@ -18,6 +18,11 @@ package org.apache.mahout.h2obindings.ops; import org.apache.mahout.math.Vector; +import org.apache.mahout.math.DenseVector; +import org.apache.mahout.math.Matrix; +import org.apache.mahout.math.DenseMatrix; +import org.apache.mahout.h2obindings.H2OHelper; +import org.apache.mahout.h2obindings.drm.H2OBCast; import water.*; import water.fvec.*; @@ -25,6 +30,28 @@ public class Atx { /* Calculate A'x (where x is an in-core Vector) */ public static Frame Atx(Frame A, Vector x) { - return null; + final H2OBCast bx = new H2OBCast(x); + class MRTaskAtx extends MRTask { + double _atx[]; + public void map(Chunk chks[]) { + Vector x = bx.value(); + long start = chks[0]._start; + _atx = new double[chks.length]; + for (int r = 0; r < chks[0]._len; r++) { + double d = x.getQuick((int)start + r); + for (int c = 0; c < chks.length; c++) { + _atx[c] += (chks[c].at0(r) * d); + } + } + } + public void reduce(MRTaskAtx other) { + for (int i = 0; i < _atx.length; i++) + _atx[i] += other._atx[i]; + } + } + Vector v = new DenseVector(new MRTaskAtx().doAll(A)._atx); + Matrix m = new DenseMatrix(A.numCols(), 1); + m.assignColumn(0, v); + return H2OHelper.frame_from_matrix(m, 0); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java index 3cb70ca122..873436021e 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java @@ -18,6 +18,8 @@ package org.apache.mahout.h2obindings.ops; import org.apache.mahout.math.Vector; +import org.apache.mahout.h2obindings.H2OHelper; +import org.apache.mahout.h2obindings.drm.H2OBCast; import water.*; import water.fvec.*; @@ -25,6 +27,19 @@ public class Ax { /* Calculate Ax (where x is an in-core Vector) */ public static Frame Ax(Frame A, Vector x) { - return null; + final H2OBCast bx = new H2OBCast(x); + class MRTaskAx extends MRTask { + public void map(Chunk chks[], NewChunk nc) { + Vector x = bx.value(); + for (int r = 0; r < chks[0]._len; r++) { + double v = 0; + for (int c = 0; c < chks.length; c++) { + v += (chks[c].at0(r) * x.getQuick(c)); + } + nc.addNum(v); + } + } + } + return new MRTaskAx().doAll(1, A).outputFrame(A.names(), A.domains()); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java index 5eccb4975b..4957bb60b0 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java @@ -18,13 +18,70 @@ package org.apache.mahout.h2obindings.ops; import org.apache.mahout.math.Matrix; +import org.apache.mahout.math.Vector; +import org.apache.mahout.math.DiagonalMatrix; +import org.apache.mahout.h2obindings.H2OHelper; +import org.apache.mahout.h2obindings.drm.H2OBCast; import water.*; import water.fvec.*; public class TimesRightMatrix { + + private static Frame AinCoreB_diagonal(final Frame A, Vector d) { + final H2OBCast bd = new H2OBCast(d); + /* XXX: create AinCore like A */ + Frame AinCoreB = H2OHelper.empty_frame (A.numRows(), d.size(), 0); + + + class MRTaskAinCoreB extends MRTask { + public void map(Chunk chks[]) { + Vector D = bd.value(); + long start = chks[0]._start; + for (int c = 0; c < ncs.length; c++) { + for (int r = 0; r < chks[0]._len; r++) { + double v = (A.vecs()[c].at(start+r) * D.getQuick(c)); + chks[c].set0(r, v); + } + } + } + } + new MRTaskAinCoreB().doAll(AinCoreB); + return AinCoreB; + } + + private static Frame AinCoreB_common(final Frame A, Matrix b) { + final H2OBCast bb = new H2OBCast(b); + /* XXX: create AinCore like A */ + Frame AinCoreB = H2OHelper.empty_frame (A.numRows(), b.columnSize(), 0); + + class MRTaskAinCoreB extends MRTask { + public void map(Chunk chks[]) { + Matrix B = bb.value(); + long start = chks[0]._start; + for (int c = 0; c < ncs.length; c++) { + for (int r = 0; r < chks[0]._len; r++) { + double v = 0; + for (int i = 0; i < chks.length; i++) { + v += (A.vecs()[i].at(start+r) * B.getQuick(i, c)); + } + chks[c].set0(r, v); + } + } + } + } + new MRTaskAinCoreB().doAll(AinCoreB); + return AinCoreB; + } + /* Multiple with in-core Matrix */ - public static Frame TimesRightMatrix(Frame A, Matrix m) { - return null; + public static Frame TimesRightMatrix(Frame A, Matrix B) { + Frame AinCoreB; + if (B instanceof DiagonalMatrix) + AinCoreB = AinCoreB_diagonal(A, B.viewDiagonal()); + else + AinCoreB = AinCoreB_common(A, B); + + return AinCoreB; } } From a36d8bde11429bb9ae339c71086fb7fc2c0c0a9f Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Fri, 4 Jul 2014 19:01:19 -0700 Subject: [PATCH 05/34] MAHOUT-1500: Add String key support to DRM Strings are stored in a Vec along with Frame in CheckpointedDrm Signed-off-by: Anand Avati --- .../apache/mahout/h2obindings/H2OHelper.java | 56 +++++++++++++++---- .../apache/mahout/h2obindings/ops/ABt.java | 23 ++++---- .../apache/mahout/h2obindings/ops/AewB.java | 11 +++- .../mahout/h2obindings/ops/AewScalar.java | 9 ++- .../org/apache/mahout/h2obindings/ops/At.java | 6 +- .../apache/mahout/h2obindings/ops/AtA.java | 6 +- .../apache/mahout/h2obindings/ops/AtB.java | 10 +++- .../apache/mahout/h2obindings/ops/Atx.java | 4 +- .../org/apache/mahout/h2obindings/ops/Ax.java | 8 ++- .../mahout/h2obindings/ops/MapBlock.java | 26 +++++++-- .../apache/mahout/h2obindings/ops/Par.java | 5 +- .../mahout/h2obindings/ops/RowRange.java | 31 +++++++++- .../h2obindings/ops/TimesRightMatrix.java | 32 +++++------ .../apache/mahout/h2obindings/H2OEngine.scala | 27 ++++++--- .../h2obindings/drm/CheckpointedDrmH2O.scala | 9 ++- .../h2obindings/ops/MapBlockHelper.scala | 19 ++++++- 16 files changed, 206 insertions(+), 76 deletions(-) diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java index b17fb8a713..09a64685a9 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -25,6 +25,11 @@ import java.io.File; import java.io.IOException; +import java.util.Map; +import java.util.HashMap; +import java.util.Arrays; + +import scala.Tuple2; public class H2OHelper { @@ -59,7 +64,7 @@ public void reduce(MRTaskNZ other) { Dense Matrix depending on number of missing elements in Frame. */ - public static Matrix matrix_from_frame (Frame frame) { + public static Matrix matrix_from_frame (Frame frame, Vec labels) { Matrix m; if (is_sparse (frame)) @@ -76,6 +81,14 @@ public static Matrix matrix_from_frame (Frame frame) { } c++; } + + if (labels != null) { + HashMap map = new HashMap(); + for (long i = 0; i < labels.length(); i++) { + map.put(labels.atStr(i), (int)i); + } + m.setRowLabelBindings(map); + } return m; } @@ -174,6 +187,18 @@ public static Frame frame_from_file (String path) throws IOException { return FrameUtils.parseFrame(null, new File(path)); } + private static Map reverse_map(Map map) { + if (map == null) + return null; + + Map rmap = new HashMap(); + + for(Map.Entry entry : map.entrySet()) { + rmap.put(entry.getValue(),entry.getKey()); + } + + return rmap; + } private static int chunk_size (long nrow, int ncol, int parts_hint) { int chunk_sz; @@ -212,18 +237,21 @@ private static int next_chunks(NewChunk ncs[], int cidx, long r, int chunk_sz, A - Create @cols number of Vec's. - Load data into Vecs by routing them through NewChunks */ - public static Frame frame_from_matrix (Matrix m, int parts_hint) { + public static Tuple2 frame_from_matrix (Matrix m, int parts_hint) { + Map map = m.getRowLabelBindings(); + Map rmap = reverse_map(map); int cols = m.columnSize(); + int nvecs = cols + (map != null ? 1 : 0); Vec.VectorGroup vg = new Vec.VectorGroup(); - Key keys[] = vg.addVecs(cols); - AppendableVec avs[] = new AppendableVec[cols]; - Vec vecs[] = new Vec[cols]; - NewChunk ncs[] = new NewChunk[cols]; + Key keys[] = vg.addVecs(nvecs); + AppendableVec avs[] = new AppendableVec[nvecs]; + Vec vecs[] = new Vec[nvecs]; + NewChunk ncs[] = new NewChunk[nvecs]; int chunk_sz = chunk_size (m.rowSize(), m.columnSize(), parts_hint); Futures fs = new Futures(); int cidx = 0; - for (int c = 0; c < cols; c++) + for (int c = 0; c < nvecs; c++) avs[c] = new AppendableVec(keys[c]); long r = 0; @@ -231,8 +259,10 @@ public static Frame frame_from_matrix (Matrix m, int parts_hint) { cidx = next_chunks (ncs, cidx, r, chunk_sz, avs, fs); /* Detect entire sparse rows */ while (r < row.index()) { - for (NewChunk nc : ncs) - nc.addNum(0.0); + for (int i = 0; i < cols; i++) + ncs[i].addNum(0.0); + if (nvecs != cols) + ncs[nvecs-1].addStr(null); r++; cidx = next_chunks (ncs, cidx, r, chunk_sz, avs, fs); } @@ -243,16 +273,20 @@ public static Frame frame_from_matrix (Matrix m, int parts_hint) { ncs[c++].addNum(0.0); ncs[c++].addNum(element.get()); } + if (rmap != null) + ncs[nvecs-1].addStr(rmap.get(r)); r++; } - for (int c = 0; c < cols; c++) { + for (int c = 0; c < nvecs; c++) { ncs[c].close(fs); vecs[c] = avs[c].close(fs); } fs.blockForPending(); - return new Frame(vecs); + Frame fr = new Frame(Arrays.copyOfRange(vecs,0,cols)); + Vec labels = (rmap != null) ? vecs[nvecs-1] : null; + return new Tuple2(fr,labels); } public static Frame empty_frame (long nrow, int ncol, int parts_hint) { diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java index 6764dea58d..79fd8122c9 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java @@ -21,28 +21,29 @@ import water.*; import water.fvec.*; +import scala.Tuple2; public class ABt { /* Calculate AB' */ - public static Frame ABt(final Frame A, final Frame B) { - /* XXX - make ABt similar to A */ - Frame ABt = H2OHelper.empty_frame (A.numRows(), (int)B.numRows(), 0); + public static Tuple2 ABt(Tuple2 TA, Tuple2 TB) { + Frame A = TA._1(); + Vec VA = TA._2(); + final Frame B = TB._1(); class MRTaskABt extends MRTask { - public void map(Chunk chks[]) { - long start = chks[0]._start; - for (int c = 0; c < chks.length; c++) { + public void map(Chunk chks[], NewChunk ncs[]) { + for (int c = 0; c < ncs.length; c++) { for (int r = 0; r < chks[0]._len; r++) { double v = 0; - for (int i = 0; i < A.vecs().length; i++) { - v += (A.vecs()[i].at(start+r) * B.vecs()[i].at(c)); + for (int i = 0; i < chks.length; i++) { + v += (chks[i].at0(r) * B.vecs()[i].at(c)); } - chks[c].set0(r, v); + ncs[c].addNum(v); } } } } - new MRTaskABt().doAll(ABt); - return ABt; + Frame ABt = new MRTaskABt().doAll((int)B.numRows(),A).outputFrame(null,null); + return new Tuple2(ABt, VA); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java index 836783beb5..7413018e79 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java @@ -21,10 +21,15 @@ import water.*; import water.fvec.*; +import scala.Tuple2; public class AewB { /* Element-wise DRM-DRM operations */ - public static Frame AewB(final Frame A, final Frame B, final String op) { + public static Tuple2 AewB(Tuple2 AT, Tuple2 BT, final String op) { + final Frame A = AT._1(); + final Frame B = BT._1(); + Vec VA = AT._2(); + class MRTaskAewB extends MRTask { private double opfn (String op, double a, double b) { if (a == 0.0 && b == 0.0) @@ -48,6 +53,8 @@ public void map(Chunk chks[], NewChunk ncs[]) { } } } - return new MRTaskAewB().doAll(A.numCols(), A).outputFrame(A.names(), A.domains()); + Frame AewB = new MRTaskAewB().doAll(A.numCols(), A).outputFrame(A.names(), A.domains()); + + return new Tuple2(AewB, VA); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java index ec4aeaecce..fec495f48c 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java @@ -21,10 +21,14 @@ import water.*; import water.fvec.*; +import scala.Tuple2; public class AewScalar { /* Element-wise DRM-DRM operations */ - public static Frame AewScalar(final Frame A, final double s, final String op) { + public static Tuple2 AewScalar(final Tuple2 TA, final double s, final String op) { + Frame A = TA._1(); + Vec VA = TA._2(); + class MRTaskAewScalar extends MRTask { private double opfn (String op, double a, double b) { if (a == 0.0 && b == 0.0) @@ -48,6 +52,7 @@ public void map(Chunk chks[], NewChunk ncs[]) { } } } - return new MRTaskAewScalar().doAll(A.numCols(), A).outputFrame(A.names(), A.domains()); + Frame AewScalar = new MRTaskAewScalar().doAll(A.numCols(), A).outputFrame(A.names(), A.domains()); + return new Tuple2(AewScalar, VA); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java index 1f131a4022..1c15c4bc82 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java @@ -21,10 +21,12 @@ import water.*; import water.fvec.*; +import scala.Tuple2; public class At { /* Calculate A' (transpose) */ - public static Frame At(final Frame A) { + public static Tuple2 At(Tuple2 T) { + final Frame A = T._1(); Frame At = H2OHelper.empty_frame (A.numCols(), (int)A.numRows(), 0); class MRTaskAt extends MRTask { public void map(Chunk chks[]) { @@ -37,6 +39,6 @@ public void map(Chunk chks[]) { } } new MRTaskAt().doAll(At); - return At; + return new Tuple2(At,null); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java index 125c348fef..aaf0e5fe83 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java @@ -21,10 +21,12 @@ import water.*; import water.fvec.*; +import scala.Tuple2; public class AtA { /* Calculate A'A */ - public static Frame AtA(final Frame A) { + public static Tuple2 AtA(Tuple2 TA) { + final Frame A = TA._1(); Frame AtA = H2OHelper.empty_frame (A.numCols(), A.numCols(), 0); class MRTaskAtA extends MRTask { public void map(Chunk chks[]) { @@ -41,6 +43,6 @@ public void map(Chunk chks[]) { } } new MRTaskAtA().doAll(AtA); - return AtA; + return new Tuple2(AtA,null); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java index d720ee2aa4..46f8468769 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java @@ -21,11 +21,16 @@ import water.*; import water.fvec.*; +import scala.Tuple2; public class AtB { /* Calculate A'B */ - public static Frame AtB(final Frame A, final Frame B) { + public static Tuple2 AtB(Tuple2 TA, Tuple2 TB) { + final Frame A = TA._1(); + final Frame B = TB._1(); + Frame AtB = H2OHelper.empty_frame (A.numCols(), B.numCols(), 0); + class MRTaskAtB extends MRTask { public void map(Chunk chks[]) { long start = chks[0]._start; @@ -40,7 +45,8 @@ public void map(Chunk chks[]) { } } } + new MRTaskAtB().doAll(AtB); - return AtB; + return new Tuple2(AtB,null); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java index ab8d987f1a..8091b1b5c1 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java @@ -26,10 +26,12 @@ import water.*; import water.fvec.*; +import scala.Tuple2; public class Atx { /* Calculate A'x (where x is an in-core Vector) */ - public static Frame Atx(Frame A, Vector x) { + public static Tuple2 Atx(Tuple2 TA, Vector x) { + Frame A = TA._1(); final H2OBCast bx = new H2OBCast(x); class MRTaskAtx extends MRTask { double _atx[]; diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java index 873436021e..7d1694e9c8 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java @@ -23,10 +23,13 @@ import water.*; import water.fvec.*; +import scala.Tuple2; public class Ax { /* Calculate Ax (where x is an in-core Vector) */ - public static Frame Ax(Frame A, Vector x) { + public static Tuple2 Ax(Tuple2 TA, Vector x) { + Frame A = TA._1(); + Vec VA = TA._2(); final H2OBCast bx = new H2OBCast(x); class MRTaskAx extends MRTask { public void map(Chunk chks[], NewChunk nc) { @@ -40,6 +43,7 @@ public void map(Chunk chks[], NewChunk nc) { } } } - return new MRTaskAx().doAll(1, A).outputFrame(A.names(), A.domains()); + Frame Ax = new MRTaskAx().doAll(1, A).outputFrame(A.names(), A.domains()); + return new Tuple2(Ax, VA); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java index 96e6c746e8..98263af511 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java @@ -23,14 +23,21 @@ import water.*; import water.fvec.*; import java.io.Serializable; +import java.util.Arrays; import scala.reflect.ClassTag; +import scala.Tuple2; public class MapBlock { - public static Frame exec(Frame A, int ncol, Object bmf, final ClassTag k, final ClassTag r) { + public static Tuple2 exec(Tuple2 AT, int ncol, Object bmf, final boolean is_r_str, + final ClassTag k, final ClassTag r) { + Frame A = AT._1(); + Vec VA = AT._2(); + class MRTaskBMF extends MRTask { Serializable _bmf; - MRTaskBMF(Object bmf) { + Vec _labels; + MRTaskBMF(Object bmf, Vec labels) { /* BlockMapFun does not implement Serializable, but Scala closures are _always_ Serializable. @@ -40,6 +47,7 @@ So receive the object as a plain Object (else closure functions with Serializable. */ _bmf = (Serializable)bmf; + _labels = labels; } private Matrix blockify (Chunk chks[]) { @@ -56,10 +64,20 @@ private void deblockify (Matrix out, NewChunk ncs[]) { } public void map(Chunk chks[], NewChunk ncs[]) { - deblockify(MapBlockHelper.exec(_bmf, blockify(chks), chks[0]._start, k, r), ncs); + long start = chks[0]._start; + NewChunk nclabel = is_r_str ? ncs[ncs.length-1] : null; + deblockify(MapBlockHelper.exec(_bmf, blockify(chks), start, _labels, nclabel, k, r), ncs); // assert chks[i]._len == ncs[j]._len } } - return new MRTaskBMF(bmf).doAll(ncol, A).outputFrame(A.names(), A.domains()); + + int ncol_res = ncol + (is_r_str ? 1 : 0); + Frame fmap = new MRTaskBMF(bmf, VA).doAll(ncol_res, A).outputFrame(null, null); + Vec vmap = null; + if (is_r_str) { + vmap = fmap.vecs()[ncol]; + fmap = new Frame(Arrays.copyOfRange(fmap.vecs(), 0, ncol)); + } + return new Tuple2(fmap,vmap); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java index 2b3f1e3ec2..31b1c11c23 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java @@ -19,10 +19,11 @@ import water.*; import water.fvec.*; +import scala.Tuple2; public class Par { - public static Frame exec(Frame A, int min, int exact) { + public static Tuple2 exec(Tuple2 TA, int min, int exact) { /* XXX: re-org Frame */ - return A; + return TA; } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java index 9c960eb1bb..56c75f81b8 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java @@ -21,10 +21,14 @@ import water.*; import water.fvec.*; +import scala.Tuple2; public class RowRange { /* Filter operation */ - public static Frame RowRange(Frame A, Range r) { + public static Tuple2 RowRange(Tuple2 TA, Range r) { + Frame A = TA._1(); + Vec VA = TA._2(); + class MRTaskFilter extends MRTask { Range _r; MRTaskFilter(Range r) { @@ -43,6 +47,29 @@ public void map(Chunk chks[], NewChunk ncs[]) { } } } - return new MRTaskFilter(r).doAll(A.numCols(), A).outputFrame(A.names(), A.domains()); + Frame Arr = new MRTaskFilter(r).doAll(A.numCols(), A).outputFrame(A.names(), A.domains()); + Vec Vrr = null; + if (VA != null) { + class MRTaskStrFilter extends MRTask { + Range _r; + MRTaskStrFilter(Range r) { + _r = r; + } + public void map(Chunk chk, NewChunk nc) { + if (chk._start > _r.end() || (chk._start + chk._len) < _r.start()) + return; + + for (int r = 0; r < chk._len; r++) { + if (!_r.contains (chk._start + r)) + continue; + + nc.addStr(chk.atStr0(r)); + } + } + } + Vrr = new MRTaskStrFilter(r).doAll(1, VA).outputFrame(null,null).vecs()[0]; + } + + return new Tuple2(Arr,Vrr); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java index 4957bb60b0..e268655712 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java @@ -25,63 +25,57 @@ import water.*; import water.fvec.*; +import scala.Tuple2; public class TimesRightMatrix { private static Frame AinCoreB_diagonal(final Frame A, Vector d) { final H2OBCast bd = new H2OBCast(d); - /* XXX: create AinCore like A */ - Frame AinCoreB = H2OHelper.empty_frame (A.numRows(), d.size(), 0); - class MRTaskAinCoreB extends MRTask { - public void map(Chunk chks[]) { + public void map(Chunk chks[], NewChunk ncs[]) { Vector D = bd.value(); - long start = chks[0]._start; for (int c = 0; c < ncs.length; c++) { for (int r = 0; r < chks[0]._len; r++) { - double v = (A.vecs()[c].at(start+r) * D.getQuick(c)); - chks[c].set0(r, v); + double v = (chks[c].at0(r) * D.getQuick(c)); + ncs[c].addNum(v); } } } } - new MRTaskAinCoreB().doAll(AinCoreB); - return AinCoreB; + return new MRTaskAinCoreB().doAll(d.size(), A).outputFrame(null,null); } private static Frame AinCoreB_common(final Frame A, Matrix b) { final H2OBCast bb = new H2OBCast(b); - /* XXX: create AinCore like A */ - Frame AinCoreB = H2OHelper.empty_frame (A.numRows(), b.columnSize(), 0); class MRTaskAinCoreB extends MRTask { - public void map(Chunk chks[]) { + public void map(Chunk chks[], NewChunk ncs[]) { Matrix B = bb.value(); - long start = chks[0]._start; for (int c = 0; c < ncs.length; c++) { for (int r = 0; r < chks[0]._len; r++) { double v = 0; for (int i = 0; i < chks.length; i++) { - v += (A.vecs()[i].at(start+r) * B.getQuick(i, c)); + v += (chks[i].at0(r) * B.getQuick(i, c)); } - chks[c].set0(r, v); + ncs[c].addNum(v); } } } } - new MRTaskAinCoreB().doAll(AinCoreB); - return AinCoreB; + return new MRTaskAinCoreB().doAll(b.columnSize(), A).outputFrame(null,null); } /* Multiple with in-core Matrix */ - public static Frame TimesRightMatrix(Frame A, Matrix B) { + public static Tuple2 TimesRightMatrix(Tuple2 TA, Matrix B) { + Frame A = TA._1(); + Vec VA = TA._2(); Frame AinCoreB; if (B instanceof DiagonalMatrix) AinCoreB = AinCoreB_diagonal(A, B.viewDiagonal()); else AinCoreB = AinCoreB_common(A, B); - return AinCoreB; + return new Tuple2(AinCoreB, VA); } } diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala index ddd567567c..bdc3a7b6b9 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala @@ -56,18 +56,26 @@ object H2OEngine extends DistributedEngine { def drmParallelizeEmptyLong(nrow: Long, ncol: Int, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Long] = new CheckpointedDrmH2O[Long] (H2OHelper.empty_frame (nrow, ncol, numPartitions), dc) - def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Int] = - new CheckpointedDrmH2O[Int] (H2OHelper.frame_from_matrix (m, numPartitions), dc) + def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Int] = { + val (frame, labels) = H2OHelper.frame_from_matrix (m, numPartitions) + // assert labels == null + new CheckpointedDrmH2O[Int] (frame, labels, dc) + } - def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[String] = - new CheckpointedDrmH2O[String] (H2OHelper.frame_from_matrix (m, numPartitions), dc) + def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[String] = { + val (frame, labels) = H2OHelper.frame_from_matrix (m, numPartitions) + // assert labels != null + new CheckpointedDrmH2O[String] (frame, labels, dc) + } - def toPhysical[K:ClassTag](plan: DrmLike[K], ch: CacheHint.CacheHint): CheckpointedDrm[K] = - new CheckpointedDrmH2O[K] (tr2phys (plan), plan.context) + def toPhysical[K:ClassTag](plan: DrmLike[K], ch: CacheHint.CacheHint): CheckpointedDrm[K] = { + val (frame, labels) = tr2phys (plan) + new CheckpointedDrmH2O[K] (frame, labels, plan.context) + } // H2O specific - private def tr2phys[K: ClassTag](oper: DrmLike[K]): Frame = { + private def tr2phys[K: ClassTag](oper: DrmLike[K]): (Frame, Vec) = { oper match { case OpAtAnyKey(_) => throw new IllegalArgumentException("\"A\" must be Int-keyed in this A.t expression.") @@ -82,9 +90,10 @@ object H2OEngine extends DistributedEngine { case op@OpRowRange(a, r) => RowRange.RowRange(tr2phys(a)(op.classTagA), r) case op@OpTimesRightMatrix(a, m) => TimesRightMatrix.TimesRightMatrix(tr2phys(a)(op.classTagA), m) // Custom operators, we just execute them - case blockOp: OpMapBlock[K, _] => MapBlock.exec(tr2phys(blockOp.A)(blockOp.classTagA), blockOp.ncol, blockOp.bmf, blockOp.classTagA, blockOp.classTagK) + case blockOp: OpMapBlock[K, _] => MapBlock.exec(tr2phys(blockOp.A)(blockOp.classTagA), blockOp.ncol, blockOp.bmf, + (blockOp.classTagK == implicitly[ClassTag[String]]), blockOp.classTagA, blockOp.classTagK) case op@OpPar(a, m, e) => Par.exec(tr2phys(a)(op.classTagA), m, e) - case cp: CheckpointedDrm[K] => cp.frame + case cp: CheckpointedDrm[K] => (cp.frame, cp.labels) case _ => throw new IllegalArgumentException("Internal:Optimizer has no exec policy for operator %s." .format(oper)) } diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala index 0c067474a5..1e442ac810 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala @@ -14,11 +14,14 @@ import scala.reflect._ /** H2O-specific optimizer-checkpointed DRM. */ class CheckpointedDrmH2O[K: ClassTag]( val frame: Frame, + val labels: Vec, protected[mahout] val context: DistributedContext ) extends CheckpointedDrm[K] { - /* XXX: Row index not supported. Numerical index generated on the fly (for mapBlock etc.) */ - def collect: Matrix = H2OHelper.matrix_from_frame(frame) + def this(frame: Frame, context: DistributedContext) = + this(frame, null, context) + + def collect: Matrix = H2OHelper.matrix_from_frame(frame, labels) /* XXX: call frame.remove */ def uncache(): Unit = return /* XXX: H2O does not support seqfile format yet */ @@ -31,5 +34,5 @@ class CheckpointedDrmH2O[K: ClassTag]( def nrow: Long = frame.numRows - protected[mahout] def partitioningTag: Long = frame.vecs()(0).group.hashCode + protected[mahout] def partitioningTag: Long = frame.anyVec.group.hashCode } diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala index d53fe22187..3967652582 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala @@ -21,8 +21,10 @@ import org.apache.mahout.math.Matrix import org.apache.mahout.math.drm.BlockMapFunc import scala.reflect.ClassTag +import water.fvec.{Vec,NewChunk} + object MapBlockHelper { - def exec[K: ClassTag, R: ClassTag](bmf: Object, in: Matrix, startlong: Long): Matrix = { + def exec[K: ClassTag, R: ClassTag](bmf: Object, in: Matrix, startlong: Long, labels: Vec, nclabel: NewChunk): Matrix = { val i = implicitly[ClassTag[Int]] val l = implicitly[ClassTag[Long]] val s = implicitly[ClassTag[String]] @@ -31,11 +33,24 @@ object MapBlockHelper { case `i` => val startint: Int = startlong.asInstanceOf[Int] startint until (startint + in.rowSize) toArray case `l` => startlong until (startlong + in.rowSize) toArray - case `s` => new Array[String](in.rowSize) + case `s` => { + val arr = new Array[String](in.rowSize) + for (i <- 0 to in.rowSize) { + arr(i) = labels.atStr(i+startlong) + } + arr + } } val _bmf = bmf.asInstanceOf[BlockMapFunc[K,R]] val out = _bmf((inarray.asInstanceOf[Array[K]], in)) + + implicitly[ClassTag[R]] match { + case `s` => for (str <- out._1) { + nclabel.addStr(str.asInstanceOf[String]) + } + case _ => Unit + } out._2 } } From 286127b352308b33b8b764169dd819c29e8c4dc0 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Mon, 7 Jul 2014 00:48:30 -0700 Subject: [PATCH 06/34] MAHOUT-1500: Implement Par() operator Signed-off-by: Anand Avati --- .../apache/mahout/h2obindings/H2OHelper.java | 25 ++++++++----- .../org/apache/mahout/h2obindings/ops/At.java | 2 +- .../apache/mahout/h2obindings/ops/AtA.java | 2 +- .../apache/mahout/h2obindings/ops/AtB.java | 2 +- .../apache/mahout/h2obindings/ops/Atx.java | 2 +- .../apache/mahout/h2obindings/ops/Par.java | 37 ++++++++++++++++++- .../apache/mahout/h2obindings/H2OEngine.scala | 8 ++-- 7 files changed, 58 insertions(+), 20 deletions(-) diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java index 09a64685a9..bde5ed0cbb 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -200,22 +200,27 @@ private static Map reverse_map(Map map) { return rmap; } - private static int chunk_size (long nrow, int ncol, int parts_hint) { + private static int chunk_size (long nrow, int ncol, int min, int exact) { int chunk_sz; + int parts_hint = Math.max(min, exact); if (parts_hint < 1) /* XXX: calculate based on cloud size and # of cpu */ - parts_hint = 1; + parts_hint = 4; chunk_sz = (int) (((nrow - 1) / parts_hint) + 1); - if (parts_hint < 2) { - if (chunk_sz < 1e3) - chunk_sz = (int)1e3; - } + if (exact > 0) + return chunk_sz; if (chunk_sz > 1e6) chunk_sz = (int)1e6; + if (min > 0) + return chunk_sz; + + if (chunk_sz < 1e3) + chunk_sz = (int)1e3; + return chunk_sz; } @@ -237,7 +242,7 @@ private static int next_chunks(NewChunk ncs[], int cidx, long r, int chunk_sz, A - Create @cols number of Vec's. - Load data into Vecs by routing them through NewChunks */ - public static Tuple2 frame_from_matrix (Matrix m, int parts_hint) { + public static Tuple2 frame_from_matrix (Matrix m, int min_hint, int exact_hint) { Map map = m.getRowLabelBindings(); Map rmap = reverse_map(map); int cols = m.columnSize(); @@ -247,7 +252,7 @@ public static Tuple2 frame_from_matrix (Matrix m, int parts_hint) { AppendableVec avs[] = new AppendableVec[nvecs]; Vec vecs[] = new Vec[nvecs]; NewChunk ncs[] = new NewChunk[nvecs]; - int chunk_sz = chunk_size (m.rowSize(), m.columnSize(), parts_hint); + int chunk_sz = chunk_size (m.rowSize(), m.columnSize(), min_hint, exact_hint); Futures fs = new Futures(); int cidx = 0; @@ -289,8 +294,8 @@ public static Tuple2 frame_from_matrix (Matrix m, int parts_hint) { return new Tuple2(fr,labels); } - public static Frame empty_frame (long nrow, int ncol, int parts_hint) { - int chunk_sz = chunk_size (nrow, ncol, parts_hint); + public static Frame empty_frame (long nrow, int ncol, int min_hint, int exact_hint) { + int chunk_sz = chunk_size (nrow, ncol, min_hint, exact_hint); int nchunks = (int) ((nrow - 1) / chunk_sz) + 1; /* Final number of Chunks per Vec */ Futures fs = new Futures(); Vec.VectorGroup vg = new Vec.VectorGroup(); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java index 1c15c4bc82..e0285a5c05 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java @@ -27,7 +27,7 @@ public class At { /* Calculate A' (transpose) */ public static Tuple2 At(Tuple2 T) { final Frame A = T._1(); - Frame At = H2OHelper.empty_frame (A.numCols(), (int)A.numRows(), 0); + Frame At = H2OHelper.empty_frame (A.numCols(), (int)A.numRows(), -1, -1); class MRTaskAt extends MRTask { public void map(Chunk chks[]) { long start = chks[0]._start; diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java index aaf0e5fe83..aee55bd13b 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java @@ -27,7 +27,7 @@ public class AtA { /* Calculate A'A */ public static Tuple2 AtA(Tuple2 TA) { final Frame A = TA._1(); - Frame AtA = H2OHelper.empty_frame (A.numCols(), A.numCols(), 0); + Frame AtA = H2OHelper.empty_frame (A.numCols(), A.numCols(), -1, -1); class MRTaskAtA extends MRTask { public void map(Chunk chks[]) { long start = chks[0]._start; diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java index 46f8468769..f0822cd598 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java @@ -29,7 +29,7 @@ public static Tuple2 AtB(Tuple2 TA, Tuple2 TB) final Frame A = TA._1(); final Frame B = TB._1(); - Frame AtB = H2OHelper.empty_frame (A.numCols(), B.numCols(), 0); + Frame AtB = H2OHelper.empty_frame (A.numCols(), B.numCols(), -1, -1); class MRTaskAtB extends MRTask { public void map(Chunk chks[]) { diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java index 8091b1b5c1..2bdf076911 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java @@ -54,6 +54,6 @@ public void reduce(MRTaskAtx other) { Vector v = new DenseVector(new MRTaskAtx().doAll(A)._atx); Matrix m = new DenseMatrix(A.numCols(), 1); m.assignColumn(0, v); - return H2OHelper.frame_from_matrix(m, 0); + return H2OHelper.frame_from_matrix(m, -1, -1); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java index 31b1c11c23..7a6bbd21ca 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java @@ -20,10 +20,43 @@ import water.*; import water.fvec.*; import scala.Tuple2; +import org.apache.mahout.h2obindings.H2OHelper; public class Par { public static Tuple2 exec(Tuple2 TA, int min, int exact) { - /* XXX: re-org Frame */ - return TA; + final Frame frin = TA._1(); + final Vec vin = TA._2(); + Frame frout = H2OHelper.empty_frame (frin.numRows(), frin.numCols(), min, exact); + Vec vout = null; + + class MRParVecTask extends MRTask { + public void map(Chunk chks[], NewChunk nc) { + Vec vins[] = frin.vecs(); + for (int r = 0; r < chks[0]._len; r++) { + for (int c = 0; c < chks.length; c++) { + chks[c].set0(r, vins[c].at(chks[0]._start + r)); + } + nc.addStr(vin.atStr(chks[0]._start + r)); + } + } + } + + class MRParTask extends MRTask { + public void map(Chunk chks[]) { + Vec vins[] = frin.vecs(); + for (int r = 0; r < chks[0]._len; r++) { + for (int c = 0; c < chks.length; c++) { + chks[c].set0(r, vins[c].at(chks[0]._start + r)); + } + } + } + } + + if (vout != null) { + vout = new MRParVecTask().doAll(1, frout).outputFrame(null, null).anyVec(); + } else { + new MRParTask().doAll(frout); + } + return new Tuple2 (frout, vout); } } diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala index bdc3a7b6b9..ec556fd63d 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala @@ -51,19 +51,19 @@ object H2OEngine extends DistributedEngine { new CheckpointedDrmH2O (H2OHelper.frame_from_file (path), dc) def drmParallelizeEmpty(nrow: Int, ncol: Int, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Int] = - new CheckpointedDrmH2O[Int] (H2OHelper.empty_frame (nrow, ncol, numPartitions), dc) + new CheckpointedDrmH2O[Int] (H2OHelper.empty_frame (nrow, ncol, numPartitions, -1), dc) def drmParallelizeEmptyLong(nrow: Long, ncol: Int, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Long] = - new CheckpointedDrmH2O[Long] (H2OHelper.empty_frame (nrow, ncol, numPartitions), dc) + new CheckpointedDrmH2O[Long] (H2OHelper.empty_frame (nrow, ncol, numPartitions, -1), dc) def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Int] = { - val (frame, labels) = H2OHelper.frame_from_matrix (m, numPartitions) + val (frame, labels) = H2OHelper.frame_from_matrix (m, numPartitions, -1) // assert labels == null new CheckpointedDrmH2O[Int] (frame, labels, dc) } def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[String] = { - val (frame, labels) = H2OHelper.frame_from_matrix (m, numPartitions) + val (frame, labels) = H2OHelper.frame_from_matrix (m, numPartitions, -1) // assert labels != null new CheckpointedDrmH2O[String] (frame, labels, dc) } From 655f677f656bc4cf29f1294f50e4b81dcc243f85 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Mon, 7 Jul 2014 15:08:12 -0700 Subject: [PATCH 07/34] MAHOUT-1500: Implement Cbind() operator Signed-off-by: Anand Avati --- .../apache/mahout/h2obindings/ops/Cbind.java | 76 +++++++++++++++++++ .../apache/mahout/h2obindings/H2OEngine.scala | 2 + 2 files changed, 78 insertions(+) create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java new file mode 100644 index 0000000000..71f2a9fff8 --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops; + +import water.*; +import water.fvec.*; +import scala.Tuple2; +import org.apache.mahout.h2obindings.H2OHelper; + +public class Cbind { + private static Tuple2 zip(final Frame fra, final Vec va, final Frame frb, final Vec vb) { + Vec vecs[] = new Vec[fra.vecs().length + frb.vecs().length]; + int d = 0; + for (Vec vfra : fra.vecs()) + vecs[d++] = vfra; + for (Vec vfrb : frb.vecs()) + vecs[d++] = vfrb; + Frame fr = new Frame(vecs); + return new Tuple2 (fr, va); + } + + private static Tuple2 join(final Frame fra, final Vec va, final Frame frb, final Vec vb) { + Vec bvecs[] = new Vec[frb.vecs().length]; + + for (int i = 0; i < bvecs.length; i++) + bvecs[i] = fra.anyVec().makeZero(); + + new MRTask() { + public void map(Chunk chks[]) { + long start = chks[0]._start; + for (int r = 0; r < chks[0]._len; r++) { + for (int c = 0; c < chks.length; c++) { + // assert va.atStr(start+r) == vb.atStr(start+r) + chks[c].set0(r, frb.vecs()[c].at(start + r)); + } + } + } + }.doAll(bvecs); + + Vec vecs[] = new Vec[fra.vecs().length + frb.vecs().length]; + int d = 0; + for (Vec vfra : fra.vecs()) + vecs[d++] = vfra; + for (Vec vfrb : bvecs) + vecs[d++] = vfrb; + Frame fr = new Frame(vecs); + return new Tuple2 (fr, va); + } + + public static Tuple2 Cbind(Tuple2 TA, Tuple2 TB) { + Frame fra = TA._1(); + Vec va = TA._2(); + Frame frb = TB._1(); + Vec vb = TB._2(); + + if (fra.anyVec().group() == frb.anyVec().group()) + return zip(fra, va, frb, vb); + else + return join(fra, va, frb, vb); + } +} diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala index ec556fd63d..3b3f511218 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala @@ -86,6 +86,8 @@ object H2OEngine extends DistributedEngine { case op@OpAx(a, v) => Ax.Ax(tr2phys(a)(op.classTagA), v) case op@OpAtx(a, v) => Atx.Atx(tr2phys(a)(op.classTagA), v) case op@OpAewB(a, b, opId) => AewB.AewB(tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB), opId) + // Non arithmetic + case op@OpCbind(a, b) => Cbind.Cbind(tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB)) case op@OpAewScalar(a, s, opId) => AewScalar.AewScalar(tr2phys(a)(op.classTagA), s, opId) case op@OpRowRange(a, r) => RowRange.RowRange(tr2phys(a)(op.classTagA), r) case op@OpTimesRightMatrix(a, m) => TimesRightMatrix.TimesRightMatrix(tr2phys(a)(op.classTagA), m) From 922da861fb94731c21482809329b7f9fad3ffb53 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Mon, 7 Jul 2014 20:02:25 -0700 Subject: [PATCH 08/34] MAHOUT-1500: use accessor methods instead of direct field access Signed-off-by: Anand Avati --- .../apache/mahout/h2obindings/H2OBlockMatrix.java | 8 ++++---- .../org/apache/mahout/h2obindings/H2OHelper.java | 6 +++--- .../java/org/apache/mahout/h2obindings/ops/ABt.java | 2 +- .../java/org/apache/mahout/h2obindings/ops/AewB.java | 4 ++-- .../org/apache/mahout/h2obindings/ops/AewScalar.java | 4 ++-- .../java/org/apache/mahout/h2obindings/ops/At.java | 4 ++-- .../java/org/apache/mahout/h2obindings/ops/AtA.java | 4 ++-- .../java/org/apache/mahout/h2obindings/ops/AtB.java | 4 ++-- .../java/org/apache/mahout/h2obindings/ops/Atx.java | 4 ++-- .../java/org/apache/mahout/h2obindings/ops/Ax.java | 2 +- .../org/apache/mahout/h2obindings/ops/Cbind.java | 4 ++-- .../org/apache/mahout/h2obindings/ops/MapBlock.java | 2 +- .../java/org/apache/mahout/h2obindings/ops/Par.java | 10 +++++----- .../org/apache/mahout/h2obindings/ops/RowRange.java | 12 ++++++------ .../mahout/h2obindings/ops/TimesRightMatrix.java | 4 ++-- 15 files changed, 37 insertions(+), 37 deletions(-) diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java index 3a6a68354d..11a57b172f 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java @@ -43,7 +43,7 @@ public class H2OBlockMatrix extends AbstractMatrix { Matrix cow; /* Copy on Write */ public H2OBlockMatrix(Chunk chks[]) { - super(chks[0]._len, chks.length); + super(chks[0].len(), chks.length); _chks = chks; } @@ -52,12 +52,12 @@ private void cow() { return; if (_chks[0].isSparse()) - cow = new SparseMatrix(_chks[0]._len, _chks.length); + cow = new SparseMatrix(_chks[0].len(), _chks.length); else - cow = new DenseMatrix(_chks[0]._len, _chks.length); + cow = new DenseMatrix(_chks[0].len(), _chks.length); for (int c = 0; c < _chks.length; c++) { - for (int r = 0; r < _chks[0]._len; r++) { + for (int r = 0; r < _chks[0].len(); r++) { cow.setQuick(r, c, _chks[c].at0(r)); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java index bde5ed0cbb..70bdf9b266 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -119,7 +119,7 @@ public void map(Chunk chks[]) { _sums = new double[chks.length]; for (int c = 0; c < chks.length; c++) { - for (int r = 0; r < chks[c]._len; r++) { + for (int r = 0; r < chks[c].len(); r++) { _sums[c] += chks[c].at0(r); } } @@ -142,7 +142,7 @@ class MRTaskSumSqr extends MRTask { public double _sumSqr; public void map(Chunk chks[]) { for (int c = 0; c < chks.length; c++) { - for (int r = 0; r < chks[c]._len; r++) { + for (int r = 0; r < chks[c].len(); r++) { _sumSqr += (chks[c].at0(r) * chks[c].at0(r)); } } @@ -168,7 +168,7 @@ public void map(Chunk chks[]) { _sums = new double[chks.length]; for (int c = 0; c < chks.length; c++) { - for (int r = 0; r < chks[c]._len; r++) { + for (int r = 0; r < chks[c].len(); r++) { if ((long)chks[c].at0(r) != 0) _sums[c] ++; } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java index 79fd8122c9..4389bec1a7 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java @@ -33,7 +33,7 @@ public static Tuple2 ABt(Tuple2 TA, Tuple2 TB) class MRTaskABt extends MRTask { public void map(Chunk chks[], NewChunk ncs[]) { for (int c = 0; c < ncs.length; c++) { - for (int r = 0; r < chks[0]._len; r++) { + for (int r = 0; r < chks[0].len(); r++) { double v = 0; for (int i = 0; i < chks.length; i++) { v += (chks[i].at0(r) * B.vecs()[i].at(c)); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java index 7413018e79..4e0468f3b7 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java @@ -45,9 +45,9 @@ else if (op.equals("/")) return 0.0; } public void map(Chunk chks[], NewChunk ncs[]) { - long start = chks[0]._start; + long start = chks[0].start(); for (int c = 0; c < chks.length; c++) { - for (int r = 0; r < chks[0]._len; r++) { + for (int r = 0; r < chks[0].len(); r++) { ncs[c].addNum(opfn(op, chks[c].at0(r), B.vecs()[c].at(start+r))); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java index fec495f48c..42c17c42be 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java @@ -44,9 +44,9 @@ else if (op.equals("/")) return 0.0; } public void map(Chunk chks[], NewChunk ncs[]) { - long start = chks[0]._start; + long start = chks[0].start(); for (int c = 0; c < chks.length; c++) { - for (int r = 0; r < chks[0]._len; r++) { + for (int r = 0; r < chks[0].len(); r++) { ncs[c].addNum(opfn(op, chks[c].at0(r), s)); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java index e0285a5c05..cba5c5ef68 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java @@ -30,9 +30,9 @@ public static Tuple2 At(Tuple2 T) { Frame At = H2OHelper.empty_frame (A.numCols(), (int)A.numRows(), -1, -1); class MRTaskAt extends MRTask { public void map(Chunk chks[]) { - long start = chks[0]._start; + long start = chks[0].start(); for (int c = 0; c < chks.length; c++) { - for (int r = 0; r < chks[0]._len; r++) { + for (int r = 0; r < chks[0].len(); r++) { chks[c].set0(r, A.vecs()[(int)(start+r)].at(c)); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java index aee55bd13b..a099f9eda7 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java @@ -30,9 +30,9 @@ public static Tuple2 AtA(Tuple2 TA) { Frame AtA = H2OHelper.empty_frame (A.numCols(), A.numCols(), -1, -1); class MRTaskAtA extends MRTask { public void map(Chunk chks[]) { - long start = chks[0]._start; + long start = chks[0].start(); for (int c = 0; c < chks.length; c++) { - for (int r = 0; r < chks[0]._len; r++) { + for (int r = 0; r < chks[0].len(); r++) { double v = 0; for (int i = 0; i < A.numRows(); i++) { v += (A.vecs()[(int)(start+r)].at(i) * A.vecs()[c].at(i)); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java index f0822cd598..f30f3f78b6 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java @@ -33,9 +33,9 @@ public static Tuple2 AtB(Tuple2 TA, Tuple2 TB) class MRTaskAtB extends MRTask { public void map(Chunk chks[]) { - long start = chks[0]._start; + long start = chks[0].start(); for (int c = 0; c < chks.length; c++) { - for (int r = 0; r < chks[0]._len; r++) { + for (int r = 0; r < chks[0].len(); r++) { double v = 0; for (int i = 0; i < A.numRows(); i++) { v += (A.vecs()[(int)(start+r)].at(i) * B.vecs()[c].at(i)); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java index 2bdf076911..7d6f0a4419 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java @@ -37,9 +37,9 @@ class MRTaskAtx extends MRTask { double _atx[]; public void map(Chunk chks[]) { Vector x = bx.value(); - long start = chks[0]._start; + long start = chks[0].start(); _atx = new double[chks.length]; - for (int r = 0; r < chks[0]._len; r++) { + for (int r = 0; r < chks[0].len(); r++) { double d = x.getQuick((int)start + r); for (int c = 0; c < chks.length; c++) { _atx[c] += (chks[c].at0(r) * d); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java index 7d1694e9c8..7242f57681 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java @@ -34,7 +34,7 @@ public static Tuple2 Ax(Tuple2 TA, Vector x) { class MRTaskAx extends MRTask { public void map(Chunk chks[], NewChunk nc) { Vector x = bx.value(); - for (int r = 0; r < chks[0]._len; r++) { + for (int r = 0; r < chks[0].len(); r++) { double v = 0; for (int c = 0; c < chks.length; c++) { v += (chks[c].at0(r) * x.getQuick(c)); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java index 71f2a9fff8..e26635d678 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java @@ -42,8 +42,8 @@ private static Tuple2 join(final Frame fra, final Vec va, final Fram new MRTask() { public void map(Chunk chks[]) { - long start = chks[0]._start; - for (int r = 0; r < chks[0]._len; r++) { + long start = chks[0].start(); + for (int r = 0; r < chks[0].len(); r++) { for (int c = 0; c < chks.length; c++) { // assert va.atStr(start+r) == vb.atStr(start+r) chks[c].set0(r, frb.vecs()[c].at(start + r)); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java index 98263af511..76e062bb9f 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java @@ -64,7 +64,7 @@ private void deblockify (Matrix out, NewChunk ncs[]) { } public void map(Chunk chks[], NewChunk ncs[]) { - long start = chks[0]._start; + long start = chks[0].start(); NewChunk nclabel = is_r_str ? ncs[ncs.length-1] : null; deblockify(MapBlockHelper.exec(_bmf, blockify(chks), start, _labels, nclabel, k, r), ncs); // assert chks[i]._len == ncs[j]._len diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java index 7a6bbd21ca..a8eb13a86c 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java @@ -32,11 +32,11 @@ public static Tuple2 exec(Tuple2 TA, int min, int exact) { class MRParVecTask extends MRTask { public void map(Chunk chks[], NewChunk nc) { Vec vins[] = frin.vecs(); - for (int r = 0; r < chks[0]._len; r++) { + for (int r = 0; r < chks[0].len(); r++) { for (int c = 0; c < chks.length; c++) { - chks[c].set0(r, vins[c].at(chks[0]._start + r)); + chks[c].set0(r, vins[c].at(chks[0].start() + r)); } - nc.addStr(vin.atStr(chks[0]._start + r)); + nc.addStr(vin.atStr(chks[0].start() + r)); } } } @@ -44,9 +44,9 @@ public void map(Chunk chks[], NewChunk nc) { class MRParTask extends MRTask { public void map(Chunk chks[]) { Vec vins[] = frin.vecs(); - for (int r = 0; r < chks[0]._len; r++) { + for (int r = 0; r < chks[0].len(); r++) { for (int c = 0; c < chks.length; c++) { - chks[c].set0(r, vins[c].at(chks[0]._start + r)); + chks[c].set0(r, vins[c].at(chks[0].start() + r)); } } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java index 56c75f81b8..6e9c09b249 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java @@ -35,11 +35,11 @@ class MRTaskFilter extends MRTask { _r = r; } public void map(Chunk chks[], NewChunk ncs[]) { - if (chks[0]._start > _r.end() || (chks[0]._start + chks[0]._len) < _r.start()) + if (chks[0].start() > _r.end() || (chks[0].start() + chks[0].len()) < _r.start()) return; - for (int r = 0; r < chks[0]._len; r++) { - if (!_r.contains (chks[0]._start + r)) + for (int r = 0; r < chks[0].len(); r++) { + if (!_r.contains (chks[0].start() + r)) continue; for (int c = 0; c < chks.length; c++) @@ -56,11 +56,11 @@ class MRTaskStrFilter extends MRTask { _r = r; } public void map(Chunk chk, NewChunk nc) { - if (chk._start > _r.end() || (chk._start + chk._len) < _r.start()) + if (chk.start() > _r.end() || (chk.start() + chk.len()) < _r.start()) return; - for (int r = 0; r < chk._len; r++) { - if (!_r.contains (chk._start + r)) + for (int r = 0; r < chk.len(); r++) { + if (!_r.contains (chk.start() + r)) continue; nc.addStr(chk.atStr0(r)); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java index e268655712..bbd55e4262 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java @@ -36,7 +36,7 @@ class MRTaskAinCoreB extends MRTask { public void map(Chunk chks[], NewChunk ncs[]) { Vector D = bd.value(); for (int c = 0; c < ncs.length; c++) { - for (int r = 0; r < chks[0]._len; r++) { + for (int r = 0; r < chks[0].len(); r++) { double v = (chks[c].at0(r) * D.getQuick(c)); ncs[c].addNum(v); } @@ -53,7 +53,7 @@ class MRTaskAinCoreB extends MRTask { public void map(Chunk chks[], NewChunk ncs[]) { Matrix B = bb.value(); for (int c = 0; c < ncs.length; c++) { - for (int r = 0; r < chks[0]._len; r++) { + for (int r = 0; r < chks[0].len(); r++) { double v = 0; for (int i = 0; i < chks.length; i++) { v += (chks[i].at0(r) * B.getQuick(i, c)); From 3ebc839bd120fcaff90f912a343841ccb316e104 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Mon, 7 Jul 2014 19:50:35 -0700 Subject: [PATCH 09/34] MAHOUT-1500: mahout h2o-node CLI for aux worker processes Signed-off-by: Anand Avati --- bin/mahout | 18 +++++++++++++++ h2o/pom.xml | 22 +++++++++++++++++++ .../apache/mahout/h2obindings/H2OContext.java | 2 +- .../h2obindings/H2ODistributedContext.scala | 2 +- .../h2obindings/test/MahoutLocalContext.scala | 2 +- 5 files changed, 43 insertions(+), 3 deletions(-) diff --git a/bin/mahout b/bin/mahout index d0623f1209..27acd9fa50 100755 --- a/bin/mahout +++ b/bin/mahout @@ -92,6 +92,10 @@ if [ "$MAHOUT_CORE" != "" ]; then IS_CORE=1 fi +if [ "$1" == "h2o-node" ]; then + H2O=1 +fi + # some directories THIS_DIR=`dirname "$THIS"` MAHOUT_HOME=`cd "$THIS_DIR/.." ; pwd` @@ -165,6 +169,15 @@ then CLASSPATH=${CLASSPATH}:$f; done + if [ "$H2O" == "1" ]; then + for f in $MAHOUT_HOME/mrlegacy/target/mahout-mrlegacy-*.jar; do + CLASSPATH=${CLASSPATH}:$f; + done + + for f in $MAHOUT_HOME/h2o/target/mahout-h2o-*.jar; do + CLASSPATH=${CLASSPATH}:$f; + done + fi # add spark-shell -- if we requested shell or other spark CLI driver if [ "$SPARK" == "1" ]; then @@ -209,6 +222,7 @@ else #CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/mrlegacy/src/main/resources CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark/target/classes CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/spark-shell/target/classes + CLASSPATH=${CLASSPATH}:$MAHOUT_HOME/h2o/target/classes fi # add development dependencies to CLASSPATH @@ -240,6 +254,10 @@ case "$1" in shift "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "org.apache.mahout.drivers.ItemSimilarityDriver" "$@" ;; + (h2o-node) + shift + "$JAVA" $JAVA_HEAP_MAX -classpath "$CLASSPATH" "water.H2O" -md5skip "$@" -name mah2out + ;; (*) # default log directory & file diff --git a/h2o/pom.xml b/h2o/pom.xml index 711295d091..13c59338f9 100644 --- a/h2o/pom.xml +++ b/h2o/pom.xml @@ -110,6 +110,28 @@ + + maven-assembly-plugin + + + jar-with-dependencies + + + + water.H2O + + + + + + package + + single + + + + + maven-javadoc-plugin diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OContext.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OContext.java index 7502a2909e..2ee9e45dec 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OContext.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OContext.java @@ -25,7 +25,7 @@ public class H2OContext { public H2OContext(String _masterURL) { masterURL = _masterURL; - H2O.main(new String[]{"-name", _masterURL}); + H2O.main(new String[]{"-md5skip", "-name", _masterURL}); H2O.joinOthers(); } } diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2ODistributedContext.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2ODistributedContext.scala index 40289d7504..e5622293ae 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2ODistributedContext.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2ODistributedContext.scala @@ -20,7 +20,7 @@ package org.apache.mahout.h2obindings import org.apache.mahout.math.drm._ class H2ODistributedContext(val masterUrl: String) extends DistributedContext { - val h2octx = new H2OContext("local"); + val h2octx = new H2OContext(masterUrl); def close(): Unit = return diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/test/MahoutLocalContext.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/test/MahoutLocalContext.scala index f08c613225..21735ab87e 100644 --- a/h2o/src/test/scala/org/apache/mahout/h2obindings/test/MahoutLocalContext.scala +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/test/MahoutLocalContext.scala @@ -13,7 +13,7 @@ trait MahoutLocalContext extends MahoutSuite with LoggerConfiguration { override protected def beforeEach() { super.beforeEach() - mahoutCtx = mahoutH2OContext("local") + mahoutCtx = mahoutH2OContext("mah2out") } override protected def afterEach() { From 28fb049043b2741167b49cea6385cb451a8e3120 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Tue, 8 Jul 2014 19:32:27 -0700 Subject: [PATCH 10/34] MAHOUT-1500: refactor frame_from_matrix() to reuse empty_frame() Signed-off-by: Anand Avati --- .../apache/mahout/h2obindings/H2OHelper.java | 81 ++++++------------- 1 file changed, 26 insertions(+), 55 deletions(-) diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java index 70bdf9b266..4b22166b76 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -224,16 +224,6 @@ private static int chunk_size (long nrow, int ncol, int min, int exact) { return chunk_sz; } - private static int next_chunks(NewChunk ncs[], int cidx, long r, int chunk_sz, AppendableVec avs[], Futures fs) { - if ((r % chunk_sz) != 0) - return cidx; - for (int i = 0; i < ncs.length; i++) { - if (ncs[i] != null) - ncs[i].close(fs); - ncs[i] = new NewChunk (avs[i], cidx); - } - return cidx + 1; - } /* Ingest a Matrix into an H2O Frame. H2O Frame is the "backing" data structure behind CheckpointedDrm. Steps: @@ -243,55 +233,36 @@ private static int next_chunks(NewChunk ncs[], int cidx, long r, int chunk_sz, A - Load data into Vecs by routing them through NewChunks */ public static Tuple2 frame_from_matrix (Matrix m, int min_hint, int exact_hint) { + Frame frame = empty_frame (m.rowSize(), m.columnSize(), min_hint, exact_hint); + Vec labels = null; + Vec.Writer writers[] = new Vec.Writer[m.columnSize()]; + Futures closer = new Futures(); + + for (int i = 0; i < writers.length; i++) + writers[i] = frame.vecs()[i].open(); + + for (int r = 0; r < m.rowSize(); r++) + for (int c = 0; c < m.columnSize(); c++) + writers[c].set(r, m.getQuick(r, c)); + + for (int c = 0; c < m.columnSize(); c++) + writers[c].close(closer); + Map map = m.getRowLabelBindings(); - Map rmap = reverse_map(map); - int cols = m.columnSize(); - int nvecs = cols + (map != null ? 1 : 0); - Vec.VectorGroup vg = new Vec.VectorGroup(); - Key keys[] = vg.addVecs(nvecs); - AppendableVec avs[] = new AppendableVec[nvecs]; - Vec vecs[] = new Vec[nvecs]; - NewChunk ncs[] = new NewChunk[nvecs]; - int chunk_sz = chunk_size (m.rowSize(), m.columnSize(), min_hint, exact_hint); - Futures fs = new Futures(); - int cidx = 0; - - for (int c = 0; c < nvecs; c++) - avs[c] = new AppendableVec(keys[c]); - - long r = 0; - for (MatrixSlice row : m) { - cidx = next_chunks (ncs, cidx, r, chunk_sz, avs, fs); - /* Detect entire sparse rows */ - while (r < row.index()) { - for (int i = 0; i < cols; i++) - ncs[i].addNum(0.0); - if (nvecs != cols) - ncs[nvecs-1].addStr(null); - r++; - cidx = next_chunks (ncs, cidx, r, chunk_sz, avs, fs); - } - int c = 0; - for (Vector.Element element : row.nonZeroes()) { - while (c < element.index()) - /* Detect sparse column elements within a row */ - ncs[c++].addNum(0.0); - ncs[c++].addNum(element.get()); - } - if (rmap != null) - ncs[nvecs-1].addStr(rmap.get(r)); - r++; - } + if (map != null) { + labels = frame.anyVec().makeZero(); + Vec.Writer writer = labels.open(); + Map rmap = reverse_map(map); + + for (long r = 0; r < m.rowSize(); r++) + writer.set(r, rmap.get(r)); - for (int c = 0; c < nvecs; c++) { - ncs[c].close(fs); - vecs[c] = avs[c].close(fs); + writer.close(closer); } - fs.blockForPending(); - Frame fr = new Frame(Arrays.copyOfRange(vecs,0,cols)); - Vec labels = (rmap != null) ? vecs[nvecs-1] : null; - return new Tuple2(fr,labels); + closer.blockForPending(); + + return new Tuple2(frame,labels); } public static Frame empty_frame (long nrow, int ncol, int min_hint, int exact_hint) { From 872d7a4827e3e2c13859c3aed0f9ba0588784382 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Tue, 8 Jul 2014 18:59:38 -0700 Subject: [PATCH 11/34] MAHOUT-1500: SequenceFile persistence (read/write) support Signed-off-by: Anand Avati --- .../apache/mahout/h2obindings/H2OHdfs.java | 148 ++++++++++++++++++ .../apache/mahout/h2obindings/H2OHelper.java | 8 +- .../apache/mahout/h2obindings/H2OEngine.scala | 7 +- .../h2obindings/drm/CheckpointedDrmH2O.scala | 3 +- 4 files changed, 154 insertions(+), 12 deletions(-) create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java new file mode 100644 index 0000000000..70866ed054 --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings; + +import java.io.IOException; +import java.net.URI; + +import scala.Tuple2; + +import water.fvec.*; +import water.*; + +import org.apache.mahout.math.Vector; +import org.apache.mahout.math.DenseVector; +import org.apache.mahout.math.SequentialAccessSparseVector; +import org.apache.mahout.math.VectorWritable; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.util.ReflectionUtils; + + + +public class H2OHdfs { + public static Tuple2 drm_from_file (String filename, int parMin) { + long rows = 0; + int cols = 0; + Frame frame = null; + Vec labels = null; + + SequenceFile.Reader reader = null; + try { + String uri = filename; + Configuration conf = new Configuration(); + Path path = new Path(uri); + FileSystem fs = FileSystem.get(URI.create(uri), conf); + Vec.Writer writers[]; + Vec.Writer labelwriter = null; + + reader = new SequenceFile.Reader(fs, path, conf); + + if (reader.getValueClass() != VectorWritable.class) { + System.out.println("ValueClass in file " + filename + "must be VectorWritable, but found " + reader.getValueClassName()); + return null; + } + + Writable key = (Writable) + ReflectionUtils.newInstance(reader.getKeyClass(), conf); + VectorWritable value = (VectorWritable) + ReflectionUtils.newInstance(reader.getValueClass(), conf); + + long start = reader.getPosition(); + while (reader.next(key, value)) { + if (cols == 0) { + Vector v = value.get(); + cols = v.size(); + } + rows++; + } + reader.seek(start); + + frame = H2OHelper.empty_frame (rows, cols, parMin, -1); + writers = new Vec.Writer[cols]; + for (int i = 0; i < writers.length; i++) + writers[i] = frame.vecs()[i].open(); + + if (reader.getKeyClass() == Text.class) { + labels = frame.anyVec().makeZero(); + labelwriter = labels.open(); + } + + long r = 0; + while (reader.next(key, value)) { + Vector v = value.get(); + for (int c = 0; c < v.size(); c++) + writers[c].set(r, v.getQuick(c)); + if (labels != null) + labelwriter.set(r, ((Text)key).toString()); + r++; + } + + Futures fus = new Futures(); + for (Vec.Writer w : writers) + w.close(fus); + if (labelwriter != null) + labelwriter.close(fus); + fus.blockForPending(); + } catch (java.io.IOException e) { + return null; + } finally { + IOUtils.closeStream(reader); + } + return new Tuple2(frame, labels); + } + + public static void drm_to_file (String filename, Frame frame, Vec labels) throws java.io.IOException { + String uri = filename; + Configuration conf = new Configuration(); + Path path = new Path(uri); + FileSystem fs = FileSystem.get(URI.create(uri), conf); + SequenceFile.Writer writer = null; + boolean is_sparse = H2OHelper.is_sparse(frame); + + if (labels != null) + writer = SequenceFile.createWriter(fs, conf, path, Text.class, VectorWritable.class); + else + writer = SequenceFile.createWriter(fs, conf, path, IntWritable.class, VectorWritable.class); + + for (long r = 0; r < frame.anyVec().length(); r++) { + Vector v = null; + if (is_sparse) + v = new SequentialAccessSparseVector(frame.numCols()); + else + v = new DenseVector(frame.numCols()); + + for (int c = 0; c < frame.numCols(); c++) + v.setQuick(c, frame.vecs()[c].at(r)); + + if (labels != null) + writer.append(new Text(labels.atStr(r)), new VectorWritable(v)); + else + writer.append(new IntWritable((int)r), new VectorWritable(v)); + } + + writer.close(); + } +} diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java index 4b22166b76..7f65431a6d 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -21,7 +21,6 @@ import water.*; import water.fvec.*; -import water.util.FrameUtils; import java.io.File; import java.io.IOException; @@ -37,7 +36,7 @@ public class H2OHelper { Is the matrix sparse? If the number of missing elements is 32 x times the number of present elements, treat it as sparse */ - private static boolean is_sparse (Frame frame) { + public static boolean is_sparse (Frame frame) { long rows = frame.numRows(); long cols = frame.numCols(); @@ -182,11 +181,6 @@ public void reduce(MRTaskNonZero other) { return new DenseVector(new MRTaskNonZero().doAll(frame)._sums); } - - public static Frame frame_from_file (String path) throws IOException { - return FrameUtils.parseFrame(null, new File(path)); - } - private static Map reverse_map(Map map) { if (map == null) return null; diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala index 3b3f511218..f97121d7a0 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala @@ -46,9 +46,10 @@ object H2OEngine extends DistributedEngine { def drmBroadcast(v: Vector)(implicit dc: DistributedContext): BCast[Vector] = new H2OBCast(v) - /* XXX - H2O parser does not support seqfile */ - def drmFromHDFS(path: String, parMin: Int = 0)(implicit dc: DistributedContext): CheckpointedDrm[_] = - new CheckpointedDrmH2O (H2OHelper.frame_from_file (path), dc) + def drmFromHDFS(path: String, parMin: Int = 0)(implicit dc: DistributedContext): CheckpointedDrm[_] = { + val (frame, labels) = H2OHdfs.drm_from_file (path, parMin) + new CheckpointedDrmH2O (frame, labels, dc) + } def drmParallelizeEmpty(nrow: Int, ncol: Int, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Int] = new CheckpointedDrmH2O[Int] (H2OHelper.empty_frame (nrow, ncol, numPartitions, -1), dc) diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala index 1e442ac810..74ffe787a5 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala @@ -24,9 +24,8 @@ class CheckpointedDrmH2O[K: ClassTag]( def collect: Matrix = H2OHelper.matrix_from_frame(frame, labels) /* XXX: call frame.remove */ def uncache(): Unit = return - /* XXX: H2O does not support seqfile format yet */ - def writeDRM(path: String): Unit = ??? + def writeDRM(path: String): Unit = H2OHdfs.drm_to_file (path, frame, labels) def checkpoint(cacheHint: CacheHint.CacheHint): CheckpointedDrm[K] = this From 90be82e409e4cebf1b57ddaf46cb8ac56707719c Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Wed, 9 Jul 2014 00:40:58 -0700 Subject: [PATCH 12/34] MAHOUT-1500: Add tests for distributed operations Mostly copy tests from sparkbindings with minor/trivial changes Signed-off-by: Anand Avati --- .../h2obindings/drm/DrmLikeOpsSuite.scala | 93 ++++ .../mahout/h2obindings/drm/DrmLikeSuite.scala | 68 +++ .../h2obindings/drm/RLikeDrmOpsSuite.scala | 487 ++++++++++++++++++ .../mahout/h2obindings/ops/ABtSuite.scala | 50 ++ .../mahout/h2obindings/ops/AewBSuite.scala | 91 ++++ .../mahout/h2obindings/ops/AtASuite.scala | 50 ++ .../mahout/h2obindings/ops/AtSuite.scala | 46 ++ .../math/cf/CooccurrenceAnalysisSuite.scala | 235 +++++++++ 8 files changed, 1120 insertions(+) create mode 100644 h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeOpsSuite.scala create mode 100644 h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeSuite.scala create mode 100644 h2o/src/test/scala/org/apache/mahout/h2obindings/drm/RLikeDrmOpsSuite.scala create mode 100644 h2o/src/test/scala/org/apache/mahout/h2obindings/ops/ABtSuite.scala create mode 100644 h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AewBSuite.scala create mode 100644 h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtASuite.scala create mode 100644 h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtSuite.scala create mode 100644 h2o/src/test/scala/org/apache/mahout/math/cf/CooccurrenceAnalysisSuite.scala diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeOpsSuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeOpsSuite.scala new file mode 100644 index 0000000000..68bf017752 --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeOpsSuite.scala @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.drm + +import org.apache.mahout.math._ +import scalabindings._ +import drm._ +import RLikeOps._ +import RLikeDrmOps._ +import org.apache.mahout.h2obindings._ +import org.scalatest.FunSuite +import org.apache.mahout.h2obindings.test.MahoutLocalContext + +/** Tests for DrmLikeOps */ +class DrmLikeOpsSuite extends FunSuite with MahoutLocalContext { + + test("mapBlock") { + + val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6)) + val A = drmParallelize(m = inCoreA, numPartitions = 2) + val B = A.mapBlock(/* Inherit width */) { + case (keys, block) => keys -> (block += 1.0) + } + + val inCoreB = B.collect + val inCoreBControl = inCoreA + 1.0 + + println(inCoreB) + + // Assert they are the same + (inCoreB - inCoreBControl).norm should be < 1E-10 + + } + + test("col range") { + val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6)) + val A = drmParallelize(m = inCoreA, numPartitions = 2) + val B = A(::, 1 to 2) + val inCoreB = B.collect + val inCoreBControl = inCoreA(::, 1 to 2) + + println(inCoreB) + + // Assert they are the same + (inCoreB - inCoreBControl).norm should be < 1E-10 + + } + + test("row range") { + + val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6)) + val A = drmParallelize(m = inCoreA, numPartitions = 2) + val B = A(1 to 2, ::) + val inCoreB = B.collect + val inCoreBControl = inCoreA(1 to 2, ::) + + println(inCoreB) + + // Assert they are the same + (inCoreB - inCoreBControl).norm should be < 1E-10 + + } + + test("col, row range") { + + val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6)) + val A = drmParallelize(m = inCoreA, numPartitions = 2) + val B = A(1 to 2, 1 to 2) + val inCoreB = B.collect + val inCoreBControl = inCoreA(1 to 2, 1 to 2) + + println(inCoreB) + + // Assert they are the same + (inCoreB - inCoreBControl).norm should be < 1E-10 + + } +} diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeSuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeSuite.scala new file mode 100644 index 0000000000..98778a8abd --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeSuite.scala @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.drm + +import org.scalatest.FunSuite +import org.apache.mahout.math._ +import scalabindings._ +import drm._ +import RLikeOps._ +import RLikeDrmOps._ +import org.apache.mahout.h2obindings.test.MahoutLocalContext + + +/** + * DRMLike tests + */ +class DrmLikeSuite extends FunSuite with MahoutLocalContext { + + + test("DRM DFS i/o (local)") { + + val uploadPath = "UploadedDRM" + + val inCoreA = dense((1, 2, 3), (3, 4, 5)) + val drmA = drmParallelize(inCoreA) + + drmA.writeDRM(path = uploadPath) + + println(inCoreA) + + // Load back from hdfs + val drmB = drmFromHDFS(path = uploadPath) + + // Collect back into in-core + val inCoreB = drmB.collect + + // Print out to see what it is we collected: + println(inCoreB) + } + + test("DRM parallelizeEmpty") { + + val drmEmpty = drmParallelizeEmpty(100, 50) + + // collect back into in-core + val inCoreEmpty = drmEmpty.collect + + //print out to see what it is we collected: + println(inCoreEmpty) + printf("drm nrow:%d, ncol:%d\n", drmEmpty.nrow, drmEmpty.ncol) + printf("in core nrow:%d, ncol:%d\n", inCoreEmpty.nrow, inCoreEmpty.ncol) + } +} diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/RLikeDrmOpsSuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/RLikeDrmOpsSuite.scala new file mode 100644 index 0000000000..dd36f1ecbe --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/RLikeDrmOpsSuite.scala @@ -0,0 +1,487 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.drm + +import org.scalatest.{Matchers, FunSuite} +import org.apache.mahout.math._ +import decompositions._ +import scalabindings._ +import drm._ +import RLikeOps._ +import RLikeDrmOps._ +import org.apache.mahout.h2obindings._ +import test.MahoutLocalContext +import scala.collection.mutable.ArrayBuffer +import org.apache.mahout.math.Matrices +import org.apache.mahout.h2obindings.{H2OEngine, ops} +import org.apache.mahout.math.drm.logical.{OpAtx, OpAtB, OpAtA} +import scala.util.Random + +/** R-like DRM DSL operation tests */ +class RLikeDrmOpsSuite extends FunSuite with Matchers with MahoutLocalContext { + + import RLikeOps._ + + val epsilon = 1E-5 + + test("A.t") { + + val inCoreA = dense((1, 2, 3), (3, 4, 5)) + + val A = drmParallelize(inCoreA) + + val inCoreAt = A.t.collect + + // Assert first norm of difference is less than error margin. + (inCoreAt - inCoreA.t).norm should be < epsilon + + } + + test("C = A %*% B") { + + val inCoreA = dense((1, 2), (3, 4)) + val inCoreB = dense((3, 5), (4, 6)) + + val A = drmParallelize(inCoreA, numPartitions = 2) + val B = drmParallelize(inCoreB, numPartitions = 2) + + // Actual + val inCoreCControl = inCoreA %*% inCoreB + + // Distributed operation + val C = A %*% B + val inCoreC = C.collect + println(inCoreC) + + (inCoreC - inCoreCControl).norm should be < 1E-10 + + // We also should be able to collect via implicit checkpoint + val inCoreC2 = C.collect + println(inCoreC2) + + (inCoreC2 - inCoreCControl).norm should be < 1E-10 + + } + + test("C = A %*% B mapBlock {}") { + + val inCoreA = dense((1, 2), (3, 4)) + val inCoreB = dense((3, 5), (4, 6)) + + val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint() + val B = drmParallelize(inCoreB, numPartitions = 2).checkpoint() + + // Actual + val inCoreCControl = inCoreA %*% inCoreB + + A.colSums() + B.colSums() + + + val x = drmBroadcast(dvec(0, 0)) + val x2 = drmBroadcast(dvec(0, 0)) + // Distributed operation + val C = (B.t %*% A.t).t.mapBlock() { + case (keys, block) => + for (row <- 0 until block.nrow) block(row, ::) += x.value + x2 + keys -> block + } + + val inCoreC = C checkpoint CacheHint.NONE collect; + println(inCoreC) + + (inCoreC - inCoreCControl).norm should be < 1E-10 + + // We also should be able to collect via implicit checkpoint + val inCoreC2 = C.collect + println(inCoreC2) + + (inCoreC2 - inCoreCControl).norm should be < 1E-10 + + val inCoreQ = dqrThin(C)._1.collect + + printf("Q=\n%s\n", inCoreQ) + + // Assert unit-orthogonality + ((inCoreQ(::, 0) dot inCoreQ(::, 0)) - 1.0).abs should be < 1e-10 + (inCoreQ(::, 0) dot inCoreQ(::, 1)).abs should be < 1e-10 + + } + + test("C = A %*% B incompatible B keys") { + + val inCoreA = dense((1, 2), (3, 4)) + val inCoreB = dense((3, 5), (4, 6)) + + val A = drmParallelize(inCoreA, numPartitions = 2) + val B = drmParallelize(inCoreB, numPartitions = 2) + // Re-key B into DrmLike[String] instead of [Int] + .mapBlock()({ + case (keys, block) => keys.map(_.toString) -> block + }) + + val C = A %*% B + + intercept[IllegalArgumentException] { + // This plan must not compile + C.checkpoint() + } + } + + test("C = At %*% B , join") { + + val inCoreA = dense((1, 2), (3, 4), (-3, -5)) + val inCoreB = dense((3, 5), (4, 6), (0, 1)) + + val A = drmParallelize(inCoreA, numPartitions = 2) + val B = drmParallelize(inCoreB, numPartitions = 2) + + val C = A.t %*% B + + H2OEngine.optimizerRewrite(C) should equal(OpAtB[Int](A, B)) + + val inCoreC = C.collect + val inCoreControlC = inCoreA.t %*% inCoreB + + (inCoreC - inCoreControlC).norm should be < 1E-10 + + } + + test("C = At %*% B , join, String-keyed") { + + val inCoreA = dense((1, 2), (3, 4), (-3, -5)) + val inCoreB = dense((3, 5), (4, 6), (0, 1)) + + val A = drmParallelize(inCoreA, numPartitions = 2) + .mapBlock()({ + case (keys, block) => keys.map(_.toString) -> block + }) + + val B = drmParallelize(inCoreB, numPartitions = 2) + .mapBlock()({ + case (keys, block) => keys.map(_.toString) -> block + }) + + val C = A.t %*% B + + H2OEngine.optimizerRewrite(C) should equal(OpAtB[String](A, B)) + + val inCoreC = C.collect + val inCoreControlC = inCoreA.t %*% inCoreB + + (inCoreC - inCoreControlC).norm should be < 1E-10 + + } + + test("C = At %*% B , zippable, String-keyed") { + + val inCoreA = dense((1, 2), (3, 4), (-3, -5)) + + val A = drmParallelize(inCoreA, numPartitions = 2) + .mapBlock()({ + case (keys, block) => keys.map(_.toString) -> block + }) + + val B = A + 1.0 + + val C = A.t %*% B + + H2OEngine.optimizerRewrite(C) should equal(OpAtB[String](A, B)) + + val inCoreC = C.collect + val inCoreControlC = inCoreA.t %*% (inCoreA + 1.0) + + (inCoreC - inCoreControlC).norm should be < 1E-10 + + } + + test("C = A %*% inCoreB") { + + val inCoreA = dense((1, 2, 3), (3, 4, 5), (4, 5, 6), (5, 6, 7)) + val inCoreB = dense((3, 5, 7, 10), (4, 6, 9, 10), (5, 6, 7, 7)) + + val A = drmParallelize(inCoreA, numPartitions = 2) + val C = A %*% inCoreB + + val inCoreC = C.collect + val inCoreCControl = inCoreA %*% inCoreB + + println(inCoreC) + (inCoreC - inCoreCControl).norm should be < 1E-10 + + } + + test("C = inCoreA %*%: B") { + + val inCoreA = dense((1, 2, 3), (3, 4, 5), (4, 5, 6), (5, 6, 7)) + val inCoreB = dense((3, 5, 7, 10), (4, 6, 9, 10), (5, 6, 7, 7)) + + val B = drmParallelize(inCoreB, numPartitions = 2) + val C = inCoreA %*%: B + + val inCoreC = C.collect + val inCoreCControl = inCoreA %*% inCoreB + + println(inCoreC) + (inCoreC - inCoreCControl).norm should be < 1E-10 + + } + + test("C = A.t %*% A") { + val inCoreA = dense((1, 2, 3), (3, 4, 5), (4, 5, 6), (5, 6, 7)) + val A = drmParallelize(m = inCoreA, numPartitions = 2) + + val AtA = A.t %*% A + + // Assert optimizer detects square + H2OEngine.optimizerRewrite(action = AtA) should equal(OpAtA(A)) + + val inCoreAtA = AtA.collect + val inCoreAtAControl = inCoreA.t %*% inCoreA + + (inCoreAtA - inCoreAtAControl).norm should be < 1E-10 + } + + test("C = A.t %*% A fat non-graph") { + val inCoreA = Matrices.uniformView(400, 550, 1234) + val A = drmParallelize(m = inCoreA, numPartitions = 2) + + val AtA = A.t %*% A + + // Assert optimizer detects square + H2OEngine.optimizerRewrite(action = AtA) should equal(OpAtA(A)) + + val inCoreAtA = AtA.collect + val inCoreAtAControl = inCoreA.t %*% inCoreA + + (inCoreAtA - inCoreAtAControl).norm should be < 1E-10 + } + + + test("C = A.t %*% A non-int key") { + val inCoreA = dense((1, 2, 3), (3, 4, 5), (4, 5, 6), (5, 6, 7)) + val AintKeyd = drmParallelize(m = inCoreA, numPartitions = 2) + val A = AintKeyd.mapBlock() { + case (keys, block) => keys.map(_.toString) -> block + } + + val AtA = A.t %*% A + + // Assert optimizer detects square + H2OEngine.optimizerRewrite(action = AtA) should equal(OpAtA(A)) + + val inCoreAtA = AtA.collect + val inCoreAtAControl = inCoreA.t %*% inCoreA + + (inCoreAtA - inCoreAtAControl).norm should be < 1E-10 + } + + test("C = A + B") { + + val inCoreA = dense((1, 2), (3, 4)) + val inCoreB = dense((3, 5), (4, 6)) + + val A = drmParallelize(inCoreA, numPartitions = 2) + val B = drmParallelize(inCoreB, numPartitions = 2) + + val C = A + B + val inCoreC = C.collect + + // Actual + val inCoreCControl = inCoreA + inCoreB + + (inCoreC - inCoreCControl).norm should be < 1E-10 + } + + test("C = A + B, identically partitioned") { + + val inCoreA = dense((1, 2, 3), (3, 4, 5), (5, 6, 7)) + + val A = drmParallelize(inCoreA, numPartitions = 2) + + printf("A.nrow=%d.\n", A.nrow) + + // Create B which would be identically partitioned to A. mapBlock() by default will do the trick. + val B = A.mapBlock() { + case (keys, block) => + val bBlock = block.like() := ((r, c, v) => util.Random.nextDouble()) + keys -> bBlock + } + // Prevent repeated computation non-determinism + .checkpoint() + + val inCoreB = B.collect + + printf("A=\n%s\n", inCoreA) + printf("B=\n%s\n", inCoreB) + + val C = A + B + + val inCoreC = C.collect + + printf("C=\n%s\n", inCoreC) + + // Actual + val inCoreCControl = inCoreA + inCoreB + + (inCoreC - inCoreCControl).norm should be < 1E-10 + } + + + test("C = A + B side test 1") { + + val inCoreA = dense((1, 2), (3, 4)) + val inCoreB = dense((3, 5), (4, 6)) + + val A = drmParallelize(inCoreA, numPartitions = 2) + val B = drmParallelize(inCoreB, numPartitions = 2) + + val C = A + B + val inCoreC = C.collect + + val inCoreD = (A + B).collect + + // Actual + val inCoreCControl = inCoreA + inCoreB + + (inCoreC - inCoreCControl).norm should be < 1E-10 + (inCoreD - inCoreCControl).norm should be < 1E-10 + } + + test("C = A + B side test 2") { + + val inCoreA = dense((1, 2), (3, 4)) + val inCoreB = dense((3, 5), (4, 6)) + + val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint() + val B = drmParallelize(inCoreB, numPartitions = 2) + + val C = A + B + val inCoreC = C.collect + + val inCoreD = (A + B).collect + + // Actual + val inCoreCControl = inCoreA + inCoreB + + (inCoreC - inCoreCControl).norm should be < 1E-10 + (inCoreD - inCoreCControl).norm should be < 1E-10 + } + + test("C = A + B side test 3") { + + val inCoreA = dense((1, 2), (3, 4)) + val inCoreB = dense((3, 5), (4, 6)) + + val B = drmParallelize(inCoreB, numPartitions = 2) + // val A = (drmParallelize(inCoreA, numPartitions = 2) + B).checkpoint(CacheHint.MEMORY_ONLY_SER) + val A = (drmParallelize(inCoreA, numPartitions = 2) + B).checkpoint(CacheHint.MEMORY_ONLY) + + val C = A + B + val inCoreC = C.collect + + val inCoreD = (A + B).collect + + // Actual + val inCoreCControl = inCoreA + inCoreB * 2.0 + + (inCoreC - inCoreCControl).norm should be < 1E-10 + (inCoreD - inCoreCControl).norm should be < 1E-10 + } + + test("Ax") { + val inCoreA = dense( + (1, 2), + (3, 4), + (20, 30) + ) + val x = dvec(10, 3) + + val drmA = drmParallelize(inCoreA, numPartitions = 2) + + val ax = (drmA %*% x).collect(::, 0) + + ax should equal(inCoreA %*% x) + } + + test("A'x") { + val inCoreA = dense( + (1, 2), + (3, 4), + (20, 30) + ) + val x = dvec(10, 3, 4) + + val drmA = drmParallelize(inCoreA, numPartitions = 2) + + H2OEngine.optimizerRewrite(drmA.t %*% x) should equal(OpAtx(drmA, x)) + + val atx = (drmA.t %*% x).collect(::, 0) + + atx should equal(inCoreA.t %*% x) + } + + test("colSums, colMeans") { + val inCoreA = dense( + (1, 2), + (3, 4), + (20, 30) + ) + val drmA = drmParallelize(inCoreA, numPartitions = 2) + + drmA.colSums() should equal(inCoreA.colSums()) + drmA.colMeans() should equal(inCoreA.colMeans()) + } + + test("numNonZeroElementsPerColumn") { + val inCoreA = dense( + (0, 2), + (3, 0), + (0, -30) + + ) + val drmA = drmParallelize(inCoreA, numPartitions = 2) + + drmA.numNonZeroElementsPerColumn() should equal(inCoreA.numNonZeroElementsPerColumn()) + } + + test("C = A cbind B, cogroup") { + + val inCoreA = dense((1, 2), (3, 4)) + val inCoreB = dense((3, 5), (4, 6)) + val controlC = dense((1, 2, 3, 5), (3, 4, 4, 6)) + + val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint() + val B = drmParallelize(inCoreB, numPartitions = 2).checkpoint() + + (A.cbind(B) -: controlC).norm should be < 1e-10 + + } + + test("C = A cbind B, zip") { + + val inCoreA = dense((1, 2), (3, 4)) + val controlC = dense((1, 2, 2, 3), (3, 4, 4, 5)) + + val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint() + + (A.cbind(A + 1.0) -: controlC).norm should be < 1e-10 + + } + +} diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/ABtSuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/ABtSuite.scala new file mode 100644 index 0000000000..c154985e72 --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/ABtSuite.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops + +import org.apache.mahout.h2obindings.test.MahoutLocalContext +import org.scalatest.FunSuite +import org.apache.mahout.math.scalabindings._ +import org.apache.mahout.math.drm._ +import org.apache.mahout.h2obindings._ +import org.apache.mahout.h2obindings.drm._ +import RLikeOps._ +import RLikeDrmOps._ +import org.apache.mahout.math.drm._ + +/** Tests for AB' operator algorithms */ +class ABtSuite extends FunSuite with MahoutLocalContext { + + test("ABt") { + val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5)) + val inCoreB = dense((3, 4, 5), (5, 6, 7)) + val A = drmParallelize(m = inCoreA, numPartitions = 3) + val B = drmParallelize(m = inCoreB, numPartitions = 2) + + val ABt = A %*% B.t + + val inCoreMControl = inCoreA %*% inCoreB.t + val inCoreM = ABt.collect + + assert((inCoreM - inCoreMControl).norm < 1E-5) + + println(inCoreM) + + } + +} diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AewBSuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AewBSuite.scala new file mode 100644 index 0000000000..3a3347e15d --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AewBSuite.scala @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops + +import org.scalatest.FunSuite +import org.apache.mahout.h2obindings.test.MahoutLocalContext +import org.apache.mahout.math.scalabindings._ +import RLikeOps._ +import org.apache.mahout.math.drm._ +import RLikeDrmOps._ +import org.apache.mahout.math.drm.logical._ + +/** Elementwise matrix operation tests */ +class AewBSuite extends FunSuite with MahoutLocalContext { + + test("A * B Hadamard") { + val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (7, 8, 9)) + val inCoreB = dense((3, 4, 5), (5, 6, 7), (0, 0, 0), (9, 8, 7)) + val A = drmParallelize(m = inCoreA, numPartitions = 2) + val B = drmParallelize(m = inCoreB) + + val M = A * B + + val inCoreM = M.collect + val inCoreMControl = inCoreA * inCoreB + + assert((inCoreM - inCoreMControl).norm < 1E-10) + + } + + test("A + B Elementwise") { + val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (7, 8, 9)) + val inCoreB = dense((3, 4, 5), (5, 6, 7), (0, 0, 0), (9, 8, 7)) + val A = drmParallelize(m = inCoreA, numPartitions = 2) + val B = drmParallelize(m = inCoreB) + + val M = A + B + + val inCoreM = M.collect + val inCoreMControl = inCoreA + inCoreB + + assert((inCoreM - inCoreMControl).norm < 1E-10) + + } + + test("A - B Elementwise") { + val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (7, 8, 9)) + val inCoreB = dense((3, 4, 5), (5, 6, 7), (0, 0, 0), (9, 8, 7)) + val A = drmParallelize(m = inCoreA, numPartitions = 2) + val B = drmParallelize(m = inCoreB) + + val M = A - B + + val inCoreM = M.collect + val inCoreMControl = inCoreA - inCoreB + + assert((inCoreM - inCoreMControl).norm < 1E-10) + + } + + test("A / B Elementwise") { + val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 0), (7, 8, 9)) + val inCoreB = dense((3, 4, 5), (5, 6, 7), (10, 20, 30), (9, 8, 7)) + val A = drmParallelize(m = inCoreA, numPartitions = 2) + val B = drmParallelize(m = inCoreB) + + val M = A / B + + val inCoreM = M.collect + val inCoreMControl = inCoreA / inCoreB + + assert((inCoreM - inCoreMControl).norm < 1E-10) + + } + +} diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtASuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtASuite.scala new file mode 100644 index 0000000000..47cf14e438 --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtASuite.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops + +import org.scalatest.FunSuite +import org.apache.mahout.h2obindings.test.MahoutLocalContext +import org.apache.mahout.math.scalabindings._ +import org.apache.mahout.math.drm._ +import org.apache.mahout.h2obindings._ +import org.apache.mahout.h2obindings.drm._ +import RLikeOps._ +import RLikeDrmOps._ +import org.apache.mahout.math.drm._ + +/** Tests for {@link XtX} */ +class AtASuite extends FunSuite with MahoutLocalContext { + + test("AtA slim") { + + val inCoreA = dense((1, 2), (2, 3)) + val drmA = drmParallelize(inCoreA) + + val M = drmA.t %*% drmA + val inCoreAtA = M.collect + println(inCoreAtA) + + val expectedAtA = inCoreA.t %*% inCoreA + println(expectedAtA) + + assert(expectedAtA === inCoreAtA) + + } + + +} diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtSuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtSuite.scala new file mode 100644 index 0000000000..98dbe8ea46 --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtSuite.scala @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops + +import org.scalatest.FunSuite +import org.apache.mahout.h2obindings.test.MahoutLocalContext +import org.apache.mahout.math.scalabindings._ +import org.apache.mahout.math.drm._ +import org.apache.mahout.h2obindings._ +import org.apache.mahout.h2obindings.drm._ +import RLikeOps._ +import RLikeDrmOps._ +import org.apache.mahout.math.drm._ + +/** Tests for A' algorithms */ +class AtSuite extends FunSuite with MahoutLocalContext { + + test("At") { + val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5)) + val A = drmParallelize(m = inCoreA, numPartitions = 2) + + val AtDrm = A.t + val inCoreAt = AtDrm.collect + val inCoreControlAt = inCoreA.t + + println(inCoreAt) + assert((inCoreAt - inCoreControlAt).norm < 1E-5) + + + } +} diff --git a/h2o/src/test/scala/org/apache/mahout/math/cf/CooccurrenceAnalysisSuite.scala b/h2o/src/test/scala/org/apache/mahout/math/cf/CooccurrenceAnalysisSuite.scala new file mode 100644 index 0000000000..3624f5a87b --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/math/cf/CooccurrenceAnalysisSuite.scala @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf + +import org.apache.mahout.math.drm._ +import org.apache.mahout.math.scalabindings.{MatrixOps, _} +import org.apache.mahout.h2obindings.test.MahoutLocalContext +import org.apache.mahout.test.MahoutSuite +import org.scalatest.FunSuite + +/* values +A = +1 1 0 0 0 +0 0 1 1 0 +0 0 0 0 1 +1 0 0 1 0 + +B = +1 1 1 1 0 +1 1 1 1 0 +0 0 1 0 1 +1 1 0 1 0 + */ + +class CooccurrenceAnalysisSuite extends FunSuite with MahoutSuite with MahoutLocalContext { + + // correct cooccurrence with LLR + final val matrixLLRCoocAtAControl = dense( + (0.0, 1.7260924347106847, 0.0, 0.0, 0.0), + (1.7260924347106847, 0.0, 0.0, 0.0, 0.0), + (0.0, 0.0, 0.0, 1.7260924347106847, 0.0), + (0.0, 0.0, 1.7260924347106847, 0.0, 0.0), + (0.0, 0.0, 0.0, 0.0, 0.0)) + + // correct cross-cooccurrence with LLR + final val matrixLLRCoocBtAControl = dense( + (1.7260924347106847, 0.6795961471815897, 0.6795961471815897, 1.7260924347106847, 0.0), + (1.7260924347106847, 0.6795961471815897, 0.6795961471815897, 1.7260924347106847, 0.0), + (1.7260924347106847, 0.6795961471815897, 0.6795961471815897, 1.7260924347106847, 0.6795961471815897), + (1.7260924347106847, 0.6795961471815897, 0.6795961471815897, 1.7260924347106847, 0.0), + (0.0, 0.0, 0.0, 0.0, 4.498681156950466)) + + + + test("cooccurrence [A'A], [B'A] boolbean data using LLR") { + val a = dense( + (1, 1, 0, 0, 0), + (0, 0, 1, 1, 0), + (0, 0, 0, 0, 1), + (1, 0, 0, 1, 0)) + + val b = dense( + (1, 1, 1, 1, 0), + (1, 1, 1, 1, 0), + (0, 0, 1, 0, 1), + (1, 1, 0, 1, 0)) + + val drmA = drmParallelize(m = a, numPartitions = 2) + val drmB = drmParallelize(m = b, numPartitions = 2) + + //self similarity + val drmCooc = CooccurrenceAnalysis.cooccurrences(drmARaw = drmA, randomSeed = 1, drmBs = Array(drmB)) + val matrixSelfCooc = drmCooc(0).checkpoint().collect + val diffMatrix = matrixSelfCooc.minus(matrixLLRCoocAtAControl) + var n = (new MatrixOps(m = diffMatrix)).norm + n should be < 1E-10 + + //cross similarity + val matrixCrossCooc = drmCooc(1).checkpoint().collect + val diff2Matrix = matrixCrossCooc.minus(matrixLLRCoocBtAControl) + n = (new MatrixOps(m = diff2Matrix)).norm + n should be < 1E-10 + + } + + test("cooccurrence [A'A], [B'A] double data using LLR") { + val a = dense( + (100000.0D, 1.0D, 0.0D, 0.0D, 0.0D), + ( 0.0D, 0.0D, 10.0D, 1.0D, 0.0D), + ( 0.0D, 0.0D, 0.0D, 0.0D, 1000.0D), + ( 1.0D, 0.0D, 0.0D, 10.0D, 0.0D)) + + val b = dense( + (10000.0D, 100.0D, 1000.0D, 1.0D, 0.0D), + ( 10.0D, 1.0D, 10000000.0D, 10.0D, 0.0D), + ( 0.0D, 0.0D, 1000.0D, 0.0D, 100.0D), + ( 100.0D, 1.0D, 0.0D, 100000.0D, 0.0D)) + + val drmA = drmParallelize(m = a, numPartitions = 2) + val drmB = drmParallelize(m = b, numPartitions = 2) + + //self similarity + val drmCooc = CooccurrenceAnalysis.cooccurrences(drmARaw = drmA, drmBs = Array(drmB)) + val matrixSelfCooc = drmCooc(0).checkpoint().collect + val diffMatrix = matrixSelfCooc.minus(matrixLLRCoocAtAControl) + var n = (new MatrixOps(m = diffMatrix)).norm + n should be < 1E-10 + + //cross similarity + val matrixCrossCooc = drmCooc(1).checkpoint().collect + val diff2Matrix = matrixCrossCooc.minus(matrixLLRCoocBtAControl) + n = (new MatrixOps(m = diff2Matrix)).norm + n should be < 1E-10 + } + + test("cooccurrence [A'A], [B'A] integer data using LLR") { + val a = dense( + ( 1000, 10, 0, 0, 0), + ( 0, 0, -10000, 10, 0), + ( 0, 0, 0, 0, 100), + (10000, 0, 0, 1000, 0)) + + val b = dense( + ( 100, 1000, -10000, 10000, 0), + (10000, 1000, 100, 10, 0), + ( 0, 0, 10, 0, -100), + ( 10, 100, 0, 1000, 0)) + + val drmA = drmParallelize(m = a, numPartitions = 2) + val drmB = drmParallelize(m = b, numPartitions = 2) + + //self similarity + val drmCooc = CooccurrenceAnalysis.cooccurrences(drmARaw = drmA, drmBs = Array(drmB)) + //var cp = drmSelfCooc(0).checkpoint() + //cp.writeDRM("/tmp/cooc-spark/")//to get values written + val matrixSelfCooc = drmCooc(0).checkpoint().collect + val diffMatrix = matrixSelfCooc.minus(matrixLLRCoocAtAControl) + var n = (new MatrixOps(m = diffMatrix)).norm + n should be < 1E-10 + + //cross similarity + val matrixCrossCooc = drmCooc(1).checkpoint().collect + val diff2Matrix = matrixCrossCooc.minus(matrixLLRCoocBtAControl) + n = (new MatrixOps(m = diff2Matrix)).norm + n should be < 1E-10 + } + + test("LLR calc") { + val A = dense( + (1, 1, 0, 0, 0), + (0, 0, 1, 1, 0), + (0, 0, 0, 0, 1), + (1, 0, 0, 1, 0)) + + val AtA = A.transpose().times(A) + + /* AtA is: + 0 => {0:2.0,1:1.0,3:1.0} + 1 => {0:1.0,1:1.0} + 2 => {2:1.0,3:1.0} + 3 => {0:1.0,2:1.0,3:2.0} + 4 => {4:1.0} + + val AtAd = dense( + (2, 1, 0, 1, 0), + (1, 1, 0, 0, 0), + (0, 0, 1, 1, 0), + (1, 0, 1, 2, 0), + (0, 0, 0, 0, 1)) + + val AtAdNoSelfCooc = dense( + (0, 1, 0, 1, 0), + (1, 0, 0, 0, 0), + (0, 0, 0, 1, 0), + (1, 0, 1, 0, 0), + (0, 0, 0, 0, 0)) + + for (MatrixSlice row : cooccurrence) { + for (Vector.Element element : row.vector().nonZeroes()) { + long k11 = (long) element.get();// = 1 + long k12 = (long) (rowSums.get(row.index()) - k11);// = 0 + long k21 = (long) (colSums.get(element.index()) - k11);// = 1 + long k22 = (long) (total - k11 - k12 - k21);// = 2 + double score = LogLikelihood.rootLogLikelihoodRatio(k11, k12, k21, k22); + element.set(score); + } + } + + for some reason the hadoop version returns the following + return 1.0 - 1.0 / (1.0 + logLikelihood); + so not a pure llr or root llr + + */ + + //item (1,0) + val numInteractionsWithAandB = 1L + val numInteractionsWithA = 1L + val numInteractionsWithB = 2L + val numInteractions = 6l + + val llr = CooccurrenceAnalysis.logLikelihoodRatio(numInteractionsWithA, numInteractionsWithB, numInteractionsWithAandB, numInteractions) + + assert(llr == 2.6341457841558764) // value calculated by hadoop itemsimilairty + } + + test("downsampling by number per row") { + val a = dense( + (1, 1, 1, 1, 0), + (1, 1, 1, 1, 1), + (0, 0, 0, 0, 1), + (1, 1, 0, 1, 0)) + val drmA: DrmLike[Int] = drmParallelize(m = a, numPartitions = 2) + + val downSampledDrm = CooccurrenceAnalysis.sampleDownAndBinarize(drmA, 0xdeadbeef, 4) + //count non-zero values, should be == 7 + var numValues = 0 + val m = downSampledDrm.collect + val it = m.iterator() + while (it.hasNext) { + val v = it.next().vector() + val nonZeroIt = v.nonZeroes().iterator() + while (nonZeroIt.hasNext) { + numValues += 1 + nonZeroIt.next() + } + } + + assert(numValues == 8) //Don't change the random seed or this may fail. + } +} From ecac400236990b02abe03ca73d5f4288422fce0d Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Wed, 9 Jul 2014 03:00:26 -0700 Subject: [PATCH 13/34] MAHOUT-1500: add README Signed-off-by: Anand Avati --- h2o/README.md | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 h2o/README.md diff --git a/h2o/README.md b/h2o/README.md new file mode 100644 index 0000000000..9f647b3f7e --- /dev/null +++ b/h2o/README.md @@ -0,0 +1,65 @@ +# Introduction + +This document demonstrates the integration between Mahout (http://mahout.apache.org) and H2O (http://www.h2o.ai). The integration provides a H2O backend to the Mahout algebra DSL (similar to the Spark backend.) + +## Setup + +Since the integration is still in the early stages, the demonstration will be using git and source (rather than a pre-built binary distribution.) + + sh:~$ git clone git://github.com/apache/mahout + sh:~$ cd mahout + sh:~/mahout$ git checkout -b MAHOUT-1500 + sh:~/mahout$ git pull git://github.com/avati/mahout MAHOUT-1500 + sh:~/mahout$ mvn -DskipTests install package + +The last step (mvn package) is necessary only because we are working off the source repository and do not yet use binary distributions of either Mahout or H2O. + +The integration depends on h2o-core maven artifact. This can either be fetched automatically through sonatype, or can be installed locally from source (run 'gradle install -x test' in http://github.com/0xdata/h2o-dev) + +## Test + +The integration with H2O can be used in either a local mode (single node) or a clustered mode. + +### Simple (single node/local) test + +Testing in local mode is pretty straight forward. Just run 'mvn test' as shown below. + + sh:~/mahout$ cd h2o + sh:~/mahout/h2o$ mvn test + ... + ... + All tests passed. + ... + sh:~/mahout/h2o$ + +### Distributed test + +H2O is fundamentally a peer-to-peer system. H2O nodes join together to form a cloud on which high performance distributed math can be executed. Each node joins a cloud of a given name. Multiple clouds can exist on the same network at the same time as long as their names are different. Multiple nodes can exist on the same server as well (even belonging to the same cloud.) + +The Mahout H2O integration is fit into this model by having N-1 "worker" nodes and one driver node, all belonging to the same cloud name. The default cloud name used for the integration is "mah2out". Clouds have to be spun up per task/job. + +First bring up worker nodes: + + host-1:~/mahout$ ./bin/mahout h2o-node + ... + .. INFO: Cloud of size 1 formed [/W.X.Y.Z:54321] + +Similarly, + + host-2:~/mahout$ ./bin/mahout h2o-node + ... + .. INFO: Cloud of size 2 formed [/A.B.C.D:54322] + +... and so on. For the purpose of testing multiple (even all) instances can be run on the same system too. + +The nodes discover each other over a multicast channel and establish consensus with Paxos. Next, start the driver just like running in local mode. + + host-N:~/mahout/h2o$ mvn test + ... + .. INFO: Cloud of size 3 formed [/E.F.G.H:54323] + ... + All tests passed. + ... + host-N:~/mahout/h2o$ + +The workers have to be restarted when when the driver node terminates (automating this is a future task.) \ No newline at end of file From 68f2a441360e87776f80b68150c1badf23563cbe Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Thu, 10 Jul 2014 20:59:03 -0700 Subject: [PATCH 14/34] MAHOUT-1500: cleanup - add code comments - style fixes - minor cleanups Signed-off-by: Anand Avati --- .../mahout/h2obindings/H2OBlockMatrix.java | 2 +- .../apache/mahout/h2obindings/H2OContext.java | 4 ++ .../apache/mahout/h2obindings/H2OHdfs.java | 5 +- .../apache/mahout/h2obindings/H2OHelper.java | 51 ++++++++----- .../apache/mahout/h2obindings/ops/ABt.java | 41 +++++++---- .../apache/mahout/h2obindings/ops/AewB.java | 60 ++++++++++------ .../mahout/h2obindings/ops/AewScalar.java | 57 +++++++++------ .../org/apache/mahout/h2obindings/ops/At.java | 33 ++++++--- .../apache/mahout/h2obindings/ops/AtA.java | 36 +++++++--- .../apache/mahout/h2obindings/ops/AtB.java | 35 ++++++--- .../apache/mahout/h2obindings/ops/Atx.java | 24 ++++++- .../org/apache/mahout/h2obindings/ops/Ax.java | 39 ++++++---- .../apache/mahout/h2obindings/ops/Cbind.java | 66 ++++++++++------- .../mahout/h2obindings/ops/MapBlock.java | 26 ++++++- .../apache/mahout/h2obindings/ops/Par.java | 69 +++++++++++------- .../mahout/h2obindings/ops/RowRange.java | 71 ++++++++++--------- .../h2obindings/ops/TimesRightMatrix.java | 61 ++++++++++------ .../apache/mahout/h2obindings/H2OEngine.scala | 3 +- .../h2obindings/drm/CheckpointedDrmH2O.scala | 3 +- 19 files changed, 449 insertions(+), 237 deletions(-) diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java index 11a57b172f..10dd74a2ce 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java @@ -23,7 +23,7 @@ import org.apache.mahout.math.DenseMatrix; import org.apache.mahout.math.SparseMatrix; -import water.fvec.*; +import water.fvec.Chunk; /* * A Matrix implementation to represent a vertical Block of DRM. diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OContext.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OContext.java index 2ee9e45dec..1307ef8424 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OContext.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OContext.java @@ -22,6 +22,10 @@ public class H2OContext { String masterURL; + /* @masterURL should actually be the cloud name (name of cluster) to which + all the H2O worker nodes "join into". This is not a hostname or IP address + of a server, but a string which all cluster members agree on. + */ public H2OContext(String _masterURL) { masterURL = _masterURL; diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java index 70866ed054..2783cb2c1d 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java @@ -22,8 +22,9 @@ import scala.Tuple2; -import water.fvec.*; -import water.*; +import water.fvec.Frame; +import water.fvec.Vec; +import water.Futures; import org.apache.mahout.math.Vector; import org.apache.mahout.math.DenseVector; diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java index 7f65431a6d..399c593c94 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -17,10 +17,21 @@ package org.apache.mahout.h2obindings; -import org.apache.mahout.math.*; - -import water.*; -import water.fvec.*; +import org.apache.mahout.math.Matrix; +import org.apache.mahout.math.Vector; +import org.apache.mahout.math.DenseMatrix; +import org.apache.mahout.math.SparseMatrix; +import org.apache.mahout.math.DenseVector; + +import water.MRTask; +import water.Futures; +import water.Key; +import water.DKV; +import water.fvec.Frame; +import water.fvec.Vec; +import water.fvec.Chunk; +import water.fvec.NewChunk; +import water.fvec.C0LChunk; import java.io.File; import java.io.IOException; @@ -36,11 +47,11 @@ public class H2OHelper { Is the matrix sparse? If the number of missing elements is 32 x times the number of present elements, treat it as sparse */ - public static boolean is_sparse (Frame frame) { + public static boolean is_sparse(Frame frame) { long rows = frame.numRows(); long cols = frame.numCols(); - + /* MRTask to aggregate precalculated per-chunk sparse lengths */ class MRTaskNZ extends MRTask { long _sparselen; public void map(Chunk chks[]) { @@ -63,7 +74,7 @@ public void reduce(MRTaskNZ other) { Dense Matrix depending on number of missing elements in Frame. */ - public static Matrix matrix_from_frame (Frame frame, Vec labels) { + public static Matrix matrix_from_frame(Frame frame, Vec labels) { Matrix m; if (is_sparse (frame)) @@ -72,6 +83,7 @@ public static Matrix matrix_from_frame (Frame frame, Vec labels) { m = new DenseMatrix ((int)frame.numRows(), frame.numCols()); int c = 0; + /* Fill matrix, column at a time */ for (Vec v : frame.vecs()) { for (int r = 0; r < frame.numRows(); r++) { double d = 0.0; @@ -81,6 +93,7 @@ public static Matrix matrix_from_frame (Frame frame, Vec labels) { c++; } + /* If string keyed, set the stings as rowlabels */ if (labels != null) { HashMap map = new HashMap(); for (long i = 0; i < labels.length(); i++) { @@ -97,7 +110,7 @@ public static Matrix matrix_from_frame (Frame frame, Vec labels) { H2O precalculates means in a Vec, and a Vec corresponds to a column. */ - public static Vector colMeans (Frame frame) { + public static Vector colMeans(Frame frame) { double means[] = new double[frame.numCols()]; for (int i = 0; i < frame.numCols(); i++) means[i] = frame.vecs()[i].mean(); @@ -111,7 +124,7 @@ public static Vector colMeans (Frame frame) { WARNING: Vulnerable to overflow. No way around it. */ - public static Vector colSums (Frame frame) { + public static Vector colSums(Frame frame) { class MRTaskSum extends MRTask { public double _sums[]; public void map(Chunk chks[]) { @@ -136,7 +149,7 @@ public void reduce(MRTaskSum other) { WARNING: Vulnerable to overflow. No way around it. */ - public static double sumSqr (Frame frame) { + public static double sumSqr(Frame frame) { class MRTaskSumSqr extends MRTask { public double _sumSqr; public void map(Chunk chks[]) { @@ -160,7 +173,7 @@ public void reduce(MRTaskSumSqr other) { WARNING: Vulnerable to overflow. No way around it. */ - public static Vector nonZeroCnt (Frame frame) { + public static Vector nonZeroCnt(Frame frame) { class MRTaskNonZero extends MRTask { public double _sums[]; public void map(Chunk chks[]) { @@ -181,6 +194,7 @@ public void reduce(MRTaskNonZero other) { return new DenseVector(new MRTaskNonZero().doAll(frame)._sums); } + /* Convert String->Integer map to Integer->String map */ private static Map reverse_map(Map map) { if (map == null) return null; @@ -194,7 +208,7 @@ private static Map reverse_map(Map map) { return rmap; } - private static int chunk_size (long nrow, int ncol, int min, int exact) { + private static int chunk_size(long nrow, int ncol, int min, int exact) { int chunk_sz; int parts_hint = Math.max(min, exact); @@ -220,18 +234,15 @@ private static int chunk_size (long nrow, int ncol, int min, int exact) { /* Ingest a Matrix into an H2O Frame. H2O Frame is the "backing" data structure behind CheckpointedDrm. Steps: - - - @cols is the number of columsn in the Matrix - - An H2O Vec represents an H2O Column. - - Create @cols number of Vec's. - - Load data into Vecs by routing them through NewChunks */ - public static Tuple2 frame_from_matrix (Matrix m, int min_hint, int exact_hint) { + public static Tuple2 frame_from_matrix(Matrix m, int min_hint, int exact_hint) { + /* First create an empty (0-filled) frame of the required dimensions */ Frame frame = empty_frame (m.rowSize(), m.columnSize(), min_hint, exact_hint); Vec labels = null; Vec.Writer writers[] = new Vec.Writer[m.columnSize()]; Futures closer = new Futures(); + /* "open" vectors for writing efficiently in bulk */ for (int i = 0; i < writers.length; i++) writers[i] = frame.vecs()[i].open(); @@ -242,8 +253,10 @@ public static Tuple2 frame_from_matrix (Matrix m, int min_hint, int e for (int c = 0; c < m.columnSize(); c++) writers[c].close(closer); + /* If string labeled matrix, create aux Vec */ Map map = m.getRowLabelBindings(); if (map != null) { + /* label vector must be similarly partitioned like the Frame */ labels = frame.anyVec().makeZero(); Vec.Writer writer = labels.open(); Map rmap = reverse_map(map); @@ -259,7 +272,7 @@ public static Tuple2 frame_from_matrix (Matrix m, int min_hint, int e return new Tuple2(frame,labels); } - public static Frame empty_frame (long nrow, int ncol, int min_hint, int exact_hint) { + public static Frame empty_frame(long nrow, int ncol, int min_hint, int exact_hint) { int chunk_sz = chunk_size (nrow, ncol, min_hint, exact_hint); int nchunks = (int) ((nrow - 1) / chunk_sz) + 1; /* Final number of Chunks per Vec */ Futures fs = new Futures(); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java index 4389bec1a7..74fbbc4a86 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java @@ -19,8 +19,12 @@ import org.apache.mahout.h2obindings.H2OHelper; -import water.*; -import water.fvec.*; +import water.MRTask; +import water.fvec.Frame; +import water.fvec.Vec; +import water.fvec.Chunk; +import water.fvec.NewChunk; + import scala.Tuple2; public class ABt { @@ -29,21 +33,32 @@ public static Tuple2 ABt(Tuple2 TA, Tuple2 TB) Frame A = TA._1(); Vec VA = TA._2(); final Frame B = TB._1(); + int ABt_cols = (int)B.numRows(); + + /* ABt is written into ncs[] with an MRTask on A, and therefore will + be similarly partitioned as A. - class MRTaskABt extends MRTask { - public void map(Chunk chks[], NewChunk ncs[]) { - for (int c = 0; c < ncs.length; c++) { - for (int r = 0; r < chks[0].len(); r++) { - double v = 0; - for (int i = 0; i < chks.length; i++) { - v += (chks[i].at0(r) * B.vecs()[i].at(c)); + chks.length == A.numCols() (== B.numCols()) + ncs.length == ABt_cols (B.numRows()) + */ + Frame ABt = new MRTask() { + public void map(Chunk chks[], NewChunk ncs[]) { + int chunk_size = chks[0].len(); + Vec B_vecs[] = B.vecs(); + + for (int c = 0; c < ncs.length; c++) { + for (int r = 0; r < chunk_size; r++) { + double v = 0; + for (int i = 0; i < chks.length; i++) { + v += (chks[i].at0(r) * B_vecs[i].at(c)); + } + ncs[c].addNum(v); } - ncs[c].addNum(v); } } - } - } - Frame ABt = new MRTaskABt().doAll((int)B.numRows(),A).outputFrame(null,null); + }.doAll(ABt_cols, A).outputFrame(null, null); + + /* Carry forward labels of A blindly into ABt */ return new Tuple2(ABt, VA); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java index 4e0468f3b7..cba0d7dbc9 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java @@ -19,8 +19,12 @@ import org.apache.mahout.h2obindings.H2OHelper; -import water.*; -import water.fvec.*; +import water.MRTask; +import water.fvec.Frame; +import water.fvec.Vec; +import water.fvec.Chunk; +import water.fvec.NewChunk; + import scala.Tuple2; public class AewB { @@ -29,32 +33,42 @@ public static Tuple2 AewB(Tuple2 AT, Tuple2 BT, final Frame A = AT._1(); final Frame B = BT._1(); Vec VA = AT._2(); + int AewB_cols = A.numCols(); + + /* AewB is written into ncs[] with an MRTask on A, and therefore will + be similarly partitioned as A. - class MRTaskAewB extends MRTask { - private double opfn (String op, double a, double b) { - if (a == 0.0 && b == 0.0) + B may or may not be similarly partitioned as A, but must have the + same dimensions of A. + */ + Frame AewB = new MRTask() { + private double opfn(String op, double a, double b) { + if (a == 0.0 && b == 0.0) + return 0.0; + if (op.equals("+")) + return a + b; + else if (op.equals("-")) + return a - b; + else if (op.equals("*")) + return a * b; + else if (op.equals("/")) + return a / b; return 0.0; - if (op.equals("+")) - return a + b; - else if (op.equals("-")) - return a - b; - else if (op.equals("*")) - return a * b; - else if (op.equals("/")) - return a / b; - return 0.0; - } - public void map(Chunk chks[], NewChunk ncs[]) { - long start = chks[0].start(); - for (int c = 0; c < chks.length; c++) { - for (int r = 0; r < chks[0].len(); r++) { - ncs[c].addNum(opfn(op, chks[c].at0(r), B.vecs()[c].at(start+r))); + } + public void map(Chunk chks[], NewChunk ncs[]) { + int chunk_size = chks[0].len(); + Vec B_vecs[] = B.vecs(); + long start = chks[0].start(); + + for (int c = 0; c < chks.length; c++) { + for (int r = 0; r < chunk_size; r++) { + ncs[c].addNum(opfn(op, chks[c].at0(r), B_vecs[c].at(start + r))); + } } } - } - } - Frame AewB = new MRTaskAewB().doAll(A.numCols(), A).outputFrame(A.names(), A.domains()); + }.doAll(AewB_cols, A).outputFrame(null, null); + /* Carry forward labels of A blindly into ABt */ return new Tuple2(AewB, VA); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java index 42c17c42be..6af4991b39 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java @@ -19,8 +19,12 @@ import org.apache.mahout.h2obindings.H2OHelper; -import water.*; -import water.fvec.*; +import water.MRTask; +import water.fvec.Frame; +import water.fvec.Vec; +import water.fvec.Chunk; +import water.fvec.NewChunk; + import scala.Tuple2; public class AewScalar { @@ -28,31 +32,38 @@ public class AewScalar { public static Tuple2 AewScalar(final Tuple2 TA, final double s, final String op) { Frame A = TA._1(); Vec VA = TA._2(); + int AewScalar_cols = A.numCols(); - class MRTaskAewScalar extends MRTask { - private double opfn (String op, double a, double b) { - if (a == 0.0 && b == 0.0) + /* AewScalar is written into ncs[] with an MRTask on A, and therefore will + be similarly partitioned as A. + */ + Frame AewScalar = new MRTask() { + private double opfn(String op, double a, double b) { + if (a == 0.0 && b == 0.0) + return 0.0; + if (op.equals("+")) + return a + b; + else if (op.equals("-")) + return a - b; + else if (op.equals("*")) + return a * b; + else if (op.equals("/")) + return a / b; return 0.0; - if (op.equals("+")) - return a + b; - else if (op.equals("-")) - return a - b; - else if (op.equals("*")) - return a * b; - else if (op.equals("/")) - return a / b; - return 0.0; - } - public void map(Chunk chks[], NewChunk ncs[]) { - long start = chks[0].start(); - for (int c = 0; c < chks.length; c++) { - for (int r = 0; r < chks[0].len(); r++) { - ncs[c].addNum(opfn(op, chks[c].at0(r), s)); + } + public void map(Chunk chks[], NewChunk ncs[]) { + int chunk_size = chks[0].len(); + long start = chks[0].start(); + + for (int c = 0; c < chks.length; c++) { + for (int r = 0; r < chunk_size; r++) { + ncs[c].addNum(opfn(op, chks[c].at0(r), s)); + } } } - } - } - Frame AewScalar = new MRTaskAewScalar().doAll(A.numCols(), A).outputFrame(A.names(), A.domains()); + }.doAll(AewScalar_cols, A).outputFrame(null, null); + + /* Carry forward labels of A blindly into ABt */ return new Tuple2(AewScalar, VA); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java index cba5c5ef68..c7cc7dd8a1 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java @@ -19,26 +19,41 @@ import org.apache.mahout.h2obindings.H2OHelper; -import water.*; -import water.fvec.*; +import water.MRTask; +import water.fvec.Frame; +import water.fvec.Vec; +import water.fvec.Chunk; +import water.fvec.NewChunk; + import scala.Tuple2; public class At { /* Calculate A' (transpose) */ public static Tuple2 At(Tuple2 T) { final Frame A = T._1(); - Frame At = H2OHelper.empty_frame (A.numCols(), (int)A.numRows(), -1, -1); - class MRTaskAt extends MRTask { + /* First create a new frame of the required dimensions, A.numCols() rows + and A.numRows() columns. + */ + Frame At = H2OHelper.empty_frame(A.numCols(), (int)A.numRows(), -1, -1); + + /* Execute MRTask on the new frame, and fill each cell (initially 0) by + pulling in the appropriate value from A. + */ + new MRTask() { public void map(Chunk chks[]) { + int chunk_size = chks[0].len(); long start = chks[0].start(); + Vec A_vecs[] = A.vecs(); + for (int c = 0; c < chks.length; c++) { - for (int r = 0; r < chks[0].len(); r++) { - chks[c].set0(r, A.vecs()[(int)(start+r)].at(c)); + for (int r = 0; r < chunk_size; r++) { + chks[c].set0(r, A_vecs[(int)(start+r)].at(c)); } } } - } - new MRTaskAt().doAll(At); - return new Tuple2(At,null); + }.doAll(At); + + /* At is NOT similarly partitioned as A, drop labels */ + return new Tuple2(At, null); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java index a099f9eda7..dba03a1139 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java @@ -19,30 +19,46 @@ import org.apache.mahout.h2obindings.H2OHelper; -import water.*; -import water.fvec.*; +import water.MRTask; +import water.fvec.Frame; +import water.fvec.Vec; +import water.fvec.Chunk; +import water.fvec.NewChunk; + import scala.Tuple2; public class AtA { /* Calculate A'A */ public static Tuple2 AtA(Tuple2 TA) { final Frame A = TA._1(); - Frame AtA = H2OHelper.empty_frame (A.numCols(), A.numCols(), -1, -1); - class MRTaskAtA extends MRTask { + /* First create an empty Frame of the required dimensions */ + Frame AtA = H2OHelper.empty_frame(A.numCols(), A.numCols(), -1, -1); + + /* Execute MRTask on the new Frame, and fill each cell (initially 0) by + computing appropriate values from A. + + chks.length == A.numCols() + */ + new MRTask() { public void map(Chunk chks[]) { + int chunk_size = chks[0].len(); long start = chks[0].start(); + Vec A_vecs[] = A.vecs(); + long A_rows = A.numRows(); + for (int c = 0; c < chks.length; c++) { - for (int r = 0; r < chks[0].len(); r++) { + for (int r = 0; r < chunk_size; r++) { double v = 0; - for (int i = 0; i < A.numRows(); i++) { - v += (A.vecs()[(int)(start+r)].at(i) * A.vecs()[c].at(i)); + for (long i = 0; i < A_rows; i++) { + v += (A_vecs[(int)(start+r)].at(i) * A_vecs[c].at(i)); } chks[c].set0(r, v); } } } - } - new MRTaskAtA().doAll(AtA); - return new Tuple2(AtA,null); + }.doAll(AtA); + + /* AtA is NOT similarly partitioned as A, drop labels */ + return new Tuple2(AtA, null); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java index f30f3f78b6..c229e43913 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java @@ -19,8 +19,12 @@ import org.apache.mahout.h2obindings.H2OHelper; -import water.*; -import water.fvec.*; +import water.MRTask; +import water.fvec.Frame; +import water.fvec.Vec; +import water.fvec.Chunk; +import water.fvec.NewChunk; + import scala.Tuple2; public class AtB { @@ -29,24 +33,35 @@ public static Tuple2 AtB(Tuple2 TA, Tuple2 TB) final Frame A = TA._1(); final Frame B = TB._1(); - Frame AtB = H2OHelper.empty_frame (A.numCols(), B.numCols(), -1, -1); + /* First create an empty frame of the required dimensions */ + Frame AtB = H2OHelper.empty_frame(A.numCols(), B.numCols(), -1, -1); + + /* Execute MRTask on the new Frame, and fill each cell (initially 0) by + computing appropriate values from A and B. - class MRTaskAtB extends MRTask { + chks.length == B.numCols() + */ + new MRTask() { public void map(Chunk chks[]) { + int chunk_size = chks[0].len(); long start = chks[0].start(); + long A_rows = A.numRows(); + Vec A_vecs[] = A.vecs(); + Vec B_vecs[] = B.vecs(); + for (int c = 0; c < chks.length; c++) { - for (int r = 0; r < chks[0].len(); r++) { + for (int r = 0; r < chunk_size; r++) { double v = 0; - for (int i = 0; i < A.numRows(); i++) { - v += (A.vecs()[(int)(start+r)].at(i) * B.vecs()[c].at(i)); + for (long i = 0; i < A_rows; i++) { + v += (A_vecs[(int)(start+r)].at(i) * B_vecs[c].at(i)); } chks[c].set0(r, v); } } } - } + }.doAll(AtB); - new MRTaskAtB().doAll(AtB); - return new Tuple2(AtB,null); + /* AtB is NOT similarly partitioned as A, drop labels */ + return new Tuple2(AtB, null); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java index 7d6f0a4419..3c6ee49fa6 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java @@ -24,8 +24,12 @@ import org.apache.mahout.h2obindings.H2OHelper; import org.apache.mahout.h2obindings.drm.H2OBCast; -import water.*; -import water.fvec.*; +import water.MRTask; +import water.fvec.Frame; +import water.fvec.Vec; +import water.fvec.Chunk; +import water.fvec.NewChunk; + import scala.Tuple2; public class Atx { @@ -33,13 +37,22 @@ public class Atx { public static Tuple2 Atx(Tuple2 TA, Vector x) { Frame A = TA._1(); final H2OBCast bx = new H2OBCast(x); + + /* A'x is computed into _atx[] with an MRTask on A (with + x available as a Broadcast + + x.size() == A.numRows() + _atx.length == chks.length == A.numCols() + */ class MRTaskAtx extends MRTask { double _atx[]; public void map(Chunk chks[]) { + int chunk_size = chks[0].len(); Vector x = bx.value(); long start = chks[0].start(); + _atx = new double[chks.length]; - for (int r = 0; r < chks[0].len(); r++) { + for (int r = 0; r < chunk_size; r++) { double d = x.getQuick((int)start + r); for (int c = 0; c < chks.length; c++) { _atx[c] += (chks[c].at0(r) * d); @@ -51,6 +64,11 @@ public void reduce(MRTaskAtx other) { _atx[i] += other._atx[i]; } } + + /* Take the result in ._atx[], and convert into a Frame + using existing helper functions (creating a Matrix + along the way for the Helper) + */ Vector v = new DenseVector(new MRTaskAtx().doAll(A)._atx); Matrix m = new DenseMatrix(A.numCols(), 1); m.assignColumn(0, v); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java index 7242f57681..604e5db531 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java @@ -21,8 +21,12 @@ import org.apache.mahout.h2obindings.H2OHelper; import org.apache.mahout.h2obindings.drm.H2OBCast; -import water.*; -import water.fvec.*; +import water.MRTask; +import water.fvec.Frame; +import water.fvec.Vec; +import water.fvec.Chunk; +import water.fvec.NewChunk; + import scala.Tuple2; public class Ax { @@ -31,19 +35,28 @@ public static Tuple2 Ax(Tuple2 TA, Vector x) { Frame A = TA._1(); Vec VA = TA._2(); final H2OBCast bx = new H2OBCast(x); - class MRTaskAx extends MRTask { - public void map(Chunk chks[], NewChunk nc) { - Vector x = bx.value(); - for (int r = 0; r < chks[0].len(); r++) { - double v = 0; - for (int c = 0; c < chks.length; c++) { - v += (chks[c].at0(r) * x.getQuick(c)); + + /* Ax is written into nc (single element, not array) with an MRTask on A, + and therefore will be similarly partitioned as A. + + x.size() == A.numCols() == chks.length + */ + Frame Ax = new MRTask() { + public void map(Chunk chks[], NewChunk nc) { + int chunk_size = chks[0].len(); + Vector x = bx.value(); + + for (int r = 0; r < chunk_size; r++) { + double v = 0; + for (int c = 0; c < chks.length; c++) { + v += (chks[c].at0(r) * x.getQuick(c)); + } + nc.addNum(v); } - nc.addNum(v); } - } - } - Frame Ax = new MRTaskAx().doAll(1, A).outputFrame(A.names(), A.domains()); + }.doAll(1, A).outputFrame(null, null); + + /* Carry forward labels of A blindly into ABt */ return new Tuple2(Ax, VA); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java index e26635d678..de1e95245c 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java @@ -17,60 +17,78 @@ package org.apache.mahout.h2obindings.ops; -import water.*; -import water.fvec.*; +import water.MRTask; +import water.fvec.Frame; +import water.fvec.Vec; +import water.fvec.Chunk; +import water.fvec.NewChunk; + import scala.Tuple2; import org.apache.mahout.h2obindings.H2OHelper; public class Cbind { + /* R's cbind like operator, on TA and TB */ + public static Tuple2 Cbind(Tuple2 TA, Tuple2 TB) { + Frame fra = TA._1(); + Vec va = TA._2(); + Frame frb = TB._1(); + Vec vb = TB._2(); + + /* If A and B are similarly partitioned, .. */ + if (fra.anyVec().group() == frb.anyVec().group()) + /* .. then, do a light weight zip() */ + return zip(fra, va, frb, vb); + else + /* .. else, do a heavy weight join() which involves moving data over the wire */ + return join(fra, va, frb, vb); + } + + /* Light weight zip(), no data movement */ private static Tuple2 zip(final Frame fra, final Vec va, final Frame frb, final Vec vb) { + /* Create a new Vec[] to hold the concatenated list of A and B's column vectors */ Vec vecs[] = new Vec[fra.vecs().length + frb.vecs().length]; int d = 0; + /* fill A's column vectors */ for (Vec vfra : fra.vecs()) vecs[d++] = vfra; + /* and B's */ for (Vec vfrb : frb.vecs()) vecs[d++] = vfrb; + /* and create a new Frame with the combined list of column Vecs */ Frame fr = new Frame(vecs); + /* Finally, inherit A's string labels into the result */ return new Tuple2 (fr, va); } + /* heavy weight join(), involves moving data */ private static Tuple2 join(final Frame fra, final Vec va, final Frame frb, final Vec vb) { + + /* The plan is to re-organize B to be "similarly partitioned as A", and then zip() */ Vec bvecs[] = new Vec[frb.vecs().length]; for (int i = 0; i < bvecs.length; i++) + /* First create column Vecs which are similarly partitioned as A */ bvecs[i] = fra.anyVec().makeZero(); + /* Next run an MRTask on the new vectors, and fill each cell (initially 0) + by pulling in appropriate values from B (frb) + */ new MRTask() { public void map(Chunk chks[]) { + int chunk_size = chks[0].len(); long start = chks[0].start(); - for (int r = 0; r < chks[0].len(); r++) { + Vec vecs[] = frb.vecs(); + + for (int r = 0; r < chunk_size; r++) { for (int c = 0; c < chks.length; c++) { // assert va.atStr(start+r) == vb.atStr(start+r) - chks[c].set0(r, frb.vecs()[c].at(start + r)); + chks[c].set0(r, vecs[c].at(start + r)); } } } }.doAll(bvecs); - Vec vecs[] = new Vec[fra.vecs().length + frb.vecs().length]; - int d = 0; - for (Vec vfra : fra.vecs()) - vecs[d++] = vfra; - for (Vec vfrb : bvecs) - vecs[d++] = vfrb; - Frame fr = new Frame(vecs); - return new Tuple2 (fr, va); - } - - public static Tuple2 Cbind(Tuple2 TA, Tuple2 TB) { - Frame fra = TA._1(); - Vec va = TA._2(); - Frame frb = TB._1(); - Vec vb = TB._2(); - - if (fra.anyVec().group() == frb.anyVec().group()) - return zip(fra, va, frb, vb); - else - return join(fra, va, frb, vb); + /* now that bvecs[] is compatible, just zip'em'up */ + return zip(fra, va, new Frame(bvecs), null); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java index 76e062bb9f..5cfc936574 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java @@ -20,8 +20,12 @@ import org.apache.mahout.math.Matrix; import org.apache.mahout.h2obindings.H2OBlockMatrix; -import water.*; -import water.fvec.*; +import water.MRTask; +import water.fvec.Frame; +import water.fvec.Vec; +import water.fvec.Chunk; +import water.fvec.NewChunk; + import java.io.Serializable; import java.util.Arrays; @@ -63,6 +67,20 @@ private void deblockify (Matrix out, NewChunk ncs[]) { } } + /* + Input: + chks.length == A.numCols() + + Output: + ncs.length == (A.numCols() + 1) if String keyed + (A.numCols() + 0) if Int or Long keyed + + First A.numCols() ncs[] elements are fed back the output + of bmf() output's _2 in deblockify() + + If String keyed, then MapBlockHelper.exec() would have + filled in the Strings into ncs[ncol] already + */ public void map(Chunk chks[], NewChunk ncs[]) { long start = chks[0].start(); NewChunk nclabel = is_r_str ? ncs[ncs.length-1] : null; @@ -75,6 +93,10 @@ public void map(Chunk chks[], NewChunk ncs[]) { Frame fmap = new MRTaskBMF(bmf, VA).doAll(ncol_res, A).outputFrame(null, null); Vec vmap = null; if (is_r_str) { + /* If output was String keyed, then the last Vec in fmap is the String vec. + If so, peel it out into a separate Vec (vmap) and set fmap to be the + Frame with just the first ncol Vecs + */ vmap = fmap.vecs()[ncol]; fmap = new Frame(Arrays.copyOfRange(fmap.vecs(), 0, ncol)); } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java index a8eb13a86c..eee2737f89 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java @@ -17,8 +17,11 @@ package org.apache.mahout.h2obindings.ops; -import water.*; -import water.fvec.*; +import water.MRTask; +import water.fvec.Frame; +import water.fvec.Vec; +import water.fvec.Chunk; +import water.fvec.NewChunk; import scala.Tuple2; import org.apache.mahout.h2obindings.H2OHelper; @@ -26,37 +29,53 @@ public class Par { public static Tuple2 exec(Tuple2 TA, int min, int exact) { final Frame frin = TA._1(); final Vec vin = TA._2(); - Frame frout = H2OHelper.empty_frame (frin.numRows(), frin.numCols(), min, exact); + + /* First create a new empty Frame with the required partitioning */ + Frame frout = H2OHelper.empty_frame(frin.numRows(), frin.numCols(), min, exact); Vec vout = null; - class MRParVecTask extends MRTask { - public void map(Chunk chks[], NewChunk nc) { - Vec vins[] = frin.vecs(); - for (int r = 0; r < chks[0].len(); r++) { - for (int c = 0; c < chks.length; c++) { - chks[c].set0(r, vins[c].at(chks[0].start() + r)); + if (vin != null) { + /* If String keyed, then run an MRTask on the new frame, and also + creat yet another 1-column newer frame for the re-orged String keys. + The new String Vec will therefore be similarly partitioned as the + new Frame. + + vout is finally collected by calling anyVec() on outputFrame(), + as it is the only column in the output frame. + */ + vout = new MRTask() { + public void map(Chunk chks[], NewChunk nc) { + int chunk_size = chks[0].len(); + Vec vins[] = frin.vecs(); + long start = chks[0].start(); + + for (int r = 0; r < chunk_size; r++) { + for (int c = 0; c < chks.length; c++) { + chks[c].set0(r, vins[c].at(start + r)); + } + nc.addStr(vin.atStr(start + r)); + } } - nc.addStr(vin.atStr(chks[0].start() + r)); - } - } - } + }.doAll(1, frout).outputFrame(null, null).anyVec(); + } else { + /* If not String keyed, then run and MRTask on the new frame, and + just pull in right elements from frin + */ + new MRTask() { + public void map(Chunk chks[]) { + int chunk_size = chks[0].len(); + Vec vins[] = frin.vecs(); + long start = chks[0].start(); - class MRParTask extends MRTask { - public void map(Chunk chks[]) { - Vec vins[] = frin.vecs(); - for (int r = 0; r < chks[0].len(); r++) { - for (int c = 0; c < chks.length; c++) { - chks[c].set0(r, vins[c].at(chks[0].start() + r)); + for (int r = 0; r < chunk_size; r++) { + for (int c = 0; c < chks.length; c++) { + chks[c].set0(r, vins[c].at(start + r)); + } } } - } + }.doAll(frout); } - if (vout != null) { - vout = new MRParVecTask().doAll(1, frout).outputFrame(null, null).anyVec(); - } else { - new MRParTask().doAll(frout); - } return new Tuple2 (frout, vout); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java index 6e9c09b249..8fbfdaa696 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java @@ -19,57 +19,62 @@ import scala.collection.immutable.Range; -import water.*; -import water.fvec.*; +import water.MRTask; +import water.fvec.Frame; +import water.fvec.Vec; +import water.fvec.Chunk; +import water.fvec.NewChunk; import scala.Tuple2; public class RowRange { /* Filter operation */ - public static Tuple2 RowRange(Tuple2 TA, Range r) { + public static Tuple2 RowRange(Tuple2 TA, final Range R) { Frame A = TA._1(); Vec VA = TA._2(); - class MRTaskFilter extends MRTask { - Range _r; - MRTaskFilter(Range r) { - _r = r; - } - public void map(Chunk chks[], NewChunk ncs[]) { - if (chks[0].start() > _r.end() || (chks[0].start() + chks[0].len()) < _r.start()) - return; + /* Run a filtering MRTask on A. If row number falls within R.start() and + R.end(), then the row makes it into the output + */ + Frame Arr = new MRTask() { + public void map(Chunk chks[], NewChunk ncs[]) { + int chunk_size = chks[0].len(); + long chunk_start = chks[0].start(); - for (int r = 0; r < chks[0].len(); r++) { - if (!_r.contains (chks[0].start() + r)) - continue; + /* First check if the entire chunk even overlaps with R */ + if (chunk_start > R.end() || (chunk_start + chunk_size) < R.start()) + return; - for (int c = 0; c < chks.length; c++) - ncs[c].addNum(chks[c].at0(r)); - } - } - } - Frame Arr = new MRTaskFilter(r).doAll(A.numCols(), A).outputFrame(A.names(), A.domains()); - Vec Vrr = null; - if (VA != null) { - class MRTaskStrFilter extends MRTask { - Range _r; - MRTaskStrFilter(Range r) { - _r = r; + /* This chunk overlaps, filter out just the overlapping rows */ + for (int r = 0; r < chunk_size; r++) { + if (!R.contains (chunk_start + r)) + continue; + + for (int c = 0; c < chks.length; c++) + ncs[c].addNum(chks[c].at0(r)); + } } + }.doAll(A.numCols(), A).outputFrame(null, null); + + Vec Vrr = (VA == null) ? null : new MRTask() { + /* This is a String keyed DRM. Do the same thing as above, + but this time just one column of Strings. + */ public void map(Chunk chk, NewChunk nc) { - if (chk.start() > _r.end() || (chk.start() + chk.len()) < _r.start()) + int chunk_size = chk.len(); + long chunk_start = chk.start(); + + if (chunk_start > R.end() || (chunk_start + chunk_size) < R.start()) return; - for (int r = 0; r < chk.len(); r++) { - if (!_r.contains (chk.start() + r)) + for (int r = 0; r < chunk_size; r++) { + if (!R.contains (chunk_start + r)) continue; nc.addStr(chk.atStr0(r)); } } - } - Vrr = new MRTaskStrFilter(r).doAll(1, VA).outputFrame(null,null).vecs()[0]; - } + }.doAll(1, VA).outputFrame(null, null).anyVec(); - return new Tuple2(Arr,Vrr); + return new Tuple2(Arr, Vrr); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java index bbd55e4262..02c924c9db 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java @@ -23,37 +23,66 @@ import org.apache.mahout.h2obindings.H2OHelper; import org.apache.mahout.h2obindings.drm.H2OBCast; -import water.*; -import water.fvec.*; +import water.MRTask; +import water.fvec.Frame; +import water.fvec.Vec; +import water.fvec.Chunk; +import water.fvec.NewChunk; import scala.Tuple2; public class TimesRightMatrix { + /* Multiple with in-core Matrix */ + public static Tuple2 TimesRightMatrix(Tuple2 TA, Matrix B) { + Frame A = TA._1(); + Vec VA = TA._2(); + Frame AinCoreB = null; + + if (B instanceof DiagonalMatrix) + AinCoreB = AinCoreB_diagonal(A, B.viewDiagonal()); + else + AinCoreB = AinCoreB_common(A, B); + return new Tuple2(AinCoreB, VA); + } + + /* + Multiply Frame A with in-core diagonal Matrix (whose diagonal Vector is d) + + A.numCols() == d.size() + */ private static Frame AinCoreB_diagonal(final Frame A, Vector d) { final H2OBCast bd = new H2OBCast(d); - class MRTaskAinCoreB extends MRTask { + return new MRTask() { public void map(Chunk chks[], NewChunk ncs[]) { Vector D = bd.value(); + int chunk_size = chks[0].len(); + for (int c = 0; c < ncs.length; c++) { - for (int r = 0; r < chks[0].len(); r++) { + for (int r = 0; r < chunk_size; r++) { double v = (chks[c].at0(r) * D.getQuick(c)); ncs[c].addNum(v); } } } - } - return new MRTaskAinCoreB().doAll(d.size(), A).outputFrame(null,null); + }.doAll(d.size(), A).outputFrame(null, null); } + /* + Multiply Frame A with in-core Matrix b + + A.numCols() == b.rowSize() + */ private static Frame AinCoreB_common(final Frame A, Matrix b) { final H2OBCast bb = new H2OBCast(b); - class MRTaskAinCoreB extends MRTask { + return new MRTask() { public void map(Chunk chks[], NewChunk ncs[]) { Matrix B = bb.value(); + int chunk_size = chks[0].len(); + for (int c = 0; c < ncs.length; c++) { - for (int r = 0; r < chks[0].len(); r++) { + for (int r = 0; r < chunk_size; r++) { double v = 0; for (int i = 0; i < chks.length; i++) { v += (chks[i].at0(r) * B.getQuick(i, c)); @@ -62,20 +91,6 @@ public void map(Chunk chks[], NewChunk ncs[]) { } } } - } - return new MRTaskAinCoreB().doAll(b.columnSize(), A).outputFrame(null,null); - } - - /* Multiple with in-core Matrix */ - public static Tuple2 TimesRightMatrix(Tuple2 TA, Matrix B) { - Frame A = TA._1(); - Vec VA = TA._2(); - Frame AinCoreB; - if (B instanceof DiagonalMatrix) - AinCoreB = AinCoreB_diagonal(A, B.viewDiagonal()); - else - AinCoreB = AinCoreB_common(A, B); - - return new Tuple2(AinCoreB, VA); + }.doAll(b.columnSize(), A).outputFrame(null, null); } } diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala index f97121d7a0..1178adc38e 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala @@ -24,8 +24,7 @@ import org.apache.mahout.math.drm.logical._ import org.apache.mahout.h2obindings.ops._ import org.apache.mahout.h2obindings.drm._ -import water._ -import water.fvec._ +import water.fvec.{Frame,Vec} object H2OEngine extends DistributedEngine { def colMeans[K:ClassTag](drm: CheckpointedDrm[K]): Vector = diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala index 74ffe787a5..269ddbaec5 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala @@ -6,8 +6,7 @@ import RLikeOps._ import org.apache.mahout.math.drm._ import org.apache.mahout.h2obindings._ -import water._ -import water.fvec._ +import water.fvec.{Frame,Vec} import scala.reflect._ From 9d97769686ce405a5b8165828fa67f9d6eb47842 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Mon, 14 Jul 2014 16:32:58 -0700 Subject: [PATCH 15/34] MAHOUT-1500: use the new test suites --- .../h2obindings/drm/DrmLikeOpsSuite.scala | 66 +-- .../mahout/h2obindings/drm/DrmLikeSuite.scala | 43 +- .../h2obindings/drm/RLikeDrmOpsSuite.scala | 468 +----------------- .../mahout/h2obindings/ops/ABtSuite.scala | 4 +- .../mahout/h2obindings/ops/AewBSuite.scala | 4 +- .../mahout/h2obindings/ops/AtASuite.scala | 4 +- .../mahout/h2obindings/ops/AtSuite.scala | 4 +- .../test/DistributedH2OSuite.scala | 46 ++ .../DistributedDecompositionsSuite.scala | 34 ++ 9 files changed, 96 insertions(+), 577 deletions(-) create mode 100644 h2o/src/test/scala/org/apache/mahout/h2obindings/test/DistributedH2OSuite.scala create mode 100644 h2o/src/test/scala/org/apache/mahout/math/decompositions/DistributedDecompositionsSuite.scala diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeOpsSuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeOpsSuite.scala index 68bf017752..6bfb13fc97 100644 --- a/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeOpsSuite.scala +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeOpsSuite.scala @@ -24,70 +24,8 @@ import RLikeOps._ import RLikeDrmOps._ import org.apache.mahout.h2obindings._ import org.scalatest.FunSuite -import org.apache.mahout.h2obindings.test.MahoutLocalContext +import org.apache.mahout.h2obindings.test.DistributedH2OSuite /** Tests for DrmLikeOps */ -class DrmLikeOpsSuite extends FunSuite with MahoutLocalContext { - - test("mapBlock") { - - val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6)) - val A = drmParallelize(m = inCoreA, numPartitions = 2) - val B = A.mapBlock(/* Inherit width */) { - case (keys, block) => keys -> (block += 1.0) - } - - val inCoreB = B.collect - val inCoreBControl = inCoreA + 1.0 - - println(inCoreB) - - // Assert they are the same - (inCoreB - inCoreBControl).norm should be < 1E-10 - - } - - test("col range") { - val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6)) - val A = drmParallelize(m = inCoreA, numPartitions = 2) - val B = A(::, 1 to 2) - val inCoreB = B.collect - val inCoreBControl = inCoreA(::, 1 to 2) - - println(inCoreB) - - // Assert they are the same - (inCoreB - inCoreBControl).norm should be < 1E-10 - - } - - test("row range") { - - val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6)) - val A = drmParallelize(m = inCoreA, numPartitions = 2) - val B = A(1 to 2, ::) - val inCoreB = B.collect - val inCoreBControl = inCoreA(1 to 2, ::) - - println(inCoreB) - - // Assert they are the same - (inCoreB - inCoreBControl).norm should be < 1E-10 - - } - - test("col, row range") { - - val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6)) - val A = drmParallelize(m = inCoreA, numPartitions = 2) - val B = A(1 to 2, 1 to 2) - val inCoreB = B.collect - val inCoreBControl = inCoreA(1 to 2, 1 to 2) - - println(inCoreB) - - // Assert they are the same - (inCoreB - inCoreBControl).norm should be < 1E-10 - - } +class DrmLikeOpsSuite extends FunSuite with DistributedH2OSuite with DrmLikeOpsSuiteBase { } diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeSuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeSuite.scala index 98778a8abd..07eb9d7ca3 100644 --- a/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeSuite.scala +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeSuite.scala @@ -23,46 +23,7 @@ import scalabindings._ import drm._ import RLikeOps._ import RLikeDrmOps._ -import org.apache.mahout.h2obindings.test.MahoutLocalContext +import org.apache.mahout.h2obindings.test.DistributedH2OSuite -/** - * DRMLike tests - */ -class DrmLikeSuite extends FunSuite with MahoutLocalContext { - - - test("DRM DFS i/o (local)") { - - val uploadPath = "UploadedDRM" - - val inCoreA = dense((1, 2, 3), (3, 4, 5)) - val drmA = drmParallelize(inCoreA) - - drmA.writeDRM(path = uploadPath) - - println(inCoreA) - - // Load back from hdfs - val drmB = drmFromHDFS(path = uploadPath) - - // Collect back into in-core - val inCoreB = drmB.collect - - // Print out to see what it is we collected: - println(inCoreB) - } - - test("DRM parallelizeEmpty") { - - val drmEmpty = drmParallelizeEmpty(100, 50) - - // collect back into in-core - val inCoreEmpty = drmEmpty.collect - - //print out to see what it is we collected: - println(inCoreEmpty) - printf("drm nrow:%d, ncol:%d\n", drmEmpty.nrow, drmEmpty.ncol) - printf("in core nrow:%d, ncol:%d\n", inCoreEmpty.nrow, inCoreEmpty.ncol) - } -} +class DrmLikeSuite extends FunSuite with DistributedH2OSuite with DrmLikeSuiteBase diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/RLikeDrmOpsSuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/RLikeDrmOpsSuite.scala index dd36f1ecbe..f052247a43 100644 --- a/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/RLikeDrmOpsSuite.scala +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/RLikeDrmOpsSuite.scala @@ -17,471 +17,11 @@ package org.apache.mahout.h2obindings.drm -import org.scalatest.{Matchers, FunSuite} +import org.scalatest.FunSuite import org.apache.mahout.math._ -import decompositions._ -import scalabindings._ import drm._ -import RLikeOps._ -import RLikeDrmOps._ import org.apache.mahout.h2obindings._ -import test.MahoutLocalContext -import scala.collection.mutable.ArrayBuffer -import org.apache.mahout.math.Matrices -import org.apache.mahout.h2obindings.{H2OEngine, ops} -import org.apache.mahout.math.drm.logical.{OpAtx, OpAtB, OpAtA} -import scala.util.Random +import test.DistributedH2OSuite -/** R-like DRM DSL operation tests */ -class RLikeDrmOpsSuite extends FunSuite with Matchers with MahoutLocalContext { - - import RLikeOps._ - - val epsilon = 1E-5 - - test("A.t") { - - val inCoreA = dense((1, 2, 3), (3, 4, 5)) - - val A = drmParallelize(inCoreA) - - val inCoreAt = A.t.collect - - // Assert first norm of difference is less than error margin. - (inCoreAt - inCoreA.t).norm should be < epsilon - - } - - test("C = A %*% B") { - - val inCoreA = dense((1, 2), (3, 4)) - val inCoreB = dense((3, 5), (4, 6)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - val B = drmParallelize(inCoreB, numPartitions = 2) - - // Actual - val inCoreCControl = inCoreA %*% inCoreB - - // Distributed operation - val C = A %*% B - val inCoreC = C.collect - println(inCoreC) - - (inCoreC - inCoreCControl).norm should be < 1E-10 - - // We also should be able to collect via implicit checkpoint - val inCoreC2 = C.collect - println(inCoreC2) - - (inCoreC2 - inCoreCControl).norm should be < 1E-10 - - } - - test("C = A %*% B mapBlock {}") { - - val inCoreA = dense((1, 2), (3, 4)) - val inCoreB = dense((3, 5), (4, 6)) - - val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint() - val B = drmParallelize(inCoreB, numPartitions = 2).checkpoint() - - // Actual - val inCoreCControl = inCoreA %*% inCoreB - - A.colSums() - B.colSums() - - - val x = drmBroadcast(dvec(0, 0)) - val x2 = drmBroadcast(dvec(0, 0)) - // Distributed operation - val C = (B.t %*% A.t).t.mapBlock() { - case (keys, block) => - for (row <- 0 until block.nrow) block(row, ::) += x.value + x2 - keys -> block - } - - val inCoreC = C checkpoint CacheHint.NONE collect; - println(inCoreC) - - (inCoreC - inCoreCControl).norm should be < 1E-10 - - // We also should be able to collect via implicit checkpoint - val inCoreC2 = C.collect - println(inCoreC2) - - (inCoreC2 - inCoreCControl).norm should be < 1E-10 - - val inCoreQ = dqrThin(C)._1.collect - - printf("Q=\n%s\n", inCoreQ) - - // Assert unit-orthogonality - ((inCoreQ(::, 0) dot inCoreQ(::, 0)) - 1.0).abs should be < 1e-10 - (inCoreQ(::, 0) dot inCoreQ(::, 1)).abs should be < 1e-10 - - } - - test("C = A %*% B incompatible B keys") { - - val inCoreA = dense((1, 2), (3, 4)) - val inCoreB = dense((3, 5), (4, 6)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - val B = drmParallelize(inCoreB, numPartitions = 2) - // Re-key B into DrmLike[String] instead of [Int] - .mapBlock()({ - case (keys, block) => keys.map(_.toString) -> block - }) - - val C = A %*% B - - intercept[IllegalArgumentException] { - // This plan must not compile - C.checkpoint() - } - } - - test("C = At %*% B , join") { - - val inCoreA = dense((1, 2), (3, 4), (-3, -5)) - val inCoreB = dense((3, 5), (4, 6), (0, 1)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - val B = drmParallelize(inCoreB, numPartitions = 2) - - val C = A.t %*% B - - H2OEngine.optimizerRewrite(C) should equal(OpAtB[Int](A, B)) - - val inCoreC = C.collect - val inCoreControlC = inCoreA.t %*% inCoreB - - (inCoreC - inCoreControlC).norm should be < 1E-10 - - } - - test("C = At %*% B , join, String-keyed") { - - val inCoreA = dense((1, 2), (3, 4), (-3, -5)) - val inCoreB = dense((3, 5), (4, 6), (0, 1)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - .mapBlock()({ - case (keys, block) => keys.map(_.toString) -> block - }) - - val B = drmParallelize(inCoreB, numPartitions = 2) - .mapBlock()({ - case (keys, block) => keys.map(_.toString) -> block - }) - - val C = A.t %*% B - - H2OEngine.optimizerRewrite(C) should equal(OpAtB[String](A, B)) - - val inCoreC = C.collect - val inCoreControlC = inCoreA.t %*% inCoreB - - (inCoreC - inCoreControlC).norm should be < 1E-10 - - } - - test("C = At %*% B , zippable, String-keyed") { - - val inCoreA = dense((1, 2), (3, 4), (-3, -5)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - .mapBlock()({ - case (keys, block) => keys.map(_.toString) -> block - }) - - val B = A + 1.0 - - val C = A.t %*% B - - H2OEngine.optimizerRewrite(C) should equal(OpAtB[String](A, B)) - - val inCoreC = C.collect - val inCoreControlC = inCoreA.t %*% (inCoreA + 1.0) - - (inCoreC - inCoreControlC).norm should be < 1E-10 - - } - - test("C = A %*% inCoreB") { - - val inCoreA = dense((1, 2, 3), (3, 4, 5), (4, 5, 6), (5, 6, 7)) - val inCoreB = dense((3, 5, 7, 10), (4, 6, 9, 10), (5, 6, 7, 7)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - val C = A %*% inCoreB - - val inCoreC = C.collect - val inCoreCControl = inCoreA %*% inCoreB - - println(inCoreC) - (inCoreC - inCoreCControl).norm should be < 1E-10 - - } - - test("C = inCoreA %*%: B") { - - val inCoreA = dense((1, 2, 3), (3, 4, 5), (4, 5, 6), (5, 6, 7)) - val inCoreB = dense((3, 5, 7, 10), (4, 6, 9, 10), (5, 6, 7, 7)) - - val B = drmParallelize(inCoreB, numPartitions = 2) - val C = inCoreA %*%: B - - val inCoreC = C.collect - val inCoreCControl = inCoreA %*% inCoreB - - println(inCoreC) - (inCoreC - inCoreCControl).norm should be < 1E-10 - - } - - test("C = A.t %*% A") { - val inCoreA = dense((1, 2, 3), (3, 4, 5), (4, 5, 6), (5, 6, 7)) - val A = drmParallelize(m = inCoreA, numPartitions = 2) - - val AtA = A.t %*% A - - // Assert optimizer detects square - H2OEngine.optimizerRewrite(action = AtA) should equal(OpAtA(A)) - - val inCoreAtA = AtA.collect - val inCoreAtAControl = inCoreA.t %*% inCoreA - - (inCoreAtA - inCoreAtAControl).norm should be < 1E-10 - } - - test("C = A.t %*% A fat non-graph") { - val inCoreA = Matrices.uniformView(400, 550, 1234) - val A = drmParallelize(m = inCoreA, numPartitions = 2) - - val AtA = A.t %*% A - - // Assert optimizer detects square - H2OEngine.optimizerRewrite(action = AtA) should equal(OpAtA(A)) - - val inCoreAtA = AtA.collect - val inCoreAtAControl = inCoreA.t %*% inCoreA - - (inCoreAtA - inCoreAtAControl).norm should be < 1E-10 - } - - - test("C = A.t %*% A non-int key") { - val inCoreA = dense((1, 2, 3), (3, 4, 5), (4, 5, 6), (5, 6, 7)) - val AintKeyd = drmParallelize(m = inCoreA, numPartitions = 2) - val A = AintKeyd.mapBlock() { - case (keys, block) => keys.map(_.toString) -> block - } - - val AtA = A.t %*% A - - // Assert optimizer detects square - H2OEngine.optimizerRewrite(action = AtA) should equal(OpAtA(A)) - - val inCoreAtA = AtA.collect - val inCoreAtAControl = inCoreA.t %*% inCoreA - - (inCoreAtA - inCoreAtAControl).norm should be < 1E-10 - } - - test("C = A + B") { - - val inCoreA = dense((1, 2), (3, 4)) - val inCoreB = dense((3, 5), (4, 6)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - val B = drmParallelize(inCoreB, numPartitions = 2) - - val C = A + B - val inCoreC = C.collect - - // Actual - val inCoreCControl = inCoreA + inCoreB - - (inCoreC - inCoreCControl).norm should be < 1E-10 - } - - test("C = A + B, identically partitioned") { - - val inCoreA = dense((1, 2, 3), (3, 4, 5), (5, 6, 7)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - - printf("A.nrow=%d.\n", A.nrow) - - // Create B which would be identically partitioned to A. mapBlock() by default will do the trick. - val B = A.mapBlock() { - case (keys, block) => - val bBlock = block.like() := ((r, c, v) => util.Random.nextDouble()) - keys -> bBlock - } - // Prevent repeated computation non-determinism - .checkpoint() - - val inCoreB = B.collect - - printf("A=\n%s\n", inCoreA) - printf("B=\n%s\n", inCoreB) - - val C = A + B - - val inCoreC = C.collect - - printf("C=\n%s\n", inCoreC) - - // Actual - val inCoreCControl = inCoreA + inCoreB - - (inCoreC - inCoreCControl).norm should be < 1E-10 - } - - - test("C = A + B side test 1") { - - val inCoreA = dense((1, 2), (3, 4)) - val inCoreB = dense((3, 5), (4, 6)) - - val A = drmParallelize(inCoreA, numPartitions = 2) - val B = drmParallelize(inCoreB, numPartitions = 2) - - val C = A + B - val inCoreC = C.collect - - val inCoreD = (A + B).collect - - // Actual - val inCoreCControl = inCoreA + inCoreB - - (inCoreC - inCoreCControl).norm should be < 1E-10 - (inCoreD - inCoreCControl).norm should be < 1E-10 - } - - test("C = A + B side test 2") { - - val inCoreA = dense((1, 2), (3, 4)) - val inCoreB = dense((3, 5), (4, 6)) - - val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint() - val B = drmParallelize(inCoreB, numPartitions = 2) - - val C = A + B - val inCoreC = C.collect - - val inCoreD = (A + B).collect - - // Actual - val inCoreCControl = inCoreA + inCoreB - - (inCoreC - inCoreCControl).norm should be < 1E-10 - (inCoreD - inCoreCControl).norm should be < 1E-10 - } - - test("C = A + B side test 3") { - - val inCoreA = dense((1, 2), (3, 4)) - val inCoreB = dense((3, 5), (4, 6)) - - val B = drmParallelize(inCoreB, numPartitions = 2) - // val A = (drmParallelize(inCoreA, numPartitions = 2) + B).checkpoint(CacheHint.MEMORY_ONLY_SER) - val A = (drmParallelize(inCoreA, numPartitions = 2) + B).checkpoint(CacheHint.MEMORY_ONLY) - - val C = A + B - val inCoreC = C.collect - - val inCoreD = (A + B).collect - - // Actual - val inCoreCControl = inCoreA + inCoreB * 2.0 - - (inCoreC - inCoreCControl).norm should be < 1E-10 - (inCoreD - inCoreCControl).norm should be < 1E-10 - } - - test("Ax") { - val inCoreA = dense( - (1, 2), - (3, 4), - (20, 30) - ) - val x = dvec(10, 3) - - val drmA = drmParallelize(inCoreA, numPartitions = 2) - - val ax = (drmA %*% x).collect(::, 0) - - ax should equal(inCoreA %*% x) - } - - test("A'x") { - val inCoreA = dense( - (1, 2), - (3, 4), - (20, 30) - ) - val x = dvec(10, 3, 4) - - val drmA = drmParallelize(inCoreA, numPartitions = 2) - - H2OEngine.optimizerRewrite(drmA.t %*% x) should equal(OpAtx(drmA, x)) - - val atx = (drmA.t %*% x).collect(::, 0) - - atx should equal(inCoreA.t %*% x) - } - - test("colSums, colMeans") { - val inCoreA = dense( - (1, 2), - (3, 4), - (20, 30) - ) - val drmA = drmParallelize(inCoreA, numPartitions = 2) - - drmA.colSums() should equal(inCoreA.colSums()) - drmA.colMeans() should equal(inCoreA.colMeans()) - } - - test("numNonZeroElementsPerColumn") { - val inCoreA = dense( - (0, 2), - (3, 0), - (0, -30) - - ) - val drmA = drmParallelize(inCoreA, numPartitions = 2) - - drmA.numNonZeroElementsPerColumn() should equal(inCoreA.numNonZeroElementsPerColumn()) - } - - test("C = A cbind B, cogroup") { - - val inCoreA = dense((1, 2), (3, 4)) - val inCoreB = dense((3, 5), (4, 6)) - val controlC = dense((1, 2, 3, 5), (3, 4, 4, 6)) - - val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint() - val B = drmParallelize(inCoreB, numPartitions = 2).checkpoint() - - (A.cbind(B) -: controlC).norm should be < 1e-10 - - } - - test("C = A cbind B, zip") { - - val inCoreA = dense((1, 2), (3, 4)) - val controlC = dense((1, 2, 2, 3), (3, 4, 4, 5)) - - val A = drmParallelize(inCoreA, numPartitions = 2).checkpoint() - - (A.cbind(A + 1.0) -: controlC).norm should be < 1e-10 - - } - -} +/** ==R-like DRM DSL operation tests -- H2O== */ +class RLikeDrmOpsSuite extends FunSuite with DistributedH2OSuite with RLikeDrmOpsSuiteBase diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/ABtSuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/ABtSuite.scala index c154985e72..6395233923 100644 --- a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/ABtSuite.scala +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/ABtSuite.scala @@ -17,7 +17,7 @@ package org.apache.mahout.h2obindings.ops -import org.apache.mahout.h2obindings.test.MahoutLocalContext +import org.apache.mahout.h2obindings.test.DistributedH2OSuite import org.scalatest.FunSuite import org.apache.mahout.math.scalabindings._ import org.apache.mahout.math.drm._ @@ -28,7 +28,7 @@ import RLikeDrmOps._ import org.apache.mahout.math.drm._ /** Tests for AB' operator algorithms */ -class ABtSuite extends FunSuite with MahoutLocalContext { +class ABtSuite extends FunSuite with DistributedH2OSuite { test("ABt") { val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5)) diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AewBSuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AewBSuite.scala index 3a3347e15d..98a3345ad1 100644 --- a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AewBSuite.scala +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AewBSuite.scala @@ -18,7 +18,7 @@ package org.apache.mahout.h2obindings.ops import org.scalatest.FunSuite -import org.apache.mahout.h2obindings.test.MahoutLocalContext +import org.apache.mahout.h2obindings.test.DistributedH2OSuite import org.apache.mahout.math.scalabindings._ import RLikeOps._ import org.apache.mahout.math.drm._ @@ -26,7 +26,7 @@ import RLikeDrmOps._ import org.apache.mahout.math.drm.logical._ /** Elementwise matrix operation tests */ -class AewBSuite extends FunSuite with MahoutLocalContext { +class AewBSuite extends FunSuite with DistributedH2OSuite { test("A * B Hadamard") { val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (7, 8, 9)) diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtASuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtASuite.scala index 47cf14e438..bdb2fe09c1 100644 --- a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtASuite.scala +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtASuite.scala @@ -18,7 +18,7 @@ package org.apache.mahout.h2obindings.ops import org.scalatest.FunSuite -import org.apache.mahout.h2obindings.test.MahoutLocalContext +import org.apache.mahout.h2obindings.test.DistributedH2OSuite import org.apache.mahout.math.scalabindings._ import org.apache.mahout.math.drm._ import org.apache.mahout.h2obindings._ @@ -28,7 +28,7 @@ import RLikeDrmOps._ import org.apache.mahout.math.drm._ /** Tests for {@link XtX} */ -class AtASuite extends FunSuite with MahoutLocalContext { +class AtASuite extends FunSuite with DistributedH2OSuite { test("AtA slim") { diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtSuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtSuite.scala index 98dbe8ea46..f04f32f68b 100644 --- a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtSuite.scala +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtSuite.scala @@ -18,7 +18,7 @@ package org.apache.mahout.h2obindings.ops import org.scalatest.FunSuite -import org.apache.mahout.h2obindings.test.MahoutLocalContext +import org.apache.mahout.h2obindings.test.DistributedH2OSuite import org.apache.mahout.math.scalabindings._ import org.apache.mahout.math.drm._ import org.apache.mahout.h2obindings._ @@ -28,7 +28,7 @@ import RLikeDrmOps._ import org.apache.mahout.math.drm._ /** Tests for A' algorithms */ -class AtSuite extends FunSuite with MahoutLocalContext { +class AtSuite extends FunSuite with DistributedH2OSuite { test("At") { val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5)) diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/test/DistributedH2OSuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/test/DistributedH2OSuite.scala new file mode 100644 index 0000000000..4568fad46c --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/test/DistributedH2OSuite.scala @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.test + +import org.scalatest.Suite +import org.apache.mahout.h2obindings._ +import org.apache.mahout.test.{DistributedMahoutSuite,MahoutSuite} +import org.apache.mahout.math.drm.DistributedContext + +trait DistributedH2OSuite extends DistributedMahoutSuite with LoggerConfiguration { + this: Suite => + + protected implicit var mahoutCtx: DistributedContext = _ + + override protected def beforeEach() { + super.beforeEach() + + mahoutCtx = mahoutH2OContext("mah2out") + } + + override protected def afterEach() { + if (mahoutCtx != null) { + try { + mahoutCtx.close() + } finally { + mahoutCtx = null + } + } + super.afterEach() + } +} diff --git a/h2o/src/test/scala/org/apache/mahout/math/decompositions/DistributedDecompositionsSuite.scala b/h2o/src/test/scala/org/apache/mahout/math/decompositions/DistributedDecompositionsSuite.scala new file mode 100644 index 0000000000..71f3afd36c --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/math/decompositions/DistributedDecompositionsSuite.scala @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.math.decompositions + +import org.apache.mahout.math._ +import drm._ +import scalabindings._ +import RLikeOps._ +import RLikeDrmOps._ +import org.apache.mahout.h2obindings._ +import org.apache.mahout.common.RandomUtils +import scala.math._ +import org.scalatest.{Matchers, FunSuite} +import org.apache.mahout.h2obindings.test.DistributedH2OSuite + +class DistributedDecompositionsSuite extends FunSuite with DistributedH2OSuite with DistributedDecompositionsSuiteBase { + + +} From 5eaad83a6c5e1299503718722bddf2f704d5ef1e Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Mon, 21 Jul 2014 16:36:51 -0700 Subject: [PATCH 16/34] MAHOUT-1500: make context public in CheckpointedDrmH2O Signed-off-by: Anand Avati --- .../org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala index 269ddbaec5..24d20b537a 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala @@ -14,7 +14,7 @@ import scala.reflect._ class CheckpointedDrmH2O[K: ClassTag]( val frame: Frame, val labels: Vec, - protected[mahout] val context: DistributedContext + public val context: DistributedContext ) extends CheckpointedDrm[K] { def this(frame: Frame, context: DistributedContext) = From 4b67b1e4d728a1be8defa3d442c7fbdf96189f16 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Mon, 21 Jul 2014 16:42:16 -0700 Subject: [PATCH 17/34] MAHOUT-1500: Remove MathSuite from h2o/src/test Signed-off-by: Anand Avati --- .../h2obindings/drm/CheckpointedDrmH2O.scala | 2 +- .../math/cf/CooccurrenceAnalysisSuite.scala | 235 ------------------ .../math/decompositions/MathSuite.scala | 212 ---------------- 3 files changed, 1 insertion(+), 448 deletions(-) delete mode 100644 h2o/src/test/scala/org/apache/mahout/math/cf/CooccurrenceAnalysisSuite.scala delete mode 100644 h2o/src/test/scala/org/apache/mahout/math/decompositions/MathSuite.scala diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala index 24d20b537a..b987d1b3ae 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala @@ -14,7 +14,7 @@ import scala.reflect._ class CheckpointedDrmH2O[K: ClassTag]( val frame: Frame, val labels: Vec, - public val context: DistributedContext + val context: DistributedContext ) extends CheckpointedDrm[K] { def this(frame: Frame, context: DistributedContext) = diff --git a/h2o/src/test/scala/org/apache/mahout/math/cf/CooccurrenceAnalysisSuite.scala b/h2o/src/test/scala/org/apache/mahout/math/cf/CooccurrenceAnalysisSuite.scala deleted file mode 100644 index 3624f5a87b..0000000000 --- a/h2o/src/test/scala/org/apache/mahout/math/cf/CooccurrenceAnalysisSuite.scala +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.cf - -import org.apache.mahout.math.drm._ -import org.apache.mahout.math.scalabindings.{MatrixOps, _} -import org.apache.mahout.h2obindings.test.MahoutLocalContext -import org.apache.mahout.test.MahoutSuite -import org.scalatest.FunSuite - -/* values -A = -1 1 0 0 0 -0 0 1 1 0 -0 0 0 0 1 -1 0 0 1 0 - -B = -1 1 1 1 0 -1 1 1 1 0 -0 0 1 0 1 -1 1 0 1 0 - */ - -class CooccurrenceAnalysisSuite extends FunSuite with MahoutSuite with MahoutLocalContext { - - // correct cooccurrence with LLR - final val matrixLLRCoocAtAControl = dense( - (0.0, 1.7260924347106847, 0.0, 0.0, 0.0), - (1.7260924347106847, 0.0, 0.0, 0.0, 0.0), - (0.0, 0.0, 0.0, 1.7260924347106847, 0.0), - (0.0, 0.0, 1.7260924347106847, 0.0, 0.0), - (0.0, 0.0, 0.0, 0.0, 0.0)) - - // correct cross-cooccurrence with LLR - final val matrixLLRCoocBtAControl = dense( - (1.7260924347106847, 0.6795961471815897, 0.6795961471815897, 1.7260924347106847, 0.0), - (1.7260924347106847, 0.6795961471815897, 0.6795961471815897, 1.7260924347106847, 0.0), - (1.7260924347106847, 0.6795961471815897, 0.6795961471815897, 1.7260924347106847, 0.6795961471815897), - (1.7260924347106847, 0.6795961471815897, 0.6795961471815897, 1.7260924347106847, 0.0), - (0.0, 0.0, 0.0, 0.0, 4.498681156950466)) - - - - test("cooccurrence [A'A], [B'A] boolbean data using LLR") { - val a = dense( - (1, 1, 0, 0, 0), - (0, 0, 1, 1, 0), - (0, 0, 0, 0, 1), - (1, 0, 0, 1, 0)) - - val b = dense( - (1, 1, 1, 1, 0), - (1, 1, 1, 1, 0), - (0, 0, 1, 0, 1), - (1, 1, 0, 1, 0)) - - val drmA = drmParallelize(m = a, numPartitions = 2) - val drmB = drmParallelize(m = b, numPartitions = 2) - - //self similarity - val drmCooc = CooccurrenceAnalysis.cooccurrences(drmARaw = drmA, randomSeed = 1, drmBs = Array(drmB)) - val matrixSelfCooc = drmCooc(0).checkpoint().collect - val diffMatrix = matrixSelfCooc.minus(matrixLLRCoocAtAControl) - var n = (new MatrixOps(m = diffMatrix)).norm - n should be < 1E-10 - - //cross similarity - val matrixCrossCooc = drmCooc(1).checkpoint().collect - val diff2Matrix = matrixCrossCooc.minus(matrixLLRCoocBtAControl) - n = (new MatrixOps(m = diff2Matrix)).norm - n should be < 1E-10 - - } - - test("cooccurrence [A'A], [B'A] double data using LLR") { - val a = dense( - (100000.0D, 1.0D, 0.0D, 0.0D, 0.0D), - ( 0.0D, 0.0D, 10.0D, 1.0D, 0.0D), - ( 0.0D, 0.0D, 0.0D, 0.0D, 1000.0D), - ( 1.0D, 0.0D, 0.0D, 10.0D, 0.0D)) - - val b = dense( - (10000.0D, 100.0D, 1000.0D, 1.0D, 0.0D), - ( 10.0D, 1.0D, 10000000.0D, 10.0D, 0.0D), - ( 0.0D, 0.0D, 1000.0D, 0.0D, 100.0D), - ( 100.0D, 1.0D, 0.0D, 100000.0D, 0.0D)) - - val drmA = drmParallelize(m = a, numPartitions = 2) - val drmB = drmParallelize(m = b, numPartitions = 2) - - //self similarity - val drmCooc = CooccurrenceAnalysis.cooccurrences(drmARaw = drmA, drmBs = Array(drmB)) - val matrixSelfCooc = drmCooc(0).checkpoint().collect - val diffMatrix = matrixSelfCooc.minus(matrixLLRCoocAtAControl) - var n = (new MatrixOps(m = diffMatrix)).norm - n should be < 1E-10 - - //cross similarity - val matrixCrossCooc = drmCooc(1).checkpoint().collect - val diff2Matrix = matrixCrossCooc.minus(matrixLLRCoocBtAControl) - n = (new MatrixOps(m = diff2Matrix)).norm - n should be < 1E-10 - } - - test("cooccurrence [A'A], [B'A] integer data using LLR") { - val a = dense( - ( 1000, 10, 0, 0, 0), - ( 0, 0, -10000, 10, 0), - ( 0, 0, 0, 0, 100), - (10000, 0, 0, 1000, 0)) - - val b = dense( - ( 100, 1000, -10000, 10000, 0), - (10000, 1000, 100, 10, 0), - ( 0, 0, 10, 0, -100), - ( 10, 100, 0, 1000, 0)) - - val drmA = drmParallelize(m = a, numPartitions = 2) - val drmB = drmParallelize(m = b, numPartitions = 2) - - //self similarity - val drmCooc = CooccurrenceAnalysis.cooccurrences(drmARaw = drmA, drmBs = Array(drmB)) - //var cp = drmSelfCooc(0).checkpoint() - //cp.writeDRM("/tmp/cooc-spark/")//to get values written - val matrixSelfCooc = drmCooc(0).checkpoint().collect - val diffMatrix = matrixSelfCooc.minus(matrixLLRCoocAtAControl) - var n = (new MatrixOps(m = diffMatrix)).norm - n should be < 1E-10 - - //cross similarity - val matrixCrossCooc = drmCooc(1).checkpoint().collect - val diff2Matrix = matrixCrossCooc.minus(matrixLLRCoocBtAControl) - n = (new MatrixOps(m = diff2Matrix)).norm - n should be < 1E-10 - } - - test("LLR calc") { - val A = dense( - (1, 1, 0, 0, 0), - (0, 0, 1, 1, 0), - (0, 0, 0, 0, 1), - (1, 0, 0, 1, 0)) - - val AtA = A.transpose().times(A) - - /* AtA is: - 0 => {0:2.0,1:1.0,3:1.0} - 1 => {0:1.0,1:1.0} - 2 => {2:1.0,3:1.0} - 3 => {0:1.0,2:1.0,3:2.0} - 4 => {4:1.0} - - val AtAd = dense( - (2, 1, 0, 1, 0), - (1, 1, 0, 0, 0), - (0, 0, 1, 1, 0), - (1, 0, 1, 2, 0), - (0, 0, 0, 0, 1)) - - val AtAdNoSelfCooc = dense( - (0, 1, 0, 1, 0), - (1, 0, 0, 0, 0), - (0, 0, 0, 1, 0), - (1, 0, 1, 0, 0), - (0, 0, 0, 0, 0)) - - for (MatrixSlice row : cooccurrence) { - for (Vector.Element element : row.vector().nonZeroes()) { - long k11 = (long) element.get();// = 1 - long k12 = (long) (rowSums.get(row.index()) - k11);// = 0 - long k21 = (long) (colSums.get(element.index()) - k11);// = 1 - long k22 = (long) (total - k11 - k12 - k21);// = 2 - double score = LogLikelihood.rootLogLikelihoodRatio(k11, k12, k21, k22); - element.set(score); - } - } - - for some reason the hadoop version returns the following - return 1.0 - 1.0 / (1.0 + logLikelihood); - so not a pure llr or root llr - - */ - - //item (1,0) - val numInteractionsWithAandB = 1L - val numInteractionsWithA = 1L - val numInteractionsWithB = 2L - val numInteractions = 6l - - val llr = CooccurrenceAnalysis.logLikelihoodRatio(numInteractionsWithA, numInteractionsWithB, numInteractionsWithAandB, numInteractions) - - assert(llr == 2.6341457841558764) // value calculated by hadoop itemsimilairty - } - - test("downsampling by number per row") { - val a = dense( - (1, 1, 1, 1, 0), - (1, 1, 1, 1, 1), - (0, 0, 0, 0, 1), - (1, 1, 0, 1, 0)) - val drmA: DrmLike[Int] = drmParallelize(m = a, numPartitions = 2) - - val downSampledDrm = CooccurrenceAnalysis.sampleDownAndBinarize(drmA, 0xdeadbeef, 4) - //count non-zero values, should be == 7 - var numValues = 0 - val m = downSampledDrm.collect - val it = m.iterator() - while (it.hasNext) { - val v = it.next().vector() - val nonZeroIt = v.nonZeroes().iterator() - while (nonZeroIt.hasNext) { - numValues += 1 - nonZeroIt.next() - } - } - - assert(numValues == 8) //Don't change the random seed or this may fail. - } -} diff --git a/h2o/src/test/scala/org/apache/mahout/math/decompositions/MathSuite.scala b/h2o/src/test/scala/org/apache/mahout/math/decompositions/MathSuite.scala deleted file mode 100644 index 7040fd32bd..0000000000 --- a/h2o/src/test/scala/org/apache/mahout/math/decompositions/MathSuite.scala +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.math.decompositions - -import org.scalatest.{Matchers, FunSuite} -import org.apache.mahout.h2obindings.test.MahoutLocalContext -import org.apache.mahout.math._ -import drm._ -import scalabindings._ -import RLikeOps._ -import RLikeDrmOps._ -import org.apache.mahout.h2obindings._ -import org.apache.mahout.common.RandomUtils -import scala.math._ - -class MathSuite extends FunSuite with Matchers with MahoutLocalContext { - - test("thin distributed qr") { - - val inCoreA = dense( - (1, 2, 3, 4), - (2, 3, 4, 5), - (3, -4, 5, 6), - (4, 5, 6, 7), - (8, 6, 7, 8) - ) - - val A = drmParallelize(inCoreA, numPartitions = 2) - val (drmQ, inCoreR) = dqrThin(A, checkRankDeficiency = false) - - // Assert optimizer still knows Q and A are identically partitioned - drmQ.partitioningTag should equal(A.partitioningTag) - -// drmQ.rdd.partitions.size should be(A.rdd.partitions.size) - - // Should also be zippable -// drmQ.rdd.zip(other = A.rdd) - - val inCoreQ = drmQ.collect - - printf("A=\n%s\n", inCoreA) - printf("Q=\n%s\n", inCoreQ) - printf("R=\n%s\n", inCoreR) - - val (qControl, rControl) = qr(inCoreA) - printf("qControl=\n%s\n", qControl) - printf("rControl=\n%s\n", rControl) - - // Validate with Cholesky - val ch = chol(inCoreA.t %*% inCoreA) - printf("A'A=\n%s\n", inCoreA.t %*% inCoreA) - printf("L:\n%s\n", ch.getL) - - val rControl2 = (ch.getL cloned).t - val qControl2 = ch.solveRight(inCoreA) - printf("qControl2=\n%s\n", qControl2) - printf("rControl2=\n%s\n", rControl2) - - // Housholder approach seems to be a little bit more stable - (rControl - inCoreR).norm should be < 1E-5 - (qControl - inCoreQ).norm should be < 1E-5 - - // Assert identicity with in-core Cholesky-based -- this should be tighter. - (rControl2 - inCoreR).norm should be < 1E-10 - (qControl2 - inCoreQ).norm should be < 1E-10 - - // Assert orhtogonality: - // (a) Q[,j] dot Q[,j] == 1.0 for all j - // (b) Q[,i] dot Q[,j] == 0.0 for all i != j - for (col <- 0 until inCoreQ.ncol) - ((inCoreQ(::, col) dot inCoreQ(::, col)) - 1.0).abs should be < 1e-10 - for (col1 <- 0 until inCoreQ.ncol - 1; col2 <- col1 + 1 until inCoreQ.ncol) - (inCoreQ(::, col1) dot inCoreQ(::, col2)).abs should be < 1e-10 - - - } - - test("dssvd - the naive-est - q=0") { - dssvdNaive(q = 0) - } - - test("ddsvd - naive - q=1") { - dssvdNaive(q = 1) - } - - test("ddsvd - naive - q=2") { - dssvdNaive(q = 2) - } - - - def dssvdNaive(q: Int) { - val inCoreA = dense( - (1, 2, 3, 4), - (2, 3, 4, 5), - (3, -4, 5, 6), - (4, 5, 6, 7), - (8, 6, 7, 8) - ) - val drmA = drmParallelize(inCoreA, numPartitions = 2) - - val (drmU, drmV, s) = dssvd(drmA, k = 4, q = q) - val (inCoreU, inCoreV) = (drmU.collect, drmV.collect) - - printf("U:\n%s\n", inCoreU) - printf("V:\n%s\n", inCoreV) - printf("Sigma:\n%s\n", s) - - (inCoreA - (inCoreU %*%: diagv(s)) %*% inCoreV.t).norm should be < 1E-5 - } - - test("dspca") { - - val rnd = RandomUtils.getRandom - - // Number of points - val m = 500 - // Length of actual spectrum - val spectrumLen = 40 - - val spectrum = dvec((0 until spectrumLen).map(x => 300.0 * exp(-x) max 1e-3)) - printf("spectrum:%s\n", spectrum) - - val (u, _) = qr(new SparseRowMatrix(m, spectrumLen) := - ((r, c, v) => if (rnd.nextDouble() < 0.2) 0 else rnd.nextDouble() + 5.0)) - - // PCA Rotation matrix -- should also be orthonormal. - val (tr, _) = qr(Matrices.symmetricUniformView(spectrumLen, spectrumLen, rnd.nextInt) - 10.0) - - val input = (u %*%: diagv(spectrum)) %*% tr.t - val drmInput = drmParallelize(m = input, numPartitions = 2) - - // Calculate just first 10 principal factors and reduce dimensionality. - // Since we assert just validity of the s-pca, not stochastic error, we bump p parameter to - // ensure to zero stochastic error and assert only functional correctness of the method's pca- - // specific additions. - val k = 10 - - // Calculate just first 10 principal factors and reduce dimensionality. - var (drmPCA, _, s) = dspca(A = drmInput, k = 10, p = spectrumLen, q = 1) - // Un-normalized pca data: - drmPCA = drmPCA %*% diagv(s) - - val pca = drmPCA.checkpoint(CacheHint.NONE).collect - - // Of course, once we calculated the pca, the spectrum is going to be different since our originally - // generated input was not centered. So here, we'd just brute-solve pca to verify - val xi = input.colMeans() - for (r <- 0 until input.nrow) input(r, ::) -= xi - var (pcaControl, _, sControl) = svd(m = input) - pcaControl = (pcaControl %*%: diagv(sControl))(::, 0 until k) - - printf("pca:\n%s\n", pca(0 until 10, 0 until 10)) - printf("pcaControl:\n%s\n", pcaControl(0 until 10, 0 until 10)) - - (pca(0 until 10, 0 until 10).norm - pcaControl(0 until 10, 0 until 10).norm).abs should be < 1E-5 - - } - - test("als") { - - val rnd = RandomUtils.getRandom - - // Number of points - val m = 500 - val n = 500 - - // Length of actual spectrum - val spectrumLen = 40 - - // Create singluar values with decay - val spectrum = dvec((0 until spectrumLen).map(x => 300.0 * exp(-x) max 1e-3)) - printf("spectrum:%s\n", spectrum) - - // Create A as an ideal input - val inCoreA = (qr(Matrices.symmetricUniformView(m, spectrumLen, 1234))._1 %*%: diagv(spectrum)) %*% - qr(Matrices.symmetricUniformView(n, spectrumLen, 2345))._1.t - val drmA = drmParallelize(inCoreA, numPartitions = 2) - - // Decompose using ALS - val (drmU, drmV, rmse) = als(drmInput = drmA, k = 20).toTuple - val inCoreU = drmU.collect - val inCoreV = drmV.collect - - val predict = inCoreU %*% inCoreV.t - - printf("Control block:\n%s\n", inCoreA(0 until 3, 0 until 3)) - printf("ALS factorized approximation block:\n%s\n", predict(0 until 3, 0 until 3)) - - val err = (inCoreA - predict).norm - printf ("norm of residuals %f\n",err) - printf ("train iteration rmses: %s\n", rmse) - - err should be < 1e-2 - - } - -} From f06af4ff869f66f17eecd58984e09186f8f413cb Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Wed, 23 Jul 2014 10:14:14 -0700 Subject: [PATCH 18/34] MAHOUT-1500: ValueString API changes Signed-off-by: Anand Avati --- .../main/java/org/apache/mahout/h2obindings/H2OHdfs.java | 4 +++- .../main/java/org/apache/mahout/h2obindings/H2OHelper.java | 4 +++- .../main/java/org/apache/mahout/h2obindings/ops/Par.java | 5 ++++- .../java/org/apache/mahout/h2obindings/ops/RowRange.java | 7 +++++-- .../org/apache/mahout/h2obindings/ops/MapBlockHelper.scala | 4 +++- pom.xml | 2 +- 6 files changed, 19 insertions(+), 7 deletions(-) diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java index 2783cb2c1d..f7aef2a08b 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java @@ -25,6 +25,7 @@ import water.fvec.Frame; import water.fvec.Vec; import water.Futures; +import water.parser.ValueString; import org.apache.mahout.math.Vector; import org.apache.mahout.math.DenseVector; @@ -122,6 +123,7 @@ public static void drm_to_file (String filename, Frame frame, Vec labels) throws FileSystem fs = FileSystem.get(URI.create(uri), conf); SequenceFile.Writer writer = null; boolean is_sparse = H2OHelper.is_sparse(frame); + ValueString vstr = new ValueString(); if (labels != null) writer = SequenceFile.createWriter(fs, conf, path, Text.class, VectorWritable.class); @@ -139,7 +141,7 @@ public static void drm_to_file (String filename, Frame frame, Vec labels) throws v.setQuick(c, frame.vecs()[c].at(r)); if (labels != null) - writer.append(new Text(labels.atStr(r)), new VectorWritable(v)); + writer.append(new Text(labels.atStr(vstr, r).toString()), new VectorWritable(v)); else writer.append(new IntWritable((int)r), new VectorWritable(v)); } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java index 399c593c94..adccb3952b 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -32,6 +32,7 @@ import water.fvec.Chunk; import water.fvec.NewChunk; import water.fvec.C0LChunk; +import water.parser.ValueString; import java.io.File; import java.io.IOException; @@ -96,8 +97,9 @@ public static Matrix matrix_from_frame(Frame frame, Vec labels) { /* If string keyed, set the stings as rowlabels */ if (labels != null) { HashMap map = new HashMap(); + ValueString vstr = new ValueString(); for (long i = 0; i < labels.length(); i++) { - map.put(labels.atStr(i), (int)i); + map.put(labels.atStr(vstr, i).toString(), (int)i); } m.setRowLabelBindings(map); } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java index eee2737f89..257ac7d3d8 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java @@ -22,6 +22,8 @@ import water.fvec.Vec; import water.fvec.Chunk; import water.fvec.NewChunk; +import water.parser.ValueString; + import scala.Tuple2; import org.apache.mahout.h2obindings.H2OHelper; @@ -48,12 +50,13 @@ public void map(Chunk chks[], NewChunk nc) { int chunk_size = chks[0].len(); Vec vins[] = frin.vecs(); long start = chks[0].start(); + ValueString vstr = new ValueString(); for (int r = 0; r < chunk_size; r++) { for (int c = 0; c < chks.length; c++) { chks[c].set0(r, vins[c].at(start + r)); } - nc.addStr(vin.atStr(start + r)); + nc.addStr(vin.atStr(vstr, start + r).toString()); } } }.doAll(1, frout).outputFrame(null, null).anyVec(); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java index 8fbfdaa696..edebc6f5d2 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java @@ -24,6 +24,8 @@ import water.fvec.Vec; import water.fvec.Chunk; import water.fvec.NewChunk; +import water.parser.ValueString; + import scala.Tuple2; public class RowRange { @@ -62,15 +64,16 @@ public void map(Chunk chks[], NewChunk ncs[]) { public void map(Chunk chk, NewChunk nc) { int chunk_size = chk.len(); long chunk_start = chk.start(); + ValueString vstr = new ValueString(); if (chunk_start > R.end() || (chunk_start + chunk_size) < R.start()) return; for (int r = 0; r < chunk_size; r++) { - if (!R.contains (chunk_start + r)) + if (!R.contains(chunk_start + r)) continue; - nc.addStr(chk.atStr0(r)); + nc.addStr(chk.atStr0(vstr, r).toString()); } } }.doAll(1, VA).outputFrame(null, null).anyVec(); diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala index 3967652582..19bb3f4050 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala @@ -22,6 +22,7 @@ import org.apache.mahout.math.drm.BlockMapFunc import scala.reflect.ClassTag import water.fvec.{Vec,NewChunk} +import water.parser.ValueString object MapBlockHelper { def exec[K: ClassTag, R: ClassTag](bmf: Object, in: Matrix, startlong: Long, labels: Vec, nclabel: NewChunk): Matrix = { @@ -35,8 +36,9 @@ object MapBlockHelper { case `l` => startlong until (startlong + in.rowSize) toArray case `s` => { val arr = new Array[String](in.rowSize) + val vstr = new ValueString for (i <- 0 to in.rowSize) { - arr(i) = labels.atStr(i+startlong) + arr(i) = labels.atStr(vstr, i+startlong).toString } arr } diff --git a/pom.xml b/pom.xml index ccef99daa8..a81a71a915 100644 --- a/pom.xml +++ b/pom.xml @@ -110,7 +110,7 @@ 2.10 2.10.4 1.0.1 - 0.1.0-SNAPSHOT + 0.1.1-SNAPSHOT Jira From bf5831b316aeb74abfbb6cffe42ec85829dd3dad Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Wed, 23 Jul 2014 11:28:15 -0700 Subject: [PATCH 19/34] MAHOUT-1500: Simplify empty_frame() Signed-off-by: Anand Avati --- .../apache/mahout/h2obindings/H2OHelper.java | 24 ++++--------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java index adccb3952b..9a47b53895 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -25,13 +25,9 @@ import water.MRTask; import water.Futures; -import water.Key; -import water.DKV; import water.fvec.Frame; import water.fvec.Vec; import water.fvec.Chunk; -import water.fvec.NewChunk; -import water.fvec.C0LChunk; import water.parser.ValueString; import java.io.File; @@ -277,27 +273,17 @@ public static Tuple2 frame_from_matrix(Matrix m, int min_hint, int ex public static Frame empty_frame(long nrow, int ncol, int min_hint, int exact_hint) { int chunk_sz = chunk_size (nrow, ncol, min_hint, exact_hint); int nchunks = (int) ((nrow - 1) / chunk_sz) + 1; /* Final number of Chunks per Vec */ - Futures fs = new Futures(); Vec.VectorGroup vg = new Vec.VectorGroup(); - Key keys[] = vg.addVecs(ncol); long espc[] = new long[nchunks+1]; + final Vec[] vecs = new Vec[ncol]; + for (int i = 0; i < nchunks; i++) espc[i] = i * chunk_sz; espc[nchunks] = nrow; - final Vec[] vecs = new Vec[ncol]; + for (int i = 0; i < vecs.length; i++) - vecs[i] = new Vec(keys[i], espc); - new MRTask() { - protected void setupLocal() { - for (Vec v : vecs) { - for (int i = 0; i < v.nChunks(); i++) { - Key k = v.chunkKey(i); - if (k.home()) DKV.put(k, new C0LChunk(0L, v.chunkLen(i)), _fs); - } - } - for(Vec v : vecs) if(v._key.home()) DKV.put(v._key, v, _fs); - } - }.doAllNodes(); + vecs[i] = Vec.makeCon(0, null, vg, espc); + return new Frame(vecs); } } From 7c51c9a06cce48c7acac49da4d8f560adc4ecf08 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Wed, 23 Jul 2014 13:13:55 -0700 Subject: [PATCH 20/34] MAHOUT-1500: Use ArrayUtils.add() wherever possible Signed-off-by: Anand Avati --- .../main/java/org/apache/mahout/h2obindings/H2OHelper.java | 7 +++---- .../main/java/org/apache/mahout/h2obindings/ops/Atx.java | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java index 9a47b53895..c6626d1e27 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -29,6 +29,7 @@ import water.fvec.Vec; import water.fvec.Chunk; import water.parser.ValueString; +import water.util.ArrayUtils; import java.io.File; import java.io.IOException; @@ -135,8 +136,7 @@ public void map(Chunk chks[]) { } } public void reduce(MRTaskSum other) { - for (int i = 0; i < _sums.length; i++) - _sums[i] += other._sums[i]; + ArrayUtils.add(_sums, other._sums); } } return new DenseVector(new MRTaskSum().doAll(frame)._sums); @@ -185,8 +185,7 @@ public void map(Chunk chks[]) { } } public void reduce(MRTaskNonZero other) { - for (int i = 0; i < _sums.length; i++) - _sums[i] += other._sums[i]; + ArrayUtils.add(_sums, other._sums); } } return new DenseVector(new MRTaskNonZero().doAll(frame)._sums); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java index 3c6ee49fa6..3bb626b064 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java @@ -29,6 +29,7 @@ import water.fvec.Vec; import water.fvec.Chunk; import water.fvec.NewChunk; +import water.util.ArrayUtils; import scala.Tuple2; @@ -60,8 +61,7 @@ public void map(Chunk chks[]) { } } public void reduce(MRTaskAtx other) { - for (int i = 0; i < _atx.length; i++) - _atx[i] += other._atx[i]; + ArrayUtils.add(_atx, other._atx); } } From 8a2ae6f0582c34ded33b179ab32ebe5972e005af Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Wed, 23 Jul 2014 13:23:31 -0700 Subject: [PATCH 21/34] MAHOUT-1500: Style fixes Signed-off-by: Anand Avati --- .../org/apache/mahout/h2obindings/H2OHdfs.java | 6 +++--- .../org/apache/mahout/h2obindings/H2OHelper.java | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java index f7aef2a08b..7a04c958ee 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java @@ -45,7 +45,7 @@ public class H2OHdfs { - public static Tuple2 drm_from_file (String filename, int parMin) { + public static Tuple2 drm_from_file(String filename, int parMin) { long rows = 0; int cols = 0; Frame frame = null; @@ -82,7 +82,7 @@ public static Tuple2 drm_from_file (String filename, int parMin) { } reader.seek(start); - frame = H2OHelper.empty_frame (rows, cols, parMin, -1); + frame = H2OHelper.empty_frame(rows, cols, parMin, -1); writers = new Vec.Writer[cols]; for (int i = 0; i < writers.length; i++) writers[i] = frame.vecs()[i].open(); @@ -116,7 +116,7 @@ public static Tuple2 drm_from_file (String filename, int parMin) { return new Tuple2(frame, labels); } - public static void drm_to_file (String filename, Frame frame, Vec labels) throws java.io.IOException { + public static void drm_to_file(String filename, Frame frame, Vec labels) throws java.io.IOException { String uri = filename; Configuration conf = new Configuration(); Path path = new Path(uri); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java index c6626d1e27..e004de144e 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -75,10 +75,10 @@ public void reduce(MRTaskNZ other) { public static Matrix matrix_from_frame(Frame frame, Vec labels) { Matrix m; - if (is_sparse (frame)) - m = new SparseMatrix ((int)frame.numRows(), frame.numCols()); + if (is_sparse(frame)) + m = new SparseMatrix((int)frame.numRows(), frame.numCols()); else - m = new DenseMatrix ((int)frame.numRows(), frame.numCols()); + m = new DenseMatrix((int)frame.numRows(), frame.numCols()); int c = 0; /* Fill matrix, column at a time */ @@ -213,7 +213,7 @@ private static int chunk_size(long nrow, int ncol, int min, int exact) { /* XXX: calculate based on cloud size and # of cpu */ parts_hint = 4; - chunk_sz = (int) (((nrow - 1) / parts_hint) + 1); + chunk_sz = (int)(((nrow - 1) / parts_hint) + 1); if (exact > 0) return chunk_sz; @@ -234,7 +234,7 @@ private static int chunk_size(long nrow, int ncol, int min, int exact) { */ public static Tuple2 frame_from_matrix(Matrix m, int min_hint, int exact_hint) { /* First create an empty (0-filled) frame of the required dimensions */ - Frame frame = empty_frame (m.rowSize(), m.columnSize(), min_hint, exact_hint); + Frame frame = empty_frame(m.rowSize(), m.columnSize(), min_hint, exact_hint); Vec labels = null; Vec.Writer writers[] = new Vec.Writer[m.columnSize()]; Futures closer = new Futures(); @@ -270,8 +270,8 @@ public static Tuple2 frame_from_matrix(Matrix m, int min_hint, int ex } public static Frame empty_frame(long nrow, int ncol, int min_hint, int exact_hint) { - int chunk_sz = chunk_size (nrow, ncol, min_hint, exact_hint); - int nchunks = (int) ((nrow - 1) / chunk_sz) + 1; /* Final number of Chunks per Vec */ + int chunk_sz = chunk_size(nrow, ncol, min_hint, exact_hint); + int nchunks = (int)((nrow - 1) / chunk_sz) + 1; /* Final number of Chunks per Vec */ Vec.VectorGroup vg = new Vec.VectorGroup(); long espc[] = new long[nchunks+1]; final Vec[] vecs = new Vec[ncol]; From 8a437c57ab8942aac7becd06b67f913c22e566f6 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Wed, 23 Jul 2014 13:50:39 -0700 Subject: [PATCH 22/34] MAHOUT-1500: Detect seqfile format If not sequence file (first three bytes == 'S', 'E', 'Q') then fallback to H2O parallel/efficient parser of CSV and other formats Signed-off-by: Anand Avati --- .../apache/mahout/h2obindings/H2OHdfs.java | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java index 7a04c958ee..79ad66bf6e 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java @@ -18,6 +18,7 @@ package org.apache.mahout.h2obindings; import java.io.IOException; +import java.io.File; import java.net.URI; import scala.Tuple2; @@ -26,6 +27,7 @@ import water.fvec.Vec; import water.Futures; import water.parser.ValueString; +import water.util.FrameUtils; import org.apache.mahout.math.Vector; import org.apache.mahout.math.DenseVector; @@ -35,6 +37,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Writable; @@ -45,7 +48,39 @@ public class H2OHdfs { + public static boolean is_seqfile(String filename) { + try { + String uri = filename; + Configuration conf = new Configuration(); + Path path = new Path(uri); + FileSystem fs = FileSystem.get(URI.create(uri), conf); + FSDataInputStream fin = fs.open(path); + byte seq[] = new byte[3]; + + fin.read(seq); + fin.close(); + + if (seq[0] == 'S' && seq[1] == 'E' && seq[2] == 'Q') + return true; + else + return false; + } catch (java.io.IOException e) { + return false; + } + } + public static Tuple2 drm_from_file(String filename, int parMin) { + try { + if (is_seqfile(filename)) + return drm_from_seqfile(filename, parMin); + else + return new Tuple2(FrameUtils.parseFrame(null,new File(filename)), null); + } catch (java.io.IOException e) { + return null; + } + } + + public static Tuple2 drm_from_seqfile(String filename, int parMin) { long rows = 0; int cols = 0; Frame frame = null; From 3446ed1545769fcca59c859c4dc47687066a2ae7 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Wed, 23 Jul 2014 14:06:48 -0700 Subject: [PATCH 23/34] MAHOUT-1500: Use int key as row number into matrix in parser Signed-off-by: Anand Avati --- .../apache/mahout/h2obindings/H2OHdfs.java | 30 ++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java index 79ad66bf6e..b998b4180d 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java @@ -42,6 +42,7 @@ import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.util.ReflectionUtils; @@ -94,11 +95,14 @@ public static Tuple2 drm_from_seqfile(String filename, int parMin) { FileSystem fs = FileSystem.get(URI.create(uri), conf); Vec.Writer writers[]; Vec.Writer labelwriter = null; + boolean is_int_key = false, is_long_key = false, is_string_key = false; reader = new SequenceFile.Reader(fs, path, conf); if (reader.getValueClass() != VectorWritable.class) { - System.out.println("ValueClass in file " + filename + "must be VectorWritable, but found " + reader.getValueClassName()); + System.out.println("ValueClass in file " + filename + + "must be VectorWritable, but found " + + reader.getValueClassName()); return null; } @@ -108,12 +112,25 @@ public static Tuple2 drm_from_seqfile(String filename, int parMin) { ReflectionUtils.newInstance(reader.getValueClass(), conf); long start = reader.getPosition(); + + if (reader.getKeyClass() == Text.class) + is_string_key = true; + else if (reader.getKeyClass() == LongWritable.class) + is_long_key = true; + else + is_int_key = true; + while (reader.next(key, value)) { if (cols == 0) { Vector v = value.get(); - cols = v.size(); + cols = Math.max(v.size(), cols); } - rows++; + if (is_long_key) + rows = Math.max(((LongWritable)(key)).get()+1, rows); + if (is_int_key) + rows = Math.max(((IntWritable)(key)).get()+1, rows); + if (is_string_key) + rows++; } reader.seek(start); @@ -130,11 +147,16 @@ public static Tuple2 drm_from_seqfile(String filename, int parMin) { long r = 0; while (reader.next(key, value)) { Vector v = value.get(); + if (is_long_key) + r = ((LongWritable)(key)).get(); + if (is_int_key) + r = ((IntWritable)(key)).get(); for (int c = 0; c < v.size(); c++) writers[c].set(r, v.getQuick(c)); if (labels != null) labelwriter.set(r, ((Text)key).toString()); - r++; + if (is_string_key) + r++; } Futures fus = new Futures(); From 6831ac21a5c7c9caa6b94867e72234ce55de10dc Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Wed, 23 Jul 2014 15:37:41 -0700 Subject: [PATCH 24/34] MAHOUT-1500: Do away with scala.Tuple2 usage Signed-off-by: Anand Avati --- .../apache/mahout/h2obindings/H2OHdfs.java | 15 ++++--- .../apache/mahout/h2obindings/H2OHelper.java | 14 ++++-- .../apache/mahout/h2obindings/drm/H2ODrm.java | 36 +++++++++++++++ .../apache/mahout/h2obindings/ops/ABt.java | 13 +++--- .../apache/mahout/h2obindings/ops/AewB.java | 13 +++--- .../mahout/h2obindings/ops/AewScalar.java | 11 +++-- .../org/apache/mahout/h2obindings/ops/At.java | 9 ++-- .../apache/mahout/h2obindings/ops/AtA.java | 9 ++-- .../apache/mahout/h2obindings/ops/AtB.java | 11 +++-- .../apache/mahout/h2obindings/ops/Atx.java | 9 ++-- .../org/apache/mahout/h2obindings/ops/Ax.java | 11 +++-- .../apache/mahout/h2obindings/ops/Cbind.java | 26 +++++------ .../mahout/h2obindings/ops/MapBlock.java | 14 +++--- .../apache/mahout/h2obindings/ops/Par.java | 10 ++--- .../mahout/h2obindings/ops/RowRange.java | 14 +++--- .../h2obindings/ops/TimesRightMatrix.java | 10 ++--- .../apache/mahout/h2obindings/H2OEngine.scala | 44 +++++++------------ .../h2obindings/drm/CheckpointedDrmH2O.scala | 18 +++----- 18 files changed, 152 insertions(+), 135 deletions(-) create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2ODrm.java diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java index b998b4180d..76ad7120c7 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java @@ -21,8 +21,6 @@ import java.io.File; import java.net.URI; -import scala.Tuple2; - import water.fvec.Frame; import water.fvec.Vec; import water.Futures; @@ -33,6 +31,7 @@ import org.apache.mahout.math.DenseVector; import org.apache.mahout.math.SequentialAccessSparseVector; import org.apache.mahout.math.VectorWritable; +import org.apache.mahout.h2obindings.drm.H2ODrm; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -70,18 +69,18 @@ public static boolean is_seqfile(String filename) { } } - public static Tuple2 drm_from_file(String filename, int parMin) { + public static H2ODrm drm_from_file(String filename, int parMin) { try { if (is_seqfile(filename)) return drm_from_seqfile(filename, parMin); else - return new Tuple2(FrameUtils.parseFrame(null,new File(filename)), null); + return new H2ODrm(FrameUtils.parseFrame(null,new File(filename))); } catch (java.io.IOException e) { return null; } } - public static Tuple2 drm_from_seqfile(String filename, int parMin) { + public static H2ODrm drm_from_seqfile(String filename, int parMin) { long rows = 0; int cols = 0; Frame frame = null; @@ -170,10 +169,12 @@ else if (reader.getKeyClass() == LongWritable.class) } finally { IOUtils.closeStream(reader); } - return new Tuple2(frame, labels); + return new H2ODrm(frame, labels); } - public static void drm_to_file(String filename, Frame frame, Vec labels) throws java.io.IOException { + public static void drm_to_file(String filename, H2ODrm Drm) throws java.io.IOException { + Frame frame = Drm.frame; + Vec labels = Drm.keys; String uri = filename; Configuration conf = new Configuration(); Path path = new Path(uri); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java index e004de144e..1fde47ce30 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -37,7 +37,7 @@ import java.util.HashMap; import java.util.Arrays; -import scala.Tuple2; +import org.apache.mahout.h2obindings.drm.H2ODrm; public class H2OHelper { @@ -72,7 +72,9 @@ public void reduce(MRTaskNZ other) { Dense Matrix depending on number of missing elements in Frame. */ - public static Matrix matrix_from_frame(Frame frame, Vec labels) { + public static Matrix matrix_from_drm(H2ODrm Drm) { + Frame frame = Drm.frame; + Vec labels = Drm.keys; Matrix m; if (is_sparse(frame)) @@ -232,7 +234,7 @@ private static int chunk_size(long nrow, int ncol, int min, int exact) { /* Ingest a Matrix into an H2O Frame. H2O Frame is the "backing" data structure behind CheckpointedDrm. Steps: */ - public static Tuple2 frame_from_matrix(Matrix m, int min_hint, int exact_hint) { + public static H2ODrm drm_from_matrix(Matrix m, int min_hint, int exact_hint) { /* First create an empty (0-filled) frame of the required dimensions */ Frame frame = empty_frame(m.rowSize(), m.columnSize(), min_hint, exact_hint); Vec labels = null; @@ -266,7 +268,7 @@ public static Tuple2 frame_from_matrix(Matrix m, int min_hint, int ex closer.blockForPending(); - return new Tuple2(frame,labels); + return new H2ODrm(frame, labels); } public static Frame empty_frame(long nrow, int ncol, int min_hint, int exact_hint) { @@ -285,4 +287,8 @@ public static Frame empty_frame(long nrow, int ncol, int min_hint, int exact_hin return new Frame(vecs); } + + public static H2ODrm empty_drm(long nrow, int ncol, int min_hint, int exact_hint) { + return new H2ODrm(empty_frame(nrow, ncol, min_hint, exact_hint)); + } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2ODrm.java b/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2ODrm.java new file mode 100644 index 0000000000..058ea0a996 --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2ODrm.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.drm; + +import water.fvec.Frame; +import water.fvec.Vec; + +public class H2ODrm { + public Frame frame; + public Vec keys; + + public H2ODrm(Frame m) { + frame = m; + keys = null; + } + + public H2ODrm(Frame m, Vec k) { + frame = m; + keys = k; + } +} diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java index 74fbbc4a86..59227a752b 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java @@ -18,6 +18,7 @@ package org.apache.mahout.h2obindings.ops; import org.apache.mahout.h2obindings.H2OHelper; +import org.apache.mahout.h2obindings.drm.H2ODrm; import water.MRTask; import water.fvec.Frame; @@ -25,14 +26,12 @@ import water.fvec.Chunk; import water.fvec.NewChunk; -import scala.Tuple2; - public class ABt { /* Calculate AB' */ - public static Tuple2 ABt(Tuple2 TA, Tuple2 TB) { - Frame A = TA._1(); - Vec VA = TA._2(); - final Frame B = TB._1(); + public static H2ODrm ABt(H2ODrm DrmA, H2ODrm DrmB) { + Frame A = DrmA.frame; + Vec keys = DrmA.keys; + final Frame B = DrmB.frame; int ABt_cols = (int)B.numRows(); /* ABt is written into ncs[] with an MRTask on A, and therefore will @@ -59,6 +58,6 @@ public void map(Chunk chks[], NewChunk ncs[]) { }.doAll(ABt_cols, A).outputFrame(null, null); /* Carry forward labels of A blindly into ABt */ - return new Tuple2(ABt, VA); + return new H2ODrm(ABt, keys); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java index cba0d7dbc9..a1b0f1e543 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java @@ -18,6 +18,7 @@ package org.apache.mahout.h2obindings.ops; import org.apache.mahout.h2obindings.H2OHelper; +import org.apache.mahout.h2obindings.drm.H2ODrm; import water.MRTask; import water.fvec.Frame; @@ -25,14 +26,12 @@ import water.fvec.Chunk; import water.fvec.NewChunk; -import scala.Tuple2; - public class AewB { /* Element-wise DRM-DRM operations */ - public static Tuple2 AewB(Tuple2 AT, Tuple2 BT, final String op) { - final Frame A = AT._1(); - final Frame B = BT._1(); - Vec VA = AT._2(); + public static H2ODrm AewB(H2ODrm DrmA, H2ODrm DrmB, final String op) { + final Frame A = DrmA.frame; + final Frame B = DrmB.frame; + Vec keys = DrmA.keys; int AewB_cols = A.numCols(); /* AewB is written into ncs[] with an MRTask on A, and therefore will @@ -69,6 +68,6 @@ public void map(Chunk chks[], NewChunk ncs[]) { }.doAll(AewB_cols, A).outputFrame(null, null); /* Carry forward labels of A blindly into ABt */ - return new Tuple2(AewB, VA); + return new H2ODrm(AewB, keys); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java index 6af4991b39..f998009bb5 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java @@ -18,6 +18,7 @@ package org.apache.mahout.h2obindings.ops; import org.apache.mahout.h2obindings.H2OHelper; +import org.apache.mahout.h2obindings.drm.H2ODrm; import water.MRTask; import water.fvec.Frame; @@ -25,13 +26,11 @@ import water.fvec.Chunk; import water.fvec.NewChunk; -import scala.Tuple2; - public class AewScalar { /* Element-wise DRM-DRM operations */ - public static Tuple2 AewScalar(final Tuple2 TA, final double s, final String op) { - Frame A = TA._1(); - Vec VA = TA._2(); + public static H2ODrm AewScalar(H2ODrm DrmA, final double s, final String op) { + Frame A = DrmA.frame; + Vec keys = DrmA.keys; int AewScalar_cols = A.numCols(); /* AewScalar is written into ncs[] with an MRTask on A, and therefore will @@ -64,6 +63,6 @@ public void map(Chunk chks[], NewChunk ncs[]) { }.doAll(AewScalar_cols, A).outputFrame(null, null); /* Carry forward labels of A blindly into ABt */ - return new Tuple2(AewScalar, VA); + return new H2ODrm(AewScalar, keys); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java index c7cc7dd8a1..3458865c31 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java @@ -18,6 +18,7 @@ package org.apache.mahout.h2obindings.ops; import org.apache.mahout.h2obindings.H2OHelper; +import org.apache.mahout.h2obindings.drm.H2ODrm; import water.MRTask; import water.fvec.Frame; @@ -25,12 +26,10 @@ import water.fvec.Chunk; import water.fvec.NewChunk; -import scala.Tuple2; - public class At { /* Calculate A' (transpose) */ - public static Tuple2 At(Tuple2 T) { - final Frame A = T._1(); + public static H2ODrm At(H2ODrm DrmA) { + final Frame A = DrmA.frame; /* First create a new frame of the required dimensions, A.numCols() rows and A.numRows() columns. */ @@ -54,6 +53,6 @@ public void map(Chunk chks[]) { }.doAll(At); /* At is NOT similarly partitioned as A, drop labels */ - return new Tuple2(At, null); + return new H2ODrm(At); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java index dba03a1139..ca12bf810d 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java @@ -18,6 +18,7 @@ package org.apache.mahout.h2obindings.ops; import org.apache.mahout.h2obindings.H2OHelper; +import org.apache.mahout.h2obindings.drm.H2ODrm; import water.MRTask; import water.fvec.Frame; @@ -25,12 +26,10 @@ import water.fvec.Chunk; import water.fvec.NewChunk; -import scala.Tuple2; - public class AtA { /* Calculate A'A */ - public static Tuple2 AtA(Tuple2 TA) { - final Frame A = TA._1(); + public static H2ODrm AtA(H2ODrm DrmA) { + final Frame A = DrmA.frame; /* First create an empty Frame of the required dimensions */ Frame AtA = H2OHelper.empty_frame(A.numCols(), A.numCols(), -1, -1); @@ -59,6 +58,6 @@ public void map(Chunk chks[]) { }.doAll(AtA); /* AtA is NOT similarly partitioned as A, drop labels */ - return new Tuple2(AtA, null); + return new H2ODrm(AtA); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java index c229e43913..68f6d7ded8 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java @@ -18,6 +18,7 @@ package org.apache.mahout.h2obindings.ops; import org.apache.mahout.h2obindings.H2OHelper; +import org.apache.mahout.h2obindings.drm.H2ODrm; import water.MRTask; import water.fvec.Frame; @@ -25,13 +26,11 @@ import water.fvec.Chunk; import water.fvec.NewChunk; -import scala.Tuple2; - public class AtB { /* Calculate A'B */ - public static Tuple2 AtB(Tuple2 TA, Tuple2 TB) { - final Frame A = TA._1(); - final Frame B = TB._1(); + public static H2ODrm AtB(H2ODrm DrmA, H2ODrm DrmB) { + final Frame A = DrmA.frame; + final Frame B = DrmB.frame; /* First create an empty frame of the required dimensions */ Frame AtB = H2OHelper.empty_frame(A.numCols(), B.numCols(), -1, -1); @@ -62,6 +61,6 @@ public void map(Chunk chks[]) { }.doAll(AtB); /* AtB is NOT similarly partitioned as A, drop labels */ - return new Tuple2(AtB, null); + return new H2ODrm(AtB); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java index 3bb626b064..c0eb739fc1 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java @@ -23,6 +23,7 @@ import org.apache.mahout.math.DenseMatrix; import org.apache.mahout.h2obindings.H2OHelper; import org.apache.mahout.h2obindings.drm.H2OBCast; +import org.apache.mahout.h2obindings.drm.H2ODrm; import water.MRTask; import water.fvec.Frame; @@ -31,12 +32,10 @@ import water.fvec.NewChunk; import water.util.ArrayUtils; -import scala.Tuple2; - public class Atx { /* Calculate A'x (where x is an in-core Vector) */ - public static Tuple2 Atx(Tuple2 TA, Vector x) { - Frame A = TA._1(); + public static H2ODrm Atx(H2ODrm DrmA, Vector x) { + Frame A = DrmA.frame; final H2OBCast bx = new H2OBCast(x); /* A'x is computed into _atx[] with an MRTask on A (with @@ -72,6 +71,6 @@ using existing helper functions (creating a Matrix Vector v = new DenseVector(new MRTaskAtx().doAll(A)._atx); Matrix m = new DenseMatrix(A.numCols(), 1); m.assignColumn(0, v); - return H2OHelper.frame_from_matrix(m, -1, -1); + return H2OHelper.drm_from_matrix(m, -1, -1); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java index 604e5db531..e292e9994c 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java @@ -20,6 +20,7 @@ import org.apache.mahout.math.Vector; import org.apache.mahout.h2obindings.H2OHelper; import org.apache.mahout.h2obindings.drm.H2OBCast; +import org.apache.mahout.h2obindings.drm.H2ODrm; import water.MRTask; import water.fvec.Frame; @@ -27,13 +28,11 @@ import water.fvec.Chunk; import water.fvec.NewChunk; -import scala.Tuple2; - public class Ax { /* Calculate Ax (where x is an in-core Vector) */ - public static Tuple2 Ax(Tuple2 TA, Vector x) { - Frame A = TA._1(); - Vec VA = TA._2(); + public static H2ODrm Ax(H2ODrm DrmA, Vector x) { + Frame A = DrmA.frame; + Vec keys = DrmA.keys; final H2OBCast bx = new H2OBCast(x); /* Ax is written into nc (single element, not array) with an MRTask on A, @@ -57,6 +56,6 @@ public void map(Chunk chks[], NewChunk nc) { }.doAll(1, A).outputFrame(null, null); /* Carry forward labels of A blindly into ABt */ - return new Tuple2(Ax, VA); + return new H2ODrm(Ax, keys); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java index de1e95245c..60314625ec 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java @@ -23,28 +23,28 @@ import water.fvec.Chunk; import water.fvec.NewChunk; -import scala.Tuple2; import org.apache.mahout.h2obindings.H2OHelper; +import org.apache.mahout.h2obindings.drm.H2ODrm; public class Cbind { - /* R's cbind like operator, on TA and TB */ - public static Tuple2 Cbind(Tuple2 TA, Tuple2 TB) { - Frame fra = TA._1(); - Vec va = TA._2(); - Frame frb = TB._1(); - Vec vb = TB._2(); + /* R's cbind like operator, on DrmA and DrmB */ + public static H2ODrm Cbind(H2ODrm DrmA, H2ODrm DrmB) { + Frame fra = DrmA.frame; + Vec keysa = DrmA.keys; + Frame frb = DrmB.frame; + Vec keysb = DrmB.keys; /* If A and B are similarly partitioned, .. */ if (fra.anyVec().group() == frb.anyVec().group()) /* .. then, do a light weight zip() */ - return zip(fra, va, frb, vb); + return zip(fra, keysa, frb, keysb); else /* .. else, do a heavy weight join() which involves moving data over the wire */ - return join(fra, va, frb, vb); + return join(fra, keysa, frb, keysb); } /* Light weight zip(), no data movement */ - private static Tuple2 zip(final Frame fra, final Vec va, final Frame frb, final Vec vb) { + private static H2ODrm zip(final Frame fra, final Vec keysa, final Frame frb, final Vec keysb) { /* Create a new Vec[] to hold the concatenated list of A and B's column vectors */ Vec vecs[] = new Vec[fra.vecs().length + frb.vecs().length]; int d = 0; @@ -57,11 +57,11 @@ private static Tuple2 zip(final Frame fra, final Vec va, final Frame /* and create a new Frame with the combined list of column Vecs */ Frame fr = new Frame(vecs); /* Finally, inherit A's string labels into the result */ - return new Tuple2 (fr, va); + return new H2ODrm(fr, keysa); } /* heavy weight join(), involves moving data */ - private static Tuple2 join(final Frame fra, final Vec va, final Frame frb, final Vec vb) { + private static H2ODrm join(final Frame fra, final Vec keysa, final Frame frb, final Vec keysb) { /* The plan is to re-organize B to be "similarly partitioned as A", and then zip() */ Vec bvecs[] = new Vec[frb.vecs().length]; @@ -89,6 +89,6 @@ public void map(Chunk chks[]) { }.doAll(bvecs); /* now that bvecs[] is compatible, just zip'em'up */ - return zip(fra, va, new Frame(bvecs), null); + return zip(fra, keysa, new Frame(bvecs), null); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java index 5cfc936574..5b52c9d7de 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java @@ -19,6 +19,7 @@ import org.apache.mahout.math.Matrix; import org.apache.mahout.h2obindings.H2OBlockMatrix; +import org.apache.mahout.h2obindings.drm.H2ODrm; import water.MRTask; import water.fvec.Frame; @@ -30,13 +31,12 @@ import java.util.Arrays; import scala.reflect.ClassTag; -import scala.Tuple2; public class MapBlock { - public static Tuple2 exec(Tuple2 AT, int ncol, Object bmf, final boolean is_r_str, - final ClassTag k, final ClassTag r) { - Frame A = AT._1(); - Vec VA = AT._2(); + public static H2ODrm exec(H2ODrm DrmA, int ncol, Object bmf, final boolean is_r_str, + final ClassTag k, final ClassTag r) { + Frame A = DrmA.frame; + Vec keys = DrmA.keys; class MRTaskBMF extends MRTask { Serializable _bmf; @@ -90,7 +90,7 @@ public void map(Chunk chks[], NewChunk ncs[]) { } int ncol_res = ncol + (is_r_str ? 1 : 0); - Frame fmap = new MRTaskBMF(bmf, VA).doAll(ncol_res, A).outputFrame(null, null); + Frame fmap = new MRTaskBMF(bmf, keys).doAll(ncol_res, A).outputFrame(null, null); Vec vmap = null; if (is_r_str) { /* If output was String keyed, then the last Vec in fmap is the String vec. @@ -100,6 +100,6 @@ If so, peel it out into a separate Vec (vmap) and set fmap to be the vmap = fmap.vecs()[ncol]; fmap = new Frame(Arrays.copyOfRange(fmap.vecs(), 0, ncol)); } - return new Tuple2(fmap,vmap); + return new H2ODrm(fmap, vmap); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java index 257ac7d3d8..2c01ccd653 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java @@ -24,13 +24,13 @@ import water.fvec.NewChunk; import water.parser.ValueString; -import scala.Tuple2; import org.apache.mahout.h2obindings.H2OHelper; +import org.apache.mahout.h2obindings.drm.H2ODrm; public class Par { - public static Tuple2 exec(Tuple2 TA, int min, int exact) { - final Frame frin = TA._1(); - final Vec vin = TA._2(); + public static H2ODrm exec(H2ODrm DrmA, int min, int exact) { + final Frame frin = DrmA.frame; + final Vec vin = DrmA.keys; /* First create a new empty Frame with the required partitioning */ Frame frout = H2OHelper.empty_frame(frin.numRows(), frin.numCols(), min, exact); @@ -79,6 +79,6 @@ public void map(Chunk chks[]) { }.doAll(frout); } - return new Tuple2 (frout, vout); + return new H2ODrm(frout, vout); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java index edebc6f5d2..0a22581c7a 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java @@ -26,13 +26,13 @@ import water.fvec.NewChunk; import water.parser.ValueString; -import scala.Tuple2; +import org.apache.mahout.h2obindings.drm.H2ODrm; public class RowRange { /* Filter operation */ - public static Tuple2 RowRange(Tuple2 TA, final Range R) { - Frame A = TA._1(); - Vec VA = TA._2(); + public static H2ODrm RowRange(H2ODrm DrmA, final Range R) { + Frame A = DrmA.frame; + Vec keys = DrmA.keys; /* Run a filtering MRTask on A. If row number falls within R.start() and R.end(), then the row makes it into the output @@ -57,7 +57,7 @@ public void map(Chunk chks[], NewChunk ncs[]) { } }.doAll(A.numCols(), A).outputFrame(null, null); - Vec Vrr = (VA == null) ? null : new MRTask() { + Vec Vrr = (keys == null) ? null : new MRTask() { /* This is a String keyed DRM. Do the same thing as above, but this time just one column of Strings. */ @@ -76,8 +76,8 @@ public void map(Chunk chk, NewChunk nc) { nc.addStr(chk.atStr0(vstr, r).toString()); } } - }.doAll(1, VA).outputFrame(null, null).anyVec(); + }.doAll(1, keys).outputFrame(null, null).anyVec(); - return new Tuple2(Arr, Vrr); + return new H2ODrm(Arr, Vrr); } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java index 02c924c9db..9633ecc2be 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java @@ -22,19 +22,19 @@ import org.apache.mahout.math.DiagonalMatrix; import org.apache.mahout.h2obindings.H2OHelper; import org.apache.mahout.h2obindings.drm.H2OBCast; +import org.apache.mahout.h2obindings.drm.H2ODrm; import water.MRTask; import water.fvec.Frame; import water.fvec.Vec; import water.fvec.Chunk; import water.fvec.NewChunk; -import scala.Tuple2; public class TimesRightMatrix { /* Multiple with in-core Matrix */ - public static Tuple2 TimesRightMatrix(Tuple2 TA, Matrix B) { - Frame A = TA._1(); - Vec VA = TA._2(); + public static H2ODrm TimesRightMatrix(H2ODrm DrmA, Matrix B) { + Frame A = DrmA.frame; + Vec keys = DrmA.keys; Frame AinCoreB = null; if (B instanceof DiagonalMatrix) @@ -42,7 +42,7 @@ public static Tuple2 TimesRightMatrix(Tuple2 TA, Matrix B) else AinCoreB = AinCoreB_common(A, B); - return new Tuple2(AinCoreB, VA); + return new H2ODrm(AinCoreB, keys); } /* diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala index 1178adc38e..b00332b583 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala @@ -24,20 +24,18 @@ import org.apache.mahout.math.drm.logical._ import org.apache.mahout.h2obindings.ops._ import org.apache.mahout.h2obindings.drm._ -import water.fvec.{Frame,Vec} - object H2OEngine extends DistributedEngine { def colMeans[K:ClassTag](drm: CheckpointedDrm[K]): Vector = - H2OHelper.colMeans (drm.frame) + H2OHelper.colMeans(drm.h2odrm.frame) def colSums[K:ClassTag](drm: CheckpointedDrm[K]): Vector = - H2OHelper.colSums (drm.frame) + H2OHelper.colSums(drm.h2odrm.frame) def norm[K: ClassTag](drm: CheckpointedDrm[K]): Double = - H2OHelper.sumSqr (drm.frame) + H2OHelper.sumSqr(drm.h2odrm.frame) def numNonZeroElementsPerColumn[K: ClassTag](drm: CheckpointedDrm[K]): Vector = - H2OHelper.nonZeroCnt (drm.frame) + H2OHelper.nonZeroCnt(drm.h2odrm.frame) def drmBroadcast(m: Matrix)(implicit dc: DistributedContext): BCast[Matrix] = new H2OBCast(m) @@ -45,37 +43,27 @@ object H2OEngine extends DistributedEngine { def drmBroadcast(v: Vector)(implicit dc: DistributedContext): BCast[Vector] = new H2OBCast(v) - def drmFromHDFS(path: String, parMin: Int = 0)(implicit dc: DistributedContext): CheckpointedDrm[_] = { - val (frame, labels) = H2OHdfs.drm_from_file (path, parMin) - new CheckpointedDrmH2O (frame, labels, dc) - } + def drmFromHDFS(path: String, parMin: Int = 0)(implicit dc: DistributedContext): CheckpointedDrm[_] = + new CheckpointedDrmH2O(H2OHdfs.drm_from_file(path, parMin), dc) def drmParallelizeEmpty(nrow: Int, ncol: Int, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Int] = - new CheckpointedDrmH2O[Int] (H2OHelper.empty_frame (nrow, ncol, numPartitions, -1), dc) + new CheckpointedDrmH2O[Int](H2OHelper.empty_drm(nrow, ncol, numPartitions, -1), dc) def drmParallelizeEmptyLong(nrow: Long, ncol: Int, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Long] = - new CheckpointedDrmH2O[Long] (H2OHelper.empty_frame (nrow, ncol, numPartitions, -1), dc) + new CheckpointedDrmH2O[Long](H2OHelper.empty_drm(nrow, ncol, numPartitions, -1), dc) - def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Int] = { - val (frame, labels) = H2OHelper.frame_from_matrix (m, numPartitions, -1) - // assert labels == null - new CheckpointedDrmH2O[Int] (frame, labels, dc) - } + def drmParallelizeWithRowIndices(m: Matrix, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[Int] = + new CheckpointedDrmH2O[Int](H2OHelper.drm_from_matrix(m, numPartitions, -1), dc) - def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[String] = { - val (frame, labels) = H2OHelper.frame_from_matrix (m, numPartitions, -1) - // assert labels != null - new CheckpointedDrmH2O[String] (frame, labels, dc) - } + def drmParallelizeWithRowLabels(m: Matrix, numPartitions: Int)(implicit dc: DistributedContext): CheckpointedDrm[String] = + new CheckpointedDrmH2O[String](H2OHelper.drm_from_matrix(m, numPartitions, -1), dc) - def toPhysical[K:ClassTag](plan: DrmLike[K], ch: CacheHint.CacheHint): CheckpointedDrm[K] = { - val (frame, labels) = tr2phys (plan) - new CheckpointedDrmH2O[K] (frame, labels, plan.context) - } + def toPhysical[K:ClassTag](plan: DrmLike[K], ch: CacheHint.CacheHint): CheckpointedDrm[K] = + new CheckpointedDrmH2O[K](tr2phys(plan), plan.context) // H2O specific - private def tr2phys[K: ClassTag](oper: DrmLike[K]): (Frame, Vec) = { + private def tr2phys[K: ClassTag](oper: DrmLike[K]): H2ODrm = { oper match { case OpAtAnyKey(_) => throw new IllegalArgumentException("\"A\" must be Int-keyed in this A.t expression.") @@ -95,7 +83,7 @@ object H2OEngine extends DistributedEngine { case blockOp: OpMapBlock[K, _] => MapBlock.exec(tr2phys(blockOp.A)(blockOp.classTagA), blockOp.ncol, blockOp.bmf, (blockOp.classTagK == implicitly[ClassTag[String]]), blockOp.classTagA, blockOp.classTagK) case op@OpPar(a, m, e) => Par.exec(tr2phys(a)(op.classTagA), m, e) - case cp: CheckpointedDrm[K] => (cp.frame, cp.labels) + case cp: CheckpointedDrm[K] => cp.h2odrm case _ => throw new IllegalArgumentException("Internal:Optimizer has no exec policy for operator %s." .format(oper)) } diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala index b987d1b3ae..581eb94787 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala @@ -6,31 +6,25 @@ import RLikeOps._ import org.apache.mahout.math.drm._ import org.apache.mahout.h2obindings._ -import water.fvec.{Frame,Vec} - import scala.reflect._ /** H2O-specific optimizer-checkpointed DRM. */ class CheckpointedDrmH2O[K: ClassTag]( - val frame: Frame, - val labels: Vec, + val h2odrm: H2ODrm, val context: DistributedContext ) extends CheckpointedDrm[K] { - def this(frame: Frame, context: DistributedContext) = - this(frame, null, context) - - def collect: Matrix = H2OHelper.matrix_from_frame(frame, labels) + def collect: Matrix = H2OHelper.matrix_from_drm(h2odrm) /* XXX: call frame.remove */ def uncache(): Unit = return - def writeDRM(path: String): Unit = H2OHdfs.drm_to_file (path, frame, labels) + def writeDRM(path: String): Unit = H2OHdfs.drm_to_file(path, h2odrm) def checkpoint(cacheHint: CacheHint.CacheHint): CheckpointedDrm[K] = this - def ncol: Int = frame.numCols + def ncol: Int = h2odrm.frame.numCols - def nrow: Long = frame.numRows + def nrow: Long = h2odrm.frame.numRows - protected[mahout] def partitioningTag: Long = frame.anyVec.group.hashCode + protected[mahout] def partitioningTag: Long = h2odrm.frame.anyVec.group.hashCode } From c3b1d698ad9741daf39ec5d45c0d2bc07dc3d9ca Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Fri, 25 Jul 2014 14:23:15 -0700 Subject: [PATCH 25/34] MAHOUT-1500: combine serializeMatrix and serializeVector into single method Signed-off-by: Anand Avati --- .../mahout/h2obindings/drm/H2OBCast.java | 28 ++++--------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java b/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java index 8e9e70cd03..40fa13d286 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java @@ -22,6 +22,9 @@ import org.apache.mahout.math.Vector; import org.apache.mahout.math.MatrixWritable; import org.apache.mahout.math.VectorWritable; + +import org.apache.hadoop.io.Writable; + import java.io.Serializable; import java.io.ByteArrayOutputStream; import java.io.ByteArrayInputStream; @@ -30,11 +33,6 @@ /* Handle Matrix and Vector separately so that we can live with just importing MatrixWritable and VectorWritable. - - We could collapse the two into a single method using Writable, - but then we would have to import org.apache.hadoop.Writable, - pick a hadoop distribution in pom.xml etc. Instead let - mahout-mrlegacy solve that transitively for us. */ public class H2OBCast implements BCast, Serializable { @@ -46,10 +44,10 @@ public H2OBCast(T o) { obj = o; if (o instanceof Matrix) { - buf = serializeMatrix((Matrix)o); + buf = serialize(new MatrixWritable((Matrix)o)); is_matrix = true; } else if (o instanceof Vector) { - buf = serializeVector((Vector)o); + buf = serialize(new VectorWritable((Vector)o)); } else { throw new IllegalArgumentException("Only Matrix or Vector supported for now"); } @@ -61,21 +59,7 @@ public T value() { return obj; } - private byte[] serializeMatrix(Matrix m) { - MatrixWritable w = new MatrixWritable(m); - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - try { - ObjectOutputStream oos = new ObjectOutputStream(bos); - w.write(oos); - oos.close(); - } catch (java.io.IOException e) { - return null; - } - return bos.toByteArray(); - } - - private byte[] serializeVector(Vector v) { - VectorWritable w = new VectorWritable(v); + private byte[] serialize(Writable w) { ByteArrayOutputStream bos = new ByteArrayOutputStream(); try { ObjectOutputStream oos = new ObjectOutputStream(bos); From 06633e1f96460140dbb80ff5c9281e147d4aebff Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Tue, 29 Jul 2014 13:03:39 -0700 Subject: [PATCH 26/34] MAHOUT-1500: Implement canHaveMissingRows() Always false h2o backend, by design. Signed-off-by: Anand Avati --- .../apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala index 581eb94787..8d10cbce3c 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/drm/CheckpointedDrmH2O.scala @@ -16,7 +16,7 @@ class CheckpointedDrmH2O[K: ClassTag]( def collect: Matrix = H2OHelper.matrix_from_drm(h2odrm) /* XXX: call frame.remove */ - def uncache(): Unit = return + def uncache(): this.type = this def writeDRM(path: String): Unit = H2OHdfs.drm_to_file(path, h2odrm) @@ -26,5 +26,7 @@ class CheckpointedDrmH2O[K: ClassTag]( def nrow: Long = h2odrm.frame.numRows + def canHaveMissingRows: Boolean = false + protected[mahout] def partitioningTag: Long = h2odrm.frame.anyVec.group.hashCode } From efa7b8197a75b81ca29967efc26d5a3048e11e82 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Tue, 29 Jul 2014 15:30:55 -0700 Subject: [PATCH 27/34] MAHOUT-1500: Implement empty_frame() with pre-created VectorGroup Signed-off-by: Anand Avati --- .../main/java/org/apache/mahout/h2obindings/H2OHelper.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java index 1fde47ce30..41fe5c76be 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -272,9 +272,14 @@ public static H2ODrm drm_from_matrix(Matrix m, int min_hint, int exact_hint) { } public static Frame empty_frame(long nrow, int ncol, int min_hint, int exact_hint) { + Vec.VectorGroup vg = new Vec.VectorGroup(); + + return empty_frame(nrow, ncol, min_hint, exact_hint, vg); + } + + public static Frame empty_frame(long nrow, int ncol, int min_hint, int exact_hint, Vec.VectrorGroup vg) { int chunk_sz = chunk_size(nrow, ncol, min_hint, exact_hint); int nchunks = (int)((nrow - 1) / chunk_sz) + 1; /* Final number of Chunks per Vec */ - Vec.VectorGroup vg = new Vec.VectorGroup(); long espc[] = new long[nchunks+1]; final Vec[] vecs = new Vec[ncol]; From 28aa37d48ad606a0ecdb804626d3adb109b22e85 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Tue, 29 Jul 2014 15:27:12 -0700 Subject: [PATCH 28/34] MAHOUT-1500: Implement Rbind() Signed-off-by: Anand Avati --- .../apache/mahout/h2obindings/H2OHelper.java | 2 +- .../apache/mahout/h2obindings/ops/Rbind.java | 79 +++++++++++++++++++ .../apache/mahout/h2obindings/H2OEngine.scala | 1 + 3 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 h2o/src/main/java/org/apache/mahout/h2obindings/ops/Rbind.java diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java index 41fe5c76be..9ecebed2f3 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -277,7 +277,7 @@ public static Frame empty_frame(long nrow, int ncol, int min_hint, int exact_hin return empty_frame(nrow, ncol, min_hint, exact_hint, vg); } - public static Frame empty_frame(long nrow, int ncol, int min_hint, int exact_hint, Vec.VectrorGroup vg) { + public static Frame empty_frame(long nrow, int ncol, int min_hint, int exact_hint, Vec.VectorGroup vg) { int chunk_sz = chunk_size(nrow, ncol, min_hint, exact_hint); int nchunks = (int)((nrow - 1) / chunk_sz) + 1; /* Final number of Chunks per Vec */ long espc[] = new long[nchunks+1]; diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Rbind.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Rbind.java new file mode 100644 index 0000000000..a781a9346a --- /dev/null +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Rbind.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.h2obindings.ops; + +import water.MRTask; +import water.fvec.Frame; +import water.fvec.Vec; +import water.fvec.Chunk; +import water.fvec.NewChunk; +import water.parser.ValueString; + +import org.apache.mahout.h2obindings.H2OHelper; +import org.apache.mahout.h2obindings.drm.H2ODrm; + +public class Rbind { + /* R's rbind like operator, on DrmA and DrmB */ + public static H2ODrm Rbind(H2ODrm DrmA, H2ODrm DrmB) { + final Frame fra = DrmA.frame; + final Vec keysa = DrmA.keys; + final Frame frb = DrmB.frame; + final Vec keysb = DrmB.keys; + + /* Create new frame and copy A's data at the top, and B's data below. + Create the frame in the same VectorGroup as A, so A's data does not + cross the wire during copy. B's data could potentially cross the wire. + */ + Frame frbind = H2OHelper.empty_frame(fra.numRows() + frb.numRows(), fra.numCols(), + -1, -1, fra.anyVec().group()); + Vec keys = null; + + MRTask task = new MRTask() { + public void map(Chunk chks[], NewChunk nc) { + Vec A_vecs[] = fra.vecs(); + Vec B_vecs[] = frb.vecs(); + long A_rows = fra.numRows(); + long B_rows = frb.numRows(); + long start = chks[0].start(); + int chunk_size = chks[0].len(); + ValueString vstr = new ValueString(); + + for (int r = 0; r < chunk_size; r++) { + for (int c = 0; c < chks.length; c++) { + if (r + start < A_rows) { + chks[c].set0(r, A_vecs[c].at(r + start)); + if (keysa != null) + nc.addStr(keysa.atStr(vstr, r + start).toString()); + } else { + chks[c].set0(r, B_vecs[c].at(r + start - A_rows)); + if (keysb != null) + nc.addStr(keysb.atStr(vstr, r + start - A_rows).toString()); + } + } + } + } + }; + + if (keysa == null) + keys = task.doAll(1, frbind).outputFrame(null, null).anyVec(); + else + task.doAll(frbind); + + return new H2ODrm(frbind, keys); + } +} diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala index b00332b583..860fb84ae0 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/H2OEngine.scala @@ -76,6 +76,7 @@ object H2OEngine extends DistributedEngine { case op@OpAewB(a, b, opId) => AewB.AewB(tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB), opId) // Non arithmetic case op@OpCbind(a, b) => Cbind.Cbind(tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB)) + case op@OpRbind(a, b) => Rbind.Rbind(tr2phys(a)(op.classTagA), tr2phys(b)(op.classTagB)) case op@OpAewScalar(a, s, opId) => AewScalar.AewScalar(tr2phys(a)(op.classTagA), s, opId) case op@OpRowRange(a, r) => RowRange.RowRange(tr2phys(a)(op.classTagA), r) case op@OpTimesRightMatrix(a, m) => TimesRightMatrix.TimesRightMatrix(tr2phys(a)(op.classTagA), m) From 9385cc0e4fd977b6b712f5d3a67625ffdd684c02 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Thu, 31 Jul 2014 21:12:41 -0700 Subject: [PATCH 29/34] MAHOUT-1500: update pom.xml to use h2o-core RELEASE Use 0.1.3 release Signed-off-by: Anand Avati --- h2o/pom.xml | 6 +++--- pom.xml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/h2o/pom.xml b/h2o/pom.xml index 13c59338f9..5e1b68ced8 100644 --- a/h2o/pom.xml +++ b/h2o/pom.xml @@ -38,10 +38,10 @@ - oss.sonatype.org-snapshot - http://oss.sonatype.org/content/repositories/snapshots + oss.sonatype.org-releases + http://oss.sonatype.org/content/repositories/releases - false + true true diff --git a/pom.xml b/pom.xml index a81a71a915..9b48f4cd50 100644 --- a/pom.xml +++ b/pom.xml @@ -110,7 +110,7 @@ 2.10 2.10.4 1.0.1 - 0.1.1-SNAPSHOT + 0.1.3 Jira From 0cea0507e870afa8f0c8078636b7566355d806fb Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Sat, 16 Aug 2014 19:41:22 -0700 Subject: [PATCH 30/34] MAHOUT-1500: address review comments from Andrew Palumbo Signed-off-by: Anand Avati --- h2o/pom.xml | 4 +- .../mahout/h2obindings/H2OBlockMatrix.java | 23 ++++--- .../apache/mahout/h2obindings/H2OHdfs.java | 68 ++++++++++++------- .../apache/mahout/h2obindings/H2OHelper.java | 53 ++++++++++----- .../mahout/h2obindings/drm/H2OBCast.java | 3 +- .../apache/mahout/h2obindings/ops/AewB.java | 12 ++-- .../mahout/h2obindings/ops/AewScalar.java | 12 ++-- .../apache/mahout/h2obindings/ops/Cbind.java | 14 ++-- .../apache/mahout/h2obindings/ops/Rbind.java | 11 +-- .../mahout/h2obindings/ops/RowRange.java | 15 ++-- .../h2obindings/ops/TimesRightMatrix.java | 5 +- .../test/LoggerConfiguration.scala | 6 +- 12 files changed, 142 insertions(+), 84 deletions(-) diff --git a/h2o/pom.xml b/h2o/pom.xml index 5e1b68ced8..1f6791aaeb 100644 --- a/h2o/pom.xml +++ b/h2o/pom.xml @@ -206,7 +206,7 @@ org.apache.mahout - mahout-math-scala + mahout-math-scala_2.10 ${project.version} @@ -219,7 +219,7 @@ org.apache.mahout - mahout-math-scala + mahout-math-scala_2.10 tests test diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java index 10dd74a2ce..967e15a71c 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java @@ -48,13 +48,15 @@ public H2OBlockMatrix(Chunk chks[]) { } private void cow() { - if (cow != null) + if (cow != null) { return; + } - if (_chks[0].isSparse()) + if (_chks[0].isSparse()) { cow = new SparseMatrix(_chks[0].len(), _chks.length); - else + } else { cow = new DenseMatrix(_chks[0].len(), _chks.length); + } for (int c = 0; c < _chks.length; c++) { for (int r = 0; r < _chks[0].len(); r++) { @@ -69,24 +71,27 @@ public void setQuick(int row, int col, double val) { } public Matrix like(int nrow, int ncol) { - if (_chks[0].isSparse()) + if (_chks[0].isSparse()) { return new SparseMatrix(nrow, ncol); - else + } else { return new DenseMatrix(nrow, ncol); + } } public Matrix like() { - if (_chks[0].isSparse()) + if (_chks[0].isSparse()) { return new SparseMatrix(rowSize(), columnSize()); - else + } else { return new DenseMatrix(rowSize(), columnSize()); + } } public double getQuick(int row, int col) { - if (cow != null) + if (cow != null) { return cow.getQuick(row, col); - else + } else { return _chks[col].at0(row); + } } public Matrix assignRow(int row, Vector v) { diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java index 76ad7120c7..d33cb4c2a0 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java @@ -60,10 +60,11 @@ public static boolean is_seqfile(String filename) { fin.read(seq); fin.close(); - if (seq[0] == 'S' && seq[1] == 'E' && seq[2] == 'Q') + if (seq[0] == 'S' && seq[1] == 'E' && seq[2] == 'Q') { return true; - else + } else { return false; + } } catch (java.io.IOException e) { return false; } @@ -71,10 +72,11 @@ public static boolean is_seqfile(String filename) { public static H2ODrm drm_from_file(String filename, int parMin) { try { - if (is_seqfile(filename)) + if (is_seqfile(filename)) { return drm_from_seqfile(filename, parMin); - else + } else { return new H2ODrm(FrameUtils.parseFrame(null,new File(filename))); + } } catch (java.io.IOException e) { return null; } @@ -112,31 +114,36 @@ public static H2ODrm drm_from_seqfile(String filename, int parMin) { long start = reader.getPosition(); - if (reader.getKeyClass() == Text.class) + if (reader.getKeyClass() == Text.class) { is_string_key = true; - else if (reader.getKeyClass() == LongWritable.class) + } else if (reader.getKeyClass() == LongWritable.class) { is_long_key = true; - else + } else { is_int_key = true; + } while (reader.next(key, value)) { if (cols == 0) { Vector v = value.get(); cols = Math.max(v.size(), cols); } - if (is_long_key) + if (is_long_key) { rows = Math.max(((LongWritable)(key)).get()+1, rows); - if (is_int_key) + } + if (is_int_key) { rows = Math.max(((IntWritable)(key)).get()+1, rows); - if (is_string_key) + } + if (is_string_key) { rows++; + } } reader.seek(start); frame = H2OHelper.empty_frame(rows, cols, parMin, -1); writers = new Vec.Writer[cols]; - for (int i = 0; i < writers.length; i++) + for (int i = 0; i < writers.length; i++) { writers[i] = frame.vecs()[i].open(); + } if (reader.getKeyClass() == Text.class) { labels = frame.anyVec().makeZero(); @@ -146,23 +153,30 @@ else if (reader.getKeyClass() == LongWritable.class) long r = 0; while (reader.next(key, value)) { Vector v = value.get(); - if (is_long_key) + if (is_long_key) { r = ((LongWritable)(key)).get(); - if (is_int_key) + } + if (is_int_key) { r = ((IntWritable)(key)).get(); - for (int c = 0; c < v.size(); c++) + } + for (int c = 0; c < v.size(); c++) { writers[c].set(r, v.getQuick(c)); - if (labels != null) + } + if (labels != null) { labelwriter.set(r, ((Text)key).toString()); - if (is_string_key) + } + if (is_string_key) { r++; + } } Futures fus = new Futures(); - for (Vec.Writer w : writers) + for (Vec.Writer w : writers) { w.close(fus); - if (labelwriter != null) + } + if (labelwriter != null) { labelwriter.close(fus); + } fus.blockForPending(); } catch (java.io.IOException e) { return null; @@ -183,25 +197,29 @@ public static void drm_to_file(String filename, H2ODrm Drm) throws java.io.IOExc boolean is_sparse = H2OHelper.is_sparse(frame); ValueString vstr = new ValueString(); - if (labels != null) + if (labels != null) { writer = SequenceFile.createWriter(fs, conf, path, Text.class, VectorWritable.class); - else + } else { writer = SequenceFile.createWriter(fs, conf, path, IntWritable.class, VectorWritable.class); + } for (long r = 0; r < frame.anyVec().length(); r++) { Vector v = null; - if (is_sparse) + if (is_sparse) { v = new SequentialAccessSparseVector(frame.numCols()); - else + } else { v = new DenseVector(frame.numCols()); + } - for (int c = 0; c < frame.numCols(); c++) + for (int c = 0; c < frame.numCols(); c++) { v.setQuick(c, frame.vecs()[c].at(r)); + } - if (labels != null) + if (labels != null) { writer.append(new Text(labels.atStr(vstr, r).toString()), new VectorWritable(v)); - else + } else { writer.append(new IntWritable((int)r), new VectorWritable(v)); + } } writer.close(); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java index 9ecebed2f3..cdf97a0960 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -77,18 +77,20 @@ public static Matrix matrix_from_drm(H2ODrm Drm) { Vec labels = Drm.keys; Matrix m; - if (is_sparse(frame)) + if (is_sparse(frame)) { m = new SparseMatrix((int)frame.numRows(), frame.numCols()); - else + } else { m = new DenseMatrix((int)frame.numRows(), frame.numCols()); + } int c = 0; /* Fill matrix, column at a time */ for (Vec v : frame.vecs()) { for (int r = 0; r < frame.numRows(); r++) { double d = 0.0; - if (!v.isNA(r) && ((d = v.at(r)) != 0.0)) + if (!v.isNA(r) && ((d = v.at(r)) != 0.0)) { m.setQuick(r, c, d); + } } c++; } @@ -113,8 +115,9 @@ public static Matrix matrix_from_drm(H2ODrm Drm) { */ public static Vector colMeans(Frame frame) { double means[] = new double[frame.numCols()]; - for (int i = 0; i < frame.numCols(); i++) + for (int i = 0; i < frame.numCols(); i++) { means[i] = frame.vecs()[i].mean(); + } return new DenseVector(means); } @@ -181,8 +184,9 @@ public void map(Chunk chks[]) { for (int c = 0; c < chks.length; c++) { for (int r = 0; r < chks[c].len(); r++) { - if ((long)chks[c].at0(r) != 0) + if ((long)chks[c].at0(r) != 0) { _sums[c] ++; + } } } } @@ -195,8 +199,9 @@ public void reduce(MRTaskNonZero other) { /* Convert String->Integer map to Integer->String map */ private static Map reverse_map(Map map) { - if (map == null) + if (map == null) { return null; + } Map rmap = new HashMap(); @@ -211,22 +216,27 @@ private static int chunk_size(long nrow, int ncol, int min, int exact) { int chunk_sz; int parts_hint = Math.max(min, exact); - if (parts_hint < 1) + if (parts_hint < 1) { /* XXX: calculate based on cloud size and # of cpu */ parts_hint = 4; + } chunk_sz = (int)(((nrow - 1) / parts_hint) + 1); - if (exact > 0) + if (exact > 0) { return chunk_sz; + } - if (chunk_sz > 1e6) + if (chunk_sz > 1e6) { chunk_sz = (int)1e6; + } - if (min > 0) + if (min > 0) { return chunk_sz; + } - if (chunk_sz < 1e3) + if (chunk_sz < 1e3) { chunk_sz = (int)1e3; + } return chunk_sz; } @@ -242,15 +252,19 @@ public static H2ODrm drm_from_matrix(Matrix m, int min_hint, int exact_hint) { Futures closer = new Futures(); /* "open" vectors for writing efficiently in bulk */ - for (int i = 0; i < writers.length; i++) + for (int i = 0; i < writers.length; i++) { writers[i] = frame.vecs()[i].open(); + } - for (int r = 0; r < m.rowSize(); r++) - for (int c = 0; c < m.columnSize(); c++) + for (int r = 0; r < m.rowSize(); r++) { + for (int c = 0; c < m.columnSize(); c++) { writers[c].set(r, m.getQuick(r, c)); + } + } - for (int c = 0; c < m.columnSize(); c++) + for (int c = 0; c < m.columnSize(); c++) { writers[c].close(closer); + } /* If string labeled matrix, create aux Vec */ Map map = m.getRowLabelBindings(); @@ -260,8 +274,9 @@ public static H2ODrm drm_from_matrix(Matrix m, int min_hint, int exact_hint) { Vec.Writer writer = labels.open(); Map rmap = reverse_map(map); - for (long r = 0; r < m.rowSize(); r++) + for (long r = 0; r < m.rowSize(); r++) { writer.set(r, rmap.get(r)); + } writer.close(closer); } @@ -283,12 +298,14 @@ public static Frame empty_frame(long nrow, int ncol, int min_hint, int exact_hin long espc[] = new long[nchunks+1]; final Vec[] vecs = new Vec[ncol]; - for (int i = 0; i < nchunks; i++) + for (int i = 0; i < nchunks; i++) { espc[i] = i * chunk_sz; + } espc[nchunks] = nrow; - for (int i = 0; i < vecs.length; i++) + for (int i = 0; i < vecs.length; i++) { vecs[i] = Vec.makeCon(0, null, vg, espc); + } return new Frame(vecs); } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java b/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java index 40fa13d286..7395027bbc 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/drm/H2OBCast.java @@ -54,8 +54,9 @@ public H2OBCast(T o) { } public T value() { - if (obj == null) + if (obj == null) { obj = deserialize(buf); + } return obj; } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java index a1b0f1e543..8d24bb2395 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java @@ -42,16 +42,18 @@ public static H2ODrm AewB(H2ODrm DrmA, H2ODrm DrmB, final String op) { */ Frame AewB = new MRTask() { private double opfn(String op, double a, double b) { - if (a == 0.0 && b == 0.0) + if (a == 0.0 && b == 0.0) { return 0.0; - if (op.equals("+")) + } + if (op.equals("+")) { return a + b; - else if (op.equals("-")) + } else if (op.equals("-")) { return a - b; - else if (op.equals("*")) + } else if (op.equals("*")) { return a * b; - else if (op.equals("/")) + } else if (op.equals("/")) { return a / b; + } return 0.0; } public void map(Chunk chks[], NewChunk ncs[]) { diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java index f998009bb5..d0086fd22a 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java @@ -38,16 +38,18 @@ public static H2ODrm AewScalar(H2ODrm DrmA, final double s, final String op) { */ Frame AewScalar = new MRTask() { private double opfn(String op, double a, double b) { - if (a == 0.0 && b == 0.0) + if (a == 0.0 && b == 0.0) { return 0.0; - if (op.equals("+")) + } + if (op.equals("+")) { return a + b; - else if (op.equals("-")) + } else if (op.equals("-")) { return a - b; - else if (op.equals("*")) + } else if (op.equals("*")) { return a * b; - else if (op.equals("/")) + } else if (op.equals("/")) { return a / b; + } return 0.0; } public void map(Chunk chks[], NewChunk ncs[]) { diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java index 60314625ec..52f8701015 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java @@ -35,12 +35,13 @@ public static H2ODrm Cbind(H2ODrm DrmA, H2ODrm DrmB) { Vec keysb = DrmB.keys; /* If A and B are similarly partitioned, .. */ - if (fra.anyVec().group() == frb.anyVec().group()) + if (fra.anyVec().group() == frb.anyVec().group()) { /* .. then, do a light weight zip() */ return zip(fra, keysa, frb, keysb); - else + } else { /* .. else, do a heavy weight join() which involves moving data over the wire */ return join(fra, keysa, frb, keysb); + } } /* Light weight zip(), no data movement */ @@ -49,11 +50,13 @@ private static H2ODrm zip(final Frame fra, final Vec keysa, final Frame frb, fin Vec vecs[] = new Vec[fra.vecs().length + frb.vecs().length]; int d = 0; /* fill A's column vectors */ - for (Vec vfra : fra.vecs()) + for (Vec vfra : fra.vecs()) { vecs[d++] = vfra; + } /* and B's */ - for (Vec vfrb : frb.vecs()) + for (Vec vfrb : frb.vecs()) { vecs[d++] = vfrb; + } /* and create a new Frame with the combined list of column Vecs */ Frame fr = new Frame(vecs); /* Finally, inherit A's string labels into the result */ @@ -66,9 +69,10 @@ private static H2ODrm join(final Frame fra, final Vec keysa, final Frame frb, fi /* The plan is to re-organize B to be "similarly partitioned as A", and then zip() */ Vec bvecs[] = new Vec[frb.vecs().length]; - for (int i = 0; i < bvecs.length; i++) + for (int i = 0; i < bvecs.length; i++) { /* First create column Vecs which are similarly partitioned as A */ bvecs[i] = fra.anyVec().makeZero(); + } /* Next run an MRTask on the new vectors, and fill each cell (initially 0) by pulling in appropriate values from B (frb) diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Rbind.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Rbind.java index a781a9346a..b3ac2ec1b7 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Rbind.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Rbind.java @@ -57,22 +57,25 @@ public void map(Chunk chks[], NewChunk nc) { for (int c = 0; c < chks.length; c++) { if (r + start < A_rows) { chks[c].set0(r, A_vecs[c].at(r + start)); - if (keysa != null) + if (keysa != null) { nc.addStr(keysa.atStr(vstr, r + start).toString()); + } } else { chks[c].set0(r, B_vecs[c].at(r + start - A_rows)); - if (keysb != null) + if (keysb != null) { nc.addStr(keysb.atStr(vstr, r + start - A_rows).toString()); + } } } } } }; - if (keysa == null) + if (keysa == null) { keys = task.doAll(1, frbind).outputFrame(null, null).anyVec(); - else + } else { task.doAll(frbind); + } return new H2ODrm(frbind, keys); } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java index 0a22581c7a..88bd398e7c 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java @@ -43,16 +43,19 @@ public void map(Chunk chks[], NewChunk ncs[]) { long chunk_start = chks[0].start(); /* First check if the entire chunk even overlaps with R */ - if (chunk_start > R.end() || (chunk_start + chunk_size) < R.start()) + if (chunk_start > R.end() || (chunk_start + chunk_size) < R.start()) { return; + } /* This chunk overlaps, filter out just the overlapping rows */ for (int r = 0; r < chunk_size; r++) { - if (!R.contains (chunk_start + r)) + if (!R.contains (chunk_start + r)) { continue; + } - for (int c = 0; c < chks.length; c++) + for (int c = 0; c < chks.length; c++) { ncs[c].addNum(chks[c].at0(r)); + } } } }.doAll(A.numCols(), A).outputFrame(null, null); @@ -66,12 +69,14 @@ public void map(Chunk chk, NewChunk nc) { long chunk_start = chk.start(); ValueString vstr = new ValueString(); - if (chunk_start > R.end() || (chunk_start + chunk_size) < R.start()) + if (chunk_start > R.end() || (chunk_start + chunk_size) < R.start()) { return; + } for (int r = 0; r < chunk_size; r++) { - if (!R.contains(chunk_start + r)) + if (!R.contains(chunk_start + r)) { continue; + } nc.addStr(chk.atStr0(vstr, r).toString()); } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java index 9633ecc2be..68fe6af7d5 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java @@ -37,10 +37,11 @@ public static H2ODrm TimesRightMatrix(H2ODrm DrmA, Matrix B) { Vec keys = DrmA.keys; Frame AinCoreB = null; - if (B instanceof DiagonalMatrix) + if (B instanceof DiagonalMatrix) { AinCoreB = AinCoreB_diagonal(A, B.viewDiagonal()); - else + } else { AinCoreB = AinCoreB_common(A, B); + } return new H2ODrm(AinCoreB, keys); } diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/test/LoggerConfiguration.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/test/LoggerConfiguration.scala index 6444ece61b..b0cfd30bca 100644 --- a/h2o/src/test/scala/org/apache/mahout/h2obindings/test/LoggerConfiguration.scala +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/test/LoggerConfiguration.scala @@ -1,13 +1,13 @@ package org.apache.mahout.h2obindings.test -import org.scalatest.Suite +import org.scalatest.{Suite, ConfigMap} import org.apache.log4j.{Level, Logger, BasicConfigurator} trait LoggerConfiguration extends org.apache.mahout.test.LoggerConfiguration { this: Suite => - override protected def beforeAll(): Unit = { - super.beforeAll() + override protected def beforeAll(configMap: ConfigMap): Unit = { + super.beforeAll(configMap) Logger.getLogger("org.apache.mahout.h2obindings").setLevel(Level.DEBUG) } } From 8d0dd47771c56fcc85c7b00f8ff5f860b2801df8 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Mon, 25 Aug 2014 15:24:49 -0700 Subject: [PATCH 31/34] MAHOUT-1500: rebase to h2o-core 0.1.5 (Java 6) h2o-core 0.1.5 onwards is Java 6 compatible. Signed-off-by: Anand Avati --- .../main/java/org/apache/mahout/h2obindings/ops/Par.java | 2 +- .../main/java/org/apache/mahout/h2obindings/ops/Rbind.java | 4 ++-- .../java/org/apache/mahout/h2obindings/ops/RowRange.java | 2 +- .../org/apache/mahout/h2obindings/ops/MapBlockHelper.scala | 7 +++++-- pom.xml | 2 +- 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java index 2c01ccd653..ac4efc0f44 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java @@ -56,7 +56,7 @@ public void map(Chunk chks[], NewChunk nc) { for (int c = 0; c < chks.length; c++) { chks[c].set0(r, vins[c].at(start + r)); } - nc.addStr(vin.atStr(vstr, start + r).toString()); + nc.addStr(vin.atStr(vstr, start + r)); } } }.doAll(1, frout).outputFrame(null, null).anyVec(); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Rbind.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Rbind.java index b3ac2ec1b7..9c10b7f475 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Rbind.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Rbind.java @@ -58,12 +58,12 @@ public void map(Chunk chks[], NewChunk nc) { if (r + start < A_rows) { chks[c].set0(r, A_vecs[c].at(r + start)); if (keysa != null) { - nc.addStr(keysa.atStr(vstr, r + start).toString()); + nc.addStr(keysa.atStr(vstr, r + start)); } } else { chks[c].set0(r, B_vecs[c].at(r + start - A_rows)); if (keysb != null) { - nc.addStr(keysb.atStr(vstr, r + start - A_rows).toString()); + nc.addStr(keysb.atStr(vstr, r + start - A_rows)); } } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java index 88bd398e7c..c9e82f97d7 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java @@ -78,7 +78,7 @@ public void map(Chunk chk, NewChunk nc) { continue; } - nc.addStr(chk.atStr0(vstr, r).toString()); + nc.addStr(chk.atStr0(vstr, r)); } } }.doAll(1, keys).outputFrame(null, null).anyVec(); diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala index 19bb3f4050..b46d13c3f1 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala @@ -48,8 +48,11 @@ object MapBlockHelper { val out = _bmf((inarray.asInstanceOf[Array[K]], in)) implicitly[ClassTag[R]] match { - case `s` => for (str <- out._1) { - nclabel.addStr(str.asInstanceOf[String]) + case `s` => { + val vstr = new ValueString + for (str <- out._1) { + nclabel.addStr(vstr.setTo(str.asInstanceOf[String])) + } } case _ => Unit } diff --git a/pom.xml b/pom.xml index 9b48f4cd50..89ed1a7f06 100644 --- a/pom.xml +++ b/pom.xml @@ -110,7 +110,7 @@ 2.10 2.10.4 1.0.1 - 0.1.3 + 0.1.5 Jira From bacd8b9059c9aed26254677359d6c53ea54689db Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Tue, 26 Aug 2014 12:10:57 -0700 Subject: [PATCH 32/34] MAHOUT-1500: re-applied review comments .. which were lost in the noise! Signed-off-by: Anand Avati --- .../mahout/h2obindings/H2OBlockMatrix.java | 2 +- .../apache/mahout/h2obindings/H2OHelper.java | 38 +++++++++---------- .../org/apache/mahout/h2obindings/ops/At.java | 2 +- .../apache/mahout/h2obindings/ops/AtA.java | 2 +- .../apache/mahout/h2obindings/ops/AtB.java | 2 +- .../apache/mahout/h2obindings/ops/Atx.java | 16 ++++---- .../mahout/h2obindings/ops/MapBlock.java | 18 ++++----- .../mahout/h2obindings/ops/RowRange.java | 2 +- .../h2obindings/ops/MapBlockHelper.scala | 2 +- .../h2obindings/drm/DrmLikeOpsSuite.scala | 3 +- .../mahout/h2obindings/ops/ABtSuite.scala | 3 -- .../mahout/h2obindings/ops/AewBSuite.scala | 6 --- .../mahout/h2obindings/ops/AtASuite.scala | 5 --- .../mahout/h2obindings/ops/AtSuite.scala | 3 -- .../h2obindings/test/MahoutLocalContext.scala | 29 -------------- .../DistributedDecompositionsSuite.scala | 5 +-- 16 files changed, 44 insertions(+), 94 deletions(-) delete mode 100644 h2o/src/test/scala/org/apache/mahout/h2obindings/test/MahoutLocalContext.scala diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java index 967e15a71c..27c1d586eb 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OBlockMatrix.java @@ -67,7 +67,7 @@ private void cow() { public void setQuick(int row, int col, double val) { cow(); - cow.setQuick (row, col, val); + cow.setQuick(row, col, val); } public Matrix like(int nrow, int ncol) { diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java index cdf97a0960..ea1420b574 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -51,18 +51,18 @@ public static boolean is_sparse(Frame frame) { /* MRTask to aggregate precalculated per-chunk sparse lengths */ class MRTaskNZ extends MRTask { - long _sparselen; + long sparselen; public void map(Chunk chks[]) { for (Chunk chk : chks) { - _sparselen += chk.sparseLen(); + sparselen += chk.sparseLen(); } } public void reduce(MRTaskNZ other) { - _sparselen += other._sparselen; + sparselen += other.sparselen; } } - long sparselen = new MRTaskNZ().doAll(frame)._sparselen; + long sparselen = new MRTaskNZ().doAll(frame).sparselen; return (((rows * cols) / (sparselen + 1)) > 32); } @@ -130,21 +130,21 @@ public static Vector colMeans(Frame frame) { */ public static Vector colSums(Frame frame) { class MRTaskSum extends MRTask { - public double _sums[]; + public double sums[]; public void map(Chunk chks[]) { - _sums = new double[chks.length]; + sums = new double[chks.length]; for (int c = 0; c < chks.length; c++) { for (int r = 0; r < chks[c].len(); r++) { - _sums[c] += chks[c].at0(r); + sums[c] += chks[c].at0(r); } } } public void reduce(MRTaskSum other) { - ArrayUtils.add(_sums, other._sums); + ArrayUtils.add(sums, other.sums); } } - return new DenseVector(new MRTaskSum().doAll(frame)._sums); + return new DenseVector(new MRTaskSum().doAll(frame).sums); } @@ -154,19 +154,19 @@ public void reduce(MRTaskSum other) { */ public static double sumSqr(Frame frame) { class MRTaskSumSqr extends MRTask { - public double _sumSqr; + public double sumSqr; public void map(Chunk chks[]) { for (int c = 0; c < chks.length; c++) { for (int r = 0; r < chks[c].len(); r++) { - _sumSqr += (chks[c].at0(r) * chks[c].at0(r)); + sumSqr += (chks[c].at0(r) * chks[c].at0(r)); } } } public void reduce(MRTaskSumSqr other) { - _sumSqr += other._sumSqr; + sumSqr += other.sumSqr; } } - return new MRTaskSumSqr().doAll(frame)._sumSqr; + return new MRTaskSumSqr().doAll(frame).sumSqr; } /* Calculate Sum of all elements in a column, and @@ -178,23 +178,23 @@ public void reduce(MRTaskSumSqr other) { */ public static Vector nonZeroCnt(Frame frame) { class MRTaskNonZero extends MRTask { - public double _sums[]; + public double sums[]; public void map(Chunk chks[]) { - _sums = new double[chks.length]; + sums = new double[chks.length]; for (int c = 0; c < chks.length; c++) { for (int r = 0; r < chks[c].len(); r++) { if ((long)chks[c].at0(r) != 0) { - _sums[c] ++; + sums[c] ++; } } } } public void reduce(MRTaskNonZero other) { - ArrayUtils.add(_sums, other._sums); + ArrayUtils.add(sums, other.sums); } } - return new DenseVector(new MRTaskNonZero().doAll(frame)._sums); + return new DenseVector(new MRTaskNonZero().doAll(frame).sums); } /* Convert String->Integer map to Integer->String map */ @@ -295,7 +295,7 @@ public static Frame empty_frame(long nrow, int ncol, int min_hint, int exact_hin public static Frame empty_frame(long nrow, int ncol, int min_hint, int exact_hint, Vec.VectorGroup vg) { int chunk_sz = chunk_size(nrow, ncol, min_hint, exact_hint); int nchunks = (int)((nrow - 1) / chunk_sz) + 1; /* Final number of Chunks per Vec */ - long espc[] = new long[nchunks+1]; + long espc[] = new long[nchunks + 1]; final Vec[] vecs = new Vec[ncol]; for (int i = 0; i < nchunks; i++) { diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java index 3458865c31..4ea6fa41ee 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java @@ -46,7 +46,7 @@ public void map(Chunk chks[]) { for (int c = 0; c < chks.length; c++) { for (int r = 0; r < chunk_size; r++) { - chks[c].set0(r, A_vecs[(int)(start+r)].at(c)); + chks[c].set0(r, A_vecs[(int)(start + r)].at(c)); } } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java index ca12bf810d..4984172684 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java @@ -49,7 +49,7 @@ public void map(Chunk chks[]) { for (int r = 0; r < chunk_size; r++) { double v = 0; for (long i = 0; i < A_rows; i++) { - v += (A_vecs[(int)(start+r)].at(i) * A_vecs[c].at(i)); + v += (A_vecs[(int)(start + r)].at(i) * A_vecs[c].at(i)); } chks[c].set0(r, v); } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java index 68f6d7ded8..619214355c 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java @@ -52,7 +52,7 @@ public void map(Chunk chks[]) { for (int r = 0; r < chunk_size; r++) { double v = 0; for (long i = 0; i < A_rows; i++) { - v += (A_vecs[(int)(start+r)].at(i) * B_vecs[c].at(i)); + v += (A_vecs[(int)(start + r)].at(i) * B_vecs[c].at(i)); } chks[c].set0(r, v); } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java index c0eb739fc1..466c69d2ef 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java @@ -38,37 +38,37 @@ public static H2ODrm Atx(H2ODrm DrmA, Vector x) { Frame A = DrmA.frame; final H2OBCast bx = new H2OBCast(x); - /* A'x is computed into _atx[] with an MRTask on A (with + /* A'x is computed into atx[] with an MRTask on A (with x available as a Broadcast x.size() == A.numRows() - _atx.length == chks.length == A.numCols() + atx.length == chks.length == A.numCols() */ class MRTaskAtx extends MRTask { - double _atx[]; + double atx[]; public void map(Chunk chks[]) { int chunk_size = chks[0].len(); Vector x = bx.value(); long start = chks[0].start(); - _atx = new double[chks.length]; + atx = new double[chks.length]; for (int r = 0; r < chunk_size; r++) { double d = x.getQuick((int)start + r); for (int c = 0; c < chks.length; c++) { - _atx[c] += (chks[c].at0(r) * d); + atx[c] += (chks[c].at0(r) * d); } } } public void reduce(MRTaskAtx other) { - ArrayUtils.add(_atx, other._atx); + ArrayUtils.add(atx, other.atx); } } - /* Take the result in ._atx[], and convert into a Frame + /* Take the result in .atx[], and convert into a Frame using existing helper functions (creating a Matrix along the way for the Helper) */ - Vector v = new DenseVector(new MRTaskAtx().doAll(A)._atx); + Vector v = new DenseVector(new MRTaskAtx().doAll(A).atx); Matrix m = new DenseMatrix(A.numCols(), 1); m.assignColumn(0, v); return H2OHelper.drm_from_matrix(m, -1, -1); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java index 5b52c9d7de..1df7f16225 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java @@ -39,9 +39,9 @@ public static H2ODrm exec(H2ODrm DrmA, int ncol, Object bmf, final boolean Vec keys = DrmA.keys; class MRTaskBMF extends MRTask { - Serializable _bmf; - Vec _labels; - MRTaskBMF(Object bmf, Vec labels) { + Serializable bmf; + Vec labels; + MRTaskBMF(Object _bmf, Vec _labels) { /* BlockMapFun does not implement Serializable, but Scala closures are _always_ Serializable. @@ -50,15 +50,15 @@ So receive the object as a plain Object (else that Scala always tags the actually generated closure functions with Serializable. */ - _bmf = (Serializable)bmf; - _labels = labels; + bmf = (Serializable)_bmf; + labels = _labels; } - private Matrix blockify (Chunk chks[]) { + private Matrix blockify(Chunk chks[]) { return new H2OBlockMatrix(chks); } - private void deblockify (Matrix out, NewChunk ncs[]) { + private void deblockify(Matrix out, NewChunk ncs[]) { // assert (out.colSize() == ncs.length) for (int c = 0; c < out.columnSize(); c++) { for (int r = 0; r < out.rowSize(); r++) { @@ -83,8 +83,8 @@ of bmf() output's _2 in deblockify() */ public void map(Chunk chks[], NewChunk ncs[]) { long start = chks[0].start(); - NewChunk nclabel = is_r_str ? ncs[ncs.length-1] : null; - deblockify(MapBlockHelper.exec(_bmf, blockify(chks), start, _labels, nclabel, k, r), ncs); + NewChunk nclabel = is_r_str ? ncs[ncs.length - 1] : null; + deblockify(MapBlockHelper.exec(bmf, blockify(chks), start, labels, nclabel, k, r), ncs); // assert chks[i]._len == ncs[j]._len } } diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java index c9e82f97d7..7b89162756 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java @@ -49,7 +49,7 @@ public void map(Chunk chks[], NewChunk ncs[]) { /* This chunk overlaps, filter out just the overlapping rows */ for (int r = 0; r < chunk_size; r++) { - if (!R.contains (chunk_start + r)) { + if (!R.contains(chunk_start + r)) { continue; } diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala index b46d13c3f1..038482607f 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala @@ -38,7 +38,7 @@ object MapBlockHelper { val arr = new Array[String](in.rowSize) val vstr = new ValueString for (i <- 0 to in.rowSize) { - arr(i) = labels.atStr(vstr, i+startlong).toString + arr(i) = labels.atStr(vstr, i + startlong).toString } arr } diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeOpsSuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeOpsSuite.scala index 6bfb13fc97..c6deb35ae8 100644 --- a/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeOpsSuite.scala +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/drm/DrmLikeOpsSuite.scala @@ -27,5 +27,4 @@ import org.scalatest.FunSuite import org.apache.mahout.h2obindings.test.DistributedH2OSuite /** Tests for DrmLikeOps */ -class DrmLikeOpsSuite extends FunSuite with DistributedH2OSuite with DrmLikeOpsSuiteBase { -} +class DrmLikeOpsSuite extends FunSuite with DistributedH2OSuite with DrmLikeOpsSuiteBase diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/ABtSuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/ABtSuite.scala index 6395233923..b0d2ad7713 100644 --- a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/ABtSuite.scala +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/ABtSuite.scala @@ -29,7 +29,6 @@ import org.apache.mahout.math.drm._ /** Tests for AB' operator algorithms */ class ABtSuite extends FunSuite with DistributedH2OSuite { - test("ABt") { val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5)) val inCoreB = dense((3, 4, 5), (5, 6, 7)) @@ -44,7 +43,5 @@ class ABtSuite extends FunSuite with DistributedH2OSuite { assert((inCoreM - inCoreMControl).norm < 1E-5) println(inCoreM) - } - } diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AewBSuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AewBSuite.scala index 98a3345ad1..2f2133f5c1 100644 --- a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AewBSuite.scala +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AewBSuite.scala @@ -27,7 +27,6 @@ import org.apache.mahout.math.drm.logical._ /** Elementwise matrix operation tests */ class AewBSuite extends FunSuite with DistributedH2OSuite { - test("A * B Hadamard") { val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5), (7, 8, 9)) val inCoreB = dense((3, 4, 5), (5, 6, 7), (0, 0, 0), (9, 8, 7)) @@ -40,7 +39,6 @@ class AewBSuite extends FunSuite with DistributedH2OSuite { val inCoreMControl = inCoreA * inCoreB assert((inCoreM - inCoreMControl).norm < 1E-10) - } test("A + B Elementwise") { @@ -55,7 +53,6 @@ class AewBSuite extends FunSuite with DistributedH2OSuite { val inCoreMControl = inCoreA + inCoreB assert((inCoreM - inCoreMControl).norm < 1E-10) - } test("A - B Elementwise") { @@ -70,7 +67,6 @@ class AewBSuite extends FunSuite with DistributedH2OSuite { val inCoreMControl = inCoreA - inCoreB assert((inCoreM - inCoreMControl).norm < 1E-10) - } test("A / B Elementwise") { @@ -85,7 +81,5 @@ class AewBSuite extends FunSuite with DistributedH2OSuite { val inCoreMControl = inCoreA / inCoreB assert((inCoreM - inCoreMControl).norm < 1E-10) - } - } diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtASuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtASuite.scala index bdb2fe09c1..3dc3fcc6a1 100644 --- a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtASuite.scala +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtASuite.scala @@ -29,9 +29,7 @@ import org.apache.mahout.math.drm._ /** Tests for {@link XtX} */ class AtASuite extends FunSuite with DistributedH2OSuite { - test("AtA slim") { - val inCoreA = dense((1, 2), (2, 3)) val drmA = drmParallelize(inCoreA) @@ -43,8 +41,5 @@ class AtASuite extends FunSuite with DistributedH2OSuite { println(expectedAtA) assert(expectedAtA === inCoreAtA) - } - - } diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtSuite.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtSuite.scala index f04f32f68b..c521388568 100644 --- a/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtSuite.scala +++ b/h2o/src/test/scala/org/apache/mahout/h2obindings/ops/AtSuite.scala @@ -29,7 +29,6 @@ import org.apache.mahout.math.drm._ /** Tests for A' algorithms */ class AtSuite extends FunSuite with DistributedH2OSuite { - test("At") { val inCoreA = dense((1, 2, 3), (2, 3, 4), (3, 4, 5)) val A = drmParallelize(m = inCoreA, numPartitions = 2) @@ -40,7 +39,5 @@ class AtSuite extends FunSuite with DistributedH2OSuite { println(inCoreAt) assert((inCoreAt - inCoreControlAt).norm < 1E-5) - - } } diff --git a/h2o/src/test/scala/org/apache/mahout/h2obindings/test/MahoutLocalContext.scala b/h2o/src/test/scala/org/apache/mahout/h2obindings/test/MahoutLocalContext.scala deleted file mode 100644 index 21735ab87e..0000000000 --- a/h2o/src/test/scala/org/apache/mahout/h2obindings/test/MahoutLocalContext.scala +++ /dev/null @@ -1,29 +0,0 @@ -package org.apache.mahout.h2obindings.test - -import org.scalatest.Suite -import org.apache.mahout.h2obindings._ -import org.apache.mahout.test.MahoutSuite -import org.apache.mahout.math.drm.DistributedContext - -trait MahoutLocalContext extends MahoutSuite with LoggerConfiguration { - this: Suite => - - protected implicit var mahoutCtx: DistributedContext = _ - - override protected def beforeEach() { - super.beforeEach() - - mahoutCtx = mahoutH2OContext("mah2out") - } - - override protected def afterEach() { - if (mahoutCtx != null) { - try { - mahoutCtx.close() - } finally { - mahoutCtx = null - } - } - super.afterEach() - } -} diff --git a/h2o/src/test/scala/org/apache/mahout/math/decompositions/DistributedDecompositionsSuite.scala b/h2o/src/test/scala/org/apache/mahout/math/decompositions/DistributedDecompositionsSuite.scala index 71f3afd36c..00cf0ba6ff 100644 --- a/h2o/src/test/scala/org/apache/mahout/math/decompositions/DistributedDecompositionsSuite.scala +++ b/h2o/src/test/scala/org/apache/mahout/math/decompositions/DistributedDecompositionsSuite.scala @@ -28,7 +28,4 @@ import scala.math._ import org.scalatest.{Matchers, FunSuite} import org.apache.mahout.h2obindings.test.DistributedH2OSuite -class DistributedDecompositionsSuite extends FunSuite with DistributedH2OSuite with DistributedDecompositionsSuiteBase { - - -} +class DistributedDecompositionsSuite extends FunSuite with DistributedH2OSuite with DistributedDecompositionsSuiteBase From 3272383cf80b00c091b61d54ce3191dbe5cec9a5 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Tue, 26 Aug 2014 13:44:36 -0700 Subject: [PATCH 33/34] MAHOUT-1500: fix camelCase styling of variable names Signed-off-by: Anand Avati --- .../java/org/apache/mahout/h2obindings/H2OHelper.java | 6 +++--- .../java/org/apache/mahout/h2obindings/ops/ABt.java | 8 ++++---- .../java/org/apache/mahout/h2obindings/ops/AewB.java | 8 ++++---- .../org/apache/mahout/h2obindings/ops/AewScalar.java | 6 +++--- .../java/org/apache/mahout/h2obindings/ops/At.java | 4 ++-- .../java/org/apache/mahout/h2obindings/ops/AtA.java | 4 ++-- .../java/org/apache/mahout/h2obindings/ops/AtB.java | 6 +++--- .../java/org/apache/mahout/h2obindings/ops/Atx.java | 4 ++-- .../java/org/apache/mahout/h2obindings/ops/Ax.java | 6 +++--- .../java/org/apache/mahout/h2obindings/ops/Cbind.java | 10 +++++----- .../org/apache/mahout/h2obindings/ops/MapBlock.java | 6 +++--- .../java/org/apache/mahout/h2obindings/ops/Par.java | 6 +++--- .../java/org/apache/mahout/h2obindings/ops/Rbind.java | 10 +++++----- .../org/apache/mahout/h2obindings/ops/RowRange.java | 6 +++--- .../mahout/h2obindings/ops/TimesRightMatrix.java | 6 +++--- 15 files changed, 48 insertions(+), 48 deletions(-) diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java index ea1420b574..1b817fc2ff 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -72,9 +72,9 @@ public void reduce(MRTaskNZ other) { Dense Matrix depending on number of missing elements in Frame. */ - public static Matrix matrix_from_drm(H2ODrm Drm) { - Frame frame = Drm.frame; - Vec labels = Drm.keys; + public static Matrix matrix_from_drm(H2ODrm drm) { + Frame frame = drm.frame; + Vec labels = drm.keys; Matrix m; if (is_sparse(frame)) { diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java index 59227a752b..d05013f671 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/ABt.java @@ -28,10 +28,10 @@ public class ABt { /* Calculate AB' */ - public static H2ODrm ABt(H2ODrm DrmA, H2ODrm DrmB) { - Frame A = DrmA.frame; - Vec keys = DrmA.keys; - final Frame B = DrmB.frame; + public static H2ODrm ABt(H2ODrm drmA, H2ODrm drmB) { + Frame A = drmA.frame; + Vec keys = drmA.keys; + final Frame B = drmB.frame; int ABt_cols = (int)B.numRows(); /* ABt is written into ncs[] with an MRTask on A, and therefore will diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java index 8d24bb2395..2c590bd6bd 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewB.java @@ -28,10 +28,10 @@ public class AewB { /* Element-wise DRM-DRM operations */ - public static H2ODrm AewB(H2ODrm DrmA, H2ODrm DrmB, final String op) { - final Frame A = DrmA.frame; - final Frame B = DrmB.frame; - Vec keys = DrmA.keys; + public static H2ODrm AewB(H2ODrm drmA, H2ODrm drmB, final String op) { + final Frame A = drmA.frame; + final Frame B = drmB.frame; + Vec keys = drmA.keys; int AewB_cols = A.numCols(); /* AewB is written into ncs[] with an MRTask on A, and therefore will diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java index d0086fd22a..9b6387d8c5 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AewScalar.java @@ -28,9 +28,9 @@ public class AewScalar { /* Element-wise DRM-DRM operations */ - public static H2ODrm AewScalar(H2ODrm DrmA, final double s, final String op) { - Frame A = DrmA.frame; - Vec keys = DrmA.keys; + public static H2ODrm AewScalar(H2ODrm drmA, final double s, final String op) { + Frame A = drmA.frame; + Vec keys = drmA.keys; int AewScalar_cols = A.numCols(); /* AewScalar is written into ncs[] with an MRTask on A, and therefore will diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java index 4ea6fa41ee..a6698f0bc8 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/At.java @@ -28,8 +28,8 @@ public class At { /* Calculate A' (transpose) */ - public static H2ODrm At(H2ODrm DrmA) { - final Frame A = DrmA.frame; + public static H2ODrm At(H2ODrm drmA) { + final Frame A = drmA.frame; /* First create a new frame of the required dimensions, A.numCols() rows and A.numRows() columns. */ diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java index 4984172684..58f21bf3df 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtA.java @@ -28,8 +28,8 @@ public class AtA { /* Calculate A'A */ - public static H2ODrm AtA(H2ODrm DrmA) { - final Frame A = DrmA.frame; + public static H2ODrm AtA(H2ODrm drmA) { + final Frame A = drmA.frame; /* First create an empty Frame of the required dimensions */ Frame AtA = H2OHelper.empty_frame(A.numCols(), A.numCols(), -1, -1); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java index 619214355c..3e8cf08697 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/AtB.java @@ -28,9 +28,9 @@ public class AtB { /* Calculate A'B */ - public static H2ODrm AtB(H2ODrm DrmA, H2ODrm DrmB) { - final Frame A = DrmA.frame; - final Frame B = DrmB.frame; + public static H2ODrm AtB(H2ODrm drmA, H2ODrm drmB) { + final Frame A = drmA.frame; + final Frame B = drmB.frame; /* First create an empty frame of the required dimensions */ Frame AtB = H2OHelper.empty_frame(A.numCols(), B.numCols(), -1, -1); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java index 466c69d2ef..88465e91e3 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Atx.java @@ -34,8 +34,8 @@ public class Atx { /* Calculate A'x (where x is an in-core Vector) */ - public static H2ODrm Atx(H2ODrm DrmA, Vector x) { - Frame A = DrmA.frame; + public static H2ODrm Atx(H2ODrm drmA, Vector x) { + Frame A = drmA.frame; final H2OBCast bx = new H2OBCast(x); /* A'x is computed into atx[] with an MRTask on A (with diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java index e292e9994c..2d9c0d086f 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Ax.java @@ -30,9 +30,9 @@ public class Ax { /* Calculate Ax (where x is an in-core Vector) */ - public static H2ODrm Ax(H2ODrm DrmA, Vector x) { - Frame A = DrmA.frame; - Vec keys = DrmA.keys; + public static H2ODrm Ax(H2ODrm drmA, Vector x) { + Frame A = drmA.frame; + Vec keys = drmA.keys; final H2OBCast bx = new H2OBCast(x); /* Ax is written into nc (single element, not array) with an MRTask on A, diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java index 52f8701015..3871cabf74 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Cbind.java @@ -28,11 +28,11 @@ public class Cbind { /* R's cbind like operator, on DrmA and DrmB */ - public static H2ODrm Cbind(H2ODrm DrmA, H2ODrm DrmB) { - Frame fra = DrmA.frame; - Vec keysa = DrmA.keys; - Frame frb = DrmB.frame; - Vec keysb = DrmB.keys; + public static H2ODrm Cbind(H2ODrm drmA, H2ODrm drmB) { + Frame fra = drmA.frame; + Vec keysa = drmA.keys; + Frame frb = drmB.frame; + Vec keysb = drmB.keys; /* If A and B are similarly partitioned, .. */ if (fra.anyVec().group() == frb.anyVec().group()) { diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java index 1df7f16225..0f901e4abe 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/MapBlock.java @@ -33,10 +33,10 @@ import scala.reflect.ClassTag; public class MapBlock { - public static H2ODrm exec(H2ODrm DrmA, int ncol, Object bmf, final boolean is_r_str, + public static H2ODrm exec(H2ODrm drmA, int ncol, Object bmf, final boolean is_r_str, final ClassTag k, final ClassTag r) { - Frame A = DrmA.frame; - Vec keys = DrmA.keys; + Frame A = drmA.frame; + Vec keys = drmA.keys; class MRTaskBMF extends MRTask { Serializable bmf; diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java index ac4efc0f44..27d6733632 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Par.java @@ -28,9 +28,9 @@ import org.apache.mahout.h2obindings.drm.H2ODrm; public class Par { - public static H2ODrm exec(H2ODrm DrmA, int min, int exact) { - final Frame frin = DrmA.frame; - final Vec vin = DrmA.keys; + public static H2ODrm exec(H2ODrm drmA, int min, int exact) { + final Frame frin = drmA.frame; + final Vec vin = drmA.keys; /* First create a new empty Frame with the required partitioning */ Frame frout = H2OHelper.empty_frame(frin.numRows(), frin.numCols(), min, exact); diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Rbind.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Rbind.java index 9c10b7f475..ed4b0fb763 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Rbind.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/Rbind.java @@ -29,11 +29,11 @@ public class Rbind { /* R's rbind like operator, on DrmA and DrmB */ - public static H2ODrm Rbind(H2ODrm DrmA, H2ODrm DrmB) { - final Frame fra = DrmA.frame; - final Vec keysa = DrmA.keys; - final Frame frb = DrmB.frame; - final Vec keysb = DrmB.keys; + public static H2ODrm Rbind(H2ODrm drmA, H2ODrm drmB) { + final Frame fra = drmA.frame; + final Vec keysa = drmA.keys; + final Frame frb = drmB.frame; + final Vec keysb = drmB.keys; /* Create new frame and copy A's data at the top, and B's data below. Create the frame in the same VectorGroup as A, so A's data does not diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java index 7b89162756..5ce7732c17 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/RowRange.java @@ -30,9 +30,9 @@ public class RowRange { /* Filter operation */ - public static H2ODrm RowRange(H2ODrm DrmA, final Range R) { - Frame A = DrmA.frame; - Vec keys = DrmA.keys; + public static H2ODrm RowRange(H2ODrm drmA, final Range R) { + Frame A = drmA.frame; + Vec keys = drmA.keys; /* Run a filtering MRTask on A. If row number falls within R.start() and R.end(), then the row makes it into the output diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java index 68fe6af7d5..364f0395f1 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/ops/TimesRightMatrix.java @@ -32,9 +32,9 @@ public class TimesRightMatrix { /* Multiple with in-core Matrix */ - public static H2ODrm TimesRightMatrix(H2ODrm DrmA, Matrix B) { - Frame A = DrmA.frame; - Vec keys = DrmA.keys; + public static H2ODrm TimesRightMatrix(H2ODrm drmA, Matrix B) { + Frame A = drmA.frame; + Vec keys = drmA.keys; Frame AinCoreB = null; if (B instanceof DiagonalMatrix) { From 9ff6666c82c154e796c066015ac4097a166f47d9 Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Tue, 26 Aug 2014 13:54:05 -0700 Subject: [PATCH 34/34] MAHOUT-1500: add warning in README about firewall for multi-node Signed-off-by: Anand Avati --- h2o/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/h2o/README.md b/h2o/README.md index 9f647b3f7e..0aa5ebd982 100644 --- a/h2o/README.md +++ b/h2o/README.md @@ -38,6 +38,8 @@ H2O is fundamentally a peer-to-peer system. H2O nodes join together to form a cl The Mahout H2O integration is fit into this model by having N-1 "worker" nodes and one driver node, all belonging to the same cloud name. The default cloud name used for the integration is "mah2out". Clouds have to be spun up per task/job. +**WARNING**: Some Linux systems have default firewall rules which might block traffic required for the following tests. In order to successfully run the tests you might need to temporarily turn off firewall rules with `sh# iptables -F` + First bring up worker nodes: host-1:~/mahout$ ./bin/mahout h2o-node