From 92a488d2c5b8ec236014e8208ced852b0d603f15 Mon Sep 17 00:00:00 2001 From: liyuance Date: Sun, 17 Jul 2016 23:23:51 +0800 Subject: [PATCH 1/5] add Img2Vector tools --- .../dmlc/mxnet/spark/utils/Img2Vector.scala | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala diff --git a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala new file mode 100644 index 000000000000..1e8fb12a7082 --- /dev/null +++ b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala @@ -0,0 +1,65 @@ +package ml.dmlc.mxnet.spark.utils + +import javax.imageio.ImageIO +import java.awt.image.BufferedImage + +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD +import org.apache.spark.SparkConf +import org.apache.spark.input._ +import org.apache.spark.mllib.linalg.{Vector, Vectors} + +/** + * Convert image directory into Vectorized RDD + * @author Yuance.Li + */ +object Img2Vector{ + def getImgRGB(PDS: PortableDataStream, fullcolor: Boolean): Array[Double] = { + val img = ImageIO.read(PDS.open()) + val R = ArrayBuffer[Double]() + val G = ArrayBuffer[Double]() + val B = ArrayBuffer[Double]() + val RGB = ArrayBuffer[Double]() + val w = img.getWidth + val h = img.getHeight + if (fullcolor) { + for (x <- 0 until w){ + for (y <- 0 until h) { + val color = img.getRGB(w - x - 1, y) & 0xffffff + R += (color & 0xff0000) / 65536 + G += (color & 0xff00) / 256 + B += (color & 0xff) + } + } + RGB ++= R ++= G ++= B + RGB.toArray + } else { + for (x <- 0 until w) { + for (y <- 0 until h){ + val color = img.getRGB(w - x - 1, y) & 0xffffff + R += (color & 0xff0000) / 65536 * 0.3 + G += (color & 0xff00) / 256 * 0.59 + B += (color & 0xff) * 0.11 + } + } + val grayArr = new Array[Double](w * h) + for (i <- 0 until w * h) { + grayArr(i) = R(i) + G(i) + B(i) + } + grayArr + } + } + + def getRGBArray(sc: SparkContext, path: String, fullcolor: Boolean = true): RDD[Array[Double]] = { + val rgbArray = sc.binaryFiles(path).map(_._2).map(getImgRGB(_, fullcolor)) + rgbArray + } + + def getRGBvector(sc: SparkContext, path: String, fullcolor: Boolean = true): RDD[Vector] = { + val rgbArray = sc.binaryFiles(path).map(_._2).map(getImgRGB(_, fullcolor)) + val rgbVector = rgbArray.map(x => Vectors.dense(x)) + rgbVector + } +} From 160f8144f7592237d3f2ecbb641776b1e95b199f Mon Sep 17 00:00:00 2001 From: liyuance Date: Sun, 17 Jul 2016 23:47:31 +0800 Subject: [PATCH 2/5] del RepIterator --- .../dmlc/mxnet/spark/utils/RepIterator.scala | 37 ------------------- 1 file changed, 37 deletions(-) delete mode 100644 scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/RepIterator.scala diff --git a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/RepIterator.scala b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/RepIterator.scala deleted file mode 100644 index 4ae7dc9346d8..000000000000 --- a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/RepIterator.scala +++ /dev/null @@ -1,37 +0,0 @@ -package ml.dmlc.mxnet.spark.utils - -import java.io.IOException -import scala.collection.Iterator - -/** - * Repeatable Iterator useful in mapPartitions - * @author Yuance.Li - */ -class RepIterator[T](iteratorInternal: Iterator[T], repetition: Int = 1) extends Iterator[T] { - assert(repetition > 0) - var counter = repetition - 1 - var (currentIter, backupIter) = iteratorInternal.duplicate - - override def hasNext: Boolean = { - currentIter.hasNext || counter > 0 - } - - override def next(): T = { - assert(hasNext) - if(currentIter.hasNext) { - currentIter.next() - } else if (counter > 0) { - counter = counter - 1 - var iterTuple = backupIter.duplicate - currentIter = iterTuple._1 - backupIter = iterTuple._2 - currentIter.next() - } else { - throw new IOException("No element in this collection") - } - } -} - -object RepIterator { - def apply[T](iteratorInternal: Iterator[T], repetition: Int = 1) = new RepIterator(iteratorInternal, repetition) -} From 88e720785fc394d20329cef9519125c2c7f6f31e Mon Sep 17 00:00:00 2001 From: liyuance Date: Wed, 20 Jul 2016 13:27:10 +0800 Subject: [PATCH 3/5] formatted code style --- .../src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala index 1e8fb12a7082..44feb7b53e3d 100644 --- a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala +++ b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala @@ -12,7 +12,7 @@ import org.apache.spark.input._ import org.apache.spark.mllib.linalg.{Vector, Vectors} /** - * Convert image directory into Vectorized RDD + * Convert image directory into Vectorized RDD * @author Yuance.Li */ object Img2Vector{ From 49b141bae882d125be48262d96492716ccf87f0f Mon Sep 17 00:00:00 2001 From: liyuance Date: Tue, 26 Jul 2016 14:02:27 +0800 Subject: [PATCH 4/5] revoke import scala.collection.mutable.ArrayBuffer --- .../src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala index 44feb7b53e3d..505cc9e54963 100644 --- a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala +++ b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala @@ -1,7 +1,6 @@ package ml.dmlc.mxnet.spark.utils import javax.imageio.ImageIO -import java.awt.image.BufferedImage import scala.collection.mutable.ArrayBuffer From 7159d47c342bde3ba2c56776caf7dea4f429b29b Mon Sep 17 00:00:00 2001 From: liyuance Date: Tue, 26 Jul 2016 14:24:46 +0800 Subject: [PATCH 5/5] add javadoc --- .../main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala index 505cc9e54963..4e48c9ec2b06 100644 --- a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala +++ b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/utils/Img2Vector.scala @@ -11,7 +11,11 @@ import org.apache.spark.input._ import org.apache.spark.mllib.linalg.{Vector, Vectors} /** - * Convert image directory into Vectorized RDD + * Img2Vector tools could convert imgae directory into Vectorized RDD,for example: + * Images stored in hdfs://namenode:9000/user/xxx/images/ + * val sc = new SparkContext(conf) + * val imagesArrayRDD = Img2Vector.getRGBArray(sc, "hdfs://namenode:9000/user/xxx/images/") + * val imagesVectorRDD = Img2Vector.getRGBVector(sc, "hdfs://namenode:9000/user/xxx/images/") * @author Yuance.Li */ object Img2Vector{ @@ -56,7 +60,7 @@ object Img2Vector{ rgbArray } - def getRGBvector(sc: SparkContext, path: String, fullcolor: Boolean = true): RDD[Vector] = { + def getRGBVector(sc: SparkContext, path: String, fullcolor: Boolean = true): RDD[Vector] = { val rgbArray = sc.binaryFiles(path).map(_._2).map(getImgRGB(_, fullcolor)) val rgbVector = rgbArray.map(x => Vectors.dense(x)) rgbVector