Skip to content

Commit

Permalink
Remove unnecessary appendBias implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
Lewuathe committed May 9, 2015
1 parent 44295c2 commit a353354
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,15 @@ private[python] class PythonMLLibAPI extends Serializable {
minPartitions: Int): JavaRDD[LabeledPoint] =
MLUtils.loadLabeledPoints(jsc.sc, path, minPartitions)

def appendBias(data: org.apache.spark.mllib.linalg.Vector)
: org.apache.spark.mllib.linalg.Vector
= MLUtils.appendBias(data)

def loadVectors(jsc: JavaSparkContext, path: String)
: RDD[org.apache.spark.mllib.linalg.Vector]
= MLUtils.loadVectors(jsc.sc, path)
/**
* Loads and serializes vectors saved with `RDD#saveAsTextFile`.
* @param jsc Java SparkContext
* @param path file or directory path in any Hadoop-supported file system URI
* @return serialized vectors in a RDD
*/
def loadVectors(jsc: JavaSparkContext,
path: String): RDD[Vector] =
MLUtils.loadVectors(jsc.sc, path)

private def trainRegressionModel(
learner: GeneralizedLinearAlgorithm[_ <: GeneralizedLinearModel],
Expand Down
11 changes: 7 additions & 4 deletions python/pyspark/mllib/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -821,7 +821,7 @@ def test_model_transform(self):

class MLUtilsTests(MLlibTestCase):
def test_append_bias(self):
data = [1.0, 2.0, 3.0]
data = [2.0, 2.0, 2.0]
ret = MLUtils.appendBias(data)
self.assertEqual(ret[3], 1.0)

Expand All @@ -832,14 +832,17 @@ def test_load_vectors(self):
[1.0, 2.0, 3.0]
]
try:
self.sc.parallelize(data).saveAsTextFile("test_load_vectors")
ret_rdd = MLUtils.loadVectors(self.sc, "test_load_vectors")
temp_dir = tempfile.mkdtemp()
load_vectors_path = os.path.join(temp_dir, "test_load_vectors")
self.sc.parallelize(data).saveAsTextFile(load_vectors_path)
ret_rdd = MLUtils.loadVectors(self.sc, load_vectors_path)
ret = ret_rdd.collect()
ret.sort()
self.assertEqual(len(ret), 2)
self.assertEqual(ret[0], DenseVector([1.0, 2.0, 3.0]))
self.assertEqual(ret[1], DenseVector([1.0, 2.0, 3.0]))
finally:
shutil.rmtree("test_load_vectors")
shutil.rmtree(load_vectors_path)


if __name__ == "__main__":
Expand Down
10 changes: 9 additions & 1 deletion python/pyspark/mllib/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,10 +171,18 @@ def loadLabeledPoints(sc, path, minPartitions=None):

@staticmethod
def appendBias(data):
return callMLlibFunc("appendBias", _convert_to_vector(data))
"""
Returns a new vector with `1.0` (bias) appended to the input vector.
"""
vec = _convert_to_vector(data)
return np.append(vec, 1.0)

@staticmethod
def loadVectors(sc, path):
"""
Loads vectors saved using `RDD[Vector].saveAsTextFile`
with the default number of partitions.
"""
return callMLlibFunc("loadVectors", sc, path)


Expand Down

0 comments on commit a353354

Please sign in to comment.