Skip to content

Commit

Permalink
[SPARK-7401] [MLLIB] [PYSPARK] Vectorize dot product and sq_dist betw…
Browse files Browse the repository at this point in the history
…een SparseVector and DenseVector

Currently we iterate over indices which can be vectorized.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #5946 from MechCoder/spark-7203 and squashes the following commits:

034d086 [MechCoder] Vectorize dot calculation for numpy arrays for ndim=2
bce2b07 [MechCoder] fix doctest
fcad0a3 [MechCoder] Remove type checks for list, pyarray etc
0ee5dd4 [MechCoder] Add tests and other isinstance changes
e5f1de0 [MechCoder] [SPARK-7401] Vectorize dot product and sq_dist
  • Loading branch information
MechCoder authored and davies committed Jul 3, 2015
1 parent ab535b9 commit f0fac2a
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 23 deletions.
44 changes: 21 additions & 23 deletions python/pyspark/mllib/linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,22 +577,19 @@ def dot(self, other):
...
AssertionError: dimension mismatch
"""
if type(other) == np.ndarray:
if other.ndim == 2:
results = [self.dot(other[:, i]) for i in xrange(other.shape[1])]
return np.array(results)
elif other.ndim > 2:

if isinstance(other, np.ndarray):
if other.ndim not in [2, 1]:
raise ValueError("Cannot call dot with %d-dimensional array" % other.ndim)
assert len(self) == other.shape[0], "dimension mismatch"
return np.dot(self.values, other[self.indices])

assert len(self) == _vector_size(other), "dimension mismatch"

if type(other) in (np.ndarray, array.array, DenseVector):
result = 0.0
for i in xrange(len(self.indices)):
result += self.values[i] * other[self.indices[i]]
return result
if isinstance(other, DenseVector):
return np.dot(other.array[self.indices], self.values)

elif type(other) is SparseVector:
elif isinstance(other, SparseVector):
result = 0.0
i, j = 0, 0
while i < len(self.indices) and j < len(other.indices):
Expand Down Expand Up @@ -635,22 +632,23 @@ def squared_distance(self, other):
AssertionError: dimension mismatch
"""
assert len(self) == _vector_size(other), "dimension mismatch"
if type(other) in (list, array.array, DenseVector, np.array, np.ndarray):
if type(other) is np.array and other.ndim != 1:

if isinstance(other, np.ndarray) or isinstance(other, DenseVector):
if isinstance(other, np.ndarray) and other.ndim != 1:
raise Exception("Cannot call squared_distance with %d-dimensional array" %
other.ndim)
result = 0.0
j = 0 # index into our own array
for i in xrange(len(other)):
if j < len(self.indices) and self.indices[j] == i:
diff = self.values[j] - other[i]
result += diff * diff
j += 1
else:
result += other[i] * other[i]
if isinstance(other, DenseVector):
other = other.array
sparse_ind = np.zeros(other.size, dtype=bool)
sparse_ind[self.indices] = True
dist = other[sparse_ind] - self.values
result = np.dot(dist, dist)

other_ind = other[~sparse_ind]
result += np.dot(other_ind, other_ind)
return result

elif type(other) is SparseVector:
elif isinstance(other, SparseVector):
result = 0.0
i, j = 0, 0
while i < len(self.indices) and j < len(other.indices):
Expand Down
8 changes: 8 additions & 0 deletions python/pyspark/mllib/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,17 +129,22 @@ def test_dot(self):
[1., 2., 3., 4.],
[1., 2., 3., 4.],
[1., 2., 3., 4.]])
arr = pyarray.array('d', [0, 1, 2, 3])
self.assertEquals(10.0, sv.dot(dv))
self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
self.assertEquals(30.0, dv.dot(dv))
self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
self.assertEquals(30.0, lst.dot(dv))
self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
self.assertEquals(7.0, sv.dot(arr))

def test_squared_distance(self):
sv = SparseVector(4, {1: 1, 3: 2})
dv = DenseVector(array([1., 2., 3., 4.]))
lst = DenseVector([4, 3, 2, 1])
lst1 = [4, 3, 2, 1]
arr = pyarray.array('d', [0, 2, 1, 3])
narr = array([0, 2, 1, 3])
self.assertEquals(15.0, _squared_distance(sv, dv))
self.assertEquals(25.0, _squared_distance(sv, lst))
self.assertEquals(20.0, _squared_distance(dv, lst))
Expand All @@ -149,6 +154,9 @@ def test_squared_distance(self):
self.assertEquals(0.0, _squared_distance(sv, sv))
self.assertEquals(0.0, _squared_distance(dv, dv))
self.assertEquals(0.0, _squared_distance(lst, lst))
self.assertEquals(25.0, _squared_distance(sv, lst1))
self.assertEquals(3.0, _squared_distance(sv, arr))
self.assertEquals(3.0, _squared_distance(sv, narr))

def test_conversion(self):
# numpy arrays should be automatically upcast to float64
Expand Down

0 comments on commit f0fac2a

Please sign in to comment.