Skip to content

Commit

Permalink
FIX #213, name creation via hashing for sparse data
Browse files Browse the repository at this point in the history
  • Loading branch information
mfeurer committed May 15, 2017
1 parent 6068f54 commit e87e812
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 15 deletions.
4 changes: 2 additions & 2 deletions autosklearn/automl.py
Expand Up @@ -26,7 +26,7 @@
pipeline
from autosklearn.ensemble_builder import EnsembleBuilder
from autosklearn.smbo import AutoMLSMBO
from autosklearn.util.hash import hash_numpy_array
from autosklearn.util.hash import hash_array_or_matrix


def _model_predict(self, X, batch_size, identifier):
Expand Down Expand Up @@ -158,7 +158,7 @@ def fit(self, X, y,
self._backend.context.create_directories()

if dataset_name is None:
dataset_name = hash_numpy_array(X)
dataset_name = hash_array_or_matrix(X)

self._backend.save_start_time(self._seed)
self._stopwatch = StopWatch()
Expand Down
16 changes: 13 additions & 3 deletions autosklearn/util/hash.py
@@ -1,13 +1,23 @@
import hashlib

import scipy.sparse

def hash_numpy_array(X):

def hash_array_or_matrix(X):
m = hashlib.md5()

if X.flags['C_CONTIGUOUS']:
if scipy.sparse.issparse(X):
m.update(X.indices)
m.update(X.indptr)
m.update(X.data)
m.update(str(X.shape).encode('utf8'))
else:
m.update(X.T.data)
if X.flags['C_CONTIGUOUS']:
m.update(X.data)
m.update(str(X.shape).encode('utf8'))
else:
m.update(X.T.data)
m.update(str(X.T.shape).encode('utf8'))

hash = m.hexdigest()
return hash
31 changes: 21 additions & 10 deletions test/test_util/test_hash.py
@@ -1,24 +1,25 @@
import unittest

import numpy as np
import scipy.sparse

from autosklearn.util.hash import hash_numpy_array
from autosklearn.util.hash import hash_array_or_matrix


class HashTests(unittest.TestCase):

def test_c_contiguous_array(self):
array = np.array([[1, 2], [3, 4]])

hash = hash_numpy_array(array)
hash = hash_array_or_matrix(array)

self.assertIsNotNone(hash)

def test_f_contiguous_array(self):
array = np.array([[1, 2], [3, 4]])
array = np.asfortranarray(array)

hash = hash_numpy_array(array)
hash = hash_array_or_matrix(array)

self.assertIsNotNone(hash)

Expand All @@ -27,25 +28,35 @@ def test_transpose_arrays(self):
f_array = np.array([[1, 3], [2, 4]])
f_array = np.asfortranarray(f_array)

c_hash = hash_numpy_array(c_array)
f_hash = hash_numpy_array(f_array)
c_hash = hash_array_or_matrix(c_array)
f_hash = hash_array_or_matrix(f_array)

self.assertEqual(c_hash, f_hash)

def test_same_data_arrays(self):
first_array = np.array([[1, 2], [3, 4]])
second_array = np.array([[1, 2], [3, 4]])

first_hash = hash_numpy_array(first_array)
second_hash = hash_numpy_array(second_array)
first_hash = hash_array_or_matrix(first_array)
second_hash = hash_array_or_matrix(second_array)

self.assertEqual(first_hash, second_hash)

def test_different_data_arrays(self):
first_array = np.array([[1, 2], [3, 4]])
second_array = np.array([[1, 3], [2, 4]])

first_hash = hash_numpy_array(first_array)
second_hash = hash_numpy_array(second_array)
first_hash = hash_array_or_matrix(first_array)
second_hash = hash_array_or_matrix(second_array)

self.assertNotEqual(first_hash, second_hash)
self.assertNotEqual(first_hash, second_hash)

def test_scipy_csr(self):
row = np.array([0, 0, 1, 2, 2, 2])
col = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])
matrix = scipy.sparse.csr_matrix((data, (row, col)), shape=(3, 3))

hash = hash_array_or_matrix(matrix)

self.assertIsNotNone(hash)

0 comments on commit e87e812

Please sign in to comment.