Skip to content

Commit

Permalink
Merge pull request #6445 from noahnovsak/dask-pca
Browse files Browse the repository at this point in the history
Dask: PCA
  • Loading branch information
markotoplak committed Oct 29, 2023
2 parents 60d7faa + 7cc9a2c commit 9887af3
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 63 deletions.
46 changes: 41 additions & 5 deletions Orange/projection/pca.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import warnings
import numbers
import six
import numpy as np
import scipy.sparse as sp
import dask.array as da
from scipy.linalg import lu, qr, svd

from sklearn import decomposition as skl_decomposition
from sklearn.utils import check_array, check_random_state
from sklearn.utils import check_random_state
from sklearn.utils.extmath import svd_flip, safe_sparse_dot
from sklearn.utils.validation import check_is_fitted

Expand All @@ -16,6 +18,7 @@
from Orange.misc.wrapper_meta import WrapperMeta
from Orange.preprocess.score import LearnerScorer
from Orange.projection import SklProjector, DomainProjection
from Orange.util import dummy_callback, wrap_callback

__all__ = ["PCA", "SparsePCA", "IncrementalPCA", "TruncatedSVD"]

Expand Down Expand Up @@ -261,12 +264,45 @@ def __init__(self, n_components=None, copy=True, whiten=False,
self.params = vars()

def fit(self, X, Y=None):
proj = self._initialize_wrapped(X, Y)
if isinstance(X, da.Array):
X = X.rechunk({0: "auto", 1: -1})
return proj.fit(X, Y)

def _initialize_wrapped(self, X=None, Y=None):
params = self.params.copy()
if params["n_components"] is not None:
params["n_components"] = min(min(X.shape), params["n_components"])
proj = self.__wraps__(**params)
proj = proj.fit(X, Y)
return PCAModel(proj, self.domain, len(proj.components_))
params["n_components"] = min(*X.shape, params["n_components"])

if isinstance(X, da.Array) or isinstance(Y, da.Array):
try:
import dask_ml.decomposition as dask_decomposition

if params["iterated_power"] == "auto":
params["iterated_power"] = 0
del params["tol"]

# use IPCA instead of PCA due to memory issues
return dask_decomposition.IncrementalPCA(**params)

except ImportError:
warnings.warn("dask_ml is not installed. Using sklearn instead.")

return self.__wraps__(**params)

def __call__(self, data, progress_callback=None):
if progress_callback is None:
progress_callback = dummy_callback
progress_callback(0, "Preprocessing...")
cb = wrap_callback(progress_callback, end=0.1)
data = self.preprocess(data, progress_callback=cb)
progress_callback(0.1, "Fitting...")
proj = self.fit(data.X, data.Y)
model = PCAModel(proj, data.domain, len(proj.components_))
model.pre_domain = data.domain
model.name = self.name
progress_callback(1)
return model


class SparsePCA(SklProjector):
Expand Down
72 changes: 41 additions & 31 deletions Orange/tests/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,19 @@
from Orange.preprocess import Continuize, Normalize
from Orange.projection import pca, PCA, SparsePCA, IncrementalPCA, TruncatedSVD
from Orange.tests import test_filename
from Orange.tests.test_dasktable import with_dasktable


class TestPCA(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.ionosphere = Table(test_filename('datasets/ionosphere.tab'))
cls.iris = Table('iris')
cls.zoo = Table('zoo')

def test_pca(self):
data = self.ionosphere
self.__pca_test_helper(data, n_com=3, min_xpl_var=0.5)
def setUp(self):
self.ionosphere = Table(test_filename('datasets/ionosphere.tab'))
self.iris = Table('iris')
self.zoo = Table('zoo')

@with_dasktable
def test_pca(self, prepare_table):
data = prepare_table(self.ionosphere)
self.__pca_test_helper(data, n_com=3, min_xpl_var=0.49)
self.__pca_test_helper(data, n_com=10, min_xpl_var=0.7)
self.__pca_test_helper(data, n_com=32, min_xpl_var=1)

Expand All @@ -35,7 +36,7 @@ def __pca_test_helper(self, data, n_com, min_xpl_var):
self.assertEqual(n_com, pca_model.n_components)
self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape)
proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T)
np.testing.assert_almost_equal(pca_model(data).X, proj)
self.assertTrue(np.allclose(pca_model(data).X, proj))

def test_sparse_pca(self):
data = self.ionosphere[:100]
Expand All @@ -50,9 +51,10 @@ def __sparse_pca_test_helper(self, data, n_com, max_err):
self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape)
self.assertLessEqual(pca_model.error_[-1], max_err)

def test_randomized_pca(self):
data = self.ionosphere
self.__rnd_pca_test_helper(data, n_com=3, min_xpl_var=0.5)
@with_dasktable
def test_randomized_pca(self, prepare_table):
data = prepare_table(self.ionosphere)
self.__rnd_pca_test_helper(data, n_com=3, min_xpl_var=0.47)
self.__rnd_pca_test_helper(data, n_com=10, min_xpl_var=0.7)
self.__rnd_pca_test_helper(data, n_com=32, min_xpl_var=0.98)

Expand All @@ -64,7 +66,7 @@ def __rnd_pca_test_helper(self, data, n_com, min_xpl_var):
self.assertEqual(n_com, pca_model.n_components)
self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape)
proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T)
np.testing.assert_almost_equal(pca_model(data).X, proj)
self.assertTrue(np.allclose(pca_model(data).X, proj))

def test_improved_randomized_pca_properly_called(self):
# It doesn't matter what we put into the matrix
Expand Down Expand Up @@ -215,17 +217,20 @@ def test_transformed_domain_does_not_pickle_data(self):
pca_iris2 = pickle.loads(pickle.dumps(pca_iris))
self.assertIsNone(pca_iris2.domain[0].compute_value.transformed)

def test_chain(self):
zoo_c = Continuize()(self.zoo)
pca = PCA(n_components=3)(zoo_c)(self.zoo)
@with_dasktable
def test_chain(self, prepare_table):
zoo = prepare_table(self.zoo)
zoo_c = Continuize()(zoo)
pca = PCA(n_components=3)(zoo_c)(zoo)
pca2 = PCA(n_components=3)(zoo_c)(zoo_c)
pp = [Continuize()]
pca3 = PCA(n_components=3, preprocessors=pp)(self.zoo)(self.zoo)
np.testing.assert_almost_equal(pca.X, pca2.X)
np.testing.assert_almost_equal(pca.X, pca3.X)
pca3 = PCA(n_components=3, preprocessors=pp)(zoo)(zoo)
self.assertTrue(np.allclose(pca.X, pca2.X))
self.assertTrue(np.allclose(pca.X, pca3.X))

def test_PCA_scorer(self):
data = self.iris
@with_dasktable
def test_PCA_scorer(self, prepare_table):
data = prepare_table(self.iris)
pca = PCA(preprocessors=[Normalize()])
pca.component = 1
scores = pca.score_data(data)
Expand All @@ -236,23 +241,28 @@ def test_PCA_scorer(self):
self.assertEqual([round(s, 4) for s in scores[0]],
[0.5224, 0.2634, 0.5813, 0.5656])

def test_PCA_scorer_component(self):
@with_dasktable
def test_PCA_scorer_component(self, prepare_table):
pca = PCA()
for i in range(1, len(self.zoo.domain.attributes) + 1):
zoo = prepare_table(self.zoo)
for i in range(1, len(zoo.domain.attributes) + 1):
pca.component = i
scores = pca.score_data(self.zoo)
scores = pca.score_data(zoo)
self.assertEqual(scores.shape,
(pca.component, len(self.zoo.domain.attributes)))
(pca.component, len(zoo.domain.attributes)))

def test_PCA_scorer_all_components(self):
n_attr = len(self.iris.domain.attributes)
@with_dasktable
def test_PCA_scorer_all_components(self, prepare_table):
iris = prepare_table(self.iris)
n_attr = len(iris.domain.attributes)
pca = PCA()
scores = pca.score_data(self.iris)
scores = pca.score_data(iris)
self.assertEqual(scores.shape, (n_attr, n_attr))

def test_max_components(self):
@with_dasktable
def test_max_components(self, prepare_table):
d = np.random.RandomState(0).rand(20, 20)
data = Table.from_numpy(None, d)
data = prepare_table(Table.from_numpy(None, d))
pca = PCA()(data)
self.assertEqual(len(pca.explained_variance_ratio_), 20)
pca = PCA(n_components=10)(data)
Expand Down
1 change: 1 addition & 0 deletions Orange/widgets/unsupervised/owpca.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin
from Orange.widgets.utils.slidergraph import SliderGraph
from Orange.widgets.utils.widgetpreview import WidgetPreview
from Orange.widgets.utils.annotated_data import add_columns
from Orange.widgets.widget import Input, Output

# Maximum number of PCA components that we can set in the widget
Expand Down
69 changes: 42 additions & 27 deletions Orange/widgets/unsupervised/tests/test_owpca.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,24 @@
from Orange.widgets.tests.utils import table_dense_sparse, possible_duplicate_table
from Orange.widgets.unsupervised.owpca import OWPCA
from Orange.tests import test_filename
from Orange.tests.test_dasktable import with_dasktable


class TestOWPCA(WidgetTest):
def setUp(self):
self.widget = self.create_widget(OWPCA) # type: OWPCA
self.iris = Table("iris") # type: Table

def test_set_variance100(self):
self.widget.set_data(self.iris)
@with_dasktable
def test_set_variance100(self, prepare_table):
data = prepare_table(self.iris)
self.widget.set_data(data)
self.widget.variance_covered = 100
self.widget._update_selection_variance_spin()

def test_constant_data(self):
data = self.iris[::5].copy()
@with_dasktable
def test_constant_data(self, prepare_table):
data = prepare_table(self.iris[::5].copy())
with data.unlocked():
data.X[:, :] = 1.0
# Ignore the warning: the test checks whether the widget shows
Expand All @@ -37,23 +41,26 @@ def test_constant_data(self):
self.assertIsNone(self.get_output(self.widget.Outputs.transformed_data))
self.assertIsNone(self.get_output(self.widget.Outputs.components))

def test_empty_data(self):
@with_dasktable
def test_empty_data(self, prepare_table):
""" Check widget for dataset with no rows and for dataset with no attributes """
self.send_signal(self.widget.Inputs.data, self.iris[:0])
data = prepare_table(self.iris)
self.send_signal(self.widget.Inputs.data, data[:0])
self.assertTrue(self.widget.Error.no_instances.is_shown())

domain = Domain([], None, self.iris.domain.variables)
new_data = Table.from_table(domain, self.iris)
domain = Domain([], None, data.domain.variables)
new_data = data.transform(domain)
self.send_signal(self.widget.Inputs.data, new_data)
self.assertTrue(self.widget.Error.no_features.is_shown())
self.assertFalse(self.widget.Error.no_instances.is_shown())

self.send_signal(self.widget.Inputs.data, None)
self.assertFalse(self.widget.Error.no_features.is_shown())

def test_limit_components(self):
@with_dasktable
def test_limit_components(self, prepare_table):
X = np.random.RandomState(0).rand(101, 101)
data = Table.from_numpy(None, X)
data = prepare_table(Table.from_numpy(None, X))
self.widget.ncomponents = 100
self.send_signal(self.widget.Inputs.data, data, wait=5000)
tran = self.get_output(self.widget.Outputs.transformed_data)
Expand All @@ -79,8 +86,10 @@ def test_migrate_settings_changes_variance_covered_to_int(self):
OWPCA.migrate_settings(settings, 0)
self.assertEqual(settings["variance_covered"], 100)

def test_variance_shown(self):
self.send_signal(self.widget.Inputs.data, self.iris)
@with_dasktable
def test_variance_shown(self, prepare_table):
data = prepare_table(self.iris)
self.send_signal(self.widget.Inputs.data, data)
self.widget.maxp = 2
self.widget._setup_plot()
self.wait_until_finished()
Expand All @@ -91,15 +100,18 @@ def test_variance_shown(self):
var3 = self.widget.variance_covered
self.assertGreater(var3, var2)

def test_unique_domain_components(self):
table = possible_duplicate_table('components')
@with_dasktable
def test_unique_domain_components(self, prepare_table):
table = prepare_table(possible_duplicate_table('components'))
self.send_signal(self.widget.Inputs.data, table)
out = self.get_output(self.widget.Outputs.components)
self.assertEqual(out.domain.metas[0].name, 'components (1)')

def test_variance_attr(self):
@with_dasktable
def test_variance_attr(self, prepare_table):
data = prepare_table(self.iris)
self.widget.ncomponents = 2
self.send_signal(self.widget.Inputs.data, self.iris, wait=5000)
self.send_signal(self.widget.Inputs.data, data, wait=5000)
self.wait_until_stop_blocking()
self.widget._variance_ratio = np.array([0.5, 0.25, 0.2, 0.05])
self.widget.commit.now()
Expand Down Expand Up @@ -228,12 +240,13 @@ def test_normalized_gives_correct_result(self, prepare_table):

np.testing.assert_almost_equal(widget_result.X, pca_embedding)

def test_do_not_mask_features(self):
@with_dasktable
def test_do_not_mask_features(self, prepare_table):
# the widget used to replace cached variables when creating the
# components output (until 20170726)
data = Table("iris.tab")
data = prepare_table(self.iris)
ndata = data.copy()
self.widget.set_data(data)
ndata = Table("iris.tab")
self.assertEqual(data.domain[0], ndata.domain[0])

def test_on_cut_changed(self):
Expand All @@ -251,27 +264,29 @@ def test_on_cut_changed(self):
invalidate.assert_not_called()
self.assertEqual(widget.ncomponents, 0)

def test_output_data(self):
@with_dasktable
def test_output_data(self, prepare_table):
widget = self.widget
widget.ncomponents = 2
domain = Domain(self.iris.domain.attributes[:3],
self.iris.domain.class_var,
self.iris.domain.attributes[3:])
iris = self.iris.transform(domain)
data = prepare_table(self.iris)
domain = Domain(data.domain.attributes[:3],
data.domain.class_var,
data.domain.attributes[3:])
iris = data.transform(domain)
self.send_signal(widget.Inputs.data, iris)
output = self.get_output(widget.Outputs.data)
outdom = output.domain
self.assertEqual(domain.attributes, outdom.attributes)
self.assertEqual(domain.class_var, outdom.class_var)
self.assertEqual(domain.metas, outdom.metas[:1])
self.assertEqual(len(outdom.metas), 3)
np.testing.assert_equal(iris.X, output.X)
np.testing.assert_equal(iris.Y, output.Y)
self.assertTrue(np.all(iris.X == output.X))
self.assertTrue(np.all(iris.Y == output.Y))
np.testing.assert_equal(iris.metas[:, 0], output.metas[:, 0])

trans = self.get_output(widget.Outputs.transformed_data)
self.assertEqual(trans.domain.attributes, outdom.metas[1:])
np.testing.assert_equal(trans.X, output.metas[:, 1:])
np.testing.assert_equal(trans.X, output.metas[:, 1:]) # dask

self.send_signal(widget.Inputs.data, None)
output = self.get_output(widget.Outputs.data)
Expand Down

0 comments on commit 9887af3

Please sign in to comment.