Merge branch 'feature/sparse-tests' into develop

ariddell · Oct 30, 2014 · e2e35ee · e2e35ee
2 parents 4420c5f + d44e73c
commit e2e35ee
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 7 deletions.
diff --git a/lda/lda.py b/lda/lda.py
@@ -104,7 +104,7 @@ def fit(self, X, y=None):
         ----------
         X: array-like, shape (n_samples, n_features)
             Training data, where n_samples in the number of samples
-            and n_features is the number of features.
+            and n_features is the number of features. Sparse matrix allowed.
 
         Returns
         -------
@@ -121,15 +121,18 @@ def fit_transform(self, X, y=None):
         ----------
         X : array-like, shape (n_samples, n_features)
             New data, where n_samples in the number of samples
-            and n_features is the number of features.
+            and n_features is the number of features. Sparse matrix allowed.
 
         Returns
         -------
         doc_topic : array-like, shape (n_samples, n_topics)
             Point estimate of the document-topic distributions
 
         """
-        self._fit(np.atleast_2d(X))
+        if isinstance(X, np.ndarray):
+            # if user passes a 1-dim (1-feature) array
+            X = np.atleast_2d(X)
+        self._fit(X)
         return self.doc_topic_
 
     def transform(self, X, y=None):
@@ -156,10 +159,9 @@ def _fit(self, X):
         ----------
         X: array-like, shape (n_samples, n_features)
             Training vector, where n_samples in the number of samples and
-            n_features is the number of features.
+            n_features is the number of features. Sparse matrix allowed.
         """
         random_state = lda.utils.check_random_state(self.random_state)
-        X = np.atleast_2d(X).astype(np.float64)
         self._initialize(X, random_state)
         for it in range(self.n_iter):
             if it % self.refresh == 0:
@@ -185,7 +187,7 @@ def _print_status(self, iter):
 
     def _initialize(self, X, random_state):
         D, W = X.shape
-        N = int(np.sum(X))
+        N = int(X.sum())
         n_topics = self.n_topics
         n_iter = self.n_iter
         logger.info("n_documents: {}".format(D))

diff --git a/lda/tests/test_lda_sparse.py b/lda/tests/test_lda_sparse.py
@@ -0,0 +1,50 @@
+# coding=utf-8
+from __future__ import absolute_import, unicode_literals  # noqa
+import os
+
+import numpy as np
+import scipy.sparse
+import oslotest.base
+
+import lda
+import lda.utils
+
+
+class TestLDASparse(oslotest.base.BaseTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        test_dir = os.path.dirname(__file__)
+        reuters_ldac_fn = os.path.join(test_dir, 'reuters.ldac')
+        cls.dtm = scipy.sparse.csr_matrix(lda.utils.ldac2dtm(open(reuters_ldac_fn), offset=0))
+        cls.n_iter = n_iter = 1
+        cls.n_topics = n_topics = 10
+        cls.random_seed = random_seed = 1
+        cls.model = lda.LDA(n_topics=n_topics, n_iter=n_iter, random_state=random_seed)
+
+    def test_lda_sparse(self):
+        dtm = self.dtm
+        model = self.model
+        doc_topic = model.fit_transform(dtm)
+        self.assertEqual(len(doc_topic), dtm.shape[0])
+        N = dtm.sum()
+        D, V = dtm.shape
+        _, K = doc_topic.shape
+        self.assertEqual(model.doc_topic_.shape, doc_topic.shape)
+        np.testing.assert_array_equal(model.doc_topic_, doc_topic)
+        self.assertEqual(model.doc_topic_.shape, (D, K))
+        self.assertEqual(model.ndz_.shape, (D, K))
+        self.assertEqual(model.topic_word_.shape, (K, V))
+        self.assertEqual(model.nzw_.shape, (K, V))
+
+        # check contents
+        self.assertAlmostEqual(model.nzw_.sum(), N)
+        self.assertAlmostEqual(model.ndz_.sum(), N)
+        self.assertAlmostEqual(model.nz_.sum(), N)
+        self.assertAlmostEqual(model.doc_topic_.sum(), D)
+        self.assertAlmostEqual(model.topic_word_.sum(), K)
+        np.testing.assert_array_equal(model.ndz_.sum(axis=0), model.nz_)
+
+        # check distributions sum to one
+        np.testing.assert_array_almost_equal(model.doc_topic_.sum(axis=1), np.ones(D))
+        np.testing.assert_array_almost_equal(model.topic_word_.sum(axis=1), np.ones(K))
diff --git a/lda/tests/test_utils.py b/lda/tests/test_utils.py
@@ -4,6 +4,7 @@
 import os
 
 import numpy as np
+import scipy.sparse
 import oslotest.base
 
 import lda.utils as utils
@@ -20,6 +21,7 @@ class TestUtils(oslotest.base.BaseTestCase):
     dtm = np.zeros((D, W), dtype=int)
     for d in range(D):
         dtm[d] = np.random.multinomial(N_WORDS_PER_DOC, np.ones(W) / W)
+    dtm_sparse = scipy.sparse.csr_matrix(dtm)
     N_BY_W = np.sum(dtm, axis=0)
     N_BY_D = np.sum(dtm, axis=1)
 
@@ -67,3 +69,22 @@ def test_ldac_conversion(self):
         f = io.StringIO('\n'.join(doclines))
         dtm_new = utils.ldac2dtm(f)
         self.assertTrue(np.all(dtm == dtm_new))
+
+    def test_lists_to_matrix_sparse(self):
+        dtm = self.dtm_sparse
+        WS, DS = utils.matrix_to_lists(dtm)
+        dtm_new = utils.lists_to_matrix(WS, DS)
+        self.assertTrue(np.all(dtm == dtm_new))
+
+    def test_ldac_conversion_sparse(self):
+        dtm = self.dtm
+        dtm_sparse = self.dtm_sparse
+        N, V = dtm.shape
+        doclines = list(utils.dtm2ldac(dtm_sparse))
+        nd_unique = np.sum(dtm > 0, axis=1)
+        for n, docline in zip(nd_unique, doclines):
+            self.assertEqual(n, int(docline.split(' ')[0]))
+        self.assertEqual(len(doclines), N)
+        f = io.StringIO('\n'.join(doclines))
+        dtm_new = utils.ldac2dtm(f)
+        self.assertTrue(np.all(dtm == dtm_new))
diff --git a/lda/utils.py b/lda/utils.py
@@ -47,7 +47,7 @@ def matrix_to_lists(doc_word):
         logger.warning("all zero column in document-term matrix found")
     try:
         # if doc_word is a scipy sparse matrix
-        doc_word = doc_word.copy().tocoo()
+        doc_word = doc_word.copy().tolil()
     except AttributeError:
         pass
 

diff --git a/test-requirements.txt b/test-requirements.txt
@@ -2,6 +2,9 @@
 # of appearance. Changing the order has an impact on the overall integration
 # process, which may cause wedges in the gate later.
 
+# scipy needed for tests with sparse matrices
+scipy>=0.10.1
+
 hacking>=0.9.2,<0.10
 
 coverage>=3.6