Skip to content

Commit

Permalink
Merge branch 'feature/sparse-tests' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
Allen Riddell committed Oct 30, 2014
2 parents 4420c5f + d44e73c commit e2e35ee
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 7 deletions.
14 changes: 8 additions & 6 deletions lda/lda.py
Expand Up @@ -104,7 +104,7 @@ def fit(self, X, y=None):
----------
X: array-like, shape (n_samples, n_features)
Training data, where n_samples in the number of samples
and n_features is the number of features.
and n_features is the number of features. Sparse matrix allowed.
Returns
-------
Expand All @@ -121,15 +121,18 @@ def fit_transform(self, X, y=None):
----------
X : array-like, shape (n_samples, n_features)
New data, where n_samples in the number of samples
and n_features is the number of features.
and n_features is the number of features. Sparse matrix allowed.
Returns
-------
doc_topic : array-like, shape (n_samples, n_topics)
Point estimate of the document-topic distributions
"""
self._fit(np.atleast_2d(X))
if isinstance(X, np.ndarray):
# if user passes a 1-dim (1-feature) array
X = np.atleast_2d(X)
self._fit(X)
return self.doc_topic_

def transform(self, X, y=None):
Expand All @@ -156,10 +159,9 @@ def _fit(self, X):
----------
X: array-like, shape (n_samples, n_features)
Training vector, where n_samples in the number of samples and
n_features is the number of features.
n_features is the number of features. Sparse matrix allowed.
"""
random_state = lda.utils.check_random_state(self.random_state)
X = np.atleast_2d(X).astype(np.float64)
self._initialize(X, random_state)
for it in range(self.n_iter):
if it % self.refresh == 0:
Expand All @@ -185,7 +187,7 @@ def _print_status(self, iter):

def _initialize(self, X, random_state):
D, W = X.shape
N = int(np.sum(X))
N = int(X.sum())
n_topics = self.n_topics
n_iter = self.n_iter
logger.info("n_documents: {}".format(D))
Expand Down
50 changes: 50 additions & 0 deletions lda/tests/test_lda_sparse.py
@@ -0,0 +1,50 @@
# coding=utf-8
from __future__ import absolute_import, unicode_literals # noqa
import os

import numpy as np
import scipy.sparse
import oslotest.base

import lda
import lda.utils


class TestLDASparse(oslotest.base.BaseTestCase):

@classmethod
def setUpClass(cls):
test_dir = os.path.dirname(__file__)
reuters_ldac_fn = os.path.join(test_dir, 'reuters.ldac')
cls.dtm = scipy.sparse.csr_matrix(lda.utils.ldac2dtm(open(reuters_ldac_fn), offset=0))
cls.n_iter = n_iter = 1
cls.n_topics = n_topics = 10
cls.random_seed = random_seed = 1
cls.model = lda.LDA(n_topics=n_topics, n_iter=n_iter, random_state=random_seed)

def test_lda_sparse(self):
dtm = self.dtm
model = self.model
doc_topic = model.fit_transform(dtm)
self.assertEqual(len(doc_topic), dtm.shape[0])
N = dtm.sum()
D, V = dtm.shape
_, K = doc_topic.shape
self.assertEqual(model.doc_topic_.shape, doc_topic.shape)
np.testing.assert_array_equal(model.doc_topic_, doc_topic)
self.assertEqual(model.doc_topic_.shape, (D, K))
self.assertEqual(model.ndz_.shape, (D, K))
self.assertEqual(model.topic_word_.shape, (K, V))
self.assertEqual(model.nzw_.shape, (K, V))

# check contents
self.assertAlmostEqual(model.nzw_.sum(), N)
self.assertAlmostEqual(model.ndz_.sum(), N)
self.assertAlmostEqual(model.nz_.sum(), N)
self.assertAlmostEqual(model.doc_topic_.sum(), D)
self.assertAlmostEqual(model.topic_word_.sum(), K)
np.testing.assert_array_equal(model.ndz_.sum(axis=0), model.nz_)

# check distributions sum to one
np.testing.assert_array_almost_equal(model.doc_topic_.sum(axis=1), np.ones(D))
np.testing.assert_array_almost_equal(model.topic_word_.sum(axis=1), np.ones(K))
21 changes: 21 additions & 0 deletions lda/tests/test_utils.py
Expand Up @@ -4,6 +4,7 @@
import os

import numpy as np
import scipy.sparse
import oslotest.base

import lda.utils as utils
Expand All @@ -20,6 +21,7 @@ class TestUtils(oslotest.base.BaseTestCase):
dtm = np.zeros((D, W), dtype=int)
for d in range(D):
dtm[d] = np.random.multinomial(N_WORDS_PER_DOC, np.ones(W) / W)
dtm_sparse = scipy.sparse.csr_matrix(dtm)
N_BY_W = np.sum(dtm, axis=0)
N_BY_D = np.sum(dtm, axis=1)

Expand Down Expand Up @@ -67,3 +69,22 @@ def test_ldac_conversion(self):
f = io.StringIO('\n'.join(doclines))
dtm_new = utils.ldac2dtm(f)
self.assertTrue(np.all(dtm == dtm_new))

def test_lists_to_matrix_sparse(self):
dtm = self.dtm_sparse
WS, DS = utils.matrix_to_lists(dtm)
dtm_new = utils.lists_to_matrix(WS, DS)
self.assertTrue(np.all(dtm == dtm_new))

def test_ldac_conversion_sparse(self):
dtm = self.dtm
dtm_sparse = self.dtm_sparse
N, V = dtm.shape
doclines = list(utils.dtm2ldac(dtm_sparse))
nd_unique = np.sum(dtm > 0, axis=1)
for n, docline in zip(nd_unique, doclines):
self.assertEqual(n, int(docline.split(' ')[0]))
self.assertEqual(len(doclines), N)
f = io.StringIO('\n'.join(doclines))
dtm_new = utils.ldac2dtm(f)
self.assertTrue(np.all(dtm == dtm_new))
2 changes: 1 addition & 1 deletion lda/utils.py
Expand Up @@ -47,7 +47,7 @@ def matrix_to_lists(doc_word):
logger.warning("all zero column in document-term matrix found")
try:
# if doc_word is a scipy sparse matrix
doc_word = doc_word.copy().tocoo()
doc_word = doc_word.copy().tolil()
except AttributeError:
pass

Expand Down
3 changes: 3 additions & 0 deletions test-requirements.txt
Expand Up @@ -2,6 +2,9 @@
# of appearance. Changing the order has an impact on the overall integration
# process, which may cause wedges in the gate later.

# scipy needed for tests with sparse matrices
scipy>=0.10.1

hacking>=0.9.2,<0.10

coverage>=3.6
Expand Down

0 comments on commit e2e35ee

Please sign in to comment.