BUG: Fix MANOVA when not using formulas

Fix mv_test so that it works when not using formulas Improve MANOVA docstring to help users closes statsmodels#4903 closes statsmodels#5578
bashtage · May 7, 2019 · c3ab504 · c3ab504
1 parent 860b2bd
commit c3ab504
Show file tree

Hide file tree

Showing 2 changed files with 97 additions and 5 deletions.
diff --git a/statsmodels/multivariate/manova.py b/statsmodels/multivariate/manova.py
@@ -7,16 +7,19 @@
 from __future__ import division
 
 import numpy as np
+
 from statsmodels.base.model import Model
-from .multivariate_ols import _multivariate_ols_test, _hypotheses_doc
-from .multivariate_ols import _multivariate_ols_fit
 from .multivariate_ols import MultivariateTestResults
+from .multivariate_ols import _multivariate_ols_fit
+from .multivariate_ols import _multivariate_ols_test, _hypotheses_doc
+
 __docformat__ = 'restructuredtext en'
 
 
 class MANOVA(Model):
     """
     Multivariate analysis of variance
+
     The implementation of MANOVA is based on multivariate regression and does
     not assume that the explanatory variables are categorical. Any type of
     variables as in regression is allowed.
@@ -41,6 +44,14 @@ class MANOVA(Model):
     exog : array
         See Parameters.
 
+    Notes
+    -----
+    MANOVA is used though the `mv_test` function, and `fit` is not used.
+
+    The ``from_formula`` interface is the recommended method to specify
+    a model and simplifies testing without needing to manually configure
+    the contrast matrices.
+
     References
     ----------
     .. [*] ftp://public.dhe.ibm.com/software/analytics/spss/documentation/statistics/20.0/en/client/Manuals/IBM_SPSS_Statistics_Algorithms.pdf
@@ -53,6 +64,10 @@ def __init__(self, endog, exog, missing='none', hasconst=None, **kwargs):
                                      hasconst=hasconst, **kwargs)
         self._fittedmod = _multivariate_ols_fit(self.endog, self.exog)
 
+    def fit(self):
+        raise NotImplementedError('fit is not needed to use MANOVA. Call'
+                                  'mv_test directly on a MANOVA instance.')
+
     def mv_test(self, hypotheses=None):
         if hypotheses is None:
             if (hasattr(self, 'data') and self.data is not None and
@@ -67,7 +82,7 @@ def mv_test(self, hypotheses=None):
                 for i in range(self.exog.shape[1]):
                     name = 'x%d' % (i)
                     L = np.zeros([1, self.exog.shape[1]])
-                    L[i] = 1
+                    L[0, i] = 1
                     hypotheses.append([name, L, None])
 
         results = _multivariate_ols_test(hypotheses, self._fittedmod,
@@ -95,4 +110,10 @@ def mv_test(self, hypotheses=None):
 
 where `params` is the regression coefficient matrix for the
 linear model y = x * params
+
+If the model is not specified using the formula interfact, then the hypotheses
+test each included exogenous variable, one at a time. In most applications 
+with categorical variables, the ``from_formula`` interface should be preferred 
+when specifying a model since it provides knowledge about the model when 
+specifying the hypotheses. 
 """)
diff --git a/statsmodels/multivariate/tests/test_manova.py b/statsmodels/multivariate/tests/test_manova.py
@@ -2,9 +2,12 @@
 
 import numpy as np
 import pandas as pd
+import pytest
+from numpy.testing import assert_almost_equal, assert_raises
+
 from statsmodels.multivariate.manova import MANOVA
-from numpy.testing import assert_almost_equal
-from numpy.testing import assert_raises
+from statsmodels.multivariate.multivariate_ols import MultivariateTestResults
+from statsmodels.tools import add_constant
 
 # Example data
 # https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/
@@ -73,6 +76,74 @@ def test_manova_sas_example():
                         0.4109, decimal=4)
 
 
+def test_manova_no_formula():
+    # Same as previous test only skipping formula interface
+    exog = add_constant(pd.get_dummies(X[['Loc']], drop_first=True))
+    endog = X[['Basal', 'Occ', 'Max']]
+    mod = MANOVA(endog, exog)
+    intercept = np.zeros((1, 3))
+    intercept[0, 0] = 1
+    loc = np.zeros((2, 3))
+    loc[0, 1] = loc[1, 2] = 1
+    hypotheses = [('Intercept', intercept), ('Loc', loc)]
+    r = mod.mv_test(hypotheses)
+    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Value'],
+                        0.60143661, decimal=8)
+    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Value'],
+                        0.44702843, decimal=8)
+    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
+                                             'Value'],
+                        0.58210348, decimal=8)
+    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Value'],
+                        0.35530890, decimal=8)
+    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'F Value'],
+                        0.77, decimal=2)
+    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'F Value'],
+                        0.86, decimal=2)
+    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
+                                             'F Value'],
+                        0.75, decimal=2)
+    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'F Value'],
+                        1.07, decimal=2)
+    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Num DF'],
+                        6, decimal=3)
+    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Num DF'],
+                        6, decimal=3)
+    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
+                                             'Num DF'],
+                        6, decimal=3)
+    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Num DF'],
+                        3, decimal=3)
+    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Den DF'],
+                        16, decimal=3)
+    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Den DF'],
+                        18, decimal=3)
+    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
+                                             'Den DF'],
+                        9.0909, decimal=4)
+    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Den DF'],
+                        9, decimal=3)
+    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Pr > F'],
+                        0.6032, decimal=4)
+    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Pr > F'],
+                        0.5397, decimal=4)
+    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
+                                             'Pr > F'],
+                        0.6272, decimal=4)
+    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Pr > F'],
+                        0.4109, decimal=4)
+
+
+@pytest.mark.smoke
+def test_manova_no_formula_no_hypothesis():
+    # Same as previous test only skipping formula interface
+    exog = add_constant(pd.get_dummies(X[['Loc']], drop_first=True))
+    endog = X[['Basal', 'Occ', 'Max']]
+    mod = MANOVA(endog, exog)
+    r = mod.mv_test()
+    assert isinstance(r, MultivariateTestResults)
+
+
 def test_manova_test_input_validation():
     mod = MANOVA.from_formula('Basal + Occ + Max ~ Loc', data=X)
     hypothesis = [('test', np.array([[1, 1, 1]]), None)]