BUG/ENH: Improve RLM in the case of perfect fit

Ensure RLM doesn't crash if perfect fit Clean norms so that they don't crash either Avoid calculations the load to nan Improve testing of RLM Replace fabs with abs closes statsmodels#5585 closes statsmodels#1341 closes statsmodels#5878 closes statsmodels#5356 closes statsmodels#3319 xref statsmodels#55
bashtage · Jul 17, 2019 · 021e003 · 021e003
1 parent c367d78
commit 021e003
Show file tree

Hide file tree

Showing 3 changed files with 104 additions and 17 deletions.
diff --git a/statsmodels/robust/norms.py b/statsmodels/robust/norms.py
@@ -353,9 +353,12 @@ def psi_deriv(self, z):
         -----
         Used to estimate the robust covariance matrix.
         """
-
-        return np.exp(-self.a * np.fabs(z)) + z**2*\
-                np.exp(-self.a*np.fabs(z))*-self.a/np.fabs(z)
+        a = self.a
+        x = np.exp(-a * np.abs(z))
+        dx = -a * x * np.sign(z)
+        y = z
+        dy = 1
+        return x * dy + y * dx
 
 
 class AndrewWave(RobustNorm):
@@ -451,7 +454,16 @@ def weights(self, z):
         a = self.a
         z = np.asarray(z)
         test = self._subset(z)
-        return test * np.sin(z / a) / (z / a)
+        ratio = z / a
+        small = np.abs(ratio) < np.finfo(np.double).eps
+        if np.any(small):
+            weights = np.ones_like(ratio)
+            large = ~small
+            ratio = ratio[large]
+            weights[large] = test[large] * np.sin(ratio) / ratio
+        else:
+            weights = test * np.sin(ratio) / ratio
+        return weights
 
     def psi_deriv(self, z):
         """
@@ -699,15 +711,25 @@ def weights(self, z):
         b = self.b
         c = self.c
         t1, t2, t3 = self._subset(z)
-        v = (t1 +
-            t2 * a/np.fabs(z) +
-            t3 * a*(c-np.fabs(z))/(np.fabs(z)*(c-b)))
-        v[np.where(np.isnan(v))]=1. # for some reason 0 returns a nan?
+
+        v = np.zeros_like(z)
+        v[t1] = 1.0
+        abs_z = np.fabs(z)
+        v[t2] = a / abs_z[t2]
+        abs_zt3 = abs_z[t3]
+        v[t3] = a * (c - abs_zt3) / (abs_zt3 * (c - b))
+        v[np.where(np.isnan(v))] = 1.  # for some reason 0 returns a nan?
         return v
 
     def psi_deriv(self, z):
         t1, t2, t3 = self._subset(z)
-        return t1 + t3 * (self.a*np.sign(z)*z)/(np.fabs(z)*(self.c-self.b))
+        a, b, c = self.a, self.b, self.c
+        # default is t1
+        d = np.zeros_like(z)
+        d[t1] = 1.0
+        zt3 = z[t3]
+        d[t3] = (a * np.sign(zt3) * zt3) / (np.fabs(zt3) * (c - b))
+        return d
 
 
 class TukeyBiweight(RobustNorm):

diff --git a/statsmodels/robust/robust_linear_model.py b/statsmodels/robust/robust_linear_model.py
@@ -17,6 +17,7 @@
 import scipy.stats as stats
 
 from statsmodels.tools.decorators import cache_readonly
+from statsmodels.tools.sm_exceptions import ConvergenceWarning
 import statsmodels.regression.linear_model as lm
 import statsmodels.regression._tools as reg_tools
 import statsmodels.robust.norms as norms
@@ -140,18 +141,14 @@ def predict(self, params, exog=None):
 
         Parameters
         ----------
-        params : array_like, optional after fit has been called
+        params : array_like
             Parameters of a linear model
         exog : array_like, optional.
             Design / exogenous data. Model exog is used if None.
 
         Returns
         -------
         An array of fitted values
-
-        Notes
-        -----
-        If the model as not yet been fit, params is not optional.
         """
         # copied from linear_model
         if exog is None:
@@ -240,8 +237,8 @@ def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1',
 
         Returns
         -------
-        results : object
-            statsmodels.rlm.RLMresults
+        results : statsmodels.rlm.RLMresults
+            Results instance
         """
         if cov.upper() not in ["H1", "H2", "H3"]:
             raise ValueError("Covariance matrix %s not understood" % cov)
@@ -286,6 +283,12 @@ def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1',
         iteration = 1
         converged = 0
         while not converged:
+            if self.scale == 0.0:
+                import warnings
+                warnings.warn('Estimated scale is 0.0 indicating that the most'
+                              ' last iteration produced a perfect fit of the '
+                              'weighted data.', ConvergenceWarning)
+                break
             self.weights = self.M.weights(wls_results.resid / self.scale)
             wls_results = reg_tools._MinimalWLS(self.endog, self.exog,
                                                 weights=self.weights,
@@ -347,7 +350,7 @@ class RLMResults(base.LikelihoodModelResults):
         errors are taken from the robust covariance matrix specified in the
         argument to fit.
     chisq : array
-        An array of the chi-squared values of the paramter estimates.
+        An array of the chi-squared values of the parameter estimates.
     df_model
         See RLM.df_model
     df_resid
@@ -419,6 +422,10 @@ def resid(self):
 
     @cache_readonly
     def sresid(self):
+        if self.scale == 0.0:
+            sresid = self.resid.copy()
+            sresid[:] = 0.0
+            return sresid
         return self.resid / self.scale
 
     @cache_readonly

diff --git a/statsmodels/robust/tests/test_rlm.py b/statsmodels/robust/tests/test_rlm.py
@@ -107,6 +107,7 @@ def setup_class(cls):
         cls.decimal_scale = DECIMAL_3
 
         model = RLM(cls.data.endog, cls.data.exog, M=norms.HuberT())
+        cls.model = model
         results = model.fit()
         h2 = model.fit(cov="H2").bcov_scaled
         h3 = model.fit(cov="H3").bcov_scaled
@@ -122,6 +123,18 @@ def setup(self):
     def test_summary(self):
         self.res1.summary()
 
+    @pytest.mark.smoke
+    def test_summary2(self):
+        self.res1.summary2()
+
+    @pytest.mark.smoke
+    def test_chisq(self):
+        assert isinstance(self.res1.chisq, np.ndarray)
+
+    @pytest.mark.smoke
+    def test_predict(self):
+        assert isinstance(self.model.predict(self.res1.params), np.ndarray)
+
 
 class TestHampel(TestRlm):
     @classmethod
@@ -314,3 +327,48 @@ def test_rlm_start_values_errors():
     start_params = np.array([start_params, start_params]).T
     with pytest.raises(ValueError):
         model.fit(start_params=start_params)
+
+
+@pytest.fixture(scope='module',
+                params=[norms.AndrewWave, norms.LeastSquares, norms.HuberT,
+                        norms.TrimmedMean, norms.TukeyBiweight, norms.Hampel,
+                        norms.RamsayE])
+def norm(request):
+    return request.param()
+
+
+@pytest.fixture(scope='module')
+def perfect_fit_data(request):
+    from statsmodels.tools.tools import Bunch
+    rs = np.random.RandomState(1249328932)
+    exog = rs.standard_normal((1000, 1))
+    endog = exog + exog ** 2
+    exog = sm.add_constant(np.c_[exog, exog ** 2])
+    return Bunch(endog=endog, exog=exog, const=(3.2 * np.ones_like(endog)))
+
+
+def test_perfect_fit(perfect_fit_data, norm):
+    res = RLM(perfect_fit_data.endog, perfect_fit_data.exog, M=norm).fit()
+    assert_allclose(res.params, np.array([0, 1, 1]), atol=1e-8)
+
+
+def test_perfect_const(perfect_fit_data, norm):
+    res = RLM(perfect_fit_data.const, perfect_fit_data.exog, M=norm).fit()
+    assert_allclose(res.params, np.array([3.2, 0, 0]), atol=1e-8)
+
+
+@pytest.mark.parametrize('conv', ('weights', 'coefs', 'sresid'))
+def test_alt_criterion(conv):
+    data = sm.datasets.stackloss.load(as_pandas=True)
+    data.exog = sm.add_constant(data.exog, prepend=False)
+    base = RLM(data.endog, data.exog, M=norms.HuberT()).fit()
+    alt = RLM(data.endog, data.exog, M=norms.HuberT()).fit(conv=conv)
+    assert_allclose(base.params, alt.params)
+
+
+def test_bad_criterion():
+    data = sm.datasets.stackloss.load(as_pandas=True)
+    data.exog = sm.add_constant(data.exog, prepend=False)
+    mod = RLM(data.endog, data.exog, M=norms.HuberT())
+    with pytest.raises(ValueError, match='Convergence argument unknown'):
+        mod.fit(conv='unknown')