Skip to content

Commit

Permalink
BUG/ENH: Improve RLM in the case of perfect fit
Browse files Browse the repository at this point in the history
Ensure RLM doesn't crash if perfect fit
Clean norms so that they don't crash either
Avoid calculations the load to nan
Improve testing of RLM
Replace fabs with abs

closes statsmodels#5585
closes statsmodels#1341
closes statsmodels#5878
closes statsmodels#5356
closes statsmodels#3319
xref statsmodels#55
  • Loading branch information
bashtage committed Jul 17, 2019
1 parent c367d78 commit 021e003
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 17 deletions.
40 changes: 31 additions & 9 deletions statsmodels/robust/norms.py
Expand Up @@ -353,9 +353,12 @@ def psi_deriv(self, z):
-----
Used to estimate the robust covariance matrix.
"""

return np.exp(-self.a * np.fabs(z)) + z**2*\
np.exp(-self.a*np.fabs(z))*-self.a/np.fabs(z)
a = self.a
x = np.exp(-a * np.abs(z))
dx = -a * x * np.sign(z)
y = z
dy = 1
return x * dy + y * dx


class AndrewWave(RobustNorm):
Expand Down Expand Up @@ -451,7 +454,16 @@ def weights(self, z):
a = self.a
z = np.asarray(z)
test = self._subset(z)
return test * np.sin(z / a) / (z / a)
ratio = z / a
small = np.abs(ratio) < np.finfo(np.double).eps
if np.any(small):
weights = np.ones_like(ratio)
large = ~small
ratio = ratio[large]
weights[large] = test[large] * np.sin(ratio) / ratio
else:
weights = test * np.sin(ratio) / ratio
return weights

def psi_deriv(self, z):
"""
Expand Down Expand Up @@ -699,15 +711,25 @@ def weights(self, z):
b = self.b
c = self.c
t1, t2, t3 = self._subset(z)
v = (t1 +
t2 * a/np.fabs(z) +
t3 * a*(c-np.fabs(z))/(np.fabs(z)*(c-b)))
v[np.where(np.isnan(v))]=1. # for some reason 0 returns a nan?

v = np.zeros_like(z)
v[t1] = 1.0
abs_z = np.fabs(z)
v[t2] = a / abs_z[t2]
abs_zt3 = abs_z[t3]
v[t3] = a * (c - abs_zt3) / (abs_zt3 * (c - b))
v[np.where(np.isnan(v))] = 1. # for some reason 0 returns a nan?
return v

def psi_deriv(self, z):
t1, t2, t3 = self._subset(z)
return t1 + t3 * (self.a*np.sign(z)*z)/(np.fabs(z)*(self.c-self.b))
a, b, c = self.a, self.b, self.c
# default is t1
d = np.zeros_like(z)
d[t1] = 1.0
zt3 = z[t3]
d[t3] = (a * np.sign(zt3) * zt3) / (np.fabs(zt3) * (c - b))
return d


class TukeyBiweight(RobustNorm):
Expand Down
23 changes: 15 additions & 8 deletions statsmodels/robust/robust_linear_model.py
Expand Up @@ -17,6 +17,7 @@
import scipy.stats as stats

from statsmodels.tools.decorators import cache_readonly
from statsmodels.tools.sm_exceptions import ConvergenceWarning
import statsmodels.regression.linear_model as lm
import statsmodels.regression._tools as reg_tools
import statsmodels.robust.norms as norms
Expand Down Expand Up @@ -140,18 +141,14 @@ def predict(self, params, exog=None):
Parameters
----------
params : array_like, optional after fit has been called
params : array_like
Parameters of a linear model
exog : array_like, optional.
Design / exogenous data. Model exog is used if None.
Returns
-------
An array of fitted values
Notes
-----
If the model as not yet been fit, params is not optional.
"""
# copied from linear_model
if exog is None:
Expand Down Expand Up @@ -240,8 +237,8 @@ def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1',
Returns
-------
results : object
statsmodels.rlm.RLMresults
results : statsmodels.rlm.RLMresults
Results instance
"""
if cov.upper() not in ["H1", "H2", "H3"]:
raise ValueError("Covariance matrix %s not understood" % cov)
Expand Down Expand Up @@ -286,6 +283,12 @@ def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1',
iteration = 1
converged = 0
while not converged:
if self.scale == 0.0:
import warnings
warnings.warn('Estimated scale is 0.0 indicating that the most'
' last iteration produced a perfect fit of the '
'weighted data.', ConvergenceWarning)
break
self.weights = self.M.weights(wls_results.resid / self.scale)
wls_results = reg_tools._MinimalWLS(self.endog, self.exog,
weights=self.weights,
Expand Down Expand Up @@ -347,7 +350,7 @@ class RLMResults(base.LikelihoodModelResults):
errors are taken from the robust covariance matrix specified in the
argument to fit.
chisq : array
An array of the chi-squared values of the paramter estimates.
An array of the chi-squared values of the parameter estimates.
df_model
See RLM.df_model
df_resid
Expand Down Expand Up @@ -419,6 +422,10 @@ def resid(self):

@cache_readonly
def sresid(self):
if self.scale == 0.0:
sresid = self.resid.copy()
sresid[:] = 0.0
return sresid
return self.resid / self.scale

@cache_readonly
Expand Down
58 changes: 58 additions & 0 deletions statsmodels/robust/tests/test_rlm.py
Expand Up @@ -107,6 +107,7 @@ def setup_class(cls):
cls.decimal_scale = DECIMAL_3

model = RLM(cls.data.endog, cls.data.exog, M=norms.HuberT())
cls.model = model
results = model.fit()
h2 = model.fit(cov="H2").bcov_scaled
h3 = model.fit(cov="H3").bcov_scaled
Expand All @@ -122,6 +123,18 @@ def setup(self):
def test_summary(self):
self.res1.summary()

@pytest.mark.smoke
def test_summary2(self):
self.res1.summary2()

@pytest.mark.smoke
def test_chisq(self):
assert isinstance(self.res1.chisq, np.ndarray)

@pytest.mark.smoke
def test_predict(self):
assert isinstance(self.model.predict(self.res1.params), np.ndarray)


class TestHampel(TestRlm):
@classmethod
Expand Down Expand Up @@ -314,3 +327,48 @@ def test_rlm_start_values_errors():
start_params = np.array([start_params, start_params]).T
with pytest.raises(ValueError):
model.fit(start_params=start_params)


@pytest.fixture(scope='module',
params=[norms.AndrewWave, norms.LeastSquares, norms.HuberT,
norms.TrimmedMean, norms.TukeyBiweight, norms.Hampel,
norms.RamsayE])
def norm(request):
return request.param()


@pytest.fixture(scope='module')
def perfect_fit_data(request):
from statsmodels.tools.tools import Bunch
rs = np.random.RandomState(1249328932)
exog = rs.standard_normal((1000, 1))
endog = exog + exog ** 2
exog = sm.add_constant(np.c_[exog, exog ** 2])
return Bunch(endog=endog, exog=exog, const=(3.2 * np.ones_like(endog)))


def test_perfect_fit(perfect_fit_data, norm):
res = RLM(perfect_fit_data.endog, perfect_fit_data.exog, M=norm).fit()
assert_allclose(res.params, np.array([0, 1, 1]), atol=1e-8)


def test_perfect_const(perfect_fit_data, norm):
res = RLM(perfect_fit_data.const, perfect_fit_data.exog, M=norm).fit()
assert_allclose(res.params, np.array([3.2, 0, 0]), atol=1e-8)


@pytest.mark.parametrize('conv', ('weights', 'coefs', 'sresid'))
def test_alt_criterion(conv):
data = sm.datasets.stackloss.load(as_pandas=True)
data.exog = sm.add_constant(data.exog, prepend=False)
base = RLM(data.endog, data.exog, M=norms.HuberT()).fit()
alt = RLM(data.endog, data.exog, M=norms.HuberT()).fit(conv=conv)
assert_allclose(base.params, alt.params)


def test_bad_criterion():
data = sm.datasets.stackloss.load(as_pandas=True)
data.exog = sm.add_constant(data.exog, prepend=False)
mod = RLM(data.endog, data.exog, M=norms.HuberT())
with pytest.raises(ValueError, match='Convergence argument unknown'):
mod.fit(conv='unknown')

0 comments on commit 021e003

Please sign in to comment.