Skip to content

Commit

Permalink
REF: Change of BIC formula
Browse files Browse the repository at this point in the history
  • Loading branch information
JoonsukPark authored and bashtage committed Aug 3, 2020
1 parent f534a39 commit dfeda8b
Show file tree
Hide file tree
Showing 9 changed files with 167 additions and 34 deletions.
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ filterwarnings =
error:unbiased is deprecated in factor of adjusted:FutureWarning
error:categorical is deprecated:FutureWarning
error:After 0.13 initialization:FutureWarning
error:The bic value:FutureWarning
markers =
example: mark a test that runs example code
matplotlib: mark a test that requires matplotlib
Expand Down
7 changes: 6 additions & 1 deletion statsmodels/base/tests/test_generic_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,12 @@ def test_ttest_tvalues(self):
assert_array_equal(summf.columns.values, cols)

def test_ftest_pvalues(self):
smt.check_ftest_pvalues(self.results)
import warnings

with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
# FutureWarning for BIC changes
smt.check_ftest_pvalues(self.results)

def test_fitted(self):
smt.check_fitted(self.results)
Expand Down
5 changes: 4 additions & 1 deletion statsmodels/base/tests/test_penalized.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,10 @@ def test_summary(self):

@pytest.mark.smoke
def test_summary2(self):
summ = self.res1.summary2()
with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
# FutureWarning for BIC changes
summ = self.res1.summary2()
assert isinstance(summ.as_latex(), str)
assert isinstance(summ.as_html(), str)
assert isinstance(summ.as_text(), str)
Expand Down
17 changes: 14 additions & 3 deletions statsmodels/base/tests/test_shrink_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,13 @@ def test_remove_data_pickle(self):
pred1 = results.predict(xf, **pred_kwds)
# create some cached attributes
results.summary()
res = results.summary2() # SMOKE test also summary2

import warnings

with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
# FutureWarning for BIC changes
results.summary2() # SMOKE test also summary2

# uncomment the following to check whether tests run (7 failures now)
# np.testing.assert_equal(res, 1)
Expand Down Expand Up @@ -237,8 +243,13 @@ def test_cached_data_removed(self):
for name in names:
assert name in res._cache
assert res._cache[name] is not None
import warnings

with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
# FutureWarning for BIC changes
res.remove_data()

res.remove_data()
for name in names:
assert res._cache[name] is None

Expand All @@ -249,7 +260,7 @@ def test_cached_values_evaluated(self):
assert res._cache == {}
with pytest.warns(FutureWarning, match="Anscombe residuals"):
res.remove_data()
assert 'bic' in res._cache
assert 'aic' in res._cache


class TestRemoveDataPickleGLMConstrained(RemoveDataPickle):
Expand Down
15 changes: 13 additions & 2 deletions statsmodels/discrete/tests/test_constrained.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,12 @@ def test_glm(self):
# see issue GH#1733
assert_allclose(res1.aic, res2.infocrit[4], rtol=1e-10)

assert_allclose(res1.bic, res2.bic, rtol=1e-10)
import warnings

with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
# FutureWarning for BIC changes
assert_allclose(res1.bic, res2.bic, rtol=1e-10)
# bic is deviance based
# FIXME: dont leave commented-out
# assert_allclose(res1.bic, res2.infocrit[5], rtol=1e-10)
Expand Down Expand Up @@ -480,7 +485,13 @@ def test_summary(self):
@pytest.mark.smoke
def test_summary2(self):
# trailing text in summary, assumes it's the first extra string
summ = self.res1m.summary2()
import warnings

with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
# FutureWarning for BIC changes
summ = self.res1m.summary2()

assert_('linear equality constraints' in summ.extra_txt[0])

def test_fit_constrained_wrap(self):
Expand Down
7 changes: 6 additions & 1 deletion statsmodels/discrete/tests/test_sandwich_cov.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,12 @@ def test_ttest(self):


def test_waldtest(self):
smt.check_ftest_pvalues(self.res1)
import warnings

with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
# FutureWarning for BIC changes
smt.check_ftest_pvalues(self.res1)


class TestPoissonClu(CheckCountRobustMixin):
Expand Down
75 changes: 71 additions & 4 deletions statsmodels/genmod/generalized_linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,24 @@ def _check_convergence(criterion, iteration, atol, rtol):
atol=atol, rtol=rtol)


# Remove after 0.13 when bic changes to bic llf
class _ModuleVariable:
_value = None

@property
def use_bic_llf(self):
return self._value

def set_use_bic_llf(self, val):
if val not in (True, False, None):
raise ValueError("Must be True, False or None")
self._value = bool(val) if val is not None else val


_use_bic_helper = _ModuleVariable()
SET_USE_BIC_LLF = _use_bic_helper.set_use_bic_llf


class GLM(base.LikelihoodModel):
__doc__ = """
Generalized Linear Models
Expand Down Expand Up @@ -1678,16 +1696,65 @@ def llf(self):
def aic(self):
"""
Akaike Information Criterion
-2 * `llf` + 2*(`df_model` + 1)
-2 * `llf` + 2 * (`df_model` + 1)
"""
return -2 * self.llf + 2 * (self.df_model + 1)

@cached_value
@property
def bic(self):
"""
Bayes Information Criterion
Old: `deviance` - `df_resid` * log(`nobs`)
New: -2 * `llf` + (df_model+1)*log*(n)
`deviance` - `df_resid` * log(`nobs`)
.. warning::
The current definition is base don the deviance rather than the
log-likelihood. This is not consistent with the AIC definition,
and after 0.13 both will make use of the log-likelihood definition.
Notes
-----
The log-likelihood version is defined
-2 * `llf` + (`df_model` + 1)*log(n)
"""
if _use_bic_helper.use_bic_llf not in (True, False):
warn(
"The bic value is computed using the deviance formula. After "
"0.13 this will change to the log-likelihood based formula. "
"This change has no impact on the relative rank of models "
"compared using BIC. You can directly access the "
"log-likelihood version using the `bic_llf` attribute. You "
"can suppress this message by calling "
"statsmodels.genmod.generalized_linear_model.SET_USE_BIC_LLF "
"with True to get the LLF-based version now or False to retain"
"the deviance version.",
FutureWarning
)
if bool(_use_bic_helper.use_bic_llf):
return self.bic_llf

return self.bic_deviance

@cached_value
def bic_deviance(self):
"""
Bayes Information Criterion
Based on the deviance,
`deviance` - `df_resid` * log(`nobs`)
"""
return (self.deviance -
(self.model.wnobs - self.df_model - 1) *
np.log(self.model.wnobs))

@cached_value
def bic_llf(self):
"""
Bayes Information Criterion
Based on the log-likelihood,
-2 * `llf` + log(n) * (`df_model` + 1)
"""
return -2*self.llf + (self.df_model+1)*np.log(
self.df_model+self.df_resid+1
Expand Down
15 changes: 10 additions & 5 deletions statsmodels/genmod/tests/test_constrained.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,10 +213,13 @@ def test_resid(self):
assert_allclose(res1.resid_response, res2.resid_response, rtol=1e-8)

def test_glm_attr(self):
for attr in ['llf', 'null_deviance', 'aic', 'bic', 'df_resid',
'df_model', 'pearson_chi2', 'scale']:
assert_allclose(getattr(self.res1, attr),
getattr(self.res2, attr), rtol=1e-10)
with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
# FutureWarning to silence BIC warning
for attr in ['llf', 'null_deviance', 'aic', 'bic', 'df_resid',
'df_model', 'pearson_chi2', 'scale']:
assert_allclose(getattr(self.res1, attr),
getattr(self.res2, attr), rtol=1e-10)

def test_wald(self):
res1 = self.res1
Expand Down Expand Up @@ -246,8 +249,10 @@ def test_wald(self):

# smoke
with warnings.catch_warnings():
warnings.simplefilter('ignore', ValueWarning)
# RuntimeWarnings because of truedivide and scipy distributions
# Future to silence BIC warning
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter('ignore', ValueWarning)
warnings.simplefilter('ignore', RuntimeWarning)
self.res1.summary()
self.res1.summary2()
Expand Down
59 changes: 42 additions & 17 deletions statsmodels/genmod/tests/test_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from scipy import stats

import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod.generalized_linear_model import GLM, SET_USE_BIC_LLF
from statsmodels.tools.tools import add_constant
from statsmodels.tools.sm_exceptions import PerfectSeparationError
from statsmodels.discrete import discrete_model as discrete
Expand Down Expand Up @@ -46,6 +46,13 @@ def teardown_module():
pdf.close()


@pytest.fixture(scope="module")
def iris():
cur_dir = os.path.dirname(os.path.abspath(__file__))
return np.genfromtxt(os.path.join(cur_dir, 'results', 'iris.csv'),
delimiter=",", skip_header=1)


class CheckModelResultsMixin(object):
'''
res2 should be either the results from RModelWrap
Expand Down Expand Up @@ -150,8 +157,11 @@ def test_null_deviance(self):

decimal_bic = DECIMAL_4
def test_bic(self):
assert_almost_equal(self.res1.bic, self.res2.bic_Stata,
self.decimal_bic)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
assert_almost_equal(self.res1.bic,
self.res2.bic_Stata,
self.decimal_bic)

def test_degrees(self):
assert_equal(self.res1.model.df_resid,self.res2.df_resid)
Expand Down Expand Up @@ -183,11 +193,15 @@ def test_pearson_chi2(self):

@pytest.mark.smoke
def test_summary(self):
self.res1.summary()
with warnings.catch_warnings():
warnings.simplefilter("ignore")
self.res1.summary()

@pytest.mark.smoke
def test_summary2(self):
self.res1.summary2()
with warnings.catch_warnings():
warnings.simplefilter("ignore")
self.res1.summary2()


class CheckComparisonMixin(object):
Expand Down Expand Up @@ -845,10 +859,7 @@ def test_predict(self):
assert_almost_equal(pred2, pred1+offset)


def test_perfect_pred():
cur_dir = os.path.dirname(os.path.abspath(__file__))
iris = np.genfromtxt(os.path.join(cur_dir, 'results', 'iris.csv'),
delimiter=",", skip_header=1)
def test_perfect_pred(iris):
y = iris[:, -1]
X = iris[:, :-1]
X = X[y != 2]
Expand Down Expand Up @@ -1403,7 +1414,9 @@ def test_null_deviance(self):
decimal_bic = DECIMAL_4

def test_bic(self):
assert_allclose(self.res1.bic, self.res2.bic, atol=1e-6, rtol=1e-6)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
assert_allclose(self.res1.bic, self.res2.bic, atol=1e-6, rtol=1e-6)

decimal_fittedvalues = DECIMAL_4

Expand Down Expand Up @@ -1791,8 +1804,10 @@ def test_fittedvalues(self):
atol=1e-4, rtol=1e-4)

def test_summary(self):
self.res1.summary()
self.res1.summary2()
with warnings.catch_warnings():
warnings.simplefilter("ignore")
self.res1.summary()
self.res1.summary2()


class TestTweediePower15(CheckTweedie):
Expand Down Expand Up @@ -2320,7 +2335,6 @@ def test_non_invertible_hessian_fails_summary():
res.summary()



def test_int_scale():
# GH-6627, make sure it works with int scale
data = longley.load(as_pandas=True)
Expand All @@ -2342,13 +2356,24 @@ def test_int_exog(dtype):
assert isinstance(res.params, np.ndarray)


def test_glm_bic():
cur_dir = os.path.dirname(os.path.abspath(__file__))
iris = np.genfromtxt(os.path.join(cur_dir, 'results', 'iris.csv'),
delimiter=",", skip_header=1)
def test_glm_bic(iris):
X = np.c_[np.ones(100), iris[50:, :4]]
y = np.array(iris)[50:, 4].astype(np.int32)
y -= 1
SET_USE_BIC_LLF(True)
model = GLM(y, X, family=sm.families.Binomial()).fit()
# 34.9244 is what glm() of R yields
assert_almost_equal(model.bic, 34.9244, decimal=3)
assert_almost_equal(model.bic_llf, 34.9244, decimal=3)
SET_USE_BIC_LLF(False)
assert_almost_equal(model.bic, model.bic_deviance, decimal=3)
SET_USE_BIC_LLF(None)


def test_glm_bic_warning(iris):
X = np.c_[np.ones(100), iris[50:, :4]]
y = np.array(iris)[50:, 4].astype(np.int32)
y -= 1
model = GLM(y, X, family=sm.families.Binomial()).fit()
with pytest.warns(FutureWarning, match="The bic"):
assert isinstance(model.bic, float)

0 comments on commit dfeda8b

Please sign in to comment.