Skip to content

Commit

Permalink
REF: Change of BIC formula
Browse files Browse the repository at this point in the history
  • Loading branch information
JoonsukPark authored and bashtage committed Aug 3, 2020
1 parent 48d77d6 commit 0a5b1fb
Show file tree
Hide file tree
Showing 6 changed files with 145 additions and 35 deletions.
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ filterwarnings =
error:unbiased is deprecated in factor of adjusted:FutureWarning
error:categorical is deprecated:FutureWarning
error:After 0.13 initialization:FutureWarning
error:The bic value:FutureWarning
markers =
example: mark a test that runs example code
matplotlib: mark a test that requires matplotlib
Expand Down
11 changes: 8 additions & 3 deletions statsmodels/base/tests/test_shrink_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_remove_data_pickle(self):
pred1 = results.predict(xf, **pred_kwds)
# create some cached attributes
results.summary()
res = results.summary2() # SMOKE test also summary2
results.summary2() # SMOKE test also summary2

# uncomment the following to check whether tests run (7 failures now)
# np.testing.assert_equal(res, 1)
Expand Down Expand Up @@ -237,8 +237,13 @@ def test_cached_data_removed(self):
for name in names:
assert name in res._cache
assert res._cache[name] is not None
import warnings

with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
# FutureWarning for BIC changes
res.remove_data()

res.remove_data()
for name in names:
assert res._cache[name] is None

Expand All @@ -249,7 +254,7 @@ def test_cached_values_evaluated(self):
assert res._cache == {}
with pytest.warns(FutureWarning, match="Anscombe residuals"):
res.remove_data()
assert 'bic' in res._cache
assert 'aic' in res._cache


class TestRemoveDataPickleGLMConstrained(RemoveDataPickle):
Expand Down
15 changes: 13 additions & 2 deletions statsmodels/discrete/tests/test_constrained.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,12 @@ def test_glm(self):
# see issue GH#1733
assert_allclose(res1.aic, res2.infocrit[4], rtol=1e-10)

assert_allclose(res1.bic, res2.bic, rtol=1e-10)
import warnings

with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
# FutureWarning for BIC changes
assert_allclose(res1.bic, res2.bic, rtol=1e-10)
# bic is deviance based
# FIXME: dont leave commented-out
# assert_allclose(res1.bic, res2.infocrit[5], rtol=1e-10)
Expand Down Expand Up @@ -480,7 +485,13 @@ def test_summary(self):
@pytest.mark.smoke
def test_summary2(self):
# trailing text in summary, assumes it's the first extra string
summ = self.res1m.summary2()
import warnings

with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
# FutureWarning for BIC changes
summ = self.res1m.summary2()

assert_('linear equality constraints' in summ.extra_txt[0])

def test_fit_constrained_wrap(self):
Expand Down
98 changes: 83 additions & 15 deletions statsmodels/genmod/generalized_linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
import statsmodels.regression.linear_model as lm
import statsmodels.base.wrapper as wrap
import statsmodels.regression._tools as reg_tools
from warnings import warn
import warnings

from statsmodels.graphics._regressionplots_doc import (
_plot_added_variable_doc,
Expand All @@ -55,6 +55,24 @@ def _check_convergence(criterion, iteration, atol, rtol):
atol=atol, rtol=rtol)


# Remove after 0.13 when bic changes to bic llf
class _ModuleVariable:
_value = None

@property
def use_bic_llf(self):
return self._value

def set_use_bic_llf(self, val):
if val not in (True, False, None):
raise ValueError("Must be True, False or None")
self._value = bool(val) if val is not None else val


_use_bic_helper = _ModuleVariable()
SET_USE_BIC_LLF = _use_bic_helper.set_use_bic_llf


class GLM(base.LikelihoodModel):
__doc__ = """
Generalized Linear Models
Expand Down Expand Up @@ -272,10 +290,10 @@ def __init__(self, endog, exog, family=None, offset=None,
if (family is not None) and not isinstance(family.link,
tuple(family.safe_links)):

warn(("The %s link function does not respect the domain "
"of the %s family.") %
(family.link.__class__.__name__,
family.__class__.__name__), DomainWarning)
warnings.warn((f"The {type(family.link).__name__} link function "
"does not respect the domain of the "
f"{type(family).__name__} family."),
DomainWarning)

if exposure is not None:
exposure = np.log(exposure)
Expand Down Expand Up @@ -1108,8 +1126,8 @@ def _fit_gradient(self, start_params=None, method="newton",
try:
cov_p = np.linalg.inv(-self.hessian(rslt.params, observed=oim)) / scale
except LinAlgError:
warn('Inverting hessian failed, no bse or cov_params '
'available', HessianInversionWarning)
warnings.warn('Inverting hessian failed, no bse or cov_params '
'available', HessianInversionWarning)
cov_p = None

results_class = getattr(self, '_results_class', GLMResults)
Expand Down Expand Up @@ -1302,8 +1320,7 @@ def fit_regularized(self, method="elastic_net", alpha=0.,
self.scale = self.estimate_scale(self.mu)

if not result.converged:
msg = "Elastic net fitting did not converge"
warn(msg)
warnings.warn("Elastic net fitting did not converge")

return result

Expand Down Expand Up @@ -1678,16 +1695,65 @@ def llf(self):
def aic(self):
"""
Akaike Information Criterion
-2 * `llf` + 2*(`df_model` + 1)
-2 * `llf` + 2 * (`df_model` + 1)
"""
return -2 * self.llf + 2 * (self.df_model + 1)

@cached_value
@property
def bic(self):
"""
Bayes Information Criterion
Old: `deviance` - `df_resid` * log(`nobs`)
New: -2 * `llf` + (df_model+1)*log*(n)
`deviance` - `df_resid` * log(`nobs`)
.. warning::
The current definition is base don the deviance rather than the
log-likelihood. This is not consistent with the AIC definition,
and after 0.13 both will make use of the log-likelihood definition.
Notes
-----
The log-likelihood version is defined
-2 * `llf` + (`df_model` + 1)*log(n)
"""
if _use_bic_helper.use_bic_llf not in (True, False):
warnings.warn(
"The bic value is computed using the deviance formula. After "
"0.13 this will change to the log-likelihood based formula. "
"This change has no impact on the relative rank of models "
"compared using BIC. You can directly access the "
"log-likelihood version using the `bic_llf` attribute. You "
"can suppress this message by calling "
"statsmodels.genmod.generalized_linear_model.SET_USE_BIC_LLF "
"with True to get the LLF-based version now or False to retain"
"the deviance version.",
FutureWarning
)
if bool(_use_bic_helper.use_bic_llf):
return self.bic_llf

return self.bic_deviance

@cached_value
def bic_deviance(self):
"""
Bayes Information Criterion
Based on the deviance,
`deviance` - `df_resid` * log(`nobs`)
"""
return (self.deviance -
(self.model.wnobs - self.df_model - 1) *
np.log(self.model.wnobs))

@cached_value
def bic_llf(self):
"""
Bayes Information Criterion
Based on the log-likelihood,
-2 * `llf` + log(n) * (`df_model` + 1)
"""
return -2*self.llf + (self.df_model+1)*np.log(
self.df_model+self.df_resid+1
Expand Down Expand Up @@ -1920,8 +1986,10 @@ def summary2(self, yname=None, xname=None, title=None, alpha=.05,
self.method = 'IRLS'
from statsmodels.iolib import summary2
smry = summary2.Summary()
smry.add_base(results=self, alpha=alpha, float_format=float_format,
xname=xname, yname=yname, title=title)
with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
smry.add_base(results=self, alpha=alpha, float_format=float_format,
xname=xname, yname=yname, title=title)
if hasattr(self, 'constraints'):
smry.add_text('Model has been estimated subject to linear '
'equality constraints.')
Expand Down
10 changes: 8 additions & 2 deletions statsmodels/genmod/tests/test_constrained.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,10 +213,14 @@ def test_resid(self):
assert_allclose(res1.resid_response, res2.resid_response, rtol=1e-8)

def test_glm_attr(self):
for attr in ['llf', 'null_deviance', 'aic', 'bic', 'df_resid',
for attr in ['llf', 'null_deviance', 'aic', 'df_resid',
'df_model', 'pearson_chi2', 'scale']:
assert_allclose(getattr(self.res1, attr),
getattr(self.res2, attr), rtol=1e-10)
with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
# FutureWarning to silence BIC warning
assert_allclose(self.res1.bic, self.res2.bic, rtol=1e-10)

def test_wald(self):
res1 = self.res1
Expand Down Expand Up @@ -246,8 +250,10 @@ def test_wald(self):

# smoke
with warnings.catch_warnings():
warnings.simplefilter('ignore', ValueWarning)
# RuntimeWarnings because of truedivide and scipy distributions
# Future to silence BIC warning
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter('ignore', ValueWarning)
warnings.simplefilter('ignore', RuntimeWarning)
self.res1.summary()
self.res1.summary2()
Expand Down
45 changes: 32 additions & 13 deletions statsmodels/genmod/tests/test_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from scipy import stats

import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod.generalized_linear_model import GLM, SET_USE_BIC_LLF
from statsmodels.tools.tools import add_constant
from statsmodels.tools.sm_exceptions import PerfectSeparationError
from statsmodels.discrete import discrete_model as discrete
Expand Down Expand Up @@ -46,6 +46,13 @@ def teardown_module():
pdf.close()


@pytest.fixture(scope="module")
def iris():
cur_dir = os.path.dirname(os.path.abspath(__file__))
return np.genfromtxt(os.path.join(cur_dir, 'results', 'iris.csv'),
delimiter=",", skip_header=1)


class CheckModelResultsMixin(object):
'''
res2 should be either the results from RModelWrap
Expand Down Expand Up @@ -150,8 +157,11 @@ def test_null_deviance(self):

decimal_bic = DECIMAL_4
def test_bic(self):
assert_almost_equal(self.res1.bic, self.res2.bic_Stata,
self.decimal_bic)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
assert_almost_equal(self.res1.bic,
self.res2.bic_Stata,
self.decimal_bic)

def test_degrees(self):
assert_equal(self.res1.model.df_resid,self.res2.df_resid)
Expand Down Expand Up @@ -845,10 +855,7 @@ def test_predict(self):
assert_almost_equal(pred2, pred1+offset)


def test_perfect_pred():
cur_dir = os.path.dirname(os.path.abspath(__file__))
iris = np.genfromtxt(os.path.join(cur_dir, 'results', 'iris.csv'),
delimiter=",", skip_header=1)
def test_perfect_pred(iris):
y = iris[:, -1]
X = iris[:, :-1]
X = X[y != 2]
Expand Down Expand Up @@ -1403,7 +1410,9 @@ def test_null_deviance(self):
decimal_bic = DECIMAL_4

def test_bic(self):
assert_allclose(self.res1.bic, self.res2.bic, atol=1e-6, rtol=1e-6)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
assert_allclose(self.res1.bic, self.res2.bic, atol=1e-6, rtol=1e-6)

decimal_fittedvalues = DECIMAL_4

Expand Down Expand Up @@ -2320,7 +2329,6 @@ def test_non_invertible_hessian_fails_summary():
res.summary()



def test_int_scale():
# GH-6627, make sure it works with int scale
data = longley.load(as_pandas=True)
Expand All @@ -2342,13 +2350,24 @@ def test_int_exog(dtype):
assert isinstance(res.params, np.ndarray)


def test_glm_bic():
cur_dir = os.path.dirname(os.path.abspath(__file__))
iris = np.genfromtxt(os.path.join(cur_dir, 'results', 'iris.csv'),
delimiter=",", skip_header=1)
def test_glm_bic(iris):
X = np.c_[np.ones(100), iris[50:, :4]]
y = np.array(iris)[50:, 4].astype(np.int32)
y -= 1
SET_USE_BIC_LLF(True)
model = GLM(y, X, family=sm.families.Binomial()).fit()
# 34.9244 is what glm() of R yields
assert_almost_equal(model.bic, 34.9244, decimal=3)
assert_almost_equal(model.bic_llf, 34.9244, decimal=3)
SET_USE_BIC_LLF(False)
assert_almost_equal(model.bic, model.bic_deviance, decimal=3)
SET_USE_BIC_LLF(None)


def test_glm_bic_warning(iris):
X = np.c_[np.ones(100), iris[50:, :4]]
y = np.array(iris)[50:, 4].astype(np.int32)
y -= 1
model = GLM(y, X, family=sm.families.Binomial()).fit()
with pytest.warns(FutureWarning, match="The bic"):
assert isinstance(model.bic, float)

0 comments on commit 0a5b1fb

Please sign in to comment.