Skip to content

Commit

Permalink
Merge pull request statsmodels#2034 from jseabold/fix-1877
Browse files Browse the repository at this point in the history
ENH: Handle missing for extra data with formulas
  • Loading branch information
jseabold committed Oct 10, 2014
2 parents e57ba9f + f43a92a commit 4f55df8
Show file tree
Hide file tree
Showing 14 changed files with 301 additions and 163 deletions.
4 changes: 1 addition & 3 deletions docs/source/release/version0.6.rst
Original file line number Diff line number Diff line change
Expand Up @@ -199,9 +199,7 @@ for weakly non-normal distributions.
Major Bugs fixed
----------------

* Bullet list of major bugs
* With a link to its github issue.
* Use the syntax ``:ghissue:`###```.
* NA-handling with formulas is now correctly handled. :ghissue:`805`, :ghissue:`1877`.

.. currentmodule:: statsmodels.tsa

Expand Down
28 changes: 26 additions & 2 deletions statsmodels/base/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,16 @@ def handle_missing(cls, endog, exog, missing, **kwargs):
"""
none_array_names = []

if exog is not None:
# patsy's already dropped NaNs in y/X
missing_idx = kwargs.pop('missing_idx', None)

if missing_idx is not None:
# y, X already handled by patsy. add back in later.
combined = ()
combined_names = []
if exog is None:
none_array_names += ['exog']
elif exog is not None:
combined = (endog, exog)
combined_names = ['endog', 'exog']
else:
Expand Down Expand Up @@ -185,7 +194,11 @@ def handle_missing(cls, endog, exog, missing, **kwargs):
raise ValueError("Arrays with more than 2 dimensions "
"aren't yet handled")

nan_mask = _nan_rows(*combined)
if missing_idx is not None:
nan_mask = missing_idx | _nan_rows(*combined)
else:
nan_mask = _nan_rows(*combined)

if combined_2d:
nan_mask = _nan_rows(*(nan_mask[:, None],) + combined_2d)

Expand All @@ -197,6 +210,11 @@ def handle_missing(cls, endog, exog, missing, **kwargs):
combined.update(dict(zip(none_array_names,
[None] * len(none_array_names))))

if missing_idx is not None:
combined.update({'endog': endog})
if exog is not None:
combined.update({'exog': exog})

return combined, []

elif missing == 'raise':
Expand All @@ -213,6 +231,12 @@ def handle_missing(cls, endog, exog, missing, **kwargs):
if none_array_names:
combined.update(dict(zip(none_array_names,
[None] * len(none_array_names))))

if missing_idx is not None:
combined.update({'endog': endog})
if exog is not None:
combined.update({'exog': exog})

return combined, np.where(~nan_mask)[0].tolist()
else:
raise ValueError("missing option %s not understood" % missing)
Expand Down
10 changes: 8 additions & 2 deletions statsmodels/base/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,14 @@ def from_formula(cls, formula, data, subset=None, *args, **kwargs):
eval_env = EvalEnvironment({})
else:
eval_env += 1 # we're going down the stack again
endog, exog = handle_formula_data(data, None, formula,
depth=eval_env)
missing = kwargs.get('missing', 'drop')
if missing == 'none': # with patys it's drop or raise. let's raise.
missing = 'raise'
(endog, exog), missing_idx = handle_formula_data(data, None, formula,
depth=eval_env,
missing=missing)
kwargs.update({'missing_idx': missing_idx,
'missing': missing})
mod = cls(endog, exog, *args, **kwargs)
mod.formula = formula

Expand Down
9 changes: 5 additions & 4 deletions statsmodels/discrete/discrete_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,10 +648,11 @@ def _derivative_exog(self, params, exog=None, transform='dydx',
return margeff.reshape(len(exog), -1, order='F')

class CountModel(DiscreteModel):
def __init__(self, endog, exog, offset=None, exposure=None, missing='none'):
def __init__(self, endog, exog, offset=None, exposure=None, missing='none',
**kwargs):
self._check_inputs(offset, exposure, endog) # attaches if needed
super(CountModel, self).__init__(endog, exog, missing=missing,
offset=self.offset, exposure=self.exposure)
offset=self.offset, exposure=self.exposure, **kwargs)
if offset is None:
delattr(self, 'offset')
if exposure is None:
Expand Down Expand Up @@ -1903,10 +1904,10 @@ class NegativeBinomial(CountModel):
""" + base._missing_param_doc}
def __init__(self, endog, exog, loglike_method='nb2', offset=None,
exposure=None, missing='none'):
exposure=None, missing='none', **kwargs):
super(NegativeBinomial, self).__init__(endog, exog, offset=offset,
exposure=exposure,
missing=missing)
missing=missing, **kwargs)
self.loglike_method = loglike_method
self._initialize()
if loglike_method in ['nb2', 'nb1']:
Expand Down
5 changes: 3 additions & 2 deletions statsmodels/duration/hazard_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,15 +247,16 @@ class PHReg(model.LikelihoodModel):

def __init__(self, endog, exog, status=None, entry=None,
strata=None, offset=None, ties='breslow',
missing='drop'):
missing='drop', **kwargs):

# Default is no censoring
if status is None:
status = np.ones(len(endog))

super(PHReg, self).__init__(endog, exog, status=status,
entry=entry, strata=strata,
offset=offset, missing=missing)
offset=offset, missing=missing,
**kwargs)

# endog and exog are automatically converted, but these are
# not
Expand Down
38 changes: 32 additions & 6 deletions statsmodels/formula/formulatools.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,28 @@
from statsmodels.compat.python import iterkeys
import statsmodels.tools.data as data_util
from patsy import dmatrices
from patsy import dmatrices, NAAction
import numpy as np

# if users want to pass in a different formula framework, they can
# add their handler here. how to do it interactively?

# this is a mutable object, so editing it should show up in the below
formula_handler = {}

def handle_formula_data(Y, X, formula, depth=0):

class NAAction(NAAction):
# monkey-patch so we can handle missing values in 'extra' arrays later
def _handle_NA_drop(self, values, is_NAs, origins):
total_mask = np.zeros(is_NAs[0].shape[0], dtype=bool)
for is_NA in is_NAs:
total_mask |= is_NA
good_mask = ~total_mask
self.missing_mask = total_mask
# "..." to handle 1- versus 2-dim indexing
return [v[good_mask, ...] for v in values]


def handle_formula_data(Y, X, formula, depth=0, missing='drop'):
"""
Returns endog, exog, and the model specification from arrays and formula
Expand Down Expand Up @@ -36,16 +50,28 @@ def handle_formula_data(Y, X, formula, depth=0):
if isinstance(formula, tuple(iterkeys(formula_handler))):
return formula_handler[type(formula)]

na_action = NAAction(on_NA=missing)

if X is not None:
if data_util._is_using_pandas(Y, X):
return dmatrices(formula, (Y, X), depth, return_type='dataframe')
result = dmatrices(formula, (Y, X), depth,
return_type='dataframe', NA_action=na_action)
else:
return dmatrices(formula, (Y, X), depth, return_type='dataframe')
result = dmatrices(formula, (Y, X), depth,
return_type='dataframe', NA_action=na_action)
else:
if data_util._is_using_pandas(Y, None):
return dmatrices(formula, Y, depth, return_type='dataframe')
result = dmatrices(formula, Y, depth, return_type='dataframe',
NA_action=na_action)
else:
return dmatrices(formula, Y, depth, return_type='dataframe')
result = dmatrices(formula, Y, depth, return_type='dataframe',
NA_action=na_action)

# if missing == 'raise' there's not missing_mask
missing_mask = getattr(na_action, 'missing_mask', None)
if not np.any(missing_mask):
missing_mask = None
return result, missing_mask


def _remove_intercept_patsy(terms):
Expand Down
5 changes: 3 additions & 2 deletions statsmodels/genmod/generalized_estimating_equations.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ class GEE(base.Model):
def __init__(self, endog, exog, groups, time=None, family=None,
cov_struct=None, missing='none', offset=None,
exposure=None, dep_data=None, constraint=None,
update_dep=True):
update_dep=True, **kwargs):

self.missing = missing
self.dep_data = dep_data
Expand All @@ -423,7 +423,8 @@ def __init__(self, endog, exog, groups, time=None, family=None,
super(GEE, self).__init__(endog, exog, groups=groups,
time=time, offset=offset,
exposure=exposure,
dep_data=dep_data, missing=missing)
dep_data=dep_data, missing=missing,
**kwargs)

self._init_keys.extend(["update_dep", "constraint", "family",
"cov_struct"])
Expand Down
5 changes: 3 additions & 2 deletions statsmodels/genmod/generalized_linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,10 +189,11 @@ class GLM(base.LikelihoodModel):
""" % {'extra_params' : base._missing_param_doc}

def __init__(self, endog, exog, family=None, offset=None, exposure=None,
missing='none'):
missing='none', **kwargs):
self._check_inputs(family, offset, exposure, endog)
super(GLM, self).__init__(endog, exog, missing=missing,
offset=self.offset, exposure=self.exposure)
offset=self.offset, exposure=self.exposure,
**kwargs)
if offset is None:
delattr(self, 'offset')
if exposure is None:
Expand Down

0 comments on commit 4f55df8

Please sign in to comment.