Skip to content

Commit

Permalink
Fix statsmodels#1877 relating to missing data handling in GEE
Browse files Browse the repository at this point in the history
  • Loading branch information
kshedden authored and bert9bert committed Aug 29, 2014
1 parent 3ab7089 commit 26a78dd
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 19 deletions.
4 changes: 2 additions & 2 deletions statsmodels/genmod/generalized_estimating_equations.py
Expand Up @@ -459,10 +459,10 @@ def __init__(self, endog, exog, groups, time=None, family=None,

# Convert the data to the internal representation, which is a
# list of arrays, corresponding to the clusters.
group_labels = sorted(set(groups))
group_labels = sorted(set(self.groups))
group_indices = dict((s, []) for s in group_labels)
for i in range(len(self.endog)):
group_indices[groups[i]].append(i)
group_indices[self.groups[i]].append(i)
for k in iterkeys(group_indices):
group_indices[k] = np.asarray(group_indices[k])
self.group_indices = group_indices
Expand Down
42 changes: 25 additions & 17 deletions statsmodels/genmod/tests/test_gee.py
Expand Up @@ -104,30 +104,38 @@ def test_poisson_epil(self):
assert_almost_equal(rslt1.params, rslt2.params, decimal=6)
assert_almost_equal(rslt1.scale, rslt2.scale, decimal=6)

def test_missing(self):

endog = np.random.normal(size=100)
exog1 = np.random.normal(size=100)
exog2 = np.random.normal(size=100)
exog3 = np.random.normal(size=100)
groups = np.kron(lrange(20), np.ones(5))

# TODO: why does this test fail?
def t_est_missing(self):
endog[0] = np.nan
endog[5:7] = np.nan
exog2[10:12] = np.nan

Y = np.random.normal(size=100)
X1 = np.random.normal(size=100)
X2 = np.random.normal(size=100)
X3 = np.random.normal(size=100)
groups = np.kron(lrange(20), np.ones(5))
data = pd.DataFrame({"endog": endog, "exog1": exog1,
"exog2": exog2, "exog3": exog3,
"groups": groups})

Y[0] = np.nan
Y[5:7] = np.nan
X2[10:12] = np.nan
mod1 = GEE.from_formula("endog ~ exog1 + exog2 + exog3",
"groups", data=data,
missing='drop')
rslt1 = mod1.fit()

D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3,
"groups": groups})
assert_almost_equal(len(mod1.endog), 95)
assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 4])

md = GEE.from_formula("Y ~ X1 + X2 + X3", D["groups"],
missing='drop')
mdf = md.fit()
mod2 = GEE.from_formula("endog ~ exog1 + exog2 + exog3",
"groups", data=data.dropna(),
missing='none')
rslt2 = mod2.fit()

assert_almost_equal(rslt1.params, rslt2.params)
assert_almost_equal(rslt1.bse, rslt2.bse)

assert_almost_equal(len(md.endog), 95)
assert_almost_equal(np.asarray(md.exog.shape), np.r_[95, 4])

def test_default_time(self):
"""
Expand Down

0 comments on commit 26a78dd

Please sign in to comment.