# alexandreyc/statsmodels forked from statsmodels/statsmodels

### Subversion checkout URL

You can clone with HTTPS or Subversion.

Add SUR predict method + test with grunfeld fitted

commit 6fc14833be52945ba4b0c8869c8f4669358ccc56 1 parent 45c9ce6
authored
8 statsmodels/examples/example_sysreg.py
 @@ -4,7 +4,7 @@ import numpy as np import statsmodels.api as sm -from statsmodels.sandbox.sysreg import * +from statsmodels.sysreg.sysreg import * #for Python 3 compatibility from statsmodels.compatnp.py3k import asbytes @@ -39,9 +39,9 @@ # Note that the results in Greene (5th edition) uses a slightly different # version of the Grunfeld data. To reproduce Table 14.1 the following changes # are necessary. -grun_sys[-2][5] = 261.6 -grun_sys[-2][-3] = 645.2 -grun_sys[-1][11,2] = 232.6 +#grun_sys[-2][5] = 261.6 +#grun_sys[-2][-3] = 645.2 +#grun_sys[-1][11,2] = 232.6 grun_mod = SUR(grun_sys) grun_res = grun_mod.fit()
377 statsmodels/sandbox/sysreg.py
 @@ -1,377 +0,0 @@ -from statsmodels.regression.linear_model import GLS -import numpy as np -import statsmodels.tools.tools as tools -from statsmodels.base.model import LikelihoodModelResults -from scipy import sparse - -#http://www.irisa.fr/aladin/wg-statlin/WORKSHOPS/RENNES02/SLIDES/Foschi.pdf - -__all__ = ['SUR', 'Sem2SLS'] - -#probably should have a SystemModel superclass -# TODO: does it make sense of SUR equations to have -# independent endogenous regressors? If so, then -# change docs to LHS = RHS -#TODO: make a dictionary that holds equation specific information -#rather than these cryptic lists? Slower to get a dict value? -#TODO: refine sigma definition -class SUR(object): - """ - Seemingly Unrelated Regression - - Parameters - ---------- - sys : list - [endog1, exog1, endog2, exog2,...] It will be of length 2 x M, - where M is the number of equations endog = exog. - sigma : array-like - M x M array where sigma[i,j] is the covariance between equation i and j - dfk : None, 'dfk1', or 'dfk2' - Default is None. Correction for the degrees of freedom - should be specified for small samples. See the notes for more - information. - - Attributes - ---------- - cholsigmainv : array - The transpose of the Cholesky decomposition of pinv_wexog - df_model : array - Model degrees of freedom of each equation. p_{m} - 1 where p is - the number of regressors for each equation m and one is subtracted - for the constant. - df_resid : array - Residual degrees of freedom of each equation. Number of observations - less the number of parameters. - endog : array - The LHS variables for each equation in the system. - It is a M x nobs array where M is the number of equations. - exog : array - The RHS variable for each equation in the system. - It is a nobs x sum(p_{m}) array. Which is just each - RHS array stacked next to each other in columns. - history : dict - Contains the history of fitting the model. Probably not of interest - if the model is fit with igls = False. - iterations : int - The number of iterations until convergence if the model is fit - iteratively. - nobs : float - The number of observations of the equations. - normalized_cov_params : array - sum(p_{m}) x sum(p_{m}) array - :math:\\left[X^{T}\\left(\\Sigma^{-1}\\otimes\\boldsymbol{I}\\right)X\\right]^{-1} - pinv_wexog : array - The pseudo-inverse of the wexog - sigma : array - M x M covariance matrix of the cross-equation disturbances. See notes. - sp_exog : CSR sparse matrix - Contains a block diagonal sparse matrix of the design so that - exog1 ... exogM are on the diagonal. - wendog : array - M * nobs x 1 array of the endogenous variables whitened by - cholsigmainv and stacked into a single column. - wexog : array - M*nobs x sum(p_{m}) array of the whitened exogenous variables. - - Notes - ----- - All individual equations are assumed to be well-behaved, homoeskedastic - iid errors. This is basically an extension of GLS, using sparse matrices. - - .. math:: \\Sigma=\\left[\\begin{array}{cccc} - \\sigma_{11} & \\sigma_{12} & \\cdots & \\sigma_{1M}\\\\ - \\sigma_{21} & \\sigma_{22} & \\cdots & \\sigma_{2M}\\\\ - \\vdots & \\vdots & \\ddots & \\vdots\\\\ - \\sigma_{M1} & \\sigma_{M2} & \\cdots & \\sigma_{MM}\\end{array}\\right] - - References - ---------- - Zellner (1962), Greene (2003) - """ -#TODO: Does each equation need nobs to be the same? - def __init__(self, sys, sigma=None, dfk=None): - if len(sys) % 2 != 0: - raise ValueError("sys must be a list of pairs of endogenous and \ -exogenous variables. Got length %s" % len(sys)) - if dfk: - if not dfk.lower() in ['dfk1','dfk2']: - raise ValueError("dfk option %s not understood" % (dfk)) - self._dfk = dfk - M = len(sys[1::2]) - self._M = M -# exog = np.zeros((M,M), dtype=object) -# for i,eq in enumerate(sys[1::2]): -# exog[i,i] = np.asarray(eq) # not sure this exog is needed - # used to compute resids for now - exog = np.column_stack(np.asarray(sys[1::2][i]) for i in range(M)) -# exog = np.vstack(np.asarray(sys[1::2][i]) for i in range(M)) - self.exog = exog # 2d ndarray exog is better -# Endog, might just go ahead and reshape this? - endog = np.asarray(sys[::2]) - self.endog = endog - self.nobs = float(self.endog[0].shape[0]) # assumes all the same length - -# Degrees of Freedom - df_resid = [] - df_model = [] - [df_resid.append(self.nobs - tools.rank(_)) \ - for _ in sys[1::2]] - [df_model.append(tools.rank(_) - 1) for _ in sys[1::2]] - self.df_resid = np.asarray(df_resid) - self.df_model = np.asarray(df_model) - -# "Block-diagonal" sparse matrix of exog - sp_exog = sparse.lil_matrix((int(self.nobs*M), - int(np.sum(self.df_model+1)))) # linked lists to build - self._cols = np.cumsum(np.hstack((0, self.df_model+1))) - for i in range(M): - sp_exog[i*self.nobs:(i+1)*self.nobs, - self._cols[i]:self._cols[i+1]] = sys[1::2][i] - self.sp_exog = sp_exog.tocsr() # cast to compressed for efficiency -# Deal with sigma, check shape earlier if given - if np.any(sigma): - sigma = np.asarray(sigma) # check shape - elif sigma == None: - resids = [] - for i in range(M): - resids.append(GLS(endog[i],exog[:, - self._cols[i]:self._cols[i+1]]).fit().resid) - resids = np.asarray(resids).reshape(M,-1) - sigma = self._compute_sigma(resids) - self.sigma = sigma - self.cholsigmainv = np.linalg.cholesky(np.linalg.pinv(\ - self.sigma)).T - self.initialize() - - def initialize(self): - self.wendog = self.whiten(self.endog) - self.wexog = self.whiten(self.sp_exog) - self.pinv_wexog = np.linalg.pinv(self.wexog) - self.normalized_cov_params = np.dot(self.pinv_wexog, - np.transpose(self.pinv_wexog)) - self.history = {'params' : [np.inf]} - self.iterations = 0 - - def _update_history(self, params): - self.history['params'].append(params) - - def _compute_sigma(self, resids): - """ - Computes the sigma matrix and update the cholesky decomposition. - """ - M = self._M - nobs = self.nobs - sig = np.dot(resids, resids.T) # faster way to do this? - if not self._dfk: - div = nobs - elif self._dfk.lower() == 'dfk1': - div = np.zeros(M**2) - for i in range(M): - for j in range(M): - div[i+j] = ((self.df_model[i]+1) *\ - (self.df_model[j]+1))**(1/2) - div.reshape(M,M) - else: # 'dfk2' error checking is done earlier - div = np.zeros(M**2) - for i in range(M): - for j in range(M): - div[i+j] = nobs - np.max(self.df_model[i]+1, - self.df_model[j]+1) - div.reshape(M,M) -# doesn't handle (#,) - self.cholsigmainv = np.linalg.cholesky(np.linalg.pinv(sig/div)).T - return sig/div - - def whiten(self, X): - """ - SUR whiten method. - - Parameters - ----------- - X : list of arrays - Data to be whitened. - - Returns - ------- - If X is the exogenous RHS of the system. - np.dot(np.kron(cholsigmainv,np.eye(M)),np.diag(X)) - - If X is the endogenous LHS of the system. - - """ - nobs = self.nobs - if X is self.endog: # definitely not a robust check - return np.dot(np.kron(self.cholsigmainv,np.eye(nobs)), - X.reshape(-1,1)) - elif X is self.sp_exog: - return (sparse.kron(self.cholsigmainv, - sparse.eye(nobs,nobs))*X).toarray()#*=dot until cast to array - - def fit(self, igls=False, tol=1e-5, maxiter=100): - """ - igls : bool - Iterate until estimates converge if sigma is None instead of - two-step GLS, which is the default is sigma is None. - - tol : float - - maxiter : int - - Notes - ----- - This ia naive implementation that does not exploit the block - diagonal structure. It should work for ill-conditioned sigma - but this is untested. - """ - - if not np.any(self.sigma): - self.sigma = self._compute_sigma(self.endog, self.exog) - M = self._M - beta = np.dot(self.pinv_wexog, self.wendog) - self._update_history(beta) - self.iterations += 1 - if not igls: - sur_fit = SysResults(self, beta, self.normalized_cov_params) - return sur_fit - - conv = self.history['params'] - while igls and (np.any(np.abs(conv[-2] - conv[-1]) > tol)) and \ - (self.iterations < maxiter): - fittedvalues = (self.sp_exog*beta).reshape(M,-1) - resids = self.endog - fittedvalues # don't attach results yet - self.sigma = self._compute_sigma(resids) # need to attach for compute? - self.wendog = self.whiten(self.endog) - self.wexog = self.whiten(self.sp_exog) - self.pinv_wexog = np.linalg.pinv(self.wexog) - self.normalized_cov_params = np.dot(self.pinv_wexog, - np.transpose(self.pinv_wexog)) - beta = np.dot(self.pinv_wexog, self.wendog) - self._update_history(beta) - self.iterations += 1 - sur_fit = SysResults(self, beta, self.normalized_cov_params) - return sur_fit - - def predict(self, design): - pass - -#TODO: Should just have a general 2SLS estimator to subclass -# for IV, FGLS, etc. -# Also should probably have SEM class and estimators as subclasses -class Sem2SLS(object): - """ - Two-Stage Least Squares for Simultaneous equations - - Parameters - ---------- - sys : list - [endog1, exog1, endog2, exog2,...] It will be of length 2 x M, - where M is the number of equations endog = exog. - indep_endog : dict - A dictionary mapping the equation to the column numbers of the - the independent endogenous regressors in each equation. - It is assumed that the system is inputed as broken up into - LHS and RHS. For now, the values of the dict have to be sequences. - Note that the keys for the equations should be zero-indexed. - instruments : array - Array of the exogenous independent variables. - - Notes - ----- - This is unfinished, and the design should be refactored. - Estimation is done by brute force and there is no exploitation of - the structure of the system. - """ - def __init__(self, sys, indep_endog=None, instruments=None): - if len(sys) % 2 != 0: - raise ValueError("sys must be a list of pairs of endogenous and \ -exogenous variables. Got length %s" % len(sys)) - M = len(sys[1::2]) - self._M = M -# The lists are probably a bad idea - self.endog = sys[::2] # these are just list containers - self.exog = sys[1::2] - self._K = [tools.rank(_) for _ in sys[1::2]] -# fullexog = np.column_stack((_ for _ in self.exog)) - - self.instruments = instruments - - # Keep the Y_j's in a container to get IVs - instr_endog = {} - [instr_endog.setdefault(_,[]) for _ in indep_endog.keys()] - - for eq_key in indep_endog: - for varcol in indep_endog[eq_key]: - instr_endog[eq_key].append(self.exog[eq_key][:,varcol]) - # ^ copy needed? -# self._instr_endog = instr_endog - - self._indep_endog = indep_endog - _col_map = np.cumsum(np.hstack((0,self._K))) # starting col no.s -# move this check to whiten since we're not going to build a full exog? - for eq_key in indep_endog: - try: - iter(indep_endog[eq_key]) - except: -# eq_key = [eq_key] - raise TypeError("The values of the indep_exog dict must be\ - iterable. Got type %s for converter %s" % (type(del_col))) -# for del_col in indep_endog[eq_key]: -# fullexog = np.delete(fullexog, _col_map[eq_key]+del_col, 1) -# _col_map[eq_key+1:] -= 1 - -# Josef's example for deleting reoccuring "rows" -# fullexog = np.unique(fullexog.T.view([('',fullexog.dtype)]*\ -# fullexog.shape[0])).view(fullexog.dtype).reshape(\ -# fullexog.shape[0],-1) -# From http://article.gmane.org/gmane.comp.python.numeric.general/32276/ -# Or Jouni' suggetsion of taking a hash: -# http://www.mail-archive.com/numpy-discussion@scipy.org/msg04209.html -# not clear to me how this would work though, only if they are the *same* -# elements? -# self.fullexog = fullexog - self.wexog = self.whiten(instr_endog) - - - def whiten(self, Y): - """ - Runs the first stage of the 2SLS. - - Returns the RHS variables that include the instruments. - """ - wexog = [] - indep_endog = self._indep_endog # this has the col mapping -# fullexog = self.fullexog - instruments = self.instruments - for eq in range(self._M): # need to go through all equations regardless - instr_eq = Y.get(eq, None) # Y has the eq to ind endog array map - newRHS = self.exog[eq].copy() - if instr_eq: - for i,LHS in enumerate(instr_eq): - yhat = GLS(LHS, self.instruments).fit().fittedvalues - newRHS[:,indep_endog[eq][i]] = yhat - # this might fail if there is a one variable column (nobs,) - # in exog - wexog.append(newRHS) - return wexog - - def fit(self): - """ - """ - delta = [] - wexog = self.wexog - endog = self.endog - for j in range(self._M): - delta.append(GLS(endog[j], wexog[j]).fit().params) - return delta - -class SysResults(LikelihoodModelResults): - """ - Not implemented yet. - """ - def __init__(self, model, params, normalized_cov_params=None, scale=1.): - super(SysResults, self).__init__(model, params, - normalized_cov_params, scale) - self._get_results() - - def _get_results(self): - pass
214 statsmodels/sysreg/example_sysreg.py