# The deconfounder: a PCA factor model + a quadratic outcome model

In [1]:
import numpy.random as npr
import statsmodels.api as sm 
import scipy 
import numpy as np

from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.datasets import make_spd_matrix
from scipy import stats

stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)



  from pandas.core import datetools


In [2]:
import time
timenowseed = int(time.time())
npr.seed(timenowseed)
print(timenowseed)
# npr.seed(1534727263)

1537422919


In [3]:
n = 10000
d = 3 # number of causes (=2) + number of confounders (=1)

# A simulated dataset

## simulate correlated causes

In [4]:
corrcoef = 0.4
stdev = np.ones(d)
corr = np.eye(d) * (1-corrcoef) + np.ones([d,d]) * corrcoef
print("correlation \n", corr)
b = np.matmul(stdev[:,np.newaxis], stdev[:,np.newaxis].T)
cov = np.multiply(b, corr)
mean = np.zeros(d)
# cov = make_spd_matrix(3)
print("covariance \n", cov)
X = npr.multivariate_normal(mean, cov, n)

correlation 
 [[1.  0.4 0.4]
 [0.4 1.  0.4]
 [0.4 0.4 1. ]]
covariance 
 [[1.  0.4 0.4]
 [0.4 1.  0.4]
 [0.4 0.4 1. ]]


## simulate the outcome

In [5]:
coef = np.array([0.2, 1.0, 0.9])
assert len(coef) == d
intcpt = 0.4

In [6]:
y = intcpt+(coef).dot((X.T)**2)

# noncausal estimation: quadratic regression

In [7]:
obs_n = d - 1

In [8]:
obs_X = X[:,:obs_n]

In [9]:
#ignore confounder
all_X = np.column_stack([obs_X**2])
x2 = sm.add_constant(all_X)
models = sm.OLS(y,x2)
result = models.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.676
Model:                            OLS   Adj. R-squared:                  0.676
Method:                 Least Squares   F-statistic:                 1.041e+04
Date:                Thu, 20 Sep 2018   Prob (F-statistic):               0.00
Time:                        01:55:19   Log-Likelihood:                -16265.
No. Observations:               10000   AIC:                         3.254e+04
Df Residuals:                    9997   BIC:                         3.256e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0609      0.017     63.561      0.0

*   The true causal coefficient is (0.2, 1.0). 
*   But with the quadratic regression, none of the confidence intervals include the truth.

# causal inference: the deconfounder with a PCA factor model

## fit a PCA

In [10]:
n_comp = 1
eps = 0.0

In [11]:
pca = PCA(n_components=n_comp)
pca.fit(obs_X)

PCA(copy=True, iterated_power='auto', n_components=1, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [12]:
pca.components_

array([[0.69988077, 0.71425969]])

In [13]:
print(pca.explained_variance_ratio_)  

[0.7047965]


## compute the substitute confounder Z and the reconstructed causes A

In [14]:
Z = obs_X.dot(pca.components_.T) + npr.normal(scale=eps,size=(n,1))

In [15]:
A = np.dot(pca.transform(obs_X)[:,:n_comp], pca.components_[:n_comp,:]) + npr.normal(scale=eps,size=(n,obs_n))

In [16]:
X_pca_A = np.hstack((obs_X, A))
X_pca_Z = np.hstack((obs_X, Z))

## causal estimation with the reconstructed causes A

In [17]:
all_X = np.column_stack([X_pca_A**2])
x2 = sm.add_constant(all_X)
models = sm.OLS(y,x2)
result = models.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.677
Model:                            OLS   Adj. R-squared:                  0.677
Method:                 Least Squares   F-statistic:                     6989.
Date:                Thu, 20 Sep 2018   Prob (F-statistic):               0.00
Time:                        01:55:19   Log-Likelihood:                -16241.
No. Observations:               10000   AIC:                         3.249e+04
Df Residuals:                    9996   BIC:                         3.252e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0903      0.017     63.469      0.0

*   The true causal coefficient is (0.2, 1.0). 
*   But with the deconfounder, both of the confidence intervals (for x1, x2) include the truth.

## causal estimation with the substitute confounder Z

In [18]:
all_X = np.column_stack([X_pca_Z**2])
x2 = sm.add_constant(all_X)
models = sm.OLS(y,x2)
result = models.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.677
Model:                            OLS   Adj. R-squared:                  0.677
Method:                 Least Squares   F-statistic:                     6989.
Date:                Thu, 20 Sep 2018   Prob (F-statistic):               0.00
Time:                        01:55:19   Log-Likelihood:                -16241.
No. Observations:               10000   AIC:                         3.249e+04
Df Residuals:                    9996   BIC:                         3.252e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0904      0.017     63.468      0.0

*   The true causal coefficient is (0.2, 1.0). 
*   But with the deconfounder, both of the confidence intervals (for x1, x2) include the truth.

# The oracle case: when the confounder is observed

In [19]:
# oracle
all_X = np.column_stack([X**2])
x2 = sm.add_constant(all_X)
models = sm.OLS(y,x2)
result = models.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 3.081e+32
Date:                Thu, 20 Sep 2018   Prob (F-statistic):               0.00
Time:                        01:55:19   Log-Likelihood:             3.1159e+05
No. Observations:               10000   AIC:                        -6.232e+05
Df Residuals:                    9996   BIC:                        -6.231e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.4000   1.04e-16   3.86e+15      0.0

*   The true causal coefficient is (0.2, 1.0). 
*   When the confounder is observed, both of the confidence intervals (for x1, x2) include the truth.
*   The estimate is (expectedly) more efficient than the deconfounder, but only slightly more.