This exercise is about how one can check conditional independence from the structure of the graph. (Note: The goal her is not to discuss how to estimate the correct causal inference from x3 to y.)

In [1]:
import numpy as np
import statsmodels.formula.api as smf
import pandas as pd

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
# Generate data according to the linear SCM model in the exercise

N = 100000

n1 = np.random.normal(0, 1, N)
n2 = np.random.normal(0, 1, N)
n3 = np.random.normal(0, 1, N)
n4 = np.random.normal(0, 1, N)
n5 = np.random.normal(0, 1, N)
n6 = np.random.normal(0, 1, N)
ny = np.random.normal(0, 1, N)

x1 = n1
x2 = n2
x3 = n3
x4 = 2*x1 + 3*x2 + n4
x5 = 1*x2 + 2*x3 + n5
x6 = 1*x4 + n6
y = 3*x1 + 2*x5 + ny

dat1 = pd.DataFrame({'x1':x1, 'x2':x2, 'x3':x3, 'x4':x4,
                     'x5':x5, 'x6':x6, 'y':y})

In [3]:
# Is y independent from x3 given x5?

# We check if the x3 coefficient is 0 in the linear regression?
# Answer: Yes (with very high probability)
results1 = smf.ols('y ~ x3 + x5 - 1', data=dat1).fit()
print(results1.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.700
Model:                            OLS   Adj. R-squared (uncentered):              0.700
Method:                 Least Squares   F-statistic:                          1.167e+05
Date:                Wed, 02 Oct 2024   Prob (F-statistic):                        0.00
Time:                        14:28:18   Log-Likelihood:                     -2.5759e+05
No. Observations:              100000   AIC:                                  5.152e+05
Df Residuals:                   99998   BIC:                                  5.152e+05
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [4]:
# Is y independent from x3 given (x4 and x5)?

# No, as we have opened the path x3-> x5 -> x2 -> x4 -> x1 -> y
# Also, we see that the estimation of the x3 coefficient is non-zero.
results2 = smf.ols('y ~ x3 + x4 + x5 - 1', data=dat1).fit()
print(results2.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.815
Model:                            OLS   Adj. R-squared (uncentered):              0.815
Method:                 Least Squares   F-statistic:                          1.471e+05
Date:                Wed, 02 Oct 2024   Prob (F-statistic):                        0.00
Time:                        14:28:21   Log-Likelihood:                     -2.3336e+05
No. Observations:              100000   AIC:                                  4.667e+05
Df Residuals:                   99997   BIC:                                  4.668e+05
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [9]:
# Is y independent from x3 given (x5 and x6)?

# We have the same path opening as when conditioning (x4 and x5) since x6 is a descendant of x4.
# Also, we see that the estimation of the x3 coefficient is non-zero.
# Answer: No, y and x3 are not conditionally independet given (x5 and x6)
results3 = smf.ols('y ~ x3 + x5 + x6 - 1', data=dat1).fit()
print(results3.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.804
Model:                            OLS   Adj. R-squared (uncentered):              0.804
Method:                 Least Squares   F-statistic:                          1.371e+05
Date:                Wed, 02 Oct 2024   Prob (F-statistic):                        0.00
Time:                        14:40:32   Log-Likelihood:                     -2.3622e+05
No. Observations:              100000   AIC:                                  4.724e+05
Df Residuals:                   99997   BIC:                                  4.725e+05
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [8]:
# Is y independent from x3 given (x2, x5 and x6)?

# Now, when we also condition on x2, the previously open path is broken, and we
# also do not open any new paths, so we have broken all paths from x3 to y.
# We also see that the x3 coefficient becomes zero.
results4 = smf.ols('y ~ x2 + x3 + x5 + x6 - 1', data=dat1).fit()
print(results4.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.881
Model:                            OLS   Adj. R-squared (uncentered):              0.881
Method:                 Least Squares   F-statistic:                          1.854e+05
Date:                Wed, 02 Oct 2024   Prob (F-statistic):                        0.00
Time:                        14:39:28   Log-Likelihood:                     -2.1130e+05
No. Observations:              100000   AIC:                                  4.226e+05
Df Residuals:                   99996   BIC:                                  4.226e+05
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [7]:
# And by the way, if you would like to estimate the causal impact of x3 on y,
# this would be trivial: Since x3 has no parents we can use an empty adjustment set Z,
# and a simple linear regression betwen y and x3 would work.
# The correct coefficient, which is 4, is then found by:
results5 = smf.ols('y ~ x3 - 1', data=dat1).fit()
print(results5.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.466
Model:                            OLS   Adj. R-squared (uncentered):              0.466
Method:                 Least Squares   F-statistic:                          8.717e+04
Date:                Wed, 02 Oct 2024   Prob (F-statistic):                        0.00
Time:                        14:38:29   Log-Likelihood:                     -2.8646e+05
No. Observations:              100000   AIC:                                  5.729e+05
Df Residuals:                   99999   BIC:                                  5.729e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------