In [1]:
import random
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [2]:
# set working directory - note: all code runs from the src folder
wrk_dir = os.getcwd()
# set data path
data_path = wrk_dir + '\\' + 'data' + '\\'

In [3]:
np.random.seed(2022)

n = 5000
gamma1 = 250
gamma2 = 100
beta1 = 7.5

# exog vars
u1 = gamma1*np.random.normal(loc=0.0, scale=1.0, size=n)
u2 = gamma2*np.random.normal(loc=0.0, scale=1.0, size=n)

# endo vars
x1 = u1
x2 = beta1*x1 + u2

In [4]:
# store data for testing RStan 
d = {'X1': x1,
     'U1': u1,
     'X2': x2,
     'U2': u2
    }
data = pd.DataFrame(d)
data.head(5)

Unnamed: 0,X1,U1,X2,U2
0,-0.131975,-0.131975,-54.416768,-53.426957
1,-68.725356,-68.725356,-611.36715,-95.926978
2,-34.82139,-34.82139,-272.755678,-11.59525
3,496.171539,496.171539,3962.140328,240.853782
4,70.527331,70.527331,617.738924,88.783938


In [15]:
model = LinearRegression(fit_intercept=True, normalize=False)

x = np.array(data['X1'].copy()).reshape((-1, 1))
print(x.shape)

y = np.array(data['X2'].copy())
print(y.shape)

model.fit(x, y)

print(model.coef_)
print(model.intercept_)

(5000, 1)
(5000,)
[7.5024816]
-1.3474952696904055


Under **situation testing** we're constrained by the dataset $\mathcal{D}$, meaning that what we see/have is what we can use. When facing confounding, this limits our approaches: the use of a mediator or instrumental variable(s) will depend on the data in question (here, recall we're in a model-agnostic setting). Now, for SCF we need to retrieve $U$ for the abduction step. This is, at least the Perlian view.

However, I'd add that we also need to *properly identify* the weights of the causal edges. Under confounding, these weights can be biased, which will affect all other steps for SCF. I believe we can frame it as an ommited variable porblem where we are unable to split the residual variance accordingly. This shifts some probability mass into the weights...

In [12]:
np.random.seed(2022)

gamma3 = 5000
gamma4 = 1
gamma5 = 3

w = gamma3*np.random.normal(loc=0.0, scale=1.0, size=n)

b_u1 = u1 + gamma4*w
b_u2 = u2 + gamma5*w

# endo vars
b_x1 = b_u1
b_x2 = beta1*b_x1 + b_u2

In [16]:
# store data for testing RStan 
d2 = {'X1': b_x1,
      'U1': b_u1,
      'X2': b_x2,
      'U2': b_u2,
      'W': w
    }
data2 = pd.DataFrame(d2)
data2.head(5)

Unnamed: 0,X1,U1,X2,U2,W
0,-2.77147,-2.77147,-82.13147,-61.345443,-2.639495
1,-1443.232481,-1443.232481,-15043.691957,-4219.448352,-1374.507124
2,-731.249199,-731.249199,-7585.247665,-2100.878675,-696.427808
3,10419.602328,10419.602328,108158.163608,30011.146148,9923.430789
4,1481.07396,1481.07396,15428.478527,4320.423825,1410.546629


In [14]:
model2 = LinearRegression(fit_intercept=True, normalize=False)

x = np.array(data2['X1'].copy()).reshape((-1, 1))
print(x.shape)

y = np.array(data2['X2'].copy())
print(y.shape)

model2.fit(x, y)

print(model2.coef_)

print(model2.intercept_)

(5000, 1)
(5000,)
[10.35726103]
-1.3474952696902847


In [17]:
model3 = LinearRegression(fit_intercept=True, normalize=False)

x = np.array(data2[['X1', 'W']].copy())#.reshape((-1, 1))
print(x.shape)

y = np.array(data2['X2'].copy())
print(y.shape)

model2.fit(x, y)

print(model2.coef_)

print(model2.intercept_)

(5000, 2)
(5000,)
[5.43109645 5.17247281]
-1.3474952696902847


It's tricky because even with knowledge of $W$ we can't retrieve $\beta$... focus on more damage control: is it better to do SCF without getting some proxy for $W$ or doing nothing at all?

In [None]:
# store in data folder
data.to_csv(data_path + '\\' + 'namehere.csv', sep='|', index=False)