In [1]:
# import packages

import numpy as np
from numpy import random
from scipy import stats
import pandas as pd
import copy
import matplotlib.pyplot as plt

# import r packages if needed

import rpy2.robjects as robjects
import rpy2.robjects.packages as rpackages
rpackages.importr("clusterGeneration")
rpackages.importr("mpower")
cluster_generation = robjects.packages.importr("clusterGeneration")
mpower = robjects.packages.importr("mpower")



In [None]:
 # Fully Synthetic Data Sets (Set up from Künzel et al.)

# 1: Simulate the d-dimensional X.
# 2: Create Potential Outcomes Y(1) and Y(0).
# 3: Simulate Treatment Assignments trough W.

In [None]:
########################################################

In [2]:
# 1: Simulate the d-dimensional X


# Setup parameters
d = 25 # TODO: set dimension according to setup
N = 1000 # TODO: set sample size according to setup

# X Correlation matrix and mean
mean = np.zeros(d) # TODO: set mean according to setup
cov = np.array(mpower.cvine(d=d, alpha = 0.5, beta = 0.5)) # TODO: set cov according to setup

# Simulate X
X = random.multivariate_normal(mean=mean, cov=cov, size=N, check_valid='warn')

In [3]:
# Check X
X

array([[-0.23949044,  0.20204604, -0.54972782, ..., -1.12269978,
         1.05984406, -0.2747755 ],
       [ 0.78597252, -0.9352116 ,  0.27192843, ..., -1.24850548,
         1.44651786,  0.14611905],
       [ 1.09727696, -0.96287228,  0.56219315, ..., -1.02297285,
         0.96877531, -0.79127487],
       ...,
       [ 1.43721162,  0.05581602, -0.80873523, ..., -0.65795589,
         0.41761371,  0.72035854],
       [-0.56825225,  0.78821679, -0.62711777, ..., -0.09637254,
        -0.95424552,  0.23412143],
       [-0.79253074,  1.63656252, -1.49251667, ...,  1.05690572,
        -0.80464246,  2.66129696]])

In [4]:
# Check cov
cov

array([[ 1.        , -0.60666625,  0.26815879, -0.76026717,  0.57035503,
        -0.73499248,  0.93971471,  0.66136962,  0.13849344, -0.46259898,
        -0.97770495, -0.77288718, -0.99776269, -0.86978712,  0.11842479,
         0.59778168, -0.98683337,  0.01573303,  0.70449791,  0.91346304,
        -0.81048568, -0.94588293, -0.2614016 ,  0.72887186, -0.23008923],
       [-0.60666625,  1.        , -0.90095064,  0.95105014, -0.66879917,
        -0.05142658, -0.3006486 , -0.04137047, -0.86905463,  0.98541512,
         0.50311263,  0.31104557,  0.58385804,  0.43301207, -0.33433266,
         0.03262114,  0.49707703,  0.65435305, -0.32673463, -0.80630541,
         0.89741793,  0.83077251,  0.27365195, -0.6017433 ,  0.72542454],
       [ 0.26815879, -0.90095064,  1.        , -0.81739971,  0.68148115,
         0.40615506, -0.07348987, -0.39391023,  0.95837561, -0.94641301,
        -0.15930536,  0.04159206, -0.25603663, -0.23502185,  0.1904038 ,
        -0.15346519, -0.12214519, -0.89467708,  0

In [5]:
#######################################################

In [6]:
# 2: Create Potential Outcomes Y(1) and Y(0).

# 2.1 Simulate errors, FIX
e_0 = random.normal(loc=0.0,scale=1.0,size=N)
e_1 = random.normal(loc=0.0,scale=1.0,size=N)

# if needed take student_t distributed errors to see simulation with heavy-tailed errors

# e_o = random.standard_t(df=1,size=N)
# e_1 = random.standard_t(df=1,size=N)

In [None]:
#######################################################

In [None]:
# 2.2 Create Response Functions

In [7]:
# SI1.1 simple cate - indicator  (no confounding)
betas_0 = random.uniform(low=-5, high=5, size=d)
mu_0 = np.matmul(X,betas_0) + 5*np.int8(X[:,0]>0.5)
mu_1 = mu_0 + 8*np.int8(X[:,1]>0.1) # indicator
tau = mu_1 - mu_0
tau

array([8., 0., 0., 8., 8., 0., 8., 8., 0., 8., 0., 0., 0., 0., 0., 8., 0.,
       0., 8., 0., 0., 0., 8., 8., 8., 8., 0., 0., 8., 8., 8., 8., 0., 8.,
       0., 8., 0., 0., 8., 0., 0., 0., 0., 8., 8., 0., 8., 0., 8., 0., 8.,
       0., 8., 0., 8., 0., 8., 8., 8., 8., 8., 8., 0., 0., 8., 8., 8., 8.,
       8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 8., 0., 8., 8., 8., 8., 0.,
       0., 8., 0., 8., 8., 0., 8., 8., 0., 8., 8., 8., 0., 0., 0., 8., 8.,
       8., 0., 8., 0., 8., 8., 0., 0., 0., 8., 8., 0., 0., 0., 0., 8., 0.,
       8., 8., 0., 0., 8., 8., 0., 8., 8., 0., 8., 0., 0., 0., 8., 8., 8.,
       0., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 8., 0., 8., 8., 0., 0.,
       8., 0., 8., 0., 0., 0., 8., 0., 0., 8., 0., 0., 8., 8., 8., 0., 0.,
       0., 8., 8., 0., 8., 0., 0., 8., 8., 0., 8., 0., 0., 8., 8., 0., 8.,
       0., 0., 0., 8., 8., 0., 8., 0., 0., 0., 0., 0., 8., 0., 8., 8., 0.,
       8., 0., 8., 8., 0., 8., 8., 0., 0., 8., 8., 8., 0., 8., 8., 8., 0.,
       8., 0., 0., 0., 8.

In [None]:
# SI1.2 simple cate - linear (no confounding)
betas_0 = random.uniform(low=-5, high=5, size=d)
mu_0 = np.matmul(X,betas_0) + 5*X[:,0]
mu_1 = mu_0 + 8*X[:,1] # linear
tau = mu_1 - mu_0
tau

In [None]:
# SI1.3 simple cate - quadratic (no confounding)
betas_0 = random.uniform(low=-5, high=5, size=d)
mu_0 = np.matmul(X,betas_0) + 5*(X[:,0]**2)
mu_1 = mu_0 + 8*(X[:,1]**2) # quadratic
tau = mu_1 - mu_0
tau

In [None]:
#############################################

In [None]:
# SI2 complex linear cate  (in Künzel it is low=1, high=30)
betas_0 = random.uniform(low=-15, high=15, size=d)
betas_1 = random.uniform(low=-15, high=15, size=d)
mu_0 = np.matmul(X,betas_0)
mu_1 = np.matmul(X,betas_1)
tau = mu_1 - mu_0

In [None]:
###############################################

In [None]:
def varsigma_funct(x):
    return 2/(1+np.exp(-12*(x-1/2)))

In [None]:
# SI3 complex non-linear
mu_0 = -1/2*varsigma_funct(x=X[:,0])*varsigma_funct(x=X[:,1])
mu_1 = 1/2*varsigma_funct(x=X[:,0])*varsigma_funct(x=X[:,1])
tau = mu_1 - mu_0

In [None]:
#############################################

In [None]:
# SI4 no treatment effect (global linear response)
betas_noT = random.uniform(low=-15, high=15, size=d)
mu_0 = np.matmul(X,betas_noT)
mu_1 = mu_0
tau = np.zeros(N)

In [None]:
###########################################

In [None]:
# SI5 no treatment effect (piecewise linear response)
betas = np.random.uniform(low=-15, high=15, size=d)

betas_l = copy.copy(betas) # betas_lower
betas_l[5:d] = 0

betas_m = copy.copy(betas) #betas_middle
betas_m[0:5] = 0
betas_m[10:d] = 0

betas_u = copy.copy(betas) #betas_upper
betas_u[0:10] = 0
betas_u[15:d] = 0

def piecewise_linear_new(x):
    condition_l = x[:, 19] < -0.4
    condition_u = x[:, 19] > 0.4

    array = np.zeros(N)
    array[condition_l] = np.matmul(x[condition_l, :], betas_l)
    array[~condition_l & ~condition_u] = np.matmul(x[~condition_l & ~condition_u, :], betas_m)
    array[condition_u] = np.matmul(x[condition_u, :], betas_u)

    return array

mu_0 = piecewise_linear_new(X)

mu_1 = mu_0

tau = np.zeros(N)

# TODO: CHECK IF RIGHT

In [None]:
##########################################

In [None]:
# SI6.1 beta confounded, no treatment effect
X = random.uniform(low=0, high=1, size=(N,d)) # ACHTUNG: nöd wiederhole!
mu_0 = 2*X[:,0] - 1
mu_1 = mu_0
tau = np.zeros(N)


In [None]:
# SI6.2 beta confounded, simple cate (indicator)
X = random.uniform(low=0, high=1, size=(N,d))
mu_0 = 2*X[:,0] - 1
mu_1 = mu_0 + 2*np.int8(X[:,1]>0.4)
tau = mu_1 - mu_0
tau

In [None]:
# SI6.3 beta confounded, simple cate (linear)
X = random.uniform(low=0, high=1, size=(N,d))
mu_0 = 2*X[:,0] - 1
mu_1 = mu_0 + 2*X[:,1]
tau = mu_1 - mu_0


In [None]:
# SI6.4 beta confounded, simple cate (quadratic)
X = random.uniform(low=0, high=1, size=(N,d))
mu_0 = 2*X[:,0] - 1
mu_1 = mu_0 + 2*(X[:,1]**2)
tau = mu_1 - mu_0
tau

In [None]:
##################################

In [None]:
# SI7.1 beta confounded, complex cate (linear)
betas = random.uniform(low=-15, high=15, size=d)

mu_0 = 2*X[:,0] - 1
mu_1 = mu_0 + np.matmul(X,betas)
tau = mu_1 - mu_0

# TODO: CHECK WHETHER THIS SETUP MAKES SENSE

In [None]:
# SI7.2 beta confounded, complex cate (non-linear)
mu_0 = 2*X[:,0] - 1
mu_1 = mu_0 + 1/2*varsigma_funct(x=X[:,0])*varsigma_funct(x=X[:,1])

# TODO: CHECK WHETER THIS SETUP MAKES SENSE

In [None]:
# TODO: check relationship between mu_0 & mu_1 and see whether this makes sense
check_values_0 = 2*X[:,0] - 1
check_values_1 = mu_0 + 1/2*varsigma_funct(x=X[:,0])*varsigma_funct(x=X[:,1])
check_values_2 = mu_0 + np.matmul(X,betas)

plt.plot(check_values_0, check_values_1, 'bo')
plt.show()

In [8]:
###################################
# 2.3 Creat Potential Outcomes, FIX
Y_0 = mu_0 + e_0
Y_1 = mu_1 + e_1

In [9]:
# quick check Y_0
Y_0

array([-4.76950164e+00, -1.39701815e+01,  8.26996610e+00, -3.17263379e+00,
        7.06813909e+00, -1.08206008e+00, -7.38346949e+00,  1.12382021e+01,
       -4.52038703e+00, -7.05159807e-01,  5.34581073e+00, -3.10917967e+00,
        6.22820794e+00, -1.69321595e+01, -1.60032516e+01, -1.85414974e+01,
       -1.24579715e+01, -2.17782240e+01, -5.23333539e+00, -3.12702114e+00,
       -7.94058479e+00,  7.66310554e+00,  1.48395406e+01,  2.01861019e+00,
        1.07876816e+01,  8.65416864e+00,  1.18288449e+01, -2.09259754e+01,
       -9.60344900e+00,  3.45317055e+01,  1.86558508e+01,  1.38994071e+01,
       -1.23679607e+00,  4.51199868e+00,  1.60663919e+01, -1.04109999e+01,
        4.96392773e+00, -5.38009421e+00,  3.51452739e+00,  5.77431263e+00,
        8.57969734e+00,  2.56618670e+01,  1.21087035e+01,  1.52313290e+01,
       -4.21695562e-01,  2.99925473e+01, -2.21735055e+01, -3.51032833e+00,
        2.16423179e+01, -1.91771485e+01,  1.79759378e+01,  5.50433046e+00,
        2.07766525e+01, -

In [10]:
# Quick Check Y_1
Y_1

array([ 4.90773731e+00, -1.53664060e+01,  8.00767578e+00,  5.46895934e+00,
        1.54799324e+01, -7.04498161e-01,  6.66166959e-01,  2.04133556e+01,
       -4.05854847e+00,  8.27200859e+00,  4.20585032e+00, -3.03715717e+00,
        7.66916694e+00, -1.98817799e+01, -1.56873997e+01, -9.40081323e+00,
       -1.23646030e+01, -1.86208320e+01,  2.93815297e+00, -5.11140132e+00,
       -8.64797899e+00,  9.17130130e+00,  2.30906776e+01,  9.20239653e+00,
        2.25614419e+01,  1.91061348e+01,  1.43428620e+01, -2.35225095e+01,
       -3.92957690e+00,  4.31358630e+01,  2.51477612e+01,  2.22358840e+01,
        8.35735220e-01,  1.14210171e+01,  1.54285569e+01, -3.50269988e+00,
        3.33453524e+00, -6.80687742e+00,  7.20295681e+00,  3.40371251e+00,
        7.95792756e+00,  2.32228274e+01,  1.01125356e+01,  2.30574195e+01,
        8.18102323e+00,  3.00869267e+01, -1.17655413e+01, -3.15831068e+00,
        2.71411168e+01, -1.79977131e+01,  2.45523269e+01,  4.88900651e+00,
        2.97188159e+01, -

In [11]:
###################################
# 3.1 Propensity score setups # TODO: change for setup

# i) constant, balanced
e_x = 0.5

# ii) constant, unbalanced
e_x = 0.01

In [None]:
#################################

In [None]:
# iii) beta confounded (balanced)
X = random.uniform(low=0, high=1, size=(N,d))
beta_dist = stats.beta(a=2, b=4) # set beta distribution
beta_values = beta_dist.pdf(X[:,0]) # calculate pdf values for x1
e_x = 1/4*(1+beta_values)

pd.DataFrame(e_x).describe() # summary stats of e_x

In [12]:
# check if balanced
W = random.binomial(size=N, n=1, p=e_x)
pd.DataFrame(W).describe()

Unnamed: 0,0
count,1000.0
mean,0.489
std,0.500129
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [None]:
#####################################

In [None]:
# iv) beta confounded (unbalanced)
X = random.uniform(low=0, high=1, size=(N,d))
beta_dist = stats.beta(a=1, b=10) # set beta distribution
beta_values = beta_dist.pdf(X[:,0]) # calculate pdf values for x1
e_x = 1/100*(1+beta_values)

pd.DataFrame(e_x).describe() # summary stats of e_x

In [None]:
# check if unbalanced and how much
W = random.binomial(size=N, n=1, p=e_x)
pd.DataFrame(W).describe()

In [None]:
##########################################

In [None]:
X

In [None]:
###################################

In [13]:
# 3.2 Simulate Treatment Assignments trough W

# Simulate Treatment Assignment, FIX
W = random.binomial(size=N, n=1, p=e_x)

# Create Observed Outcome, FIX
ones = np.ones(N)
Y = np.multiply(W,Y_1) + np.multiply(ones-W,Y_0)

In [14]:
pd.DataFrame(e_x).describe() # summary stats of e_x

ValueError: DataFrame constructor not properly called!

In [15]:
# check out observed outcomes
Y

array([ 4.90773731e+00, -1.53664060e+01,  8.00767578e+00,  5.46895934e+00,
        1.54799324e+01, -7.04498161e-01, -7.38346949e+00,  2.04133556e+01,
       -4.05854847e+00, -7.05159807e-01,  4.20585032e+00, -3.10917967e+00,
        7.66916694e+00, -1.98817799e+01, -1.56873997e+01, -1.85414974e+01,
       -1.24579715e+01, -1.86208320e+01, -5.23333539e+00, -5.11140132e+00,
       -8.64797899e+00,  9.17130130e+00,  2.30906776e+01,  2.01861019e+00,
        2.25614419e+01,  1.91061348e+01,  1.43428620e+01, -2.09259754e+01,
       -3.92957690e+00,  4.31358630e+01,  1.86558508e+01,  1.38994071e+01,
        8.35735220e-01,  4.51199868e+00,  1.54285569e+01, -1.04109999e+01,
        4.96392773e+00, -5.38009421e+00,  3.51452739e+00,  5.77431263e+00,
        7.95792756e+00,  2.32228274e+01,  1.21087035e+01,  1.52313290e+01,
       -4.21695562e-01,  3.00869267e+01, -1.17655413e+01, -3.51032833e+00,
        2.16423179e+01, -1.91771485e+01,  2.45523269e+01,  4.88900651e+00,
        2.97188159e+01, -

In [16]:
pd.DataFrame(Y).describe() # summary stats of Y

Unnamed: 0,0
count,1000.0
mean,3.452696
std,13.025746
min,-33.252262
25%,-5.733074
50%,3.186321
75%,12.893257
max,43.135863


In [17]:
X

array([[-0.23949044,  0.20204604, -0.54972782, ..., -1.12269978,
         1.05984406, -0.2747755 ],
       [ 0.78597252, -0.9352116 ,  0.27192843, ..., -1.24850548,
         1.44651786,  0.14611905],
       [ 1.09727696, -0.96287228,  0.56219315, ..., -1.02297285,
         0.96877531, -0.79127487],
       ...,
       [ 1.43721162,  0.05581602, -0.80873523, ..., -0.65795589,
         0.41761371,  0.72035854],
       [-0.56825225,  0.78821679, -0.62711777, ..., -0.09637254,
        -0.95424552,  0.23412143],
       [-0.79253074,  1.63656252, -1.49251667, ...,  1.05690572,
        -0.80464246,  2.66129696]])

In [18]:
Y

array([ 4.90773731e+00, -1.53664060e+01,  8.00767578e+00,  5.46895934e+00,
        1.54799324e+01, -7.04498161e-01, -7.38346949e+00,  2.04133556e+01,
       -4.05854847e+00, -7.05159807e-01,  4.20585032e+00, -3.10917967e+00,
        7.66916694e+00, -1.98817799e+01, -1.56873997e+01, -1.85414974e+01,
       -1.24579715e+01, -1.86208320e+01, -5.23333539e+00, -5.11140132e+00,
       -8.64797899e+00,  9.17130130e+00,  2.30906776e+01,  2.01861019e+00,
        2.25614419e+01,  1.91061348e+01,  1.43428620e+01, -2.09259754e+01,
       -3.92957690e+00,  4.31358630e+01,  1.86558508e+01,  1.38994071e+01,
        8.35735220e-01,  4.51199868e+00,  1.54285569e+01, -1.04109999e+01,
        4.96392773e+00, -5.38009421e+00,  3.51452739e+00,  5.77431263e+00,
        7.95792756e+00,  2.32228274e+01,  1.21087035e+01,  1.52313290e+01,
       -4.21695562e-01,  3.00869267e+01, -1.17655413e+01, -3.51032833e+00,
        2.16423179e+01, -1.91771485e+01,  2.45523269e+01,  4.88900651e+00,
        2.97188159e+01, -

In [19]:
W

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,

In [20]:
tau

array([8., 0., 0., 8., 8., 0., 8., 8., 0., 8., 0., 0., 0., 0., 0., 8., 0.,
       0., 8., 0., 0., 0., 8., 8., 8., 8., 0., 0., 8., 8., 8., 8., 0., 8.,
       0., 8., 0., 0., 8., 0., 0., 0., 0., 8., 8., 0., 8., 0., 8., 0., 8.,
       0., 8., 0., 8., 0., 8., 8., 8., 8., 8., 8., 0., 0., 8., 8., 8., 8.,
       8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 8., 0., 8., 8., 8., 8., 0.,
       0., 8., 0., 8., 8., 0., 8., 8., 0., 8., 8., 8., 0., 0., 0., 8., 8.,
       8., 0., 8., 0., 8., 8., 0., 0., 0., 8., 8., 0., 0., 0., 0., 8., 0.,
       8., 8., 0., 0., 8., 8., 0., 8., 8., 0., 8., 0., 0., 0., 8., 8., 8.,
       0., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 8., 0., 8., 8., 0., 0.,
       8., 0., 8., 0., 0., 0., 8., 0., 0., 8., 0., 0., 8., 8., 8., 0., 0.,
       0., 8., 8., 0., 8., 0., 0., 8., 8., 0., 8., 0., 0., 8., 8., 0., 8.,
       0., 0., 0., 8., 8., 0., 8., 0., 0., 0., 0., 0., 8., 0., 8., 8., 0.,
       8., 0., 8., 8., 0., 8., 8., 0., 0., 8., 8., 8., 0., 8., 8., 8., 0.,
       8., 0., 0., 0., 8.

In [21]:
dataset = np.concatenate((np.reshape(Y,(N,1)),X,np.reshape(W,(N,1)),np.reshape(tau,(N,1))), axis = 1)
dataset

array([[  4.90773731,  -0.23949044,   0.20204604, ...,  -0.2747755 ,
          1.        ,   8.        ],
       [-15.36640597,   0.78597252,  -0.9352116 , ...,   0.14611905,
          1.        ,   0.        ],
       [  8.00767578,   1.09727696,  -0.96287228, ...,  -0.79127487,
          1.        ,   0.        ],
       ...,
       [ 14.96782871,   1.43721162,   0.05581602, ...,   0.72035854,
          0.        ,   0.        ],
       [  8.21160993,  -0.56825225,   0.78821679, ...,   0.23412143,
          0.        ,   8.        ],
       [ -4.20651388,  -0.79253074,   1.63656252, ...,   2.66129696,
          0.        ,   8.        ]])