In [52]:
# import packages

import numpy as np
from numpy import random
from scipy import stats
import pandas as pd
import copy
import matplotlib.pyplot as plt

# import r packages if needed

import rpy2.robjects as robjects
import rpy2.robjects.packages as rpackages
rpackages.importr("clusterGeneration")
rpackages.importr("mpower")
rpackages.importr("base")
rpackages.importr("utils")
cluster_generation = robjects.packages.importr("clusterGeneration")
mpower = robjects.packages.importr("mpower")
base = robjects.packages.importr("base")
utils = robjects.packages.importr("utils")
from numpy import load


In [5]:
# SIMULATION SETUP: "META-PARAMETERS"

sample_sizes = [1000,2000,5000,10000]
test_sizes = [500,1000] # TODO: DECIDE
runs = [10,30] # TODO: DECIDE HOW MANY RUNS (I.E. HOW MANY TIMES GENERATING ALL SETUPS AND RUNNING ALL META LEARNERS ON THEM)

In [67]:
# SIMULATION SETUP: SETUPS

# i)            Balanced vs. Unbalanced
# ii)           Confounding vs. No confounding

# iii) A)       Treatment effect: --> Simple cate vs. Complex cate
#                                    a) Complex cate --> Linear vs Non-linear cate

# iii) B)       No effect: --> a) Global linear response vs. Piecewise linear response (globally non-linear)



In [68]:
 # Fully Synthetic Data Sets (Set up from Künzel et al.)

# 1: Simulate the d-dimensional X.
# 2: Create Potential Outcomes Y(1) and Y(0).
# 3: Simulate Treatment Assignments trough W.

In [69]:
########################################################

In [81]:
# 1: Simulate the d-dimensional X
## TODO: PROBABLY HAVE TO DO THIS ON R: SINCE NO SET.SEED() POSSIBLE!

# Setup parameters
d = 25 # TODO: set dimension according to setup
N = 1000 # TODO: set sample size according to setup

# X Correlation matrix and mean
mean = np.zeros(d) # TODO: set mean according to setup
cov = np.array(mpower.cvine(d=d, alpha = 0.5, beta = 0.5)) # TODO: set cov according to setup

# Simulate X
X = random.multivariate_normal(mean=mean, cov=cov, size=N, check_valid='warn')

In [115]:
## EXAMPLE!

# make 10 runs of dataset X
# whole data generation procedure in this loop FOR EACH SETUP!!
data = []
d = 25 # TODO: set dimension according to setup
N = 1000 # TODO: set sample size according to setup
mean = np.zeros(d) # TODO: set mean according to setup

for i in range(10):

    # 1 GENERATE FEATURES X
    cov = np.array(mpower.cvine(d=d, alpha = 0.5, beta = 0.5)) # TODO: set cov according to setup
    X = random.multivariate_normal(mean=mean, cov=cov, size=N, check_valid='warn')

    # 2 GENERATE ERRORS E_0 & E_1
    e_0 = random.normal(loc=0.0,scale=1.0,size=N)
    e_1 = random.normal(loc=0.0,scale=1.0,size=N)

    # 3 COMPUTE MU_0 & MU_1 (SIMPLE CATE, LINEAR, NO CONFOUNDING)
    betas_0 = random.uniform(low=-5, high=5, size=d)
    mu_0 = np.matmul(X,betas_0) + 5*X[:,0]
    mu_1 = mu_0 + 8*X[:,1] # linear
    tau = mu_1 - mu_0

    # 4 CREATE POTENTIAL OUTCOMES Y_0 & Y_1
    Y_0 = mu_0 + e_0
    Y_1 = mu_1 + e_1

    # 5 SET PROPENSITY SCORE E_X (CONSTANT, BALANCED)
    e_x = 0.5

    # 6 GENERATE TREATMENT ASSIGMENT W
    W = random.binomial(size=N, n=1, p=e_x)

    # 7 CREATE OBSERVED VARIABLES Y
    Y = np.multiply(W,Y_1) + np.multiply(np.ones(N)-W,Y_0)

    # 8 CREATE DATASET
    dataset = np.concatenate((np.reshape(Y,(N,1)),X,np.reshape(W,(N,1)),np.reshape(tau,(N,1))), axis = 1)

    # 9 APPEND RUN TO LIST
    data.append(dataset)

In [155]:
print(len(data)) # nr of datasets
print(len(data[0])) # rows
print(len(data[0][0])) # columns

10
1000
28


In [140]:
# Check X
X

array([[ 1.59608404, -0.29933383,  1.59713617, ...,  1.53098193,
        -0.18772674, -2.3723548 ],
       [ 1.1423447 ,  1.16866093,  1.11157359, ...,  1.16514249,
         1.01378475, -0.8360739 ],
       [-0.3193583 ,  0.62500306, -0.41479481, ..., -0.2886151 ,
         0.9787185 ,  1.07939365],
       ...,
       [-0.36430581,  0.77298921, -0.18094212, ..., -0.28829545,
         0.02839832,  1.14943272],
       [-1.12286371, -1.27647492, -1.33435966, ..., -1.18377495,
        -0.01305518,  0.22649884],
       [-1.37387576, -1.79418329, -0.9688096 , ..., -1.37531261,
        -2.34722033,  0.71357282]])

In [None]:
# Check cov
cov

In [None]:
#######################################################

In [None]:
# 2: Create Potential Outcomes Y(1) and Y(0).

# 2.1 Simulate errors, FIX
e_0 = random.normal(loc=0.0,scale=1.0,size=N)
e_1 = random.normal(loc=0.0,scale=1.0,size=N)

# if needed take student_t distributed errors to see simulation with heavy-tailed errors

# e_o = random.standard_t(df=1,size=N)
# e_1 = random.standard_t(df=1,size=N)

In [None]:
#######################################################

In [None]:
# 2.2 Create Response Functions

In [None]:
# SI1.1 simple cate - indicator  (no confounding)
betas_0 = random.uniform(low=-5, high=5, size=d)
mu_0 = np.matmul(X,betas_0) + 5*np.int8(X[:,0]>0.5)
mu_1 = mu_0 + 8*np.int8(X[:,1]>0.1) # indicator
tau = mu_1 - mu_0
tau

In [None]:
# SI1.2 simple cate - linear (no confounding)
betas_0 = random.uniform(low=-5, high=5, size=d)
mu_0 = np.matmul(X,betas_0) + 5*X[:,0]
mu_1 = mu_0 + 8*X[:,1] # linear
tau = mu_1 - mu_0
tau

In [None]:
# SI1.3 simple cate - quadratic (no confounding)
betas_0 = random.uniform(low=-5, high=5, size=d)
mu_0 = np.matmul(X,betas_0) + 5*(X[:,0]**2)
mu_1 = mu_0 + 8*(X[:,1]**2) # quadratic
tau = mu_1 - mu_0
tau

In [None]:
#############################################

In [None]:
# SI2 complex linear cate  (in Künzel it is low=1, high=30)
betas_0 = random.uniform(low=-15, high=15, size=d)
betas_1 = random.uniform(low=-15, high=15, size=d)
mu_0 = np.matmul(X,betas_0)
mu_1 = np.matmul(X,betas_1)
tau = mu_1 - mu_0

In [None]:
###############################################

In [None]:
def varsigma_funct(x):
    return 2/(1+np.exp(-12*(x-1/2)))

In [None]:
# SI3 complex non-linear
mu_0 = -1/2*varsigma_funct(x=X[:,0])*varsigma_funct(x=X[:,1])
mu_1 = 1/2*varsigma_funct(x=X[:,0])*varsigma_funct(x=X[:,1])
tau = mu_1 - mu_0

In [None]:
#############################################

In [None]:
# SI4 no treatment effect (global linear response)
betas_noT = random.uniform(low=-15, high=15, size=d)
mu_0 = np.matmul(X,betas_noT)
mu_1 = mu_0
tau = np.zeros(N)

In [None]:
###########################################

In [None]:
# SI5 no treatment effect (piecewise linear response)
betas = np.random.uniform(low=-15, high=15, size=d)

betas_l = copy.copy(betas) # betas_lower
betas_l[5:d] = 0

betas_m = copy.copy(betas) #betas_middle
betas_m[0:5] = 0
betas_m[10:d] = 0

betas_u = copy.copy(betas) #betas_upper
betas_u[0:10] = 0
betas_u[15:d] = 0

def piecewise_linear_new(x):
    condition_l = x[:, 19] < -0.4
    condition_u = x[:, 19] > 0.4

    array = np.zeros(N)
    array[condition_l] = np.matmul(x[condition_l, :], betas_l)
    array[~condition_l & ~condition_u] = np.matmul(x[~condition_l & ~condition_u, :], betas_m)
    array[condition_u] = np.matmul(x[condition_u, :], betas_u)

    return array

mu_0 = piecewise_linear_new(X)

mu_1 = mu_0

tau = np.zeros(N)

# TODO: CHECK IF RIGHT

In [None]:
##########################################

In [None]:
# SI6.1 beta confounded, no treatment effect
X = random.uniform(low=0, high=1, size=(N,d)) # ACHTUNG: nöd wiederhole!
mu_0 = 2*X[:,0] - 1
mu_1 = mu_0
tau = np.zeros(N)


In [None]:
# SI6.2 beta confounded, simple cate (indicator)
X = random.uniform(low=0, high=1, size=(N,d))
mu_0 = 2*X[:,0] - 1
mu_1 = mu_0 + 2*np.int8(X[:,1]>0.4)
tau = mu_1 - mu_0
tau

In [None]:
# SI6.3 beta confounded, simple cate (linear)
X = random.uniform(low=0, high=1, size=(N,d))
mu_0 = 2*X[:,0] - 1
mu_1 = mu_0 + 2*X[:,1]
tau = mu_1 - mu_0


In [None]:
# SI6.4 beta confounded, simple cate (quadratic)
X = random.uniform(low=0, high=1, size=(N,d))
mu_0 = 2*X[:,0] - 1
mu_1 = mu_0 + 2*(X[:,1]**2)
tau = mu_1 - mu_0
tau

In [None]:
##################################

In [None]:
# SI7.1 beta confounded, complex cate (linear)
betas = random.uniform(low=-15, high=15, size=d)

mu_0 = 2*X[:,0] - 1
mu_1 = mu_0 + np.matmul(X,betas)
tau = mu_1 - mu_0

# TODO: CHECK WHETHER THIS SETUP MAKES SENSE

In [None]:
# SI7.2 beta confounded, complex cate (non-linear)
mu_0 = 2*X[:,0] - 1
mu_1 = mu_0 + 1/2*varsigma_funct(x=X[:,0])*varsigma_funct(x=X[:,1])

# TODO: CHECK WHETER THIS SETUP MAKES SENSE

In [None]:
# TODO: check relationship between mu_0 & mu_1 and see whether this makes sense
check_values_0 = 2*X[:,0] - 1
check_values_1 = mu_0 + 1/2*varsigma_funct(x=X[:,0])*varsigma_funct(x=X[:,1])
check_values_2 = mu_0 + np.matmul(X,betas)

plt.plot(check_values_0, check_values_1, 'bo')
plt.show()

In [None]:
###################################
# 2.3 Creat Potential Outcomes, FIX
Y_0 = mu_0 + e_0
Y_1 = mu_1 + e_1

In [None]:
# quick check Y_0
Y_0

In [None]:
# Quick Check Y_1
Y_1

In [None]:
###################################
# 3.1 Propensity score setups # TODO: change for setup

# i) constant, balanced
e_x = 0.5

# ii) constant, unbalanced
e_x = 0.01

In [None]:
#################################

In [None]:
# iii) beta confounded (balanced)
X = random.uniform(low=0, high=1, size=(N,d))
beta_dist = stats.beta(a=2, b=4) # set beta distribution
beta_values = beta_dist.pdf(X[:,0]) # calculate pdf values for x1
e_x = 1/4*(1+beta_values)

pd.DataFrame(e_x).describe() # summary stats of e_x

In [None]:
# check if balanced
W = random.binomial(size=N, n=1, p=e_x)
pd.DataFrame(W).describe()

In [None]:
#####################################

In [None]:
# iv) beta confounded (unbalanced)
X = random.uniform(low=0, high=1, size=(N,d))
beta_dist = stats.beta(a=1, b=10) # set beta distribution
beta_values = beta_dist.pdf(X[:,0]) # calculate pdf values for x1
e_x = 1/100*(1+beta_values)

pd.DataFrame(e_x).describe() # summary stats of e_x

In [None]:
# check if unbalanced and how much
W = random.binomial(size=N, n=1, p=e_x)
pd.DataFrame(W).describe()

In [None]:
##########################################

In [None]:
X

In [None]:
###################################

In [None]:
# 3.2 Simulate Treatment Assignments trough W

# Simulate Treatment Assignment, FIX
W = random.binomial(size=N, n=1, p=e_x)

# Create Observed Outcome, FIX
ones = np.ones(N)
Y = np.multiply(W,Y_1) + np.multiply(ones-W,Y_0)

In [None]:
pd.DataFrame(e_x).describe() # summary stats of e_x

In [None]:
# check out observed outcomes
Y

In [None]:
pd.DataFrame(Y).describe() # summary stats of Y

In [None]:
X

In [None]:
Y

In [None]:
W

In [None]:
tau

In [None]:
dataset = np.concatenate((np.reshape(Y,(N,1)),X,np.reshape(W,(N,1)),np.reshape(tau,(N,1))), axis = 1)
dataset

In [None]:
data.apppend(dataset)

# IHDP datset from Fredjo

In [None]:
# 100 realisations of IHDP train set (672 units)
ihdp_train = load('/Users/arberimbibaj/Downloads/ihdp_npci_1-100.train.npz')
files_train = ihdp_train.files
files_train

In [None]:
# 100 realisations of IHDP test set (72 units)
ihdp_test = load('/Users/arberimbibaj/Downloads/ihdp_npci_1-100.test.npz')
files_test = ihdp_test.files
files_test

In [None]:
# for example
ihdp_train['mu1']

In [None]:
# cate per realisation
ihdp_train['mu1'] - ihdp_train['mu0']