In [1]:
# import packages
import numpy as np
from numpy import random
from scipy import stats
import jsonpickle

In [2]:
# import r packages if needed
import rpy2.robjects as robjects
import rpy2.robjects.packages as rpackages

rpackages.importr("mpower")
mpower = robjects.packages.importr("mpower")

# set seed from R
r = robjects.r
set_seed = r('set.seed')

In [3]:
# seeds for reproducibility --> NICE :-D
set_seed(8953)  # R seed
random.seed(8953)  # numpy seed

In [4]:
# SIMULATION SETUP: "META-PARAMETERS"
sample_sizes = [500, 1000, 2000, 5000]  # TODO: change to [500, 1000, 2000, 5000]
test_size = 1000
n_runs = 10  # TODO: change to 10/30
n_setups = 18  # TODO: change to 28 ???
d = 20  # TODO: change to 20

In [5]:
# all settings of e_x
exs = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5,  # settings 1-6
       0.1, 0.1, 0.1, 0.1, 0.1, 0.1,  # settings 7-12
       'beta_balanced', 'beta_balanced', 'beta_balanced', 'beta_balanced', 'beta_balanced',
       'beta_balanced']  # settings 13-18

In [6]:
# all settings of mu_0 / mu_1
cates = ['linear_response', 'non_linear_response', 'indicator_cate', 'linear_cate', 'complex_linear_cate',
         'complex_non_linear_cate'] * 3  # TODO: change to 3!!!

In [7]:
def linear_response(x, betas):
    # train set
    mu_0 = np.matmul(x, betas) + 5 * x[:, 0]
    mu_1 = mu_0
    tau = np.zeros(len(x))

    return mu_0, mu_1, tau

In [8]:
# TODO: multiple by 5/10!
def non_linear_response(x):
    # train set
    mu_0 = 5 * np.arctan(x[:, 0]) * np.arctan(x[:, 1])
    mu_1 = mu_0
    tau = np.zeros(len(x))

    return mu_0, mu_1, tau

In [9]:
def simple_indicator_cate(x, betas):
    # train set
    mu_0 = np.matmul(x, betas) + 5 * np.int8(x[:, 0] > 0.5)
    mu_1 = mu_0 + 8 * np.int8(x[:, 1] > 0.1)
    tau = mu_1 - mu_0

    return mu_0, mu_1, tau


In [10]:
def simple_linear_cate(x, betas):
    # train set
    mu_0 = np.matmul(x, betas) + 5 * x[:, 0]
    mu_1 = mu_0 + 4 * x[:, 1] + 2  # TODO: what to change?
    tau = mu_1 - mu_0

    return mu_0, mu_1, tau

In [11]:
def complex_linear_cate(x, betas_0, betas_1):
    # train set
    mu_0 = np.matmul(x, betas_0) + 5 * x[:, 0]
    mu_1 = np.matmul(x, betas_1) + 5 * x[:, 0]
    tau = mu_1 - mu_0

    return mu_0, mu_1, tau

In [12]:
# TODO: also multiple to scale outcomes
# helper function varsigma
def varsigma_function(x):
    return 2 / (1 + np.exp(-12 * x))  # TODO: before, it was (x - 0,5).


def complex_non_linear_cate(x):
    # train set
    mu_0 = -4 / 2 * varsigma_function(x[:, 0]) * varsigma_function(x[:, 1])
    mu_1 = 4 / 2 * varsigma_function(x[:, 0]) * varsigma_function(x[:, 1])
    tau = mu_1 - mu_0

    return mu_0, mu_1, tau


In [13]:
# helper function
def beta_balanced(x):
    beta_dist = stats.beta(a=2, b=4)  # set beta distribution

    # train
    cdf_values = stats.norm.cdf(x[:, 0])
    beta_values = beta_dist.pdf(cdf_values)  # calculate pdf values for x1
    e_x = 1 / 4 * (1 + beta_values)

    return e_x


In [25]:
mean_X = np.zeros(d)
cov_X = np.array(mpower.cvine(d=d, alpha=2, beta=5))
X = random.multivariate_normal(mean=mean_X, cov=cov_X, size=1000, check_valid='warn')


In [26]:
cov_X

array([[ 1.        , -0.17926742, -0.18057796, -0.25210279, -0.53958952,
        -0.14421383, -0.66921598, -0.54357017, -0.24768811, -0.8346477 ,
        -0.6539114 , -0.37117647, -0.46377518,  0.36780179, -0.61991742,
        -0.26463648,  0.21988815, -0.33245588,  0.01431814, -0.73005548],
       [-0.17926742,  1.        , -0.78102906, -0.70803124, -0.65190981,
        -0.57152946, -0.33849812, -0.2791357 , -0.70146409,  0.07464598,
        -0.18406538, -0.60382024,  0.14452151, -0.45208512, -0.21724958,
        -0.1548258 , -0.18135583, -0.13330537, -0.7198524 ,  0.19315562],
       [-0.18057796, -0.78102906,  1.        ,  0.76478082,  0.70131493,
         0.34554073,  0.38110227,  0.12089723,  0.61575085,  0.05551625,
         0.55532167,  0.44064418, -0.14670786,  0.10383955,  0.3145191 ,
         0.15264743, -0.28783436, -0.03976055,  0.38970126, -0.22639847],
       [-0.25210279, -0.70803124,  0.76478082,  1.        ,  0.68413864,
         0.10593694,  0.32683538,  0.18119373,  

In [None]:
#  TODO: TRIPLE CHECK THIS FUNCTION, IT IS VERY IMPORTANT

def generate_data(mean, cov, ex, cate, sample_size, betas, betas_0, betas_1):
    # 1: generated x_train & x_test
    x_train = random.multivariate_normal(mean=mean, cov=cov, size=sample_size, check_valid='warn')

    # 2: generate e_0 & e_0
    e_0 = random.normal(loc=0.0, scale=1.0, size=sample_size)
    e_1 = random.normal(loc=0.0, scale=1.0, size=sample_size)

    # 3: compute mu_0 & mu_1 --> based on setting
    if cate == 'linear_response':  # 'linear response' setting (no treatment effect)
        mu_0, mu_1, tau = linear_response(x_train, betas)

    elif cate == 'non_linear_response':  # 'non-linear response' setting (no treatment effect)
        mu_0, mu_1, tau = non_linear_response(x_train)

    elif cate == 'indicator_cate':  # 'simple indicator cate' setting
        mu_0, mu_1, tau = simple_indicator_cate(x_train, betas)

    elif cate == 'linear_cate':  # 'simple linear cate' setting
        mu_0, mu_1, tau = simple_linear_cate(x_train, betas)

    elif cate == 'complex_linear_cate':  # 'complex linear cate' setting
        mu_0, mu_1, tau = complex_linear_cate(x_train, betas_0, betas_1)

    elif cate == 'complex_non_linear_cate':  # 'complex non-linear cate' setting
        mu_0, mu_1, tau = complex_non_linear_cate(x_train)

    else:
        raise NotImplementedError('No or incorrect setting specified.')

    # 4: create potential outcomes y_0 & y_1
    # train
    y_0 = mu_0 + e_0
    y_1 = mu_1 + e_1

    # 5: Set propensity score e_x --> based on setting
    if isinstance(ex, float or int):
        e_x = ex

    elif ex == 'beta_balanced':
        e_x = beta_balanced(x_train)

    else:
        raise NotImplementedError('Propensity method not or incorrectly specified.')

    # 6: Generate treatment assignment W
    w = random.binomial(size=sample_size, n=1, p=e_x)

    # 7: Create observed variables Y
    y = np.multiply(w, y_1) + np.multiply(np.ones(sample_size) - w, y_0)

    # 8: Create train & test sets
    dataset = np.concatenate(
        (np.reshape(y, (sample_size, 1)), x_train, np.reshape(w, (sample_size, 1)), np.reshape(tau, (sample_size, 1))),
        axis=1)

    # 9: Return both sets
    return dataset


In [None]:
# create empty list to save all datasets NEW
data = []
for i in range(n_setups):
    data.append([])

for i in range(n_setups):
    for r in range(n_runs):
        data[i].append([])

for i in range(n_setups):
    for r in range(n_runs):
        for s in range(4):
            data[i][r].append([])

for i in range(n_setups):
    for r in range(n_runs):
        for s in range(4):
            for t in range(2):
                data[i][r][s].append([])

In [None]:
# generate all datasets NEW

# mean of X
mean_X = np.zeros(d)

for setup in range(n_setups):
    print(f'Generating setup {setup + 1}')
    for run in range(n_runs):
        # cov, betas, beta_0 and betas_1 generated once per run
        cov_X = np.array(mpower.cvine(d=d, alpha=0.5, beta=0.5))
        betas_run = random.uniform(low=-1, high=1, size=d)
        betas_0_run = random.uniform(low=-0.5, high=0.5, size=d)
        betas_1_run = random.uniform(low=-0.5, high=0.5, size=d)
        # start sample_size index
        s = 0
        for size in sample_sizes:
            train_set = generate_data(mean=mean_X, cov=cov_X, ex=exs[setup], cate=cates[setup], sample_size=size,
                                      betas=betas_run,
                                      betas_0=betas_0_run, betas_1=betas_1_run)
            test_set = generate_data(mean=mean_X, cov=cov_X, ex=exs[setup], cate=cates[setup], sample_size=test_size,
                                     betas=betas_run,
                                     betas_0=betas_0_run, betas_1=betas_1_run)
            data[setup][run][s][0] = train_set  # add train-set
            data[setup][run][s][1] = test_set  # add test-set
            s += 1  # update index
print('DONE')
# it took 43 seconds # 4.4 GB of data!

In [None]:
data[1][0][0][0]

In [None]:
import pandas as pd


In [None]:
# save as json
file_name = "/Users/arberimbibaj/Documents/Master Thesis ETH/DataSets /Generated/simulated_data.json"
f = open(file_name, 'w')
json_obj = jsonpickle.encode(data)
f.write(json_obj)
f.close()