In [1]:
# import packages

import numpy as np
from numpy import random
from scipy import stats
# import pandas as pd
import copy
import matplotlib.pyplot as plt

# import r packages if needed

import rpy2.robjects as robjects
import rpy2.robjects.packages as rpackages

rpackages.importr("clusterGeneration")
rpackages.importr("mpower")
rpackages.importr("base")
rpackages.importr("utils")
cluster_generation = robjects.packages.importr("clusterGeneration")
mpower = robjects.packages.importr("mpower")
base = robjects.packages.importr("base")
utils = robjects.packages.importr("utils")

# set seed from R
r = robjects.r
set_seed = r('set.seed')

import pickle


In [None]:
 # Fully Synthetic Data Sets (Set up from Künzel et al.)

# 1: Simulate the d-dimensional X.
# 2: Create Potential Outcomes Y(1) and Y(0).
# 3: Simulate Treatment Assignments trough W.

In [None]:
# TODO: if needed take student_t distributed errors to see simulation with heavy-tailed errors
# e_o = random.standard_t(df=1,size=N)
# e_1 = random.standard_t(df=1,size=N)

# Start

In [2]:
# seeds for reproducibility --> NICE :-D
set_seed(8953)  # R seed
random.seed(8953)  # numpy seed

In [3]:
# SIMULATION SETUP: "META-PARAMETERS"
sample_sizes = [500, 1000, 2000, 5000]  # TODO: change to [500, 1000, 2000, 5000]
test_size = 1000
n_runs = 1  # TODO: change to 10/30
n_setups = 1  # TODO: change to 24
d = 20  # TODO: change to 20

In [4]:
# all settings of e_x
exs = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
       0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
       'beta_balanced', 'beta_balanced', 'beta_balanced', 'beta_balanced', 'beta_balanced', 'beta_balanced',
       'beta_balanced',
       'beta_unbalanced', 'beta_unbalanced', 'beta_unbalanced', 'beta_unbalanced', 'beta_unbalanced', 'beta_unbalanced',
       'beta_unbalanced']

In [5]:
# all settings of mu_0 / mu_1
cates = ['linear_response', 'non_linear_response', 'indicator_cate', 'linear_cate', 'complex_linear_cate',
         'complex_non_linear_cate'] * 4

### Cate Settings

1: linear response (zero cate)

In [6]:
def linear_response(x_train, x_test):
    betas = random.uniform(low=-5, high=5, size=x_train.shape[1])

    # train set
    mu_0 = np.matmul(x_train, betas) + 5 * x_train[:, 0]
    mu_1 = mu_0
    tau = np.zeros(len(x_train))

    # test set
    mu_0_test = np.matmul(x_test, betas) + 5 * x_test[:, 0]
    mu_1_test = mu_0_test
    tau_test = np.zeros(len(x_test))

    return mu_0, mu_1, tau, mu_0_test, mu_1_test, tau_test

2: non-linear response (zero cate)

In [7]:
def non_linear_response(x_train, x_test):
    # train set
    mu_0 = np.arctan(x_train[:, 0]) * np.arctan(x_train[:, 1])
    mu_1 = mu_0
    tau = np.zeros(len(x_train))

    # test set
    mu_0_test = np.arctan(x_test[:, 0]) * np.arctan(x_test[:, 1])
    mu_1_test = mu_0_test
    tau_test = np.zeros(len(x_test))

    return mu_0, mu_1, tau, mu_0_test, mu_1_test, tau_test

3: simple indicator cate

In [8]:
def simple_indicator_cate(x_train, x_test):
    betas = random.uniform(low=-5, high=5, size=x_train.shape[1])

    # train set
    mu_0 = np.matmul(x_train, betas) + 5 * np.int8(x_train[:, 0] > 0.5)
    mu_1 = mu_0 + 8 * np.int8(x_train[:, 1] > 0.1)
    tau = mu_1 - mu_0

    # test set
    mu_0_test = np.matmul(x_test, betas) + 5 * np.int8(x_test[:, 0] > 0.5)
    mu_1_test = mu_0_test + 8 * np.int8(x_test[:, 1] > 0.1)
    tau_test = mu_1_test - mu_0_test

    return mu_0, mu_1, tau, mu_0_test, mu_1_test, tau_test


4: simple linear cate

In [9]:
def simple_linear_cate(x_train, x_test):
    betas = random.uniform(low=-5, high=5, size=x_train.shape[1])

    # train set
    mu_0 = np.matmul(x_train, betas) + 5 * x_train[:, 0]
    mu_1 = mu_0 + 8 * x_train[:, 1]
    tau = mu_1 - mu_0

    # test set
    mu_0_test = np.matmul(x_test, betas) + 5 * x_test[:, 0]
    mu_1_test = mu_0_test + 8 * x_test[:, 1]
    tau_test = mu_1_test - mu_0_test

    return mu_0, mu_1, tau, mu_0_test, mu_1_test, tau_test

5: complex linear cate

In [10]:
def complex_linear_cate(x_train, x_test):
    betas_0 = random.uniform(low=-5, high=5, size=x_train.shape[1])
    betas_1 = random.uniform(low=-5, high=5, size=x_train.shape[1])

    # train set
    mu_0 = np.matmul(x_train, betas_0) + 5 * x_train[:, 0]
    mu_1 = np.matmul(x_train, betas_1) + 5 * x_train[:, 0]
    tau = mu_1 - mu_0

    # test set
    mu_0_test = np.matmul(x_test, betas_0) + 5 * x_test[:, 0]
    mu_1_test = np.matmul(x_test, betas_1) + 5 * x_test[:, 0]
    tau_test = mu_1_test - mu_0_test

    return mu_0, mu_1, tau, mu_0_test, mu_1_test, tau_test

6: complex non-linear cate

In [11]:
# helper function varsigma
def varsigma_function(x):
    return 2 / (1 + np.exp(-12 * (x - 1 / 2)))

In [12]:
def complex_non_linear_cate(x_train, x_test):
    # train set
    mu_0 = -1 / 2 * varsigma_function(x_train[:, 0]) * varsigma_function(x_train[:, 1])
    mu_1 = 1 / 2 * varsigma_function(x_train[:, 0]) * varsigma_function(x_train[:, 1])
    tau = mu_1 - mu_0

    # test set
    mu_0_test = -1 / 2 * varsigma_function(x_test[:, 0]) * varsigma_function(x_test[:, 1])
    mu_1_test = 1 / 2 * varsigma_function(x_test[:, 0]) * varsigma_function(x_test[:, 1])
    tau_test = mu_1_test - mu_0_test

    return mu_0, mu_1, tau, mu_0_test, mu_1_test, tau_test

### Propensity settings

In [13]:
# helper function
def beta_balanced(x_train, x_test):
    beta_dist = stats.beta(a=2, b=4)  # set beta distribution

    # train
    cdf_values = stats.norm.cdf(x_train[:, 0])
    beta_values = beta_dist.pdf(cdf_values)  # calculate pdf values for x1
    e_x = 1 / 4 * (1 + beta_values)

    # test
    cdf_values_test = stats.norm.cdf(x_test[:, 0])
    beta_values_test = beta_dist.pdf(cdf_values_test)  # calculate pdf values for x1
    e_x_test = 1 / 4 * (1 + beta_values_test)

    return e_x, e_x_test


In [14]:
def beta_unbalanced(x_train, x_test):
    beta_dist = stats.beta(a=2, b=4)  # set beta distribution

    # train
    cdf_values = stats.norm.cdf(x_train[:, 0])
    beta_values = beta_dist.pdf(cdf_values)  # calculate pdf values for x1
    e_x = 1 / 40 * (1 + beta_values)

    # test
    cdf_values_test = stats.norm.cdf(x_test[:, 0])
    beta_values_test = beta_dist.pdf(cdf_values_test)  # calculate pdf values for x1
    e_x_test = 1 / 40 * (1 + beta_values_test)

    return e_x, e_x_test

### generate train/test set function

In [15]:
# TODO: TRIPLE CHECK THIS FUNCTION, IT IS VERY IMPORTANT
def generate(ex, cate, sample_size, testing_size):
    # 1: generated x_train & x_test
    mean = np.zeros(d)
    cov = np.array(mpower.cvine(d=d, alpha=0.5, beta=0.5))  # TODO: set cov according to setup
    x_train = random.multivariate_normal(mean=mean, cov=cov, size=sample_size, check_valid='warn')
    x_test = random.multivariate_normal(mean=mean, cov=cov, size=test_size, check_valid='warn')

    # 2: generate e_0 & e_0
    # train
    e_0 = random.normal(loc=0.0, scale=1.0, size=sample_size)
    e_1 = random.normal(loc=0.0, scale=1.0, size=sample_size)
    # test
    e_0_test = random.normal(loc=0.0, scale=1.0, size=testing_size)
    e_1_test = random.normal(loc=0.0, scale=1.0, size=testing_size)

    # 3: compute mu_0 & mu_1 --> based on setting
    if cate == 'linear_response':  # 'linear response' setting (no treatment effect)
        mu_0, mu_1, tau, mu_0_test, mu_1_test, tau_test = linear_response(x_train, x_test)

    elif cate == 'non_linear_response':  # 'non-linear response' setting (no treatment effect)
        mu_0, mu_1, tau, mu_0_test, mu_1_test, tau_test = non_linear_response(x_train, x_test)

    elif cate == 'indicator_cate':  # 'simple indicator cate' setting
        mu_0, mu_1, tau, mu_0_test, mu_1_test, tau_test = simple_indicator_cate(x_train, x_test)

    elif cate == 'linear_cate':  # 'simple linear cate' setting
        mu_0, mu_1, tau, mu_0_test, mu_1_test, tau_test = simple_linear_cate(x_train, x_test)

    elif cate == 'complex_linear_cate':  # 'complex linear cate' setting
        mu_0, mu_1, tau, mu_0_test, mu_1_test, tau_test = complex_linear_cate(x_train, x_test)

    elif cate == 'complex_non_linear_cate':  # 'complex non-linear cate' setting
        mu_0, mu_1, tau, mu_0_test, mu_1_test, tau_test = complex_non_linear_cate(x_train, x_test)

    else:
        raise NotImplementedError('No or incorrect setting specified.')

    # 4: create potential outcomes y_0 & y_1
    # train
    y_0 = mu_0 + e_0
    y_1 = mu_1 + e_1
    # test
    y_0_test = mu_0_test + e_0_test
    y_1_test = mu_1_test + e_1_test

    # 5: Set propensity score e_x --> based on setting
    if isinstance(ex, float or int):
        e_x = ex
        e_x_test = ex

    elif ex == 'beta_balanced':
        e_x, e_x_test = beta_balanced(x_train, x_test)

    elif ex == 'beta_unbalanced':
        e_x, e_x_test = beta_unbalanced(x_train, x_test)

    else:
        raise NotImplementedError('Propensity method not or incorrectly specified.')

    # 6: Generate treatment assignment W
    w = random.binomial(size=sample_size, n=1, p=e_x)
    w_test = random.binomial(size=testing_size, n=1, p=e_x_test)

    # 7: Create observed variables Y
    y = np.multiply(w, y_1) + np.multiply(np.ones(sample_size) - w, y_0)
    y_test = np.multiply(w_test, y_1_test) + np.multiply(np.ones(testing_size) - w_test, y_0_test)

    # 8: Create train & test sets
    dataset_train = np.concatenate(
        (np.reshape(y, (sample_size, 1)), x_train, np.reshape(w, (sample_size, 1)), np.reshape(tau, (sample_size, 1))),
        axis=1)
    dataset_test = np.concatenate((np.reshape(y_test, (testing_size, 1)), x_test, np.reshape(w_test, (testing_size, 1)),
                                   np.reshape(tau_test, (testing_size, 1))), axis=1)

    # 9: Return both sets
    return [dataset_train, dataset_test]


empty list

In [16]:
# create empty list to save all datasets
data = []
for i in range(n_setups):
    data.append([])

for i in range(n_setups):
    for s in range(4):
        data[i].append([])

for i in range(n_setups):
    for s in range(4):
        for r in range(n_runs):
            data[i][s].append([])

for i in range(n_setups):
    for s in range(4):
        for r in range(n_runs):
            for t in range(2):
                data[i][s][r].append([])

### generate all

In [17]:
# generate all datasets
s = 0
for setup in range(n_setups):
    print(f'Generating setup {setup + 1}')
    for size in sample_sizes:
        for run in range(n_runs):
            gen = generate(ex=exs[setup], cate=cates[setup], sample_size=size, testing_size=test_size)
            data[setup][s][run][0] = gen[0]  # add train-set
            data[setup][s][run][1] = gen[1]  # add test-set
        s += 1  # update index for sample_sizes
    s = 0  # reset index for sample_sizes
print('DONE')
# it took 43 seconds # 4.4 GB of data!

Generating setup 1
DONE


In [29]:
data[0][3][0][0]  # :)

array([[-2.10127177e+01,  9.86474306e-01, -4.92410122e-01, ...,
         1.07541031e+00,  0.00000000e+00,  0.00000000e+00],
       [-1.25542289e+01,  1.38759213e+00, -5.00854638e-01, ...,
         2.55792858e-01,  0.00000000e+00,  0.00000000e+00],
       [ 3.14445010e+01, -2.07217467e+00,  1.90524147e+00, ...,
        -1.91588089e-01,  0.00000000e+00,  0.00000000e+00],
       ...,
       [-3.20361714e+01,  1.79728689e+00, -2.70657219e+00, ...,
        -9.40858061e-01,  1.00000000e+00,  0.00000000e+00],
       [-2.47035219e+01,  1.81272006e+00, -2.20742297e+00, ...,
         1.31739761e-01,  0.00000000e+00,  0.00000000e+00],
       [-4.88487478e+00, -6.87199711e-01, -9.47270940e-03, ...,
        -1.01182862e+00,  1.00000000e+00,  0.00000000e+00]])

In [None]:
# 0.476460 -> reproducible :)

# Save data with pickle

In [30]:
# save data
file_name = "/Users/arberimbibaj/Documents/Master Thesis ETH/DataSets /Generated/data_synthetic_one_run.pkl"

open_file = open(file_name, "wb")
pickle.dump(data, open_file)
open_file.close()

In [31]:
file_name = "/Users/arberimbibaj/Documents/Master Thesis ETH/DataSets /Generated/data_synthetic_one_run.pkl"
open_file = open(file_name, "rb")
loaded_data = pickle.load(open_file)
open_file.close()

loaded_data[0][0][0][0]

array([[ 0.01167139,  1.13035656, -0.24136053, ..., -0.01064836,
         1.        ,  0.        ],
       [ 3.17020969,  0.22015068,  0.04529537, ...,  0.29755877,
         1.        ,  0.        ],
       [ 2.61769972,  0.37887061, -0.14322619, ...,  0.01384532,
         0.        ,  0.        ],
       ...,
       [ 9.13235505, -0.28882811, -0.20802326, ...,  0.01642092,
         1.        ,  0.        ],
       [-9.11141269, -1.15419441,  1.81805704, ...,  1.56209178,
         1.        ,  0.        ],
       [ 0.80415229,  0.12075425, -0.41938106, ..., -0.27756771,
         0.        ,  0.        ]])

In [None]:
# TODO: CHANGE THIS BACK TO WHAT IT WAS, SO NO CRASHES CAN HAPPEN.
# Current values:
# NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
# NotebookApp.rate_limit_window=3.0 (secs)