In [1]:
# import packages

import numpy as np
from numpy import random
from numpy import load

from scipy import stats
import pandas as pd
import copy
import matplotlib.pyplot as plt



# IHDP Dataset (from Fredjo.com)

In [None]:
# IHDP-100 (train), IHDP-100 (test) are 100 realizations of the IHDP dataset as used in Shalit, J, Sontag, ICML, 2017.


# Variables in the .npz x, t, yf, ycf, mu0, mu1 are:
# covariates, treatment, factual outcome, counterfactual outcome, and noiseless potential outcomes respectively.
ihdp_realisations = 100

In [2]:
# 100 realisations of IHDP train set (672 units)
ihdp_train = load('/Users/arberimbibaj/Downloads/ihdp_npci_1-100.train.npz')
files_train = ihdp_train.files
files_train

['ate', 'mu1', 'mu0', 'yadd', 'yf', 'ycf', 't', 'x', 'ymul']

In [None]:
# 100 realisations of IHDP test set (72 units)
ihdp_test = load('/Users/arberimbibaj/Downloads/ihdp_npci_1-100.test.npz')
files_test = ihdp_test.files
files_test

In [3]:
# for example
ihdp_train['ate'] # 4 is the ATE (Average Treatment Effect)

array(4)

In [4]:
ihdp_train['mu1'] # 672 * 100 matrix,

array([[ 5.85663227,  8.36324396,  7.83226708, ..., 27.10484002,
        14.12632834, 13.28138412],
       [ 6.6232255 ,  8.32414254,  6.23321083, ..., 26.00892602,
        14.00359421, 13.88138412],
       [ 5.89038603,  8.18072925,  7.65255021, ..., 26.5683658 ,
        14.87316392, 13.18138412],
       ...,
       [ 6.15950183,  8.38250611,  7.51245583, ..., 26.30633584,
        13.47191472, 13.28138412],
       [ 6.02161722,  8.05375981,  7.39264458, ..., 26.3935483 ,
        13.64436608, 13.28138412],
       [ 6.51909465,  8.64340469,  6.7129277 , ..., 27.76386043,
        14.46778387, 13.28138412]])

In [None]:
ihdp_train['mu0'] # 672 * 100 matrix

In [None]:
75 + 672

In [None]:
ihdp_test['mu0']

In [None]:
ihdp_train['yadd'] # it is a 0, TODO: what does that mean ???

In [None]:
ihdp_train['yf'] # 672 * 100, y_factuals

In [None]:
ihdp_train['ycf']

In [None]:
ihdp_train['t'] # 672 * 100, treatment assigment

In [None]:
ihdp_train['ymul'] # it is = 1, TODO: what does that mean ???

In [16]:
ihdp_train['x'][:,0][:,0] # X1 for all units, 100 realisations

array([ 0.31658816,  0.18689133,  0.53274953,  0.29497202, -0.54805735,
       -1.13169306, -0.31027983, -0.9803801 ,  0.10042678,  1.06450652,
        0.68406249,  0.46790112,  0.42466884, -1.23977375, -2.03957084,
        0.31658816, -0.52860282, -0.16977494, -0.20219914, -0.95876396,
        0.01396223, -1.97472242, -0.28866369,  0.38143657,  0.23012361,
        0.42683046, -0.65613803,  0.70567863, -0.41836052, -0.00765391,
       -2.32058062, -2.16926766, -0.4615928 ,  0.38143657, -0.24543142,
        1.41901117,  1.48385958,  0.74891091, -1.52078354,  1.33254662,
        0.23012361, -0.22381528,  1.44062731, -0.39674438, -0.15896687,
       -1.80179332, -1.43431899,  1.41901117, -0.0379165 , -2.25573221,
       -1.73694491,  1.44062731,  0.48951726, -1.39108671,  0.64083022,
       -1.17492533,  0.46790112,  0.42466884,  1.3757789 ,  0.12204292,
        0.87860773, -0.26704756, -2.08280311,  0.64083022, -0.54805735,
       -1.47755126, -0.54805735,  1.39739503,  0.77052704,  0.18

In [None]:
# cate per realisation
ihdp_train['mu1'] - ihdp_train['mu0']

In [None]:
len(ihdp_train['x']) # 672 units (observations) in train set

In [None]:
ihdp_train['x'][671,24] # 25 rows --> the 25 features

In [None]:
ihdp_train['x'][0]

make it the same as synthetic data

In [None]:
# This will be all the "datasets", i.e. all simulation setups * simulation runs
ihdp_train_processed = []
ihdp_train_x = []
for i in range(ihdp_realisations):
    ihdp_train_processed.append([])
    ihdp_train_x.append([])

In [None]:
# y_factual, x, treatment, tau
ihdp_train_tau = ihdp_train['mu1'] - ihdp_train['mu0']
ihdp_train_tau

In [None]:
# pre-process x --> make it a list, each entry beeing one realisation, containing all features
for i in range(ihdp_realisations):
    realisation = np.zeros(shape=(672,25))
    for n in range(672):
        realisation[n,:] = ihdp_train['x'][n][:,i][:,np.newaxis].T
    ihdp_train_x[i] = realisation

In [None]:
# concatenate [y, x, t, tau]
for i in range(ihdp_realisations):
    temporary_set = np.concatenate((np.expand_dims(ihdp_train['yf'][:, i], axis=1)
, ihdp_train_x[i], np.expand_dims(ihdp_train['t'][:,i], axis=1), np.expand_dims(ihdp_train_tau[:,i], axis=1)), axis=1)
    ihdp_train_processed[i] = temporary_set

ihdp_train_processed

In [None]:
# save the proscessed training data
np.save('/Users/arberimbibaj/Documents/Master Thesis ETH/DataSets /IHDP',ihdp_train_processed)

In [20]:
# load check
train = load('/Users/arberimbibaj/Documents/Master Thesis ETH/DataSets /IHDP/ihdp_train_processed.npy')
train[0]

array([[ 1.93856829,  0.31658816,  0.59658219, ...,  0.        ,
         0.        ,  4.65168933],
       [ 3.89968653,  0.18689133,  0.19681812, ...,  1.        ,
         0.        ,  4.02968109],
       [ 2.40589425,  0.53274953,  0.19681812, ...,  0.        ,
         0.        ,  4.64407754],
       ...,
       [ 0.60122589,  0.63218376,  0.99634625, ...,  0.        ,
         0.        ,  4.52832494],
       [ 3.93920095,  0.10042678, -0.07717767, ...,  1.        ,
         1.        ,  4.6005372 ],
       [ 2.60919184,  0.81375932,  0.59658219, ...,  0.        ,
         0.        ,  4.18203264]])

In [22]:
np.corrcoef(np.transpose(train[0]))

array([[ 1.00000000e+00,  4.75543241e-02,  4.48845920e-02,
        -3.30388668e-03,  1.07248968e-01, -8.60247480e-02,
         5.01389770e-01, -4.36939584e-02,  1.02822391e-02,
         2.07081428e-01, -1.77896735e-01, -6.45982069e-02,
         7.52743757e-02,  1.41152272e-02, -5.59856881e-02,
         2.09219920e-01, -8.49561688e-02,  1.79871564e-01,
         5.22755109e-02, -1.63472669e-02, -1.73045879e-02,
         7.52233868e-02, -1.20427334e-01, -1.00430057e-01,
        -1.16183635e-01,  1.31860707e-01,  7.10796959e-01,
        -5.14779327e-01],
       [ 4.75543241e-02,  1.00000000e+00,  8.48132297e-01,
        -7.59501153e-01,  3.54950176e-02, -1.13126834e-02,
        -2.31145504e-03, -4.80372140e-02, -2.91854718e-02,
         1.89736371e-02, -4.47413121e-03, -3.34642013e-02,
         3.89500900e-02, -7.58624856e-02, -6.21266155e-02,
         1.17339565e-02,  1.72397877e-02,  2.34836671e-02,
        -1.44990643e-03,  5.73539702e-02,  2.71945084e-02,
        -4.88256834e-02, -4.41

In [None]:
train[2]

# same with test set

In [None]:
# This will be all the "datasets", i.e. all simulation setups * simulation runs
ihdp_test_processed = []
ihdp_test_x = []
for i in range(ihdp_realisations):
    ihdp_test_processed.append([])
    ihdp_test_x.append([])

In [None]:
# y_factual, x, treatment, tau
ihdp_test_tau = ihdp_test['mu1'] - ihdp_test['mu0']
ihdp_test_tau

In [None]:
# pre-process x --> make it a list, each entry beeing one realisation, containing all features
for i in range(ihdp_realisations):
    realisation = np.zeros(shape=(75,25))
    for n in range(75):
        realisation[n,:] = ihdp_test['x'][n][:,i][:,np.newaxis].T
    ihdp_test_x[i] = realisation

In [None]:
# concatenate [y, x, t, tau]
for i in range(ihdp_realisations):
    temporary_set = np.concatenate((np.expand_dims(ihdp_test['yf'][:, i], axis=1)
, ihdp_test_x[i], np.expand_dims(ihdp_test['t'][:,i], axis=1), np.expand_dims(ihdp_test_tau[:,i], axis=1)), axis=1)
    ihdp_test_processed[i] = temporary_set

ihdp_test_processed[0]

In [None]:
# save the processed testing data
np.save('/Users/arberimbibaj/Documents/Master Thesis ETH/DataSets /IHDP/ihdp_test_processed',ihdp_test_processed)

In [None]:
# load check
test = load('/Users/arberimbibaj/Documents/Master Thesis ETH/DataSets /IHDP/ihdp_test_processed.npy')
pd.DataFrame(test[0][:,27]).describe() # the taus are very differently distributed over the realisations

In [None]:
test[0]

# NEWS Dataset (from Fredjo.com)

In [None]:
# Description
''''The first row of the .x file contains the number of rows and columns of
the corresponding dense matrix. In this case n=5000 rows, d=3477 features.

Each following row represent an element in the matrix on the form: i,j,v
where i is the row index, j the column index and v the value.

The data represents word counts in documents, so 1,16,1 means that word 16
occurred 1 time in document 1. '''''

news_x = np.loadtxt('/Users/arberimbibaj/Documents/Master Thesis ETH/DataSets /News/csv/topic_doc_mean_n5000_k3477_seed_1.csv.x', delimiter=',')
news_x

In [None]:
news_y = np.loadtxt('/Users/arberimbibaj/Documents/Master Thesis ETH/DataSets /News/csv/topic_doc_mean_n5000_k3477_seed_1.csv.y', delimiter=',')
news_y
# treatment, y_factual, y_counterfactual, mu0, mu1

In [None]:
# W * factual + (1-W) counter_factual = y_1
# (1-W) * factual + w * counter_factual = y_0

treatment = y[:,0]
y_factual = y[:,1]
y_counterfactual = y[:,2]
mu0 = y[:,3]
mu1 = y[:,4]

In [None]:
y_1 = treatment * y_factual + (1-treatment)*y_counterfactual
y_0 = (1-treatment) * y_factual + treatment*y_counterfactual

In [None]:
ite = y_1 - y_0
ite

In [None]:
cate = mu1 - mu0
cate

# JOBS Dataset (from Fredjo.com)

In [None]:
# These files contain the treated and control units from the male sub-sample from the National Supported Work Demonstration as used by Lalonde in his paper.

#

# The order of the variables from left to right is:
# treatment indicator (1 if treated, 0 if not treated), age, education,
# Black (1 if black, 0 otherwise), Hispanic (1 if Hispanic, 0 otherwise),
# married (1 if married, 0 otherwise), nodegree (1 if no degree, 0 otherwise),
# RE75 (earnings in 1975), and RE78 (earnings in 1978).
# The last variable is the outcome; other variables are pre-treatment.

In [None]:
jobs_test = load('/Users/arberimbibaj/Documents/Master Thesis ETH/DataSets /Jobs/jobs_DW_bin.new.10.test.npz')

In [None]:
jobs_test.files

In [None]:
jobs_test['ate'] # 0.07797

In [None]:
jobs_test['e'] # indicator for sampled ?

In [None]:
jobs_test['I'] # ?

In [None]:
jobs_test['yf']

In [None]:
jobs_test['ymul'] # 1

In [None]:
len(jobs_test['x']) # 642, so for each individual

In [None]:
jobs_test['x']

In [None]:
jobs_train = load('/Users/arberimbibaj/Documents/Master Thesis ETH/DataSets /Jobs/jobs_DW_bin.new.10.train.npz')

In [None]:
jobs_train.files

In [None]:
jobs_train['yf']

In [None]:
# DO THE SAME LOOP AS FOR THE IHDP SET!

# Twin Data

In [None]:
twin = np.genfromtxt('/Users/arberimbibaj/Documents/Master Thesis ETH/DataSets /Twins/Twin_Data.csv.gz.csv', delimiter = ',', names=True)
twin

# ACIC 2016

-Dimensions: 4802 observations, 58 features (3 of them categorical)

-Setups: 77 different setups, each setup 100 generations.

-This folder contains covariates, simulated treatment, simulated response variables (including counterfactuals), and expected values for the causal inference challenge in the 2016 Atlantic Causal Inference Conference. For each of 77 conditions, treatment and response data were simulated 100 times from real-world data corresponding to 4802 individuals and 58 covariates.

-Files:

  x.csv - matrix of covariates; categorical variables are coded as A/B/C/..., binary variables as 0/1, and real numbers are left alone

  zy.csv - the 77 x 100 sets of treatment and response variables corresponding to various simulation settings;
     - treatment is column "z",
     - the observed response under the control is column "y0",
     - the observed  response under treatment is "y1",
     - the expected response under the control is "mu0",
     - and the expected response under treatment is "mu1".


In [None]:
acic_X = pd.read_csv('/Users/arberimbibaj/Documents/Master Thesis ETH/DataSets /ACIC2016/data_cf_all/x.csv')
acic_X

In [None]:
acic_set1 = pd.read_csv('/Users/arberimbibaj/Documents/Master Thesis ETH/DataSets /ACIC2016/data_cf_all/1/zymu_13.csv')
acic_set1.describe()

In [None]:
acic_set1_dif = pd.read_csv('/Users/arberimbibaj/Documents/Master Thesis ETH/DataSets /ACIC2016/data_cf_all/1/zymu_336720355.csv')
acic_set1_dif.describe()