In [17]:
# Libraries
import numpy as np
from matplotlib import pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm
from tqdm import tqdm_notebook
from tqdm.notebook import trange

In [2]:
## Helper functions
# Check if all elements of array are the same
def all_same(a):
    return len(set(a)) == 1

In [3]:
## Functions that generates data
# Main subgraph
def generate_main_subgraph(N, K_infl, K_noninfl, pi_infl, lam, lam_min):
    # Total number of diversion units
    K = K_infl + K_noninfl
    # Adjacency matrix 
    A = np.zeros((N,K))
    # Connections to influential units
    A[:,:K_infl] = np.random.binomial(1, pi_infl, size=(N,K_infl))
    # For each outcome unit 1,...,N:
    for i in range(N):
        # Number of influential diversion units outcome unit i is connected to
        W_infl_num_i = np.sum(A[i,:K_infl])
        # Number of non-influential diversion units outcome unit i is connected to
        W_noninfl_num_i = max(min(np.random.poisson(lam), K_noninfl), int(lam_min - W_infl_num_i))
        # Which ones out of K_infl+1,...,K
        if W_noninfl_num_i > 0:
            auctions_i = K_infl + np.random.choice(K_noninfl, W_noninfl_num_i, replace=False)
            # Adjacency matrix
            A[i,auctions_i] = 1.0
    return A

# Off-diagonal (weak) subgraph
def generate_weak_subgraph(N, K, pi_weak):
    return np.random.binomial(1, pi_weak, size=(N,K))

# Full graph
def generate_graph(L, N, K_infl, K_noninfl, pi_infl, lam, lam_min, pi_weak):
    # Shapes
    K = K_infl + K_noninfl
    # Initialize
    A = np.zeros((N * L, K * L))
    edges_within = 0.0
    edges_between = 0.0
    # Generate subgraphs
    for s in range(L):
        for t in range(L):
            if s == t:
                A_st = generate_main_subgraph(N, K_infl, K_noninfl, pi_infl, lam, lam_min)
                edges_within += np.sum(A_st)
            else:
                A_st = generate_weak_subgraph(N, K, pi_weak)
                edges_between += np.sum(A_st)
            A[s * N:(s + 1) * N,t * K:(t + 1) * K] = A_st
    # Compute the weights
    W_num = np.sum(A, axis=1).reshape((L * N,1))
    W = A / W_num
    # Comput the share of edges cut between clusterts
    edges_cut = edges_between / (edges_between + edges_within)
    return (A, W, W_num, edges_cut)

# Variables
def generate_vars(A, W, W_num, p, sig_xi, sig_epsilon):
    # Exposure response function
    def mu(e, W_num):
        #return np.sqrt(e)
        return e
    # Sample sizes
    (N, K) = A.shape
    # Treatment effects for all outcome units
    beta = np.empty((N,1))
    # For each outcome unit 1,...,N:
    for i in range(N):
        beta[i] = mu(1, W_num[i]) - mu(0, W_num[i])
    # Normalize beta's
    #beta /= np.mean(beta)
    while True:
        # Assignment to treatment for divertion units
        Z = np.random.binomial(1, p, size=(K,1))
        # Exposures
        e = np.dot(W, Z).reshape((N,1))
        if not all_same(e.flatten()):
            break
    # Error terms at the level of diversion units
    xi = W.dot(np.random.normal(0, sig_xi, (K,1))).reshape((N,1))
    # Outcomes (constant + treatment_effect * exposure + noise)
    y = 0 + beta * mu(e, W_num) + xi + np.random.normal(0, sig_epsilon, (N,1))
    return (y, beta, Z, e)

In [4]:
## Function that estimates the true ATE
def ate_true(beta):
    return np.mean(beta)

In [5]:
## Function that estimates a naive model
def est_naive(y, e):
    # Matrix of regressors
    X = sm.add_constant(e)
    # Regress y on exposure and constant
    model = sm.OLS(y, X).fit()
    params_hat = model.params.reshape((-1,1))
    eps_hat = y - X.dot(params_hat)
    return (params_hat, eps_hat)

def ate_naive(y, e):
    return est_naive(y, e)[0][1]

In [6]:
## Bootstrap CIs
def compute_bootstrap_CI(y, e, num_boot_iter, boot_iter_func, alpha, *args):
    # Check if all elements are the same
    # Bootstrap iterations
    beta_hat_boot_dist_unsorted = np.zeros(num_boot_iter)
    for b in range(num_boot_iter): #tqdm_notebook(range(num_boot_iter), desc='Bootsrap:'):
        beta_hat_boot_dist_unsorted[b] = boot_iter_func(y, e, *args)
    beta_hat_boot_dist = np.sort(beta_hat_boot_dist_unsorted)
    # Sample observations
    q_lo = beta_hat_boot_dist[int(np.floor(num_boot_iter * alpha / 2.0))]
    q_hi = beta_hat_boot_dist[min(int(np.ceil(num_boot_iter * (1.0 - alpha / 2.0))), 
                                  num_boot_iter - 1)] # check for out-of-bounds
    return (q_lo, q_hi)

In [7]:
## Naive bootstrap iteration
def bootstrap_naive_iter(y, e):
    # Sample size
    N = y.size
    while True:
        # Sample
        boot_sample = np.random.choice(N, N, replace=True)
        # Bootstrap data
        y_b = y[boot_sample,0] # Bootstrap outcomes
        e_b = e[boot_sample,0] # Bootstrap exposures
        if not all_same(e_b):
            break
    return ate_naive(y_b, e_b)

In [8]:
## Two-sided bootstrap iteration
def bootstrap_two_sided_iter(y, e, A, Z):
    # Sample sizes
    (N, K) = A.shape
    while True:
        # Bootstrap samples
        boot_sample_out = np.random.choice(N, N, replace=True)
        boot_sample_div = np.random.choice(K, K, replace=True)
        # Bootstrap data
        A_b = A[boot_sample_out,:][:,boot_sample_div] # Bootstrap graph
        W_num_b = np.maximum(np.sum(A_b, axis=1).reshape((N,1)), 1) # Recompute the weights
        W_b = A_b / W_num_b
        Z_b = Z[boot_sample_div,:] # Bootstrap assignments
        y_b = y[boot_sample_out,:] # Bootstrap outcomes
        e_b = np.dot(W_b, Z_b).reshape((N,1)) # Recompute exposures
        if not all_same(e_b.reshape(N)):
            break
    return ate_naive(y_b, e_b)

In [9]:
## Non-parametric bootstrap iteration
def bootstrap_nonparametric_iter(y, e, L):
    # Split into blocks
    N = y.size
    y_all = np.vsplit(y, L)
    e_all = np.vsplit(e, L)
    while True:
        # Bootstrap samples
        boot_sample = np.random.choice(L, L, replace=True)
        # Bootstrap data
        y_b = np.vstack([y_all[j] for j in boot_sample])
        e_b = np.vstack([e_all[j] for j in boot_sample])
        if not all_same(e_b.reshape(N)):
            break
    return ate_naive(y_b, e_b)

In [21]:
def main(L=10, N=20, K_infl=0, K_noninfl=5, pi_infl=0.9, lam=2, lam_min=1, pi_weak=0.0,
         p=0.5, sig_xi=0.25, sig_epsilon=0.25, 
         num_sim_iter=1000, num_boot_iter=1000, 
         alpha=0.05,
         seed=666):
    # Set seed
    np.random.seed(seed)
    # Initialize coverage variables
    (cov_naive, cov_two_sided, cov_nonparam, cov_param) = (np.zeros(num_sim_iter), np.zeros(num_sim_iter), 
                                                           np.zeros(num_sim_iter), np.zeros(num_sim_iter))
    # Initialize coverage variables
    (width_naive, width_two_sided, width_nonparam, width_param) = (np.zeros(num_sim_iter), np.zeros(num_sim_iter), 
                                                                   np.zeros(num_sim_iter), np.zeros(num_sim_iter))
    # Generate graph
    (A, W, W_num, edges_cut) = generate_graph(L, N, K_infl, K_noninfl, pi_infl, lam, lam_min, pi_weak)
    # Main loop for simulations
    for i in trange(num_sim_iter, desc='Simulations'):
        (y, beta, Z, e) = generate_vars(A, W, W_num, p, sig_xi, sig_epsilon)
        # True value
        ate_true_val = ate_true(beta)
        # Estimates
        (params_hat, eps_hat) = est_naive(y, e)
        # Naive bootstrap
        (q_lo, q_hi) = compute_bootstrap_CI(y, e, num_boot_iter, bootstrap_naive_iter, alpha)
        cov_naive[i] = q_lo <= ate_true_val <= q_hi
        width_naive[i] = q_hi - q_lo
        # Two-sided bootstrap
        (q_lo, q_hi) = compute_bootstrap_CI(y, e, num_boot_iter, bootstrap_two_sided_iter, alpha, A, Z)
        cov_two_sided[i] = q_lo <= ate_true_val <= q_hi
        width_two_sided[i] = q_hi - q_lo
        # Nonparametric bootstrap
        (q_lo, q_hi) = compute_bootstrap_CI(y, e, num_boot_iter, bootstrap_nonparametric_iter, alpha, L)
        cov_nonparam[i] = q_lo <= ate_true_val <= q_hi
        width_nonparam[i] = q_hi - q_lo
        # Parametric bootstrap
        (q_lo, q_hi) = (0, 0) #compute_bootstrap_CI(y_all, e_all, bootstrap_parametric_iter, num_boot_iter, alpha, A_all, p, params_hat, eps_hat)
        cov_param[i] = q_lo <= ate_true_val <= q_hi
        width_param[i] = q_hi - q_lo
        
    # Print the results
    print('Naive bootstrap coverage: {}. Two-sided bootstrap coverage: {}. ' \
          'Nonparametric bootstrap coverage: {}. Parametric bootstrap coverage: {}' \
         .format(np.mean(cov_naive), np.mean(cov_two_sided), np.mean(cov_nonparam), np.mean(cov_param)))
    print('Naive mean CI width: {}. Two-sided mean CI width: {}. ' \
          'Nonparametric mean CI width: {}. Parametric mean CI width: {}' \
         .format(np.mean(width_naive), np.mean(width_two_sided), np.mean(width_nonparam), np.mean(width_param)))
    print('Naive variance of CI width: {}. Two-sided variance of CI width: {}. ' \
          'Nonparametric variance of CI width: {}. Parametric variance of CI width: {}' \
         .format(np.var(width_naive), np.var(width_two_sided), np.var(width_nonparam), np.var(width_param)))

In [22]:
# Get coverage
def get_coverage(L, N, K_infl, K_noninfl, pi_infl, lam, lam_min, pi_weak, p, sig_xi, sig_epsilon, num_sim_iter, num_boot_iter, alpha, seed):
    # Set seed
    np.random.seed(seed)
    # Initialize coverage variables
    n_methods = 2
    coverage = np.zeros((num_sim_iter, n_methods))
    # Generate graph
    (A, W, W_num, edges_cut) = generate_graph(L, N, K_infl, K_noninfl, pi_infl, lam, lam_min, pi_weak)
    # Main loop for simulations
    for i in trange(num_sim_iter, desc='Simulations'):
        (y, beta, Z, e) = generate_vars(A, W, W_num, p, sig_xi, sig_epsilon)
        # True value
        ate_true_val = ate_true(beta)
        # Estimates
        (params_hat, eps_hat) = est_naive(y, e)
        # Naive bootstrap
        (q_lo, q_hi) = compute_bootstrap_CI(y, e, num_boot_iter, bootstrap_naive_iter, alpha)
        coverage[i,0] = q_lo <= ate_true_val <= q_hi
        # Nonparametric bootstrap
        (q_lo, q_hi) = compute_bootstrap_CI(y, e, num_boot_iter, bootstrap_nonparametric_iter, alpha, L)
        coverage[i,1] = q_lo <= ate_true_val <= q_hi
    
    return (edges_cut, np.mean(coverage, axis=0))

In [None]:
## Run the simulations (test 1)
main(L=10, N=20, K_infl=0, K_noninfl=5, pi_infl=0.9, lam=3, lam_min=1, pi_weak=0.05,
     p=0.5, sig_xi=1, sig_epsilon=0.25, 
     num_sim_iter=200, num_boot_iter=1000, 
     alpha=0.05,
     seed=666)

In [18]:
## Function that gets the data to plot
def get_coverage_plot(pi_weak_range,
                      q_lo=0.025, q_hi=0.975,
                      L=10, N=20, K_infl=0, K_noninfl=5, pi_infl=0.9, lam=3, lam_min=1, 
                      p=0.5, sig_xi=1, sig_epsilon=0.25, 
                      num_sim_iter=200, num_boot_iter=1000, 
                      alpha=0.05,
                      seed=666):
    # Get the plot range
    n_plot_range = len(pi_weak_range)
    # Initialize
    n_methods = 2
    coverages = np.zeros((n_plot_range,n_methods))
    edges_cut = np.zeros((n_plot_range,1))
    # Loop through pi_weak's
    for i in range(n_plot_range):
        (edges_cut[i,0], coverages[i,:]) = get_coverage(L, N, K_infl, K_noninfl, pi_infl, lam, lam_min,
                                                        pi_weak_range[i], p, sig_xi, sig_epsilon,
                                                        num_sim_iter, num_boot_iter, alpha, seed)
    return (edges_cut, coverages)

In [19]:
## Plot the coverage
def plot_coverage(edges_cut, coverages):
    # Style
    plt.style.use('ggplot')
    # Plot true
    plt.clf()
    # Naive
    plt.plot(edges_cut, coverages[:,0], '-', color='blue', linewidth=1, label='Simple One-sided')
    #plt.fill_between(edges_cut, co_lower[:,0], co_upper[:,0], alpha=alpha, edgecolor='blue', facecolor='blue', linewidth=0)
    # Blocked
    plt.plot(edges_cut, coverages[:,1], '-', color='green', linewidth=1, label='Clustered')
    #plt.fill_between(edges_cut, co_lower[:,1], co_upper[:,1], alpha=alpha, edgecolor='green', facecolor='green', linewidth=0)
    # Style
    ax = plt.gca()
    ax.set_facecolor('white')
    ax.grid(color='grey', alpha=0.25)
    ax.spines['top'].set_color('grey')
    ax.spines['top'].set_linewidth(0.5)
    ax.spines['bottom'].set_color('grey')
    ax.spines['bottom'].set_linewidth(0.5)
    ax.spines['left'].set_color('grey')
    ax.spines['left'].set_linewidth(0.5)
    ax.spines['right'].set_color('grey')
    ax.spines['right'].set_linewidth(0.5)
    ax.xaxis.label.set_color('black')
    ax.yaxis.label.set_color('black')
    # Labels
    plt.xlabel('Share of \"Between Edges\"', color='black')
    plt.ylabel('Coverage of a 95% CI', color='black')
    # Legend
    plt.legend(loc='upper right', fontsize='large', facecolor='white', edgecolor='white', framealpha=0)
    # Save
    plt.savefig('coverage_plot.pdf')
    plt.savefig('coverage_plot.png')

In [None]:
pi_weak_range = [0.0, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5]
(edges_cut, coverages) = get_coverage_plot(pi_weak_range,
                                           L=10, N=20, K_infl=0, K_noninfl=5, pi_infl=0.9, lam=3, lam_min=1,
                                           p=0.5, sig_xi=1, sig_epsilon=0.25,
                                           num_sim_iter=500, num_boot_iter=1000,
                                           alpha=0.05,
                                           seed=666)

In [None]:
plot_coverage(edges_cut, coverages)