# PPI power analysis



In [43]:
import os, sys

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
import numpy as np
import pandas as pd
from ppi_py.datasets import load_dataset
from power_ppi import ppi_mean_power
from ppi_py import ppi_mean_ci
from tqdm import tqdm
from scipy.optimize import brentq
from scipy.stats import norm
from utils import *

## Import ballots data

In [44]:
dataset_folder = "./data/"
data = load_dataset(dataset_folder, "ballots")
Y = data["Y"]
Yhat = data["Yhat"]
Yhat_unlabeled = data["Yhat_unlabeled"]

## Perform initial power analysis with a subset of the data

In [45]:
n = Y.shape[0]
N = Yhat_unlabeled.shape[0]

n_initial = 200
N_initial = 200

labeled_initial = np.random.choice(n, n_initial, replace = False)
unlabeled_initial = np.random.choice(N, N_initial, replace = False)

Y_initial = Y[labeled_initial]
Yhat_initial = Yhat[labeled_initial]
Yhat_unlabeled_initial = Yhat_unlabeled[unlabeled_initial]
cost_Y = 1
cost_Yhat = 0.01

most_powerful_pair = ppi_mean_power(Y_initial, Yhat_initial, Yhat_unlabeled_initial, cost_Y, cost_Yhat, budget = 1000)
cheapest_pair = ppi_mean_power(Y_initial, Yhat_initial, Yhat_unlabeled_initial, cost_Y, cost_Yhat, se_tol = 0.01)
print("Most powerful pair:\n", most_powerful_pair)
print("Cheapest pair:\n", cheapest_pair)

Most powerful pair:
 (array([745.05610196]), array([24749.33370219]), array([1000.]), array([0.00550636]))
Cheapest pair:
 (array([225.90106232]), array([7503.99971278]), array([303.20007007]), array([0.01]))


In [46]:
n_sub = most_powerful_pair[0][0].astype(int)
N_sub = most_powerful_pair[1][0].astype(int)

n = Y.shape[0]
N = Yhat_unlabeled.shape[0]
reps = 200 
ses = np.zeros(reps)
for i in range(reps):
    labeled_sample = np.random.choice(n, n_sub, replace = False)
    unlabeled_sample = np.random.choice(N, N_sub, replace = False)

    Y_sub = Y[labeled_sample]
    Yhat_sub = Yhat[labeled_sample]
    Yhat_unlabeled_sub = Yhat_unlabeled[unlabeled_sample]
    CI = ppi_mean_ci(Y_sub, Yhat_sub, Yhat_unlabeled_sub, alpha = 0.05)
    ses[i] = (CI[1][0] - CI[0][0])/norm.ppf(1 - 0.05/2)/2
    
print("Predicted SE for most powerful pair: ", most_powerful_pair[3][0])
print("Estimated SE for most powerful pair: ", np.mean(ses))

Predicted SE for most powerful pair:  0.005506360595471345
Estimated SE for most powerful pair:  0.00450513945669836


In [47]:
n_sub = cheapest_pair[0][0].astype(int)
N_sub = cheapest_pair[1][0].astype(int)

n = Y.shape[0]
N = Yhat_unlabeled.shape[0]
reps = 200 
ses = np.zeros(reps)
for i in range(reps):
    labeled_sample = np.random.choice(n, n_sub, replace = False)
    unlabeled_sample = np.random.choice(N, N_sub, replace = False)

    Y_sub = Y[labeled_sample]
    Yhat_sub = Yhat[labeled_sample]
    Yhat_unlabeled_sub = Yhat_unlabeled[unlabeled_sample]
    CI = ppi_mean_ci(Y_sub, Yhat_sub, Yhat_unlabeled_sub, alpha = 0.05)
    ses[i] = (CI[1][0] - CI[0][0])/norm.ppf(1 - 0.05/2)/2
    
print("Predicted SE for cheapest pair: ", cheapest_pair[3][0])
print("Estimated SE for cheapest pair: ", np.mean(ses))

Predicted SE for cheapest pair:  0.009999999999999997
Estimated SE for cheapest pair:  0.008000296031075852
