In [11]:
import os, sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
import numpy as np
import pandas as pd
from ppi_py.datasets import load_dataset
from ppi_py import ppi_logistic_ci, classical_logistic_ci
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
from utils import *

# Import the census income data set

In [3]:
dataset_folder = "./data/"
data = load_dataset(dataset_folder, "census_healthcare")
Y_total = data["Y"]
Yhat_total = data["Yhat"]
X_total = data["X"]

Dataset census_healthcare not found at location ./data/; downloading now...


Downloading...
From: https://drive.google.com/uc?id=1RjWsnq-gMngRFRj22DvezcdCVl2MxAIX
To: /Users/angelopoulos/Code/working/ppi_py/examples/data/census_healthcare.npz
100%|██████████| 8.91M/8.91M [00:00<00:00, 9.57MB/s]


# Problem setup

In [10]:
alpha = 0.05
coordinate = 0 # Choose between 0, 1
n_total = Y_total.shape[0]  # Total number of labeled examples
ns = np.linspace(100, 2000, 10).astype(
    int
)  # Test for different numbers of labeled ballots
num_trials = 50
# Compute ground truth
true_theta = LogisticRegression(
    penalty="none",
    solver="lbfgs",
    max_iter=10000,
    tol=1e-15,
    fit_intercept=False,
).fit(X_total, Y_total).coef_.squeeze()[coordinate]

# Construct intervals

In [13]:
# Run prediction-powered inference and classical inference for many values of n
results = []
for i in range(ns.shape[0]):
    for j in tqdm(range(num_trials)):
        # Prediction-Powered Inference
        n = ns[i]
        rand_idx = np.random.permutation(n_total)
        _X, _X_unlabeled = X_total[rand_idx[:n]], X_total[rand_idx[n:]]
        _Y, _Y_unlabeled = Y_total[rand_idx[:n]], Y_total[rand_idx[n:]]
        _Yhat, _Yhat_unlabeled = Yhat_total[rand_idx[:n]], Yhat_total[rand_idx[n:]]
        
        ppi_ci = ppi_logistic_ci(_X, _Y, _Yhat, _X_unlabeled, _Yhat_unlabeled, alpha=alpha)

        # Classical interval
        classical_ci = classical_logistic_ci(_X, _Y, alpha=alpha)

        # Append results
        results += [
            pd.DataFrame(
                [
                    {
                        "method": "PPI",
                        "n": n,
                        "lower": ppi_ci[0][coordinate],
                        "upper": ppi_ci[1][coordinate],
                        "trial": j,
                    }
                ]
            )
        ]
        results += [
            pd.DataFrame(
                [
                    {
                        "method": "Classical",
                        "n": n,
                        "lower": classical_ci[0][coordinate],
                        "upper": classical_ci[1][coordinate],
                        "trial": j,
                    }
                ]
            )
        ]

# Imputed CI
imputed_ci = classical_logistic_ci(X_total, Yhat_total, alpha=alpha)
results += [
    pd.DataFrame(
        [
            {
                "method": "Imputation",
                "n": np.nan,
                "lower": imputed_ci[0][coordinate],
                "upper": imputed_ci[1][coordinate],
                "trial": 0,
            }
        ]
    )
]

df = pd.concat(results, axis=0, ignore_index=True)
df["width"] = df["upper"] - df["lower"]

  0%|                                                                                          | 0/50 [13:40<?, ?it/s]


KeyboardInterrupt: 

# Plot results

In [None]:
make_plots(
    df,
    "./plots/census_healthcare.pdf",
    intervals_xlabel="Logistic coeff",
    true_theta=true_theta,
)