In [20]:
import os, time
import numpy as np
import matplotlib.pyplot as plt
import cv2 as cv
from PIL import Image
from tqdm import tqdm
import pandas as pd
from scipy.optimize import brentq
from scipy.stats import binom
from joblib import delayed, Parallel

# TODO: put this in another file.
def wsr_ci(x,N,delta,grid,num_cpus=10): # x is a [0,1] bounded sequence
    n = x.shape[0]
    def mu(m,i): return (N*m - np.concatenate([np.array([0,]), np.cumsum(x[:i-1])]))/(N - (np.arange(i)+1) + 1 )
    muhats = (1/2 + np.cumsum(x))/(np.arange(n)+1)
    sigmahat2s = (1/4 + np.cumsum((x-muhats)**2))/(np.arange(n)+1)
    lambdas = np.concatenate([np.array([1,]), np.sqrt(2*np.log(2/delta)/(n*sigmahat2s))[:-1]]) # can't use last entry
    def M(m,i): return 1/2*np.maximum(
        np.prod(1+np.minimum(lambdas[:i], 1/mu(m,i))*(x[:i]-mu(m,i))),
        np.prod(1-np.minimum(lambdas[:i], 1/(1-mu(m,i)))*(x[:i]-mu(m,i)))
    )
    M = np.vectorize(M)
    M_list = Parallel(n_jobs=num_cpus)(delayed(M)(grid,i) for i in range(1,n+1))
    ci_indicators = np.prod(np.stack(M_list, axis=1) < 1/delta , axis=1)
    return grid[np.where(ci_indicators)[0]]

In [8]:
# The classification algorithm
def find_ellipse(filename, plot):
    img_full = cv.imread(filename, cv.IMREAD_COLOR)

    cropRows = [-img_full.shape[0]//3, -img_full.shape[0]//6]
    cropCols = [-img_full.shape[1]//5, -img_full.shape[1]//15]

    img = img_full[cropRows[0]:cropRows[1],cropCols[0]:cropCols[1],:]


    # Check if image is loaded fine
    if img is None:
        print ('Error opening image!')
        print ('Usage: hough_circle.py [image_name -- default ' + default_file + '] \n')


    gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)

    gray = 255-gray

    rows = gray.shape[0]

    contours,hierarchy = cv.findContours(gray,2,1)
    contours_passed = []
    fit_ellipses = []
    areas = []
    area_constraints = [250,1500]
    for i in contours:
        area = cv.contourArea(i) 
        if(area >= area_constraints[0] and area <= area_constraints[1]):
            contours_passed += [i]
            curr_ell = cv.fitEllipse(i)
            fit_ellipses += [curr_ell]
            areas += [area]
    contours_passed, areas = np.array(contours_passed), np.array(areas)
    analytic_areas = np.array([ell[1][0]*ell[1][1]*np.pi/4.0 for ell in fit_ellipses])
    if areas.shape[0] == 0:
        vote = -1
    else:
        idx_best = np.argmin( np.abs(analytic_areas - areas)/areas )
        ell_best = fit_ellipses[idx_best]
        decision_boundary = int(img.shape[0]/1.5)
        vote = fit_ellipses[idx_best][0][1] >= decision_boundary
        if plot:
            cv.line(img, (0,decision_boundary), (img.shape[1]-1,decision_boundary), (255,0,0), 2)
            cv.ellipse(img, fit_ellipses[idx_best], (0,255,0), -1)

    if plot:
        img_full[cropRows[0]:cropRows[1],cropCols[0]:cropCols[1],:] = img

        plt.figure(figsize=(15,15))
        plt.imshow(img_full)
        plt.axis('off');
    
    return vote

In [9]:
# Process .tif images into .png
base_path = "/Users/angelopoulos/Code/working/prediction-powered-inference/ballots/raw/A22_BallotImages"
new_path = "/Users/angelopoulos/Code/working/prediction-powered-inference/ballots/proc/"
ballot_dirnames = []
ballot_filenames = []
os.makedirs(new_path, exist_ok = True)
counter = 1
for dirpath, dirnames, filenames in os.walk(base_path):
    for filename in filenames:
        if '.tif' in filename:
            ballot_dirnames += [dirpath,]
            ballot_filenames += [filename,]

for i in tqdm(range(len(ballot_filenames))):
    if not os.path.exists(new_path + str(counter) + ".png"):
        img_full = cv.imread(ballot_dirnames[i] + "/" + ballot_filenames[i], cv.IMREAD_COLOR)
        cv.imwrite(new_path + str(counter) + ".png", img_full)
    counter += 1

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 78771/78771 [00:00<00:00, 549857.41it/s]


In [10]:
# Read in and clean labels
cal_label_csv = pd.read_csv('labels.csv')
cal_labeled_image_filenames = [new_path + cal_label_csv['image'][i].split("/")[3] for i in range(len(cal_label_csv))]
cal_labels = -np.ones((len(cal_label_csv,)))
cal_labels[cal_label_csv['choice'] == "Matt Haney"] = 1
cal_labels[cal_label_csv['choice'] == "David Campos"] = 0

cal_preds = np.array([find_ellipse(fname, plot=False) for fname in cal_labeled_image_filenames])
clean_cal_preds = cal_preds[(cal_labels >= 0) & (cal_preds >= 0)]
clean_cal_labels = cal_labels[(cal_labels >= 0) & (cal_preds >= 0)]

  contours_passed, areas = np.array(contours_passed), np.array(areas)


In [7]:
# Do the counting
base_path = "/Users/angelopoulos/Code/working/prediction-powered-inference/ballots/proc/"
plot = False
ballot_filenames = []
print_rate = 100
for dirpath, dirnames, filenames in os.walk(base_path):
    for filename in filenames:
        if '.png' in filename:
            ballot_filenames += [dirpath + "/" + filename, ]
# Count the prediction-powered votes
vote_counts = np.array([0,0,0])
for i in range(len(ballot_filenames)):
    ballot_filename = ballot_filenames[i]
    if ballot_filename in cal_labeled_image_filenames: # Don't count the ones we already labeled
        continue
    else:
        vote = find_ellipse(ballot_filename, plot=plot)
        if vote >= 0:
            vote_counts[int(vote)] += 1
        else:
            vote_counts[2] += 1
        if i % print_rate == 0:
            print(f"Vote Count ({float(i)/float(len(ballot_filenames))*100:.2f}% counted): Haney {vote_counts[1]} ({vote_counts[1]/(vote_counts[0]+vote_counts[1])*100:.2f}%), Campos {vote_counts[0]} ({100-vote_counts[1]/(vote_counts[0]+vote_counts[1])*100:.2f}%), Thrown Out {vote_counts[2]} ({vote_counts[2]/vote_counts.sum()*100:.2f}%)", end="\r")

print("\n")
print(f"Final Count ({float(i)/float(len(ballot_filenames))*100:.2f}% counted): Haney {vote_counts[1]} ({vote_counts[1]/(vote_counts[0]+vote_counts[1])*100:.2f}%), Campos {vote_counts[0]} ({100-vote_counts[1]/(vote_counts[0]+vote_counts[1])*100:.2f}%), Thrown Out {vote_counts[2]} ({vote_counts[2]/vote_counts.sum()*100:.2f}%)", end="\r")

Vote Count (0.00% counted): Haney 0 (0.00%), Campos 1 (100.00%), Thrown Out 0 (0.00%)

  contours_passed, areas = np.array(contours_passed), np.array(areas)


Vote Count (87.47% counted): Haney 41885 (61.28%), Campos 26469 (38.72%), Thrown Out 547 (0.79%)

KeyboardInterrupt: 

In [38]:
# Run prediction-powered inference
delta = 0.05
N = 30292 + 47858 # Total number of ballots
n = 1000 # Total number of labeled ballots
grid = np.linspace(0.3,0.7,1000)
vote_counts = np.array([30292, 47858, 621])
# Get bounds on confusion matrix for prediction-powered inference
rectifier = -(clean_cal_preds.astype(float) - clean_cal_labels.astype(float)).mean() # negative bias
pp_est = vote_counts[1]/N + rectifier
    
ci = 2*wsr_ci((clean_cal_labels - clean_cal_preds + 1)/2, N, delta, grid)-1
print(f"The prediction-powered estimate is {pp_est*100:.2f}%")
print(f"The prediction-powered interval is [{(pp_est + ci.min())*100:.2f}%, {(pp_est + ci.max())*100:.2f}%]")

The prediction-powered estimate is 62.11%
The prediction-powered interval is [62.24%, 64.00%]


In [40]:
# Run classical
ci_classical = wsr_ci(clean_cal_labels, N, delta, grid)
print(f"The classical interval is [{ci_classical.min()*100:.2f}%, {ci_classical.max()*100:.2f}%], ({(ci_classical.max() - ci_classical.min())/(ci.max()-ci.min()) * 100 : .2f}% larger)")

The classical interval is [59.11%, 66.44%], ( 415.91% larger)


In [39]:
# Imputed-only estimate
print(f"The imputed-only estimate is {vote_counts[1]/N*100:.2f}%")

The imputed-only estimate is 61.24%


In [4]:
# Visualize data
base_path = "/Users/angelopoulos/Code/working/prediction-powered-inference/ballots/proc/"
ballot_filenames = []
for dirpath, dirnames, filenames in os.walk(base_path):
    for filename in filenames:
        if '.png' in filename:
            ballot_filenames += [dirpath + "/" + filename, ]
ballot_filenames = np.array(ballot_filenames)
ballot_filenames = np.random.choice(ballot_filenames, 10)
[find_ellipse(ballot_filename, plot=True) for ballot_filename in ballot_filenames];

ValueError: 'a' cannot be empty unless no samples are taken