In [1]:
import numpy as np
import pandas as pd
import random
import copy
import matplotlib.pyplot as plt

from queries import *

In [2]:
def sample_race(q):
    """
    """
    race_counts = q.race_hisp.sum(axis=1)
    race_prob = race_counts/ race_counts.sum()
    rrace = np.random.choice(range(q.num_race), p=race_prob)
    return rrace

def sample_hisp(q, rrace):
    """
    """
    hisp_prob = q.race_hisp[rrace] / q.race_hisp[rrace].sum()
    rhisp = np.random.choice(range(q.num_hisp), p=hisp_prob)
    return rhisp

def sample_age(q, rrace, rhisp):
    """
    """
    race_coarse = q.race_coarsened(rrace)
    
    if race_coarse == 0:
        age_prob = q.white_hisp_age_sex[rhisp].sum(axis=1) / q.white_hisp_age_sex[rhisp].sum(axis=1).sum()
    else:
        age_prob = q.race_age_sex[race_coarse].sum(axis=1) / q.race_age_sex[race_coarse].sum(axis=1).sum()


    rage = np.random.choice(range(q.num_age), p=age_prob)
    return rage

def sample_sex(q, rrace, rhisp, rage):
    """
    """
    race_coarse = q.race_coarsened(rrace)
                
    if race_coarse == 0:
        sex_prob = q.white_hisp_age_sex[rhisp, rage] / q.white_hisp_age_sex[rhisp, rage].sum()
    else:
        sex_prob = q.race_age_sex[race_coarse, rage] / q.race_age_sex[race_coarse, rage].sum()
        
    rsex = np.random.choice(range(q.num_sex), p=sex_prob)
    return rsex

def construct_row(q, verbose=False):
    """
    """
    if verbose:
        print("Printing state:")
        q.print_state()
    
    rrace = sample_race(q)
    rhisp = sample_hisp(q, rrace)
    rage = sample_age(q, rrace, rhisp)
    rsex = sample_sex(q, rrace, rhisp, rage)
    
    return (rrace, rhisp, rage, rsex)


def add_row_to_reconstruction(recon_df, row, blk_id):
    """
    """
    rrace, rhisp, rage, rsex = row
    recon_df.loc[len(recon_df)] = [blk_id, rrace, rhisp, rage, rsex]
    
def reconstruct_block(blk_id, queries, verbose=False):
    """
    """
    q = copy.deepcopy(queries)
    
    recon_df = pd.DataFrame(columns=["TABBLK", "CENRACE", "CENHISP", "QAGE", "QSEX"])
    for _ in range(q.total):
        row = construct_row(q, verbose=verbose)
        add_row_to_reconstruction(recon_df, row, blk_id)
        remove_row_from_queries(row, q)
    
    assert(np.sum(q.race_hisp) == 0)
    assert(np.sum(q.white_hisp_age_sex) == 0)
    assert(np.sum(q.race_age_sex) == 0)
    
    return recon_df

def remove_row_from_queries(row, q):
    rrace, rhisp, rage, rsex = row
                
    q.total -= 1
    q.race_hisp[rrace, rhisp] -= 1
    
    if rrace == 0:
        q.white_hisp_age_sex[rhisp, rage, rsex] -= 1
    else:
        race_coarse = q.race_coarsened(rrace)
        q.race_age_sex[race_coarse, rage, rsex] -= 1
    print()
    
def reconstruct_tract(queries):
    """
    """
    main_df = pd.DataFrame(columns=["TABBLK", "CENRACE", "CENHISP", "QAGE", "QSEX"])

    for (block_id, block_queries) in queries.items():
        recon_df = reconstruct_block(block_id, block_queries)
        main_df = pd.concat([main_df, recon_df])
        
    return main_df

def tests_passed(real_queries, recon_df):
    """
    """
    reconq = get_block_queries(recon_df)
    
    for (block_id, realq) in real_queries.items():
        assert(realq.total == reconq[block_id].total)
        assert(np.sum(realq.race_hisp == reconq[block_id].race_hisp) 
               == (realq.num_race * realq.num_hisp))
        
        assert(np.sum(realq.white_hisp_age_sex == reconq[block_id].white_hisp_age_sex) 
               == (realq.num_hisp * realq.num_age * realq.num_sex))
        
        assert(np.sum(realq.race_age_sex == reconq[block_id].race_age_sex) 
               == (realq.num_race_coarse * realq.num_age * realq.num_sex))
    
    print("Tests passed!")
    return True
    
def get_block_queries(df):
    """
    """
    block_queries = {}
    
    for block_id in sorted(df["TABBLK"].unique()):
        block = df[df["TABBLK"] == block_id]
        block_queries[block_id] = query_manager(block)
    
    return block_queries
        
def get_num_reconstructed(recon, real):
    """
        `recon' and `real' are both pandas DataFrames.
    """
    real_unique = real.groupby(real.columns.tolist(), as_index=False).size()
    recon_unique = recon.groupby(recon.columns.tolist(), as_index=False).size()
    
    num_reconstructed = 0
    
    for idx, recon_row in recon_unique.iterrows():
        (blk, race, hisp, age, sex, size) = recon_row
        real_row = real_unique.query(f'TABBLK == {blk} and ' +
                                     f'CENRACE == {race} and ' +
                                     f'CENHISP == {hisp} and ' +
                                     f'QAGE == {age} and ' +
                                     f'QSEX == {sex}')

        if len(real_row) > 0:
            num_reconstructed += min(real_row["size"].item(), size)
    
    return num_reconstructed

def age_buckets(age):
    """ Given an age returns the bucket it fall in.
    """
    buckets = [(0, 5),
               (5, 10),
               (10, 15),
               (15, 18),
               (18, 20),
               (20, 21),
               (21, 22),
               (22, 25),
               (25, 30),
               (30, 35),
               (35, 40),
               (40, 45),
               (45, 50),
               (50, 55),
               (55, 60),
               (60, 62),
               (62, 65),
               (65, 67),
               (67, 70),
               (70, 75),
               (75, 80),
               (80, 85),
               (85, 126) 
              ]
    
    for (idx, bucket) in enumerate(buckets):
        start, stop = bucket
        if age in range(start, stop):
            return idx
        
    raise Exception("Age out of bounds: ", age)

In [3]:
df = pd.read_csv("./dp-query-release/datasets/ppmf/ppmf_01097007000.csv")
df = df[['TABBLK', 'CENRACE', 'CENHISP', 'QAGE', 'QSEX']]
df["QAGE"] = df["QAGE"].apply(age_buckets)

# all_block_queries = get_block_queries(df)
# recon = reconstruct_tract(all_block_queries)
# recon

In [4]:
# tests_passed(all_block_queries, recon)
# get_num_reconstructed(recon, df)

# all_recons = pd.DataFrame(columns=df.columns)
# counter = 0
# while counter <100:
#     try:
#         recon = reconstruct_tract(all_block_queries)
#         print(counter)
#         counter += 1
#         all_recons = pd.concat([all_recons, recon])
#     except:
#         continue
        
# all_recons_unique = all_recons.groupby(all_recons.columns.tolist(), as_index=False).size()
# all_recons_unique

# all_recons_unique = all_recons_unique.sort_values(by=['size'], ascending=False)
# all_recons_unique

# def match_rate(recon, real, k):
#     """
#     """
#     num_reconstructed = 0
#     for i in range(k):
#         (blk, race, hisp, age, sex, size) = recon.iloc[i]
#         real_row = real.query(f'TABBLK == {blk} and ' +
#                               f'CENRACE == {race} and ' +
#                               f'CENHISP == {hisp} and ' +
#                               f'QAGE == {age} and ' +
#                               f'QSEX == {sex}')

#         if len(real_row) > 0:
#             num_reconstructed += 1
    
#     return num_reconstructed / k

# match_rate(all_recons_unique, df, 570)

# u = len(all_recons_unique)

# xs = []
# ys = []

# for k in range(1, u):
#     mr = match_rate(all_recons_unique, df, k)
    
#     xs.append(k/u)
#     ys.append(mr)
    
# plt.plot(xs, ys)
# plt.xlim([-0.1, 1.1])
# plt.ylim([-0.1, 1.1])
# plt.ylabel("Match Rate")
# plt.xlabel("k/u")
# plt.savefig("coarse_matchrate.png")

In [5]:
block = df[df["TABBLK"] == 17]
block

Unnamed: 0,TABBLK,CENRACE,CENHISP,QAGE,QSEX
584,17,0,1,0,1
585,17,0,1,0,1
586,17,0,1,0,1
587,17,0,1,0,1
588,17,0,1,0,1
...,...,...,...,...,...
657,17,0,1,9,0
658,17,0,1,13,0
659,17,0,1,13,0
660,17,0,1,13,0


In [6]:
all_block_queries = get_block_queries(block)
recon = reconstruct_tract(all_block_queries)

removing white
before: 10.0
after: 9.0

removing white
before: 7.0
after: 6.0

removing white
before: 4.0
after: 3.0

removing white
before: 10.0
after: 9.0

removing white
before: 9.0
after: 8.0

removing white
before: 13.0
after: 12.0

removing white
before: 8.0
after: 7.0

removing white
before: 6.0
after: 5.0

removing white
before: 9.0
after: 8.0

removing white
before: 10.0
after: 9.0

removing white
before: 5.0
after: 4.0

removing white
before: 4.0
after: 3.0

removing white
before: 7.0
after: 6.0

removing white
before: 5.0
after: 4.0

removing white
before: 9.0
after: 8.0

removing white
before: 6.0
after: 5.0

removing white
before: 5.0
after: 4.0

removing non-white
Race fine:  5
Race coarse:  5
before:  2.0
after:  1.0

removing white
before: 8.0
after: 7.0

removing white
before: 4.0
after: 3.0

removing white
before: 12.0
after: 11.0

removing white
before: 11.0
after: 10.0

removing white
before: 3.0
after: 2.0

removing white
before: 4.0
after: 3.0

removing non-white


In [None]:
all_block_queries = get_block_queries(block)
all_block_queries

In [None]:
all_block_queries[17].race_age_sex

In [None]:
block.query("CENRACE == 21")

In [None]:
recon.groupby(["QSEX"]).count()

In [None]:
# block.groupby(["QSEX"]).count()
reconq = get_block_queries(recon)

In [None]:
tests_passed(all_block_queries, recon)
get_num_reconstructed(recon, df)

In [None]:
all_block_queries[17].white_hisp_age_sex

In [None]:
reconq[17].white_hisp_age_sex == all_block_queries[17].white_hisp_age_sex

In [None]:
np.sum(all_block_queries[17].race_age_sex)