In [None]:
import pandas as pd
import numpy as np
import random
import itertools

In [None]:
def get_target_marginals(d):
    factors = list(d.keys())
    targets = [sorted([(k2, v2) for k2, v2 in v.items()]) for k, v in d.items()]
    targets = np.array([[v for _, v in item] for item in targets])
    return factors, targets

def get_table(df, targets):
    factors, target_marginals = get_target_marginals(targets)

    cross_tab = pd.crosstab(df[factors[0]], [df[c] for c in factors[1:]], dropna=False)
    shape = tuple([df[c].unique().shape[0] for c in factors])
    print(shape)

    table = cross_tab.values.reshape(shape)
    
    return factors, target_marginals, table


In [None]:
def get_coordinates(M):
    return list(itertools.product(*[list(range(n)) for n in M.shape]))

# returns a dict with marginals for each unique value of dimention i in M
def get_marginals(M, i):
    coordinates = get_coordinates(M)
    key = lambda tup: tup[0]
    counts = [(c[i], M[c]) for c in coordinates]
    counts = sorted(counts, key=key)
    counts = itertools.groupby(counts, key=key)
    counts = {k: sum([v[1] for v in g]) for k, g in counts}

    return counts

def get_all_marginals(M):
    return np.array([[v for _, v in get_marginals(M, i).items()]
                     for i in range(len(M.shape))])

# returns a dict with counts for each unique value of dimention i in M
def get_counts(M, i):
    coordinates = get_coordinates(M)

    key = lambda tup: tup[0]
    counts = [(c[i], M[c], c) for c in coordinates]
    counts = sorted(counts, key=key)
    counts = itertools.groupby(counts, key=key)
    counts = {k: [(tup[1], tup[2]) for tup in g] for k, g in counts}

    return counts

#ipu to update values 
def update_values(M, i, u):
    marg = get_marginals(M, i)
    vals = get_counts(M, i)
    d = [[(c, n * u[k] / marg[k]) for n, c in v] for k, v in vals.items()]
    d = itertools.chain(*d)
    d = list(d)

    return d

def ipf_update(M, u):
    for i in range(len(M.shape)):
        values = update_values(M, i, u[i])
        for idx, v in values:
            # print(idx)
            M[idx] = v

    o = get_all_marginals(M)
    d = get_deltas(o, u)

    return M, d

# o-t
def get_deltas(o, t):
    return np.array([np.linalg.norm(np.array(o[r]) - np.array(t[r]), 2) for r in range(o.shape[0])])


def get_weights(X, max_iters=50, zero_threshold=0.0001, convergence_threshold=3, debug=False):
    M = X.copy()

    d_prev = np.zeros(len(M.shape))
    count_zero = 0

    for _ in range(max_iters):
        # print(_,count_zero)
        # print(u)
        M, d_next = ipf_update(M, u)
        d = np.linalg.norm(d_prev - d_next, 2)

        if d < zero_threshold:
            count_zero += 1

        if debug:
            print(','.join([f'{v:.5f}' for v in d_next]), d)
        d_prev = d_next

        # breaks if zero threshold triggered in 3 consecutive run
        if count_zero >= convergence_threshold:
            break

    w = M / M.sum()
    return M,w

In [None]:
temp_race = {'white': 'DP05_0064E', 'black': 'DP05_0065E', 'native': 'DP05_0066E',
             'asian': 'DP05_0067E', 'pacific-islander': 'DP05_0068E', 'other': 'DP05_0069E'}
temp_sex = {'male': 'DP05_0002E', 'female': 'DP05_0003E'}
temp_age = {
    'child': ddg.loc[ddg['id'] == 'DP05_0005E', 'value'].values[0],
    'tenager': ddg.loc[ddg['id'] == 'DP05_0006E', 'value'].values[0]+ddg.loc[ddg['id'] == 'DP05_0007E', 'value'].values[0]+ddg.loc[ddg['id'] == 'DP05_0008E', 'value'].values[0],
    'young-adult': ddg.loc[ddg['id'] == 'DP05_0009E', 'value'].values[0],
    'adult': ddg.loc[ddg['id'] == 'DP05_0010E', 'value'].values[0]+ddg.loc[ddg['id'] == 'DP05_0011E', 'value'].values[0]+ddg.loc[ddg['id'] == 'DP05_0012E', 'value'].values[0]+ddg.loc[ddg['id'] == 'DP05_0013E', 'value'].values[0],
    'senior': ddg.loc[ddg['id'] == 'DP05_0014E', 'value'].values[0]+ddg.loc[ddg['id'] == 'DP05_0015E', 'value'].values[0]+ddg.loc[ddg['id'] == 'DP05_0016E', 'value'].values[0]+ddg.loc[ddg['id'] == 'DP05_0017E', 'value'].values[0]
}
temp_inc = {
    '1': int(edg.loc[edg['id'] == 'DP03_0076E', 'value'].values[0]),
    '2': int(edg.loc[edg['id'] == 'DP03_0077E', 'value'].values[0]),
    '3': int(edg.loc[edg['id'] == 'DP03_0078E', 'value'].values[0]),
    '4': int(edg.loc[edg['id'] == 'DP03_0079E', 'value'].values[0]),
    '5': int(edg.loc[edg['id'] == 'DP03_0080E', 'value'].values[0]),
    '6': int(edg.loc[edg['id'] == 'DP03_0081E', 'value'].values[0]),
    '7': int(edg.loc[edg['id'] == 'DP03_0082E', 'value'].values[0]),
    '8': int(edg.loc[edg['id'] == 'DP03_0083E', 'value'].values[0]),
    '9': int(edg.loc[edg['id'] == 'DP03_0084E', 'value'].values[0]),
    '10': int(edg.loc[edg['id'] == 'DP03_0085E', 'value'].values[0]),
    '0': int(edg.loc[edg['id'] == 'DP03_0076E', 'value'].values[0])

}
temp_veh={
    '1' : int(hdg.loc[hdg['id'] == 'DP04_0059E', 'value'].values[0]),
    '2' : int(hdg.loc[hdg['id'] == 'DP04_0060E', 'value'].values[0]),
    '3' : int(hdg.loc[hdg['id'] == 'DP04_0061E', 'value'].values[0]),
    '0' : int(hdg.loc[hdg['id'] == 'DP04_0058E', 'value'].values[0]),
}

temp_np={ #counted from pums
    '2': 13904,
    '1': 7603,
    '4': 7276,
    '3': 5979,
    '5': 5170,
    '6': 2328,
    '7': 1036,
    '8': 424,
    '9': 288,
    '11': 143,
    '10': 120,
    '15': 45,
    '20': 40,
    '17': 34,
    '16': 32,
    '12': 24,
    '14': 14
}
temp_educ={
    'below-high': int(edudg.loc[edudg['id'] == 'S1501_C01_002E', 'value'].values[0])+int(edudg.loc[edudg['id'] == 'S1501_C01_007E', 'value'].values[0])+int(edudg.loc[edudg['id'] == 'S1501_C01_008E', 'value'].values[0]),
    'High-school':int(edudg.loc[edudg['id'] == 'S1501_C01_003E', 'value'].values[0])+int(edudg.loc[edudg['id'] == 'S1501_C01_009E', 'value'].values[0])+int(edudg.loc[edudg['id'] == 'S1501_C01_010E', 'value'].values[0]),
    'college':int(edudg.loc[edudg['id'] == 'S1501_C01_004E', 'value'].values[0])+int(edudg.loc[edudg['id'] == 'S1501_C01_011E', 'value'].values[0]),
    'Bachelor':int(edudg.loc[edudg['id'] == 'S1501_C01_005E', 'value'].values[0])+int(edudg.loc[edudg['id'] == 'S1501_C01_012E', 'value'].values[0]),
    'Graduate':int(edudg.loc[edudg['id'] == 'S1501_C01_013E', 'value'].values[0])
}

In [None]:
f, u, X = get_table(df, margin_dict)
X=np.nan_to_num(X, nan=0.0).astype('float64')
M,w = get_weights(X)