# Imports

In [None]:
import cvxpy as cvx
import numpy as np
import pandas as pd

# CVXPY List balancer

In [None]:
def balance_cvx(data_table, marginals, sample_weights, control_weights=None):
    rows, cols = data_table.shape

    # Make sure things line up
    assert (
        marginals.shape[1] == cols and
        sample_weights.shape[0] == rows
    ), 'Dimensions mismatch'

    x = cvx.Variable(rows)

    if control_weights is None:
        objective = cvx.Maximize(cvx.sum_entries(cvx.entr(x) + cvx.mul_elemwise(cvx.log(sample_weights), x)))
        
        constraints = [
            x >= 0,
            x.T * data_table == marginals,
        ]
        prob = cvx.Problem(objective, constraints)
        prob.solve(solver=cvx.CVXOPT)
        
        return x.value

    else:
        # With relaxation factors
        z = cvx.Variable(cols)

        objective = cvx.Maximize(
            cvx.sum_entries(cvx.entr(x) + cvx.mul_elemwise(cvx.log(sample_weights), x)) +
            cvx.sum_entries(control_weights * (cvx.entr(z)))
        )

        constraints = [
            x >= 0,
            z >= 0,
            x.T * data_table == cvx.mul_elemwise(marginals, z.T),
        ]
        prob = cvx.Problem(objective, constraints)
        #prob.solve(solver=cvx.CVXOPT)
        prob.solve(solver=cvx.SCS)        
        
        return x.value, z.value


# Discretize

In [None]:
def discretize_weights(data_table, marginals, sample_weights):
    n_samples, n_controls = data_table.shape
    
    # Intergerize x values
    x_int = sample_weights.astype(int)

    A_residuals = marginals - np.dot(x_int.T, data_table)
    x_residuals = sample_weights - x_int
    
    # Replace any NANs
    x_log = np.log(np.nan_to_num(x_residuals))
    y = cvx.Bool(n_samples)

    # Relaxation factors
    U = cvx.Variable(n_controls)
    V = cvx.Variable(n_controls)


    objective = cvx.Maximize(
        cvx.sum_entries(cvx.mul_elemwise(x_log, y)) -
        999 * cvx.sum_entries(U) -
        999 * cvx.sum_entries(V)
    )

    constraints = [
        y.T * data_table <= A_residuals + U.T,
        y.T * data_table >= A_residuals - V.T,
        U >= 0,
        V >= 0,
    ]

    prob = cvx.Problem(objective, constraints)
    prob.solve(solver=cvx.GLPK_MI, verbose=True)
    
    return y.value


# Example problem

In [None]:
# Here's the data table of households
# Say we're just trying to balance total household count
data_table = np.mat([
    [1, 1, 0, 1, 1],
    [1, 1, 0, 1, 1],
    [1, 0, 1, 1, 1],
    [1, 0, 1, 0, 1],
])

# Here are the controls / marginals that come from the census summary.
# (These are just made up)
marginals = np.mat([
    400, # Total households
    200, # 1 person hhs
    200, # 2 person hhs
    300, # People 0 -18
    400, # People 19-36
])

# Initial HH weights
sample_weights = np.array([
    10,
    10,
    10,
    10,
])

# Initial control weights that come from PUMS
control_weights = np.mat([
    1,
    1,
    1,
    1,
    1,
])

Trying to balance the weights without the controls is solveable, and leads to an expected result:

In [None]:
hh_weights = balance_cvx(data_table, marginals, sample_weights)
print('Household Weights: \n{}'.format(hh_weights))

When the subjective weights are added, we're solving for the relaxation factors as well. So we want:

$min_{x_n,z_i}\sum_n x_n ln \frac{x_n}{z_i} + \sum_i \mu_i z_i ln z_i$,

Subject to:

$\sum_n a_{ni} \times x_n = A_i \times z_i$,

$x_n \geq 0, z_i \geq 0$

In [None]:
hh_weights, relaxation_factors = balance_cvx(data_table, marginals, sample_weights, control_weights)
print('Household Weights: \n{}\n'.format(hh_weights))
print('Relaxation factors: \n{}\n'.format(relaxation_factors))
print('Weights/Relaxation: \n{}\n'.format(hh_weights / relaxation_factors[0, 0]))

After running the solver we get a list of hh weights and relaxation factors for each control $z_i$, such that $\sum_n a_{ni} \times x_n = A_i z_i$. At the very least, the hh weights have to be rescaled to sum to the total, correct?

Then to discretize the fractional component: