In [1]:
import math
import numpy as np
import heapq as hq
import pandas as pd


def huntington_hill(populations_array,num_seats):
    ### populations is a numpy array of populations
    ### num_seats is the desired number of seats to assign
    num_states = len(populations_array)
    if (num_states > num_seats):
        print("More states than seats!")
        return None
    representatives = np.ones(num_states)
    priorities = populations_array / math.sqrt(2)
    last_state = None
    last_priority = None
    second_last_state = None
    second_last_state_priority = None
    for j in range(num_states,num_seats):
        highest = np.argmax(priorities) # index of state with highest priority. It will receive this seat
        if(highest != last_state):
            # If the state getting this seat is not the same as last one, 
            # update the index and priority of second-last state to receive a seat
            second_last_state = last_state
            second_last_state_priority = last_priority
        # Now record the information about this seat
        last_state = highest
        last_priority = priorities[highest] 
        representatives[highest] +=  1    
        # Update this state's priority
        priorities[highest] = populations_array[highest]/math.sqrt(representatives[highest] * representatives[highest]+1)
        
    return (representatives, # a 1D array of apportioned representatives
            priorities, # 1D array of  priorities of states for next 
            last_state, # the index of the state last assigned a seat
            last_priority, # the priority for the most recently assigned seat.
            second_last_state, # the index of the second-last state assigned a seat
            second_last_state_priority) # the priority of the seat last assigned to second-last state
  
  

## Experiments with Indian example Data

In [2]:
example_populations_table = pd.DataFrame(
    [["JAMMU & KASHMIR",10143700],["HIMACHAL PRADESH",6077900],["PUNJAB",24358999],
    ["CHANDIGARH",900635],["UTTARANCHAL",8489349],["HARYANA",21144564],["DELHI",13850507],
    ["RAJASTHAN",56507188],["UTTAR PRADESH",166197921],["BIHAR",82998509],
    ["SIKKIM",540851],["ARUNACHAL PRADESH",1097968],["NAGALAND",1990036],
    ["MANIPUR",2166788],["MIZORAM",888573],["TRIPURA",3199203],
    ["MEGHALAYA",2318822],["ASSAM",26655528],["WEST BENGAL",80176197],
    ["JHARKHAND",26945829],["ORISSA",36804660],["CHHATTISGARH",20833803],
    ["MADHYA PRADESH",60348023],["GUJARAT",50671017],["DAMAN & DIU",158204],
    ["DADRA & NAGAR HAVELI",220490],["MAHARASHTRA",96878627],["ANDHRA PRADESH",76210007],
    ["KARNATAKA",52850562],["GOA",1347668],["LAKSHADWEEP",60650],
    ["KERALA",31841374],["TAMIL NADU",62405679],["PONDICHERRY",974345],
    ["ANDAMAN & NICOBAR ISLANDS",356152]
    ] )

example_populations_array = example_populations_table[1].values

number_of_seats = 545

representatives, priorities, last_state, last_priority, second_last_state,  second_last_state_priority \
= huntington_hill(example_populations_array, number_of_seats)

In [3]:
absolute_gaps = last_priority - priorities
print(min(absolute_gaps))

79.08356696041301


The __absolute gap__ for a given state is simply the difference between the priority of the last assigned seat and the state's priority.It reflects how much a state's priority would have to _increase_ for the apportionment results to change. It 

The __normalized gap__ for a given state is the number of _people_ who would have to be added to that state for it to gain a seat, assuming all other states' populations remained the same. The code I have does not correctly compute the gap for the last state to receive a seat. The calculation is correct for all other states. (To get the gap right for the last state, I need to use the info about the second-to-last state.)

In [4]:
normalization_factors = np.array(list(map(math.sqrt, representatives*(representatives + 1))))
normalized_gaps = (last_priority - priorities) * normalization_factors
# Need to fix computation of normalized gap for last state.
min_normalized_gap = min(normalized_gaps)
normalized_runner_up = np.argmin(normalized_gaps)
print("The normalized runner up:", example_populations_table[0][normalized_runner_up], min_normalized_gap)

The normalized runner up: RAJASTHAN 2332.6301014023743


To do: __write an algorithm to compute the distance to instability. __

## How likely is noise addition to changed apportioned seats? 

In [6]:
def huntington_hill_noise(populations_array, num_seats, epsilon, num_reps):
    ### populations_array is a numpy array  containing the states' populations
    ### num_seats is the desired number of seats to assign
    ### epsilon is the parameter used for DP noise addition (Laplace noise)
    ### num_reps is the number of attempts that are made
    ### Returns a numpy array
    ref_reps, _, _, _, _, _ = huntington_hill(populations_array, num_seats)
    changed_outputs = 0
    total_changes = 0
    for i in range(num_reps):
        noisy_pops = Laplace_Histogram(populations_array, epsilon)
        reps, _, _, _, _, _ = huntington_hill(noisy_pops, num_seats)
        if not np.array_equal(reps, ref_reps):
            changed_outputs = changed_outputs + 1
        total_changes = total_changes + np.linalg.norm(reps - ref_reps, ord = 1)
    if (changed_outputs == 0):
        avg_change = 0.0
    else:
        avg_change = total_changes / changed_outputs
    return changed_outputs, avg_change

def Laplace_Histogram(populations_array, epsilon):
    num_states = len(populations_array)
    return populations_array + np.random.laplace(scale = 1/epsilon, size = (num_states))

Trying this with the Indian population data:

In [22]:
(n, e) = huntington_hill_noise(example_populations_array, 
                      number_of_seats,
                      epsilon = 0.00003, 
                      num_reps = 100)
print("Out of 100 trials, there were {} trials where the \
      apportionment totals differed from the exact totals. \
      Of those, the average number of seats that changed \
      was {}.".format(n, e))

Out of 100 trials, there were 17 trials where the apportionment totals differed from the exact totals. Of those, the average number of seats that changed was 2.0.


# Experiments with US historical data

First, we load the population data into memory. 

This data comes from two sources: 
* 1790 through 1990: https://www.census.gov/population/www/censusdata/pop1790-1990.html
* 2000, 2010, and 2017 (estimated): https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population

The data on the number of seats historically apportioned comes from 
https://en.wikipedia.org/wiki/United_States_congressional_apportionment

In [9]:
us_pops = pd.read_csv('historical_populations.csv', header = 0)

us_seats = pd.read_csv('historical_seats_apportioned.csv', header = 0)


number_of_seats = 435 ##Need to update this to an array of historical seat totals

Now let's delete the total US population as well as the District of Columbia


In [10]:
us_pops = us_pops.drop([0,9]) 

print(us_pops.columns)
us_pops.head()

Index(['Name', '2017', '2010', '2000', '1990', '1980', '1970', '1960', '1950',
       '1940', '1930', '1920', '1910', '1900', '1890', '1880', '1870', '1860',
       '1850', '1840', '1830', '1820', '1810', '1800', '1790',
       'Year of first census', 'No significant change since', 'FIPS Code'],
      dtype='object')


Unnamed: 0,Name,2017,2010,2000,1990,1980,1970,1960,1950,1940,...,1850,1840,1830,1820,1810,1800,1790,Year of first census,No significant change since,FIPS Code
1,Alabama,4833722,4779736,4447100,4040587,3893888,3444165,3266740,3061743,2832961,...,771623.0,590756.0,309527.0,127901.0,9046.0,1250.0,,1800,1820,1.0
2,Alaska,739795,710231,626932,550043,401851,300382,226167,128643,72524,...,,,,,,,,1880,1880,2.0
3,Arizona,7016270,6392017,5130632,3665228,2718215,1770900,1302161,749587,499261,...,,,,,,,,1860,1870,4.0
4,Arkansas,2959373,2915918,2673400,2350725,2286435,1923295,1786272,1909511,1949387,...,209897.0,97574.0,30388.0,14273.0,1062.0,,,1810,1830,5.0
5,California,39536653,37253956,33871648,29760021,23667902,19953134,15717204,10586223,6907387,...,92597.0,,,,,,,1850,1860,6.0


We also need to process the number of seats apportioned in each Census year.

* According to Wikipeda, "Congress failed to pass any reapportionment to implement the 1920 United States Census so despite population shift, distribution of seats from 1913 remained in effect until 1933." For experiments, we will use the total from 1910. 

* For experiments with the 2017 estimates, we use the total from 2010.

Both of these fictitious numbers happen to be 435. 

In [12]:
us_seats.head()

Unnamed: 0,Year,1790,1800,1810,1820,1830,1840,1850,1860,1870,...,1910,1930,1940,1950,1960,1970,1980,1990,2000,2010
0,Number of seats apportioned,105,142,182,213,240,223,234,241,292,...,435,435,435,435,435,435,435,435,435,435


In [13]:
us_seats['1920'] = us_seats['1910']
us_seats['2017'] = us_seats['2010']

In [15]:
representatives2010, _, _, _, _, _ \
= huntington_hill(us_pops['2010'].values, num_seats = 435)
print(representatives2010)

[ 7.  1.  9.  4. 51.  7.  5.  1. 26. 14.  2.  2. 18.  9.  5.  4.  6.  7.
  2.  8.  9. 14.  8.  4.  9.  1.  3.  4.  2. 12.  3. 27. 13.  1. 16.  5.
  6. 18.  2.  7.  1.  9. 35.  4.  1. 11. 10.  3.  8.  1.]


In [16]:
list_of_years = us_pops.columns.drop(['Name', 'Year of first census',
       'No significant change since', 'FIPS Code'])

In [37]:
for year in list_of_years[0:15]: 
    (n,e) = huntington_hill_noise(us_pops[year].values, 
                      num_seats = us_seats[year][0],
                      epsilon = 0.0001, 
                      num_reps = 100)
    print('{} {} {:3d} {:.2f}'.format(year, us_seats[year][0], n, e))

2017 435  50 2.24
2010 435  26 2.08
2000 435  29 2.07
1990 435  24 2.08
1980 435  13 2.00
1970 435  55 2.36
1960 435  95 3.37
1950 435  48 2.12
1940 435  89 2.83
1930 435  77 2.42
1920 435  96 3.12
1910 435  86 2.67
1900 386  91 2.81
1890 356   0 0.00
1880 325   0 0.00


In [21]:
## Need to figure out how to handle years with missing populations. 