<a href="https://colab.research.google.com/github/bencweems/gerrymandering/blob/master/gerrymandering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import csv
import pandas as pd

In [None]:
all_data = pd.read_csv('1976-2018-house2.csv')

In [None]:
results_2018 = all_data[all_data['year'] == 2018]

In [None]:
results_2018_dr = results_2018[(results_2018.party == 'democrat') | (results_2018.party == 'republican')]

In [None]:
results_2018_dr = results_2018_dr[['year', 'state', 'district', 'party', 'candidatevotes', 'totalvotes']].copy()

In [None]:
def get_pct(row, party):
    if row['party'] == party and row['totalvotes']:
        return 100 * row['candidatevotes'] / row['totalvotes']
    return 0

In [None]:
results_2018_pct = results_2018_dr[['year', 'state', 'district', 'dem_pct', 'rep_pct']].copy()

In [None]:
results_2018_pct = results_2018_pct.groupby(by=['year', 'state', 'district']).sum()

In [None]:
results_2018_pct

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dem_pct,rep_pct
year,state,district,Unnamed: 3_level_1,Unnamed: 4_level_1
2018,Alabama,1,36.776483,63.156333
2018,Alabama,2,38.425938,61.388410
2018,Alabama,3,36.218442,63.717310
2018,Alabama,4,20.129108,79.774775
2018,Alabama,5,38.894707,61.020129
2018,...,...,...,...
2018,Wisconsin,5,37.987801,61.934239
2018,Wisconsin,6,44.463723,55.469214
2018,Wisconsin,7,38.504213,60.110581
2018,Wisconsin,8,36.275679,63.694209


In [None]:
def get_pct_results(data, year):
    results_2018 = data[data['year'] == year]
    results_2018_dr = results_2018[(results_2018.party == 'democrat') | (results_2018.party == 'republican')]
    results_2018_dr_proj = results_2018_dr[['year', 'state', 'district', 'party', 'candidatevotes', 'totalvotes']].copy()
    results_2018_dr_proj['dem_pct'] = results_2018_dr_proj.apply(lambda row : get_pct(row, 'democrat'), axis=1)
    results_2018_dr_proj['rep_pct'] = results_2018_dr_proj.apply(lambda row : get_pct(row, 'republican'), axis=1)
    d_rows = results
    results_2018_pct_proj = results_2018_dr_proj[['year', 'state', 'district', 'dem_pct', 'rep_pct']].copy()
    results_2018_pct = results_2018_pct_proj.groupby(by=['year', 'state', 'district']).sum()
    results_2018_pct['rep_margin'] = results_2018_pct['rep_pct'] - results_2018_pct['dem_pct']
    return results_2018_pct, 

In [None]:
from collections import defaultdict

results = defaultdict(dict)
for year in range(1976, 2020, 2):
    results[year]['results'] = get_pct_results(all_data, year)

In [None]:
def get_total_votes_by_party(data, party, year):
    rows = data[(data['year'] == year) & (all_data['party'] == party)]
    return rows['candidatevotes'].sum()

def get_total_votes(data, year):
    rows = data[(data['year'] == year)]
    return rows['candidatevotes'].sum()

In [None]:
get_total_votes(all_data, 2018)

113688301

In [None]:
for year in range(1976, 2020, 2):
    results[year]['total_dem'] = get_total_votes_by_party(all_data, 'democrat', year)
    results[year]['total_rep'] = get_total_votes_by_party(all_data, 'republican', year)
    results[year]['total'] = get_total_votes(all_data, year)

In [None]:
import sys
!{sys.executable} -m pip install plotly

from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *

init_notebook_mode(connected=True)

In [None]:
from collections import Counter

by_pct = Counter()
for row in results[2018]['results'][0].iterrows():
    by_pct[int(row[1]['rep_margin'] - national_rep_margin_2018)] += 1

In [None]:
from collections import OrderedDict

# Note: Not Normalized
def smooth(dist, width = 4):
    print(width)
    def get_pos(key):
        total = dist[key]
        for i in range(1, width):
            total += dist.get(key - i, 0) / ( 1 ** i)
            total += dist.get(key + i, 0) / ( 1 ** i)
        return total / (2 * width + 1)
    return {key: get_pos(key) for key in dist.keys()}


def normalize_weights(weights, normalized_sum = 1):
    total = sum(weights.values())
    return {key: normalized_sum * val / total for key, val in weights.items()}

In [None]:
smoothed_by_pct = normalize_weights(smooth(by_pct, 5), 435)
sorted_by_pct = OrderedDict(sorted(smoothed_by_pct.items()))

5


In [None]:
trace0 = Scatter(x = list(sorted_by_pct.keys()), y = list(sorted_by_pct.values()))
data = Data([trace0])
iplot(data)

In [None]:
cdf = {}
cur_total = 0
for key, val in sorted_by_pct.items():
    cur_total += val
    cdf[key] = cur_total

In [None]:
trace0 = Scatter(x = list(cdf.keys()), y = list(cdf.values()))
data = Data([trace0])
iplot(data)

In [None]:
national_rep_margin_2018 = 100 * (results[2018]['total_rep'] - results[2018]['total_dem']) / results[2018]['total']

In [None]:
national_rep_margin_2018

-7.183550926669227

In [None]:
rep_states = ["Idaho",
"Nevada",
"Utah",
"Arizona",
"Alaska",
"Wyoming",
"New Mexico",
"Texas",
"Oklahoma",
"Nebraska",
"Kansas",
"South Dakota",
"Iowa",
"Wisconsin",
"Michigan",
"Ohio",
"Tennessee",
"Alabama",
"Georgia",
"South Carolina",
"Florida",
"Pennsylvania",
"Maine",
"New Jersey",
"Virginia",
"North Dakota",
"Indiana",
"Mississippi",
"Louisiana",
"North Carolina"
]

# results_2018 = results[2018]['results']
# results_2018_df = results_2018[0]
# results_2018_df.reset_index(drop=True, inplace=True)
# results_2018_df
rep_data = all_data[all_data['state'].isin(rep_states)]

In [None]:
from collections import defaultdict
from collections import Counter

rep_state_results = defaultdict(dict)
for year in range(1976, 2020, 2):
    rep_state_results[year]['results'] = get_pct_results(rep_data, year)
    rep_state_results[year]['total_dem'] = get_total_votes_by_party(rep_data, 'democrat', year)
    rep_state_results[year]['total_rep'] = get_total_votes_by_party(rep_data, 'republican', year)
    rep_state_results[year]['total'] = get_total_votes(rep_data, year)


Boolean Series key will be reindexed to match DataFrame index.



In [None]:
import math
by_pct = Counter()
years = [2018]
national_rep_margin_2018 = 100 * (results[2018]['total_rep'] - results[2018]['total_dem']) / results[2018]['total']

for year in years:
    for row in rep_state_results[year]['results'][0].iterrows():
        national_rep_margin = 100 * (results[year]['total_rep'] - results[year]['total_dem']) / results[year]['total']
        by_pct[int(row[1]['rep_margin'])] += 1
    
smoothed_by_pct = normalize_weights(smooth(by_pct, 3), sum(by_pct.values()))
sorted_by_pct = OrderedDict(sorted(smoothed_by_pct.items()))
dem_blowout = sum([val for key, val in sorted_by_pct.items() if key < -60])
rep_blowout = sum([val for key, val in sorted_by_pct.items() if key > 60])
print("Dem Blowout: {}\tRep Blowout: {}".format(dem_blowout, rep_blowout))
sorted_by_pct = {key: val for key, val in sorted_by_pct.items() if abs(key) <= 60}

trace0 = Scatter(x = list(sorted_by_pct.keys()), y = list(sorted_by_pct.values()))
data = Data([trace0])
iplot(data)

3
Dem Blowout: 10.75445173383317	Rep Blowout: 1.6729147141518266


In [None]:
cdf = {}
cur_total = 0
for key, val in sorted_by_pct.items():
    cur_total += val
    cdf[key] = cur_total
    
trace0 = Scatter(x = list(cdf.keys()), y = list(cdf.values()))
data = Data([trace0])
iplot(data)

In [None]:
dem_data = all_data[~all_data['state'].isin(rep_states)]
from collections import defaultdict
from collections import Counter

dem_state_results = defaultdict(dict)
for year in range(1976, 2020, 2):
    dem_state_results[year]['results'] = get_pct_results(dem_data, year)
    dem_state_results[year]['total_dem'] = get_total_votes_by_party(dem_data, 'democrat', year)
    dem_state_results[year]['total_rep'] = get_total_votes_by_party(dem_data, 'republican', year)
    dem_state_results[year]['total'] = get_total_votes(dem_data, year)


Boolean Series key will be reindexed to match DataFrame index.



In [None]:
by_pct = Counter()
year = 2014
for row in dem_state_results[year]['results'][0].iterrows():
    national_dem_margin = 100 * (results[year]['total_dem'] - results[year]['total_rep']) / results[year]['total']
    dem_state_dem_margin = 100 * (dem_state_results[year]['total_dem'] - dem_state_results[year]['total_rep']) / dem_state_results[year]['total']
    by_pct[int(row[1]['rep_margin'] + (dem_state_dem_margin - national_dem_margin))] += 1
    
smoothed_by_pct = normalize_weights(smooth(by_pct, 10), sum(by_pct.values()))
sorted_by_pct = OrderedDict(sorted(smoothed_by_pct.items()))

trace0 = Scatter(x = list(sorted_by_pct.keys()), y = list(sorted_by_pct.values()))
data = Data([trace0])
iplot(data)

10


In [None]:
cdf = {}
cur_total = 0
for key, val in sorted_by_pct.items():
    cur_total += val
    cdf[key] = cur_total
    
trace1 = Scatter(x = list(cdf.keys()), y = list(cdf.values()))
data = Data([trace1])
iplot(data)

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option("max_rows", None)
rep_state_results[2018]['results'][0].sort_values(by='rep_margin')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dem_pct,rep_pct,rep_margin
year,state,district,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018,Florida,24,100.0,0.0,-100.0
2018,Florida,14,100.0,0.0,-100.0
2018,Florida,10,100.0,0.0,-100.0
2018,Georgia,5,100.0,0.0,-100.0
2018,Florida,21,100.0,0.0,-100.0
2018,Pennsylvania,18,100.0,0.0,-100.0
2018,Florida,20,99.918649,0.0,-99.918649
2018,Alabama,7,97.804539,0.0,-97.804539
2018,Wisconsin,2,97.422273,0.0,-97.422273
2018,Virginia,3,91.22413,0.0,-91.22413


In [None]:
dem_state_results[2018]['results'][0].sort_values(by='rep_margin')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dem_pct,rep_pct,rep_margin
year,state,district,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018,Washington,9,100.0,0.0,-100.0
2018,California,44,100.0,0.0,-100.0
2018,California,6,100.0,0.0,-100.0
2018,California,27,100.0,0.0,-100.0
2018,New York,5,99.398654,0.0,-99.398654
2018,Massachusetts,8,98.424653,0.0,-98.424653
2018,Massachusetts,7,98.252356,0.0,-98.252356
2018,Massachusetts,4,97.718472,0.0,-97.718472
2018,Massachusetts,1,97.644076,0.0,-97.644076
2018,New York,16,94.251074,0.0,-94.251074
