# Redistricting Data

### CS109a Final Project - Group 65

In [1]:
import numpy as np
import pandas as pd
import requests
from IPython.core.display import HTML
from IPython.display import display

In [2]:
redist_data_path = "./"

states_df = pd.read_csv(redist_data_apath + "FIPS.csv")
states_df.head()
state_names = list(map(lambda x : str.lower(x) , states_df.name.values))
state_abbrs = list(states_df.abbr.values)
state_fips = list(states_df.fips_code.values)

In [3]:
def abbr_for_state(state_name, state_names, state_abbrs):
    try:
        i = state_names.index(state_name.lower())
    except ValueError:
        return None
    return state_abbrs[i]

def dist_id_from_lewis(state_name, lewis_no, at_large="1"):
    result_state = abbr_for_state(state_name, state_names, state_abbrs)
    if lewis_no == 0:
        result_num = at_large
    else:
        result_num = "{:d}".format(int(lewis_no))
    return "{}_{}".format(result_state, result_num)

def map_id_from_lewis(state_names, lewis_nos, at_large="1"):
    return np.array(list(map(lambda s, n : dist_id_from_lewis(s, n, "1"), state_names, lewis_nos)))

No states have been redistricted to a single at large district since the 2000 census. So we will ignore at-large districts in order to further minimize errors in producing the data.

In [4]:
district_hist_df = pd.read_csv(
    redist_data_path +
    "e6311_post1948-shapeless.csv"
).sort_values(by=['congress', 'year', 'state_name', 'lewis_dist'])

# filter for date range
years_min = 2000
years_max = 2017
rows_in_years_range = (district_hist_df.year >= years_min) & (district_hist_df.year <= years_max)
district_hist_df = district_hist_df[rows_in_years_range]

In [5]:
# ignore at-large districts
district_hist_df = district_hist_df[district_hist_df.lewis_dist != 0]

district_hist_df.shape

(3424, 13)

In [6]:
LEWIS_MOD = 1000
FIPS_MOD = 1000000000 

def abbr_from_fips(fips):
    try:
        i = state_fips.index(fips)
    except ValueError:
        return None
    return state_abbrs[i]

def statename_from_fips(fips):
    try:
        i = state_fips.index(fips)
    except ValueError:
        return None
    return state_names[i]

def dist_id_from_geomuid(geomuid, at_large="1"):
    lewis = geomuid % LEWIS_MOD
    if lewis == 0 and at_large != "0":
        lewis = at_large
    fips_code = int(geomuid / FIPS_MOD)
    return "{}_{:d}".format(abbr_from_fips(fips_code), int(lewis))

def dists_from_geomuids(geomuids, at_large="1"):
    return np.array(list(map(lambda g : dist_id_from_geomuid(g, at_large), geomuids)))

def get_all_year_dist(df):
    # obtain a table of all congressional contests
    years = df.year.values
    dist_ids = dists_from_geomuids(df.geom_uid)
    data = { 'year' : years, 'dist_id' : dist_ids }
    return pd.DataFrame(data)

district_hist_df['dist_id'] = dists_from_geomuids(district_hist_df.geom_uid)
all_possible = get_all_year_dist(district_hist_df)

In [7]:
congress_info = pd.read_csv("congress_numbers.csv")
congress_info.head()

Unnamed: 0,congress,start,end,congressional_election
0,102,1991,1993,1990
1,103,1993,1995,1992
2,104,1995,1997,1994
3,105,1997,1999,1996
4,106,1999,2001,1998


In [8]:
CON_START_MOD = 1000000
CON_END_MOD = 1000

def election_year(congress_no):
    results = congress_info[congress_info.congress == congress_no].congressional_election.values
    return results[0]

def congress_no(year):
    return congress_info[(congress_info.start <= year) & (congress_info.end > year)].congress.values[0]

def start_congress(geomuid):
    result = geomuid % FIPS_MOD
    result -= (result % CON_START_MOD)
    result /= CON_START_MOD
    return result

def end_congress(geomuid):
    result = geomuid % CON_START_MOD
    result -= (result % CON_END_MOD)
    result /= CON_END_MOD
    return result

def year_from_geomuid(geomuid):
    return election_year(start_congress(geomuid))

texas_1_2004_ = 48109109001

("congress {}: {}".format(114, election_year(114)),
 "{}: congress # {}".format(2008, congress_no(2008)),
 "{}: congress # {}".format(2007, congress_no(2007)),
 "{}[{:011d}]: {}-{} congress, election year {}".format("Texas 1st District, 2004",
                                                        texas_1_2004_,
                                                        start_congress(texas_1_2004_),
                                                        end_congress(texas_1_2004_),
                                                        year_from_geomuid(texas_1_2004_)
                                                       )
)

('congress 114: 2014',
 '2008: congress # 110',
 '2007: congress # 110',
 'Texas 1st District, 2004[48109109001]: 109.0-109.0 congress, election year 2004')

In [9]:
def get_redistricting(df, geom_uid_col, ignore_years=[]):  
    unique_districts = df[geom_uid_col].drop_duplicates().sort_values().values
    dist_id = dists_from_geomuids(unique_districts)
    get_years = np.vectorize(year_from_geomuid)
    years = get_years(unique_districts)
    redistricted = np.full(dist_id.shape[0], True, dtype=bool)
    data = {
            'year' : years,
            'dist_id' : dist_id,
            'redistricted' : redistricted
           }
    redist_df = pd.DataFrame(data)
    if len(ignore_years) > 0:
        redist_df = redist_df.loc[~redist_df.year.isin(ignore_years)]
    return redist_df

redist_ = get_redistricting(district_hist_df, 'geom_uid', ignore_years=list(range(1990,1999,2)))
print(redist_.shape)

(998, 3)


In [10]:
display(redist_[redist_.dist_id == "TX_1"])
display(redist_[redist_.dist_id == "NY_10"])
display(redist_[redist_.dist_id == "NJ_12"])

Unnamed: 0,year,dist_id,redistricted
1117,2000,TX_1,True
1147,2002,TX_1,True
1179,2004,TX_1,True
1211,2006,TX_1,True
1243,2012,TX_1,True


Unnamed: 0,year,dist_id,redistricted
825,2002,NY_10,True
854,2012,NY_10,True


Unnamed: 0,year,dist_id,redistricted
762,2002,NJ_12,True
775,2012,NJ_12,True


In [11]:
# validate by getting redistricting counts by year
def states_redistricting(df):
    get_state = lambda dist : dist[0:2]
    get_states_from_id = np.vectorize(get_state)
    states = get_states_from_id(df.dist_id.values)
    years = df.year.values
    data = {'year' : years, 'state' : states}
    return pd.DataFrame(data).drop_duplicates()

df_states = states_redistricting(redist_)
display(df_states.groupby(['year']).count())
display(df_states.loc[df_states.year.isin([1998, 2000])].sort_values(['year', 'state']))

display(redist_.head())

Unnamed: 0_level_0,state
year,Unnamed: 1_level_1
2000,2
2002,42
2004,4
2006,2
2012,43


Unnamed: 0,year,state
589,2000,NC
766,2000,TX


Unnamed: 0,year,dist_id,redistricted
7,2002,AL_1,True
8,2002,AL_2,True
9,2002,AL_3,True
10,2002,AL_4,True
11,2002,AL_5,True


In [12]:
def dists_for_state(state_abbr,dist_count):
    return ["{}_{}".format(state_abbr, c) for c in range(1, dist_count + 1)]

In [13]:
texas_dists = dists_for_state("TX", 36)
redist_ = redist_.loc[~((redist_['dist_id'].isin(texas_dists)) & (redist_.year == 2000))]
print(redist_.shape)
display(df_states.loc[df_states.year.isin([2000])].sort_values(['year', 'state']))

df_states = states_redistricting(redist_)
display(df_states.groupby(['year']).count())

(968, 3)


Unnamed: 0,year,state
589,2000,NC
766,2000,TX


Unnamed: 0_level_0,state
year,Unnamed: 1_level_1
2000,1
2002,42
2004,4
2006,2
2012,43


In [14]:
def latest_district_count(df, state_abbr):
    states = get_states_from_id(df.dist_id.values)
    state_i = states == state_abbr
    state_rows = df[state_i]
    latest_year = state_rows[state_rows.year == np.max(state_rows.year.values)]
    return latest_year.shape[0]

def generate_redist_rows(state, year, dist_count, is_redist=True):
    dists = []
    for i in range(1, dist_count + 1):
        dists.append("{}_{}".format(state, i))
    years = np.full(dist_count, year, dtype=int)
    redist = np.full(dist_count, is_redist, dtype=bool)
    data = { 'year' : years, 'dist_id' : dists, 'redistricted' : redist }
    return pd.DataFrame(data)

def generate_redist_multi(state_tuples):
    df = None
    for st in state_tuples:
        result = generate_redist_rows(st[0], st[1], st[2])
        if df is None:
            df = result
        else:
            df = df.append(result)
    return df

additional_redistricting = [("NC", 2016, 13),
                            ("PA", 2018, 18),
                            ("VA", 2016, 11)]
redist_df = redist_.append(generate_redist_multi(additional_redistricting), sort=False)
redist_df.sort_values(['year', 'dist_id']).to_csv(redist_data_path + "redist_2000-2018.csv", index=False)

## Merging the redistricting variable with the rest of the data

Use the following function to merge (join) the variable data with any data frame that has `year` and `dist_id` fields. This function doesn't depend on anything except for `pandas`.

In [15]:
def merge_redist(df, redist_df, year_col='year', dist_id_col='dist_id'):
    merged_df = df.merge(redist_df, on=(year_col, dist_id_col), how='left')
    merged_df.redistricted.fillna(False, inplace=True)
    return merged_df

# this example only goes to 2014
example_ = merge_redist(all_possible, redist_df)
example_.tail()

Unnamed: 0,year,dist_id,redistricted
3419,2014.0,WI_4,False
3420,2014.0,WI_5,False
3421,2014.0,WI_6,False
3422,2014.0,WI_7,False
3423,2014.0,WI_8,False


In [16]:
import datetime
# timestamp of last run in UTC
datetime.datetime.now()

datetime.datetime(2018, 12, 10, 1, 19, 47, 153410)