In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import sys,os,time,math,csv
import itertools
import collections

import numpy as np
import pandas as pd
import networkx as nx

import matplotlib
import matplotlib.pyplot as plt
plt.style.use("ggplot")

### 2000 Population and Housing Units

In [2]:
state_fips_to_name = {}
state_name_to_fips = {}
with open("data/state_fips.csv", "r") as f:
    for line in f:
        line = line.strip()
        if line != "":
            parts = line.split(",")
            state_fips_to_name["%02d" % (int(parts[2]))] = parts[0]
            state_name_to_fips[parts[0]] = "%02d" % (int(parts[2]))

### 2010 Population and Housing Units

In [3]:
historical_population = collections.OrderedDict()
historical_housing_units = collections.OrderedDict()
state_county_to_geo_county = dict()

num_2010_counties = 0

with open("data/R11632411_SL050.txt", "r", encoding="latin-1") as f:
    reader = csv.DictReader(f, delimiter="\t")

    for row in reader:
        state_code = row["Geo_STATE"]
        county_code = row["Geo_COUNTY"]
        fips_code = row["Geo_FIPS"]

        if state_code in state_fips_to_name:
        
            state_name = state_fips_to_name[state_code]
            county_name = row["Geo_NAME"]
            county_qname = row["Geo_QName"]

            population_in_housing_units = int(row["SF1_H0100001"])
            number_of_housing_units = int(row["SF1_H0010001"])

            if state_name not in historical_population:
                historical_population[state_name] = collections.OrderedDict()
                historical_housing_units[state_name] = collections.OrderedDict()
                state_county_to_geo_county[state_name] = dict()

            historical_population[state_name][county_name] = np.array(([0] * 7) + [population_in_housing_units])
            historical_housing_units[state_name][county_name] = np.array(([0] * 7) + [number_of_housing_units])
            state_county_to_geo_county[state_name][county_name] = county_code
                        
            num_2010_counties += 1
            
print("Loaded data for %d counties from 2010 census" % (num_2010_counties))

Loaded data for 3109 counties from 2010 census


### 2000 Population and Housing Units

In [4]:
mapping_2000_2010 = {
    "La Salle County" : "LaSalle County",
    "Dona Ana County" : "Doña Ana County"
}

num_2000_counties = 0

with open("data/R11631788_SL050.txt", "r", encoding="latin-1") as f:
    reader = csv.DictReader(f, delimiter="\t")

    for row in reader:
        state_code = row["Geo_STATE"]
        county_code = row["Geo_COUNTY"]
        fips_code = row["Geo_FIPS"]

        if state_code in state_fips_to_name:
        
            state_name = state_fips_to_name[state_code]
            county_name = row["Geo_NAME"]
            county_qname = row["Geo_QName"]

            population_in_housing_units = int(row["SF1_H010001"])
            number_of_housing_units = int(row["SF1_H001001"])

            #if state_name not in historical_population:
            #    historical_population[state_name] = collections.OrderedDict()
            #    historical_housing_units[state_name] = collections.OrderedDict()
            #    state_county_to_geo_county[state_name] = dict()

            assert state_name in historical_population
            
            if not county_name in historical_housing_units[state_name]:
                
                if county_name in mapping_2000_2010:
                    county_name = mapping_2000_2010[county_name]
                    assert state_county_to_geo_county[state_name][county_name] == county_code
            
                    historical_population[state_name][county_name][-2] = population_in_housing_units
                    historical_housing_units[state_name][county_name][-2] = number_of_housing_units
                else:
                    print(state_name, county_name, "not found in the 2010 data, adding.")
                    
                    historical_population[state_name][county_name] = np.zeros(8)
                    historical_population[state_name][county_name][-2] = population_in_housing_units
                    
                    historical_housing_units[state_name][county_name] = np.zeros(8)
                    historical_housing_units[state_name][county_name][-2] = number_of_housing_units
                    
                    state_county_to_geo_county[state_name][county_name] = county_code
            else:
                
                historical_population[state_name][county_name][-2] = population_in_housing_units
                historical_housing_units[state_name][county_name][-2] = number_of_housing_units
                
                assert state_county_to_geo_county[state_name][county_name] == county_code
                        
            num_2000_counties += 1
            
print("Loaded data for %d counties from 2000 census" % (num_2000_counties))

Virginia Clifton Forge city not found in the 2010 data, adding.
Loaded data for 3109 counties from 2000 census


### 1940-1990 Population and Housing Units

In [5]:
f = open("data/Population and Housing Units: 1940 to 1990.csv", "r")
f.readline()
f.readline()
lines = map(str.strip, f.read().strip().split("\n"))
data = []
for line in lines:
    if line != "":
        parts = line.split(",")
        if not all([parts[i] == "" for i in range(len(parts))]):
            data.append(line.split(","))
f.close()

In [6]:
historical_county_name_mapping = {
    "Dade County" : "Miami-Dade County",
    "De Kalb County" : "DeKalb County",
    "Lagrange County" : "LaGrange County",
    "La Salle County" : "LaSalle County",
    "Dona Ana County" : "Doña Ana County",
    "La Porte County" : "LaPorte County",
    "O’Brien County" : "O'Brien County",
    "Prince George’s County" : "Prince George's County",
    "Queen Anne’s County" : "Queen Anne's County",
    "St. Mary’s County" : "St. Mary's County",
    #"Yellowstone National Park" : "", # not sure what the deal is, adding it to Park County
    "DeBaca County" : "De Baca County",
    "Mc Kean County" : "McKean County",
    #"South Boston city" : "" #rejoined Halifax County in 1995
    #"La Porte County" : ""
}

num_historical_counties = 0

current_state = ""
for parts in data:
    if all([parts[i] == "" for i in range(1,len(parts))]):
        current_state = parts[0]
        assert current_state in historical_population
        assert current_state in historical_housing_units
    else:
        county_name = parts[0]
        
        population = np.array(list(map(int, parts[1:7])))
        housing_units = np.array(list(map(int, parts[7:13])))
        
        if not county_name in historical_population[current_state]:
            ''' Handle special cases that are in the 1940-1990 historical data but not in the 2000 census data
            '''
            if county_name in historical_county_name_mapping:
                county_name = historical_county_name_mapping[county_name]
                historical_population[current_state][county_name][0:6] += population[::-1]
                historical_housing_units[current_state][county_name][0:6] += housing_units[::-1]
                
            elif county_name == "Yellowstone National Park":
                county_name = "Park County"
                historical_population[current_state][county_name][0:6] += population[::-1]
                historical_housing_units[current_state][county_name][0:6] += housing_units[::-1]
                
            elif county_name == "South Boston city":
                county_name = "Halifax County"
                historical_population[current_state][county_name][0:6] += population[::-1]
                historical_housing_units[current_state][county_name][0:6] += housing_units[::-1]
                
            else:
                print(county_name)
                raise ValueError("Unrecognized county")            
        else:
            historical_population[current_state][county_name][0:6] += population[::-1]
            historical_housing_units[current_state][county_name][0:6] += housing_units[::-1]
            
        num_historical_counties += 1

print("Loaded data for %d historical counties from 1940-1990 censuses" % (num_historical_counties))

Loaded data for 3111 historical counties from 1940-1990 censuses


### Save historical county data

In [7]:
f = open("data/historical_housing_units.csv", "w")
f.write("State_name,County_name,Geo_STATE,Geo_COUNTY,Geo_FIPS,hu_1940,hu_1950,hu_1960,hu_1970,hu_1980,hu_1990,hu_2000,hu_2010\n")
for state_name, county_dict in historical_housing_units.items():
    for county_name, data_list in county_dict.items():
        state_code = state_name_to_fips[state_name]
        county_code = state_county_to_geo_county[state_name][county_name]
        fips_code = state_code + county_code
        f.write("%s,%s,%s,%s,%s," % (state_name, county_name, state_code, county_code, fips_code))
        f.write("%s\n" % (",".join(map(str, data_list))))
f.close()

f = open("data/historical_population.csv", "w")
f.write("State_name,County_name,Geo_STATE,Geo_COUNTY,Geo_FIPS,pop_1940,pop_1950,pop_1960,pop_1970,pop_1980,pop_1990,pop_2000,pop_2010\n")
for state_name, county_dict in historical_population.items():
    for county_name, data_list in county_dict.items():
        state_code = state_name_to_fips[state_name]
        county_code = state_county_to_geo_county[state_name][county_name]
        fips_code = state_code + county_code
        f.write("%s,%s,%s,%s,%s," % (state_name, county_name, state_code, county_code, fips_code))
        f.write("%s\n" % (",".join(map(str, data_list))))
f.close()