In [2]:
import ipums_etl

In [3]:
%load_ext autoreload
%autoreload 2

# 1. ETL Data Pipeline
Here we extract the data from various CSVs, transform it using DuckDB and load the result into a postgres database. We do this in several steps:
- 1.1 Extract ipums demographic data and the census place project geographic data into a duckdb database.
- 1.2 Transform the data by: merging the demographic and geographic data into a single table called census. Aggregating the data to the census place industry level (census_place_industry_count table). 
- 1.3 Loading the aggregated data into a postgres database, together with auxiliary data (industry codes, census places)
- 1.4 Building the relations between tables in the database

In [4]:
ipums_etl.extract_data_to_duckdb()

2024-05-15 09:48:12,688 INFO sqlalchemy.engine.Engine select current_schema()
2024-05-15 09:48:12,689 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-05-15 09:48:12,690 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-05-15 09:48:12,694 INFO sqlalchemy.engine.Engine -- Drop tables if they exist for idempotency
DROP TABLE IF EXISTS "dem_1850";
DROP TABLE IF EXISTS "geo_1850";

-- Create demographic data table
CREATE TABLE "dem_1850"
    (year INTEGER,
     occ1950 INTEGER,
     ind1950 INTEGER,
     histid VARCHAR(36),
     hik VARCHAR(21));

COPY "dem_1850" FROM '/Users/andrea/Desktop/PhD/Data/API/clusterdb/data/ipums/census/usa_1850.csv';

CREATE INDEX "index_dem_1850_histid" ON "dem_1850" (histid);

WITH duplicates AS (
    SELECT histid, ROW_NUMBER() OVER(PARTITION BY histid) AS rownum
    FROM "dem_1850"
    )
DELETE FROM "dem_1850" USING duplicates
WHERE "dem_1850".histid = duplicates.histid AND duplicates.rownum > 1;

-- Create geographic data table
CREATE TABLE "geo_1850"
   

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

2024-05-15 09:48:29,105 INFO sqlalchemy.engine.Engine COMMIT


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

2024-05-15 09:49:00,380 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-05-15 09:49:00,382 INFO sqlalchemy.engine.Engine -- Drop tables if they exist for idempotency
DROP TABLE IF EXISTS "dem_1860";
DROP TABLE IF EXISTS "geo_1860";

-- Create demographic data table
CREATE TABLE "dem_1860"
    (year INTEGER,
     occ1950 INTEGER,
     ind1950 INTEGER,
     histid VARCHAR(36),
     hik VARCHAR(21));

COPY "dem_1860" FROM '/Users/andrea/Desktop/PhD/Data/API/clusterdb/data/ipums/census/usa_1860.csv';

CREATE INDEX "index_dem_1860_histid" ON "dem_1860" (histid);

WITH duplicates AS (
    SELECT histid, ROW_NUMBER() OVER(PARTITION BY histid) AS rownum
    FROM "dem_1860"
    )
DELETE FROM "dem_1860" USING duplicates
WHERE "dem_1860".histid = duplicates.histid AND duplicates.rownum > 1;

-- Create geographic data table
CREATE TABLE "geo_1860"
    (potential_match VARCHAR(50),
     match_type VARCHAR(50),
     lat FLOAT,
     lon FLOAT,
     state_fips_geomatch VARCHAR(2),
     county_fips

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

2024-05-15 09:49:21,211 INFO sqlalchemy.engine.Engine COMMIT


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [6]:
ipums_etl.transform_data()

2024-05-15 10:02:04,830 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-05-15 10:02:04,832 INFO sqlalchemy.engine.Engine -- Drop tables if they exist for idempotency
DROP TABLE IF EXISTS "census_1850";
DROP TABLE IF EXISTS "census_place_industry_count_1850";

-- Create census table
CREATE TABLE "census_1850"
    (histid VARCHAR(36),
     hik VARCHAR(21),
     ind1950 INTEGER,
     occ1950 INTEGER,
     census_place_id INTEGER);

-- Merge the data from the demographic and geographic tables
INSERT INTO "census_1850"
SELECT "dem_1850".histid, NULLIF(hik, '                     '), ind1950, occ1950, CASE WHEN census_place_id > 69491 THEN NULL ELSE census_place_id END AS census_place_id
FROM "dem_1850" LEFT JOIN "geo_1850"
ON "dem_1850".histid = "geo_1850".histid;

-- Create census place industry count table
CREATE TABLE "census_place_industry_count_1850" AS
SELECT census_place_id, ind1950, COUNT(*) AS worker_count
FROM "census_1850"
WHERE census_place_id IS NOT NULL
GROUP BY census_plac

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

2024-05-15 10:02:09,205 INFO sqlalchemy.engine.Engine COMMIT


In [5]:
ipums_etl.load_data_to_postgres()

2024-05-15 10:05:40,526 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-05-15 10:05:40,529 INFO sqlalchemy.engine.Engine DROP TABLE IF EXISTS "census_place";
DROP TABLE IF EXISTS "industry1950_code";

-- Create table for census place data
CREATE TABLE "census_place" (
    lat FLOAT,
    lon FLOAT,
    consistent_place_3 INTEGER,
    consistent_place_name_3 VARCHAR(50),
    consistent_place_5 INTEGER,
    consistent_place_name_5 VARCHAR(50),
    consistent_place_10 INTEGER,
    consistent_place_name_10 VARCHAR(50),
    consistent_place_50 INTEGER,
    consistent_place_name_50 VARCHAR(50),
    consistent_place_100 INTEGER,
    consistent_place_name_100 VARCHAR(50),
    consistent_place_200 INTEGER,
    consistent_place_name_200 VARCHAR(50),
    consistent_place_300 INTEGER,
    consistent_place_name_300 VARCHAR(50),
    consistent_place_500 INTEGER,
    consistent_place_name_500 VARCHAR(50),
    fracpop FLOAT,
    potential_match VARCHAR(50),
    id INTEGER
);

COPY "census_place" FR

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

2024-05-15 10:05:42,999 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-05-15 10:05:43,000 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname_1)s
2024-05-15 10:05:43,000 INFO sqlalchemy.engine.Engine [cached since 36.15s ago] {'table_name': 'census_place_industry_count_1850', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'pg_catalog'}
2024-05-15 10:05:43,042 INFO sqlalchemy.engine.Engine INSERT INTO census_place_industry_count_1850 (census_place_id, ind1950, worker_count) VALUES (%(census_place_id__0)s, %(ind1950__0)s, %(worker_count__0

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

2024-05-15 10:05:45,810 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-05-15 10:05:45,810 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname_1)s
2024-05-15 10:05:45,810 INFO sqlalchemy.engine.Engine [cached since 38.96s ago] {'table_name': 'census_place_industry_count_1860', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'pg_catalog'}
2024-05-15 10:05:45,824 INFO sqlalchemy.engine.Engine INSERT INTO census_place_industry_count_1860 (census_place_id, ind1950, worker_count) VALUES (%(census_place_id__0)s, %(ind1950__0)s, %(worker_count__0

## 1.3

In [None]:
ipums_etl.copy_worker_count_to_postgres()

# 2. Create clusters
Here we build clusters, the central unit of analysis of this project. Clusters are defined build as follows:
- 2.1 Transform the census data into a population raster covering the entire USA
- 2.2 Smooth the data with a convolutional kernel
- 2.3 Cluster nearby high population tiles together using DBSCAN
- 2.4 Create a cluster-industry table, with the number of people in each cluster working in each industry

# 3. Create time consistent clusters
Here we build time consistent clusters, i.e., clusters having a consistent id across time. We do this by:
- 3.1 Create a table with all clusters in all years
- 3.2 Intersection matching between clusters in different years
- 3.3 Create a crosswalk table between cluster ids and time consistent cluster ids
- 3.4 Create a table with time consistent cluster ids
- 3.5 Create a table with time consistent cluster ids and industry data