In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 10)
!date

%load_ext autoreload
%autoreload 2

Mon Jun 10 12:16:48 PDT 2019


# Filter 1940 DAS microdata down to just WA state, save as a csv

Now with correct headings, and for a given epsilon/replicate pair, and with output going to a central place: `/snfs1/Project/Models/us_census/WA_{epsilon}-RUN{replicate}.csv`

In [2]:
# columns are hard to find, but I've got them here
# https://github.com/uscensusbureau/census2020-das-e2e/blob/3f2c9cf9cb3c33a4e2067bd784ff381792f7ffc0/programs/writer/e2e_1940_writer.py#L82-L84

col_names = ('SCHEMA_TYPE_CODE, SCHEMA_BUILD_ID, STATE, COUNTY, ENUMDIST, '
             'EUID, EPNUM, RTYPE, QREL, QSEX, QAGE, CENHISP, CENRACE, QSPANX, '
             'QRACE1, QRACE2, QRACE3, QRACE4, QRACE5, QRACE6, QRACE7, QRACE8, CIT').split(', ')

In [3]:
def load_one_state(epsilon='0.25', replicate=1, state_fips=53):

    fname = f'/snfs1/Project/Models/us_census/{epsilon}-RUN{replicate}/MDF_PER.txt'

    iter = pd.read_csv(fname, sep='|', names=col_names, header=None, comment='#', chunksize=1_000_000)
    df = pd.DataFrame()

    for i, df_i in enumerate(iter):
        print(f'Processing chunk {i}, a dataframe with shape {df_i.shape}')
        df = df.append(df_i[df_i.STATE == state_fips])
        # if i > 1: break
    print(f'Loaded data for one state, a dataframe with shape {df.shape}')
    
    return df

In [4]:
def transform_and_save(df, epsilon, replicate, state_fips):
    t = pd.DataFrame()

    assert np.all(df.COUNTY % 10 == 0)
    t['county'] = (df['COUNTY']/10).astype(int)

    t['enum_district'] = df['ENUMDIST'].astype(int)

    # t['unit_id'] = df['EUID'].astype(int)
    # t['person_num'] = df['EPNUM'].astype(int)

    t['group_quarters'] = (df.RTYPE == 3).astype(int)
    t['over_18'] = (df.QAGE == 18).astype(int)
    # t['hispanic'] = (df.CENHISP == 2).astype(int)
    t['race_eth'] = df.CENRACE.astype(int) # IPUMS coding TK
    
    t.to_csv(f'/snfs1/Project/Models/us_census/{state_fips}_{epsilon}-RUN{replicate}.csv', index=False)
    
    return t

In [5]:
# df = load_one_state('0.25', 1, 53)

In [6]:
# transform_and_save(df, '0.25', 1, 53)

In [17]:
%%time
state_fips = 53

for epsilon in '1.0'.split(): # '0.25 0.50 0.75 1.0 2.0 4.0 6.0 8.0'
    for run in '34':
        print(f'{epsilon}-RUN{run}')
        try:
            df = load_one_state(epsilon, run, state_fips)
            t = transform_and_save(df, epsilon, run, state_fips)
        except FileNotFoundError as e:
            print(e)

1.0-RUN3
Processing chunk 0, a dataframe with shape (1000000, 23)
Processing chunk 1, a dataframe with shape (1000000, 23)
Processing chunk 2, a dataframe with shape (1000000, 23)
Processing chunk 3, a dataframe with shape (1000000, 23)
Processing chunk 4, a dataframe with shape (1000000, 23)
Processing chunk 5, a dataframe with shape (1000000, 23)
Processing chunk 6, a dataframe with shape (1000000, 23)
Processing chunk 7, a dataframe with shape (1000000, 23)
Processing chunk 8, a dataframe with shape (1000000, 23)
Processing chunk 9, a dataframe with shape (1000000, 23)
Processing chunk 10, a dataframe with shape (1000000, 23)
Processing chunk 11, a dataframe with shape (1000000, 23)
Processing chunk 12, a dataframe with shape (1000000, 23)
Processing chunk 13, a dataframe with shape (1000000, 23)
Processing chunk 14, a dataframe with shape (1000000, 23)
Processing chunk 15, a dataframe with shape (1000000, 23)
Processing chunk 16, a dataframe with shape (1000000, 23)
Processing chun

Processing chunk 7, a dataframe with shape (1000000, 23)
Processing chunk 8, a dataframe with shape (1000000, 23)
Processing chunk 9, a dataframe with shape (1000000, 23)
Processing chunk 10, a dataframe with shape (1000000, 23)
Processing chunk 11, a dataframe with shape (1000000, 23)
Processing chunk 12, a dataframe with shape (1000000, 23)
Processing chunk 13, a dataframe with shape (1000000, 23)
Processing chunk 14, a dataframe with shape (1000000, 23)
Processing chunk 15, a dataframe with shape (1000000, 23)
Processing chunk 16, a dataframe with shape (1000000, 23)
Processing chunk 17, a dataframe with shape (1000000, 23)
Processing chunk 18, a dataframe with shape (1000000, 23)
Processing chunk 19, a dataframe with shape (1000000, 23)
Processing chunk 20, a dataframe with shape (1000000, 23)
Processing chunk 21, a dataframe with shape (1000000, 23)
Processing chunk 22, a dataframe with shape (1000000, 23)
Processing chunk 23, a dataframe with shape (1000000, 23)
Processing chunk 

In [18]:
%%time
state_fips = 53

for epsilon in '2.0 4.0 6.0 8.0'.split(): # '0.25 0.50 0.75 1.0 2.0 4.0 6.0 8.0'
    for run in '1234':
        print(f'{epsilon}-RUN{run}')
        try:
            df = load_one_state(epsilon, run, state_fips)
            t = transform_and_save(df, epsilon, run, state_fips)
        except FileNotFoundError as e:
            print(e)

2.0-RUN1
Processing chunk 0, a dataframe with shape (1000000, 23)
Processing chunk 1, a dataframe with shape (1000000, 23)
Processing chunk 2, a dataframe with shape (1000000, 23)
Processing chunk 3, a dataframe with shape (1000000, 23)
Processing chunk 4, a dataframe with shape (1000000, 23)
Processing chunk 5, a dataframe with shape (1000000, 23)
Processing chunk 6, a dataframe with shape (1000000, 23)
Processing chunk 7, a dataframe with shape (1000000, 23)
Processing chunk 8, a dataframe with shape (1000000, 23)
Processing chunk 9, a dataframe with shape (1000000, 23)
Processing chunk 10, a dataframe with shape (1000000, 23)
Processing chunk 11, a dataframe with shape (1000000, 23)
Processing chunk 12, a dataframe with shape (1000000, 23)
Processing chunk 13, a dataframe with shape (1000000, 23)
Processing chunk 14, a dataframe with shape (1000000, 23)
Processing chunk 15, a dataframe with shape (1000000, 23)
Processing chunk 16, a dataframe with shape (1000000, 23)
Processing chun

Processing chunk 7, a dataframe with shape (1000000, 23)
Processing chunk 8, a dataframe with shape (1000000, 23)
Processing chunk 9, a dataframe with shape (1000000, 23)
Processing chunk 10, a dataframe with shape (1000000, 23)
Processing chunk 11, a dataframe with shape (1000000, 23)
Processing chunk 12, a dataframe with shape (1000000, 23)
Processing chunk 13, a dataframe with shape (1000000, 23)
Processing chunk 14, a dataframe with shape (1000000, 23)
Processing chunk 15, a dataframe with shape (1000000, 23)
Processing chunk 16, a dataframe with shape (1000000, 23)
Processing chunk 17, a dataframe with shape (1000000, 23)
Processing chunk 18, a dataframe with shape (1000000, 23)
Processing chunk 19, a dataframe with shape (1000000, 23)
Processing chunk 20, a dataframe with shape (1000000, 23)
Processing chunk 21, a dataframe with shape (1000000, 23)
Processing chunk 22, a dataframe with shape (1000000, 23)
Processing chunk 23, a dataframe with shape (1000000, 23)
Processing chunk 

Processing chunk 14, a dataframe with shape (1000000, 23)
Processing chunk 15, a dataframe with shape (1000000, 23)
Processing chunk 16, a dataframe with shape (1000000, 23)
Processing chunk 17, a dataframe with shape (1000000, 23)
Processing chunk 18, a dataframe with shape (1000000, 23)
Processing chunk 19, a dataframe with shape (1000000, 23)
Processing chunk 20, a dataframe with shape (1000000, 23)
Processing chunk 21, a dataframe with shape (1000000, 23)
Processing chunk 22, a dataframe with shape (1000000, 23)
Processing chunk 23, a dataframe with shape (1000000, 23)
Processing chunk 24, a dataframe with shape (1000000, 23)
Processing chunk 25, a dataframe with shape (1000000, 23)
Processing chunk 26, a dataframe with shape (1000000, 23)
Processing chunk 27, a dataframe with shape (1000000, 23)
Processing chunk 28, a dataframe with shape (1000000, 23)
Processing chunk 29, a dataframe with shape (1000000, 23)
Processing chunk 30, a dataframe with shape (1000000, 23)
Processing chu

Processing chunk 21, a dataframe with shape (1000000, 23)
Processing chunk 22, a dataframe with shape (1000000, 23)
Processing chunk 23, a dataframe with shape (1000000, 23)
Processing chunk 24, a dataframe with shape (1000000, 23)
Processing chunk 25, a dataframe with shape (1000000, 23)
Processing chunk 26, a dataframe with shape (1000000, 23)
Processing chunk 27, a dataframe with shape (1000000, 23)
Processing chunk 28, a dataframe with shape (1000000, 23)
Processing chunk 29, a dataframe with shape (1000000, 23)
Processing chunk 30, a dataframe with shape (1000000, 23)
Processing chunk 31, a dataframe with shape (1000000, 23)
Processing chunk 32, a dataframe with shape (1000000, 23)
Processing chunk 33, a dataframe with shape (1000000, 23)
Processing chunk 34, a dataframe with shape (1000000, 23)
Processing chunk 35, a dataframe with shape (1000000, 23)
Processing chunk 36, a dataframe with shape (1000000, 23)
Processing chunk 37, a dataframe with shape (1000000, 23)
Processing chu

Processing chunk 28, a dataframe with shape (1000000, 23)
Processing chunk 29, a dataframe with shape (1000000, 23)
Processing chunk 30, a dataframe with shape (1000000, 23)
Processing chunk 31, a dataframe with shape (1000000, 23)
Processing chunk 32, a dataframe with shape (1000000, 23)
Processing chunk 33, a dataframe with shape (1000000, 23)
Processing chunk 34, a dataframe with shape (1000000, 23)
Processing chunk 35, a dataframe with shape (1000000, 23)
Processing chunk 36, a dataframe with shape (1000000, 23)
Processing chunk 37, a dataframe with shape (1000000, 23)
Processing chunk 38, a dataframe with shape (1000000, 23)
Processing chunk 39, a dataframe with shape (1000000, 23)
Processing chunk 40, a dataframe with shape (1000000, 23)
Processing chunk 41, a dataframe with shape (1000000, 23)
Processing chunk 42, a dataframe with shape (1000000, 23)
Processing chunk 43, a dataframe with shape (1000000, 23)
Processing chunk 44, a dataframe with shape (1000000, 23)
Processing chu

Processing chunk 35, a dataframe with shape (1000000, 23)
Processing chunk 36, a dataframe with shape (1000000, 23)
Processing chunk 37, a dataframe with shape (1000000, 23)
Processing chunk 38, a dataframe with shape (1000000, 23)
Processing chunk 39, a dataframe with shape (1000000, 23)
Processing chunk 40, a dataframe with shape (1000000, 23)
Processing chunk 41, a dataframe with shape (1000000, 23)
Processing chunk 42, a dataframe with shape (1000000, 23)
Processing chunk 43, a dataframe with shape (1000000, 23)
Processing chunk 44, a dataframe with shape (1000000, 23)
Processing chunk 45, a dataframe with shape (1000000, 23)
Processing chunk 46, a dataframe with shape (1000000, 23)
Processing chunk 47, a dataframe with shape (1000000, 23)
Processing chunk 48, a dataframe with shape (1000000, 23)
Processing chunk 49, a dataframe with shape (1000000, 23)
Processing chunk 50, a dataframe with shape (1000000, 23)
Processing chunk 51, a dataframe with shape (1000000, 23)
Processing chu

Processing chunk 42, a dataframe with shape (1000000, 23)
Processing chunk 43, a dataframe with shape (1000000, 23)
Processing chunk 44, a dataframe with shape (1000000, 23)
Processing chunk 45, a dataframe with shape (1000000, 23)
Processing chunk 46, a dataframe with shape (1000000, 23)
Processing chunk 47, a dataframe with shape (1000000, 23)
Processing chunk 48, a dataframe with shape (1000000, 23)
Processing chunk 49, a dataframe with shape (1000000, 23)
Processing chunk 50, a dataframe with shape (1000000, 23)
Processing chunk 51, a dataframe with shape (1000000, 23)
Processing chunk 52, a dataframe with shape (1000000, 23)
Processing chunk 53, a dataframe with shape (1000000, 23)
Processing chunk 54, a dataframe with shape (1000000, 23)
Processing chunk 55, a dataframe with shape (1000000, 23)
Processing chunk 56, a dataframe with shape (1000000, 23)
Processing chunk 57, a dataframe with shape (1000000, 23)
Processing chunk 58, a dataframe with shape (1000000, 23)
Processing chu

Processing chunk 49, a dataframe with shape (1000000, 23)
Processing chunk 50, a dataframe with shape (1000000, 23)
Processing chunk 51, a dataframe with shape (1000000, 23)
Processing chunk 52, a dataframe with shape (1000000, 23)
Processing chunk 53, a dataframe with shape (1000000, 23)
Processing chunk 54, a dataframe with shape (1000000, 23)
Processing chunk 55, a dataframe with shape (1000000, 23)
Processing chunk 56, a dataframe with shape (1000000, 23)
Processing chunk 57, a dataframe with shape (1000000, 23)
Processing chunk 58, a dataframe with shape (1000000, 23)
Processing chunk 59, a dataframe with shape (1000000, 23)
Processing chunk 60, a dataframe with shape (1000000, 23)
Processing chunk 61, a dataframe with shape (1000000, 23)
Processing chunk 62, a dataframe with shape (1000000, 23)
Processing chunk 63, a dataframe with shape (1000000, 23)
Processing chunk 64, a dataframe with shape (1000000, 23)
Processing chunk 65, a dataframe with shape (1000000, 23)
Processing chu

Processing chunk 56, a dataframe with shape (1000000, 23)
Processing chunk 57, a dataframe with shape (1000000, 23)
Processing chunk 58, a dataframe with shape (1000000, 23)
Processing chunk 59, a dataframe with shape (1000000, 23)
Processing chunk 60, a dataframe with shape (1000000, 23)
Processing chunk 61, a dataframe with shape (1000000, 23)
Processing chunk 62, a dataframe with shape (1000000, 23)
Processing chunk 63, a dataframe with shape (1000000, 23)
Processing chunk 64, a dataframe with shape (1000000, 23)
Processing chunk 65, a dataframe with shape (1000000, 23)
Processing chunk 66, a dataframe with shape (1000000, 23)
Processing chunk 67, a dataframe with shape (1000000, 23)
Processing chunk 68, a dataframe with shape (1000000, 23)
Processing chunk 69, a dataframe with shape (1000000, 23)
Processing chunk 70, a dataframe with shape (1000000, 23)
Processing chunk 71, a dataframe with shape (1000000, 23)
Processing chunk 72, a dataframe with shape (1000000, 23)
Processing chu

Processing chunk 63, a dataframe with shape (1000000, 23)
Processing chunk 64, a dataframe with shape (1000000, 23)
Processing chunk 65, a dataframe with shape (1000000, 23)
Processing chunk 66, a dataframe with shape (1000000, 23)
Processing chunk 67, a dataframe with shape (1000000, 23)
Processing chunk 68, a dataframe with shape (1000000, 23)
Processing chunk 69, a dataframe with shape (1000000, 23)
Processing chunk 70, a dataframe with shape (1000000, 23)
Processing chunk 71, a dataframe with shape (1000000, 23)
Processing chunk 72, a dataframe with shape (1000000, 23)
Processing chunk 73, a dataframe with shape (1000000, 23)
Processing chunk 74, a dataframe with shape (1000000, 23)
Processing chunk 75, a dataframe with shape (1000000, 23)
Processing chunk 76, a dataframe with shape (1000000, 23)
Processing chunk 77, a dataframe with shape (1000000, 23)
Processing chunk 78, a dataframe with shape (1000000, 23)
Processing chunk 79, a dataframe with shape (1000000, 23)
Processing chu

Processing chunk 70, a dataframe with shape (1000000, 23)
Processing chunk 71, a dataframe with shape (1000000, 23)
Processing chunk 72, a dataframe with shape (1000000, 23)
Processing chunk 73, a dataframe with shape (1000000, 23)
Processing chunk 74, a dataframe with shape (1000000, 23)
Processing chunk 75, a dataframe with shape (1000000, 23)
Processing chunk 76, a dataframe with shape (1000000, 23)
Processing chunk 77, a dataframe with shape (1000000, 23)
Processing chunk 78, a dataframe with shape (1000000, 23)
Processing chunk 79, a dataframe with shape (1000000, 23)
Processing chunk 80, a dataframe with shape (1000000, 23)
Processing chunk 81, a dataframe with shape (1000000, 23)
Processing chunk 82, a dataframe with shape (1000000, 23)
Processing chunk 83, a dataframe with shape (1000000, 23)
Processing chunk 84, a dataframe with shape (1000000, 23)
Processing chunk 85, a dataframe with shape (1000000, 23)
Processing chunk 86, a dataframe with shape (1000000, 23)
Processing chu

Processing chunk 77, a dataframe with shape (1000000, 23)
Processing chunk 78, a dataframe with shape (1000000, 23)
Processing chunk 79, a dataframe with shape (1000000, 23)
Processing chunk 80, a dataframe with shape (1000000, 23)
Processing chunk 81, a dataframe with shape (1000000, 23)
Processing chunk 82, a dataframe with shape (1000000, 23)
Processing chunk 83, a dataframe with shape (1000000, 23)
Processing chunk 84, a dataframe with shape (1000000, 23)
Processing chunk 85, a dataframe with shape (1000000, 23)
Processing chunk 86, a dataframe with shape (1000000, 23)
Processing chunk 87, a dataframe with shape (1000000, 23)
Processing chunk 88, a dataframe with shape (1000000, 23)
Processing chunk 89, a dataframe with shape (1000000, 23)
Processing chunk 90, a dataframe with shape (1000000, 23)
Processing chunk 91, a dataframe with shape (1000000, 23)
Processing chunk 92, a dataframe with shape (1000000, 23)
Processing chunk 93, a dataframe with shape (1000000, 23)
Processing chu

Processing chunk 84, a dataframe with shape (1000000, 23)
Processing chunk 85, a dataframe with shape (1000000, 23)
Processing chunk 86, a dataframe with shape (1000000, 23)
Processing chunk 87, a dataframe with shape (1000000, 23)
Processing chunk 88, a dataframe with shape (1000000, 23)
Processing chunk 89, a dataframe with shape (1000000, 23)
Processing chunk 90, a dataframe with shape (1000000, 23)
Processing chunk 91, a dataframe with shape (1000000, 23)
Processing chunk 92, a dataframe with shape (1000000, 23)
Processing chunk 93, a dataframe with shape (1000000, 23)
Processing chunk 94, a dataframe with shape (1000000, 23)
Processing chunk 95, a dataframe with shape (1000000, 23)
Processing chunk 96, a dataframe with shape (1000000, 23)
Processing chunk 97, a dataframe with shape (1000000, 23)
Processing chunk 98, a dataframe with shape (1000000, 23)
Processing chunk 99, a dataframe with shape (1000000, 23)
Processing chunk 100, a dataframe with shape (1000000, 23)
Processing ch

Processing chunk 91, a dataframe with shape (1000000, 23)
Processing chunk 92, a dataframe with shape (1000000, 23)
Processing chunk 93, a dataframe with shape (1000000, 23)
Processing chunk 94, a dataframe with shape (1000000, 23)
Processing chunk 95, a dataframe with shape (1000000, 23)
Processing chunk 96, a dataframe with shape (1000000, 23)
Processing chunk 97, a dataframe with shape (1000000, 23)
Processing chunk 98, a dataframe with shape (1000000, 23)
Processing chunk 99, a dataframe with shape (1000000, 23)
Processing chunk 100, a dataframe with shape (1000000, 23)
Processing chunk 101, a dataframe with shape (1000000, 23)
Processing chunk 102, a dataframe with shape (1000000, 23)
Processing chunk 103, a dataframe with shape (1000000, 23)
Processing chunk 104, a dataframe with shape (1000000, 23)
Processing chunk 105, a dataframe with shape (1000000, 23)
Processing chunk 106, a dataframe with shape (1000000, 23)
Processing chunk 107, a dataframe with shape (1000000, 23)
Proces

Processing chunk 98, a dataframe with shape (1000000, 23)
Processing chunk 99, a dataframe with shape (1000000, 23)
Processing chunk 100, a dataframe with shape (1000000, 23)
Processing chunk 101, a dataframe with shape (1000000, 23)
Processing chunk 102, a dataframe with shape (1000000, 23)
Processing chunk 103, a dataframe with shape (1000000, 23)
Processing chunk 104, a dataframe with shape (1000000, 23)
Processing chunk 105, a dataframe with shape (1000000, 23)
Processing chunk 106, a dataframe with shape (1000000, 23)
Processing chunk 107, a dataframe with shape (1000000, 23)
Processing chunk 108, a dataframe with shape (1000000, 23)
Processing chunk 109, a dataframe with shape (1000000, 23)
Processing chunk 110, a dataframe with shape (1000000, 23)
Processing chunk 111, a dataframe with shape (1000000, 23)
Processing chunk 112, a dataframe with shape (1000000, 23)
Processing chunk 113, a dataframe with shape (1000000, 23)
Processing chunk 114, a dataframe with shape (1000000, 23)

Processing chunk 105, a dataframe with shape (1000000, 23)
Processing chunk 106, a dataframe with shape (1000000, 23)
Processing chunk 107, a dataframe with shape (1000000, 23)
Processing chunk 108, a dataframe with shape (1000000, 23)
Processing chunk 109, a dataframe with shape (1000000, 23)
Processing chunk 110, a dataframe with shape (1000000, 23)
Processing chunk 111, a dataframe with shape (1000000, 23)
Processing chunk 112, a dataframe with shape (1000000, 23)
Processing chunk 113, a dataframe with shape (1000000, 23)
Processing chunk 114, a dataframe with shape (1000000, 23)
Processing chunk 115, a dataframe with shape (1000000, 23)
Processing chunk 116, a dataframe with shape (1000000, 23)
Processing chunk 117, a dataframe with shape (1000000, 23)
Processing chunk 118, a dataframe with shape (1000000, 23)
Processing chunk 119, a dataframe with shape (1000000, 23)
Processing chunk 120, a dataframe with shape (1000000, 23)
Processing chunk 121, a dataframe with shape (1000000, 2

In [19]:
!ls -halt /snfs1/Project/Models/us_census/*.csv

-rw-r--r-- 1 abie Domain Users  22M Jun 10 14:17 /snfs1/Project/Models/us_census/53_8.0-RUN4.csv
-rw-r--r-- 1 abie Domain Users  22M Jun 10 14:13 /snfs1/Project/Models/us_census/53_8.0-RUN3.csv
-rw-r--r-- 1 abie Domain Users  22M Jun 10 14:10 /snfs1/Project/Models/us_census/53_8.0-RUN2.csv
-rw-r--r-- 1 abie Domain Users  22M Jun 10 14:06 /snfs1/Project/Models/us_census/53_8.0-RUN1.csv
-rw-r--r-- 1 abie Domain Users  22M Jun 10 14:03 /snfs1/Project/Models/us_census/53_6.0-RUN4.csv
-rw-r--r-- 1 abie Domain Users  22M Jun 10 14:00 /snfs1/Project/Models/us_census/53_6.0-RUN3.csv
-rw-r--r-- 1 abie Domain Users  22M Jun 10 13:56 /snfs1/Project/Models/us_census/53_6.0-RUN2.csv
-rw-r--r-- 1 abie Domain Users  22M Jun 10 13:53 /snfs1/Project/Models/us_census/53_6.0-RUN1.csv
-rw-r--r-- 1 abie Domain Users  22M Jun 10 13:50 /snfs1/Project/Models/us_census/53_4.0-RUN4.csv
-rw-r--r-- 1 abie Domain Users  22M Jun 10 13:46 /snfs1/Project/Models/us_census/53_4.0-RUN3.csv
-rw-r--r-- 1 abie Do

In [20]:
!head /snfs1/Project/Models/us_census/53_8.0-RUN4.csv

county,enum_district,group_quarters,over_18,race_eth
35,10,0,0,1
35,10,0,1,1
35,10,0,1,1
35,10,0,1,1
35,110,0,0,1
35,110,0,1,1
35,110,0,1,1
35,110,0,0,1
35,110,0,1,1


In [21]:
%%time
df2 = pd.read_csv('/snfs1/Project/Models/us_census/53_8.0-RUN4.csv')

CPU times: user 536 ms, sys: 31.9 ms, total: 568 ms
Wall time: 567 ms


In [22]:
df2

Unnamed: 0,county,enum_district,group_quarters,over_18,race_eth
0,35,10,0,0,1
1,35,10,0,1,1
2,35,10,0,1,1
3,35,10,0,1,1
4,35,110,0,0,1
...,...,...,...,...,...
1741611,73,720,0,1,1
1741612,73,720,0,1,1
1741613,1,180,0,0,1
1741614,1,180,0,1,1


In [12]:
df2.iloc[:,4].value_counts()

1    1703830
5      14479
3      11048
2       7537
4       2365
6       2357
Name: race_eth, dtype: int64

https://usa.ipums.org/usa/resources/1940CensusDASTestData/EXT1940USCB.cbk

    RACE		Race [general version]
    1		White
    2		Black/African American/Negro
    3		American Indian or Alaska Native
    4		Chinese
    5		Japanese
    6		Other Asian or Pacific Islander
    7		Other race, nec
    8		Two major races
    9		Three or more major races

In [13]:
tt = df2.groupby('county').race_eth.value_counts(normalize=True).unstack().fillna(0)
tt *= 100
np.round(tt, 1).sort_values(3)

race_eth,1,2,3,4,5,6
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
21,99.0,0.3,0.0,0.0,0.4,0.3
3,99.8,0.0,0.0,0.0,0.1,0.2
5,100.0,0.0,0.0,0.0,0.0,0.0
7,99.4,0.4,0.0,0.0,0.1,0.0
11,99.6,0.1,0.0,0.0,0.4,0.0
...,...,...,...,...,...,...
65,95.5,0.0,3.4,0.0,1.0,0.1
47,95.7,0.1,3.5,0.0,0.7,0.0
39,95.0,0.2,4.1,0.0,0.6,0.1
55,95.1,0.0,4.9,0.0,0.0,0.0


In [14]:
tt = df2.groupby(['county', 'enum_district']).race_eth.value_counts(normalize=True).unstack().fillna(0)
tt *= 100
np.round(tt, 1).sort_values(3)

Unnamed: 0_level_0,race_eth,1,2,3,4,5,6
county,enum_district,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,10,100.0,0.0,0.0,0.0,0.0,0.0
57,370,100.0,0.0,0.0,0.0,0.0,0.0
57,380,100.0,0.0,0.0,0.0,0.0,0.0
57,400,100.0,0.0,0.0,0.0,0.0,0.0
57,410,100.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
35,160,0.0,0.0,100.0,0.0,0.0,0.0
65,380,0.0,0.0,100.0,0.0,0.0,0.0
27,1070,0.0,0.0,100.0,0.0,0.0,0.0
65,520,0.0,0.0,100.0,0.0,0.0,0.0
