# Combine population & demographic data
---

In [57]:
import pandas as pd, numpy as np
import crime as cr

In [58]:
def cols(df) -> None:
    """ Prints all columns vertically, excluding county """
    print(*[c for c in df.columns if c not in ['county', 'year']], sep='\n')

def merge(df1, df2) -> pd.DataFrame:
    """ Shorthand for df.merge, bcas we always wanna innerjoin on county """
    return df1.merge(df2, how='inner', on=['year', 'county'])

def rename(df, text, replacement) -> pd.DataFrame:
    """ Bulk replace a substring in the name of all columns """
    for c in df.columns:
        df = df.rename(columns={c: c.replace(text, replacement)})
    return df


def add_ordinal(df, col:str, order:list, replace=False) -> pd.DataFrame:
    """
    Create ordinal col from existing categorical col. Pass an ascending list
    of categories. Example: Input ['A', 'B', 'C'] -> New column map: {'A': 1, 'B': 2, 'C': 3}
    """
    new = df[col].map({k: i+1 for i, k in enumerate(order)})
    df = insert(df, f'{col}_ord', col, new)
    if replace:
        df = df.drop(columns=col)
    return df


def insert(df, name, target, col:pd.Series) -> pd.DataFrame:
    """ 
    Inserts new column, with given name, right before
    existing column matching target name
    """
    idx = list(df.columns).index(target)
    df.insert(idx, name, col)
    return df


def add_binmax(df, name, cols, replace=False) -> pd.DataFrame:
    """ Shorthand for df.idxmax(), but lets you choose location, and replace given columns """
    new = df[cols].idxmax(axis=1)
    df = insert(df, name, cols[0], new)
    if replace:
        df = df.drop(columns=cols)
    return df


def move_col(df, name, target) -> pd.DataFrame:
    """
    Moves existing column matching given name right before
    existing column matching target name.
    """
    col = df.pop(name)
    idx = list(df.columns).index(target)
    df.insert(idx, name, col)
    return df


def separate_by(df, to_match, keys=['year', 'county'], keep=[], start=False, end=False, mode="") -> (pd.DataFrame, pd.DataFrame):
    """
    Given a df and a substring, return two dfs:
    1. df containing: county + all columns whose name does NOT contain substring
    2. df containing: county + all columns whose name DOES contain substring
    """
    if type(to_match) != list:
        to_match = [to_match]

    names = [item for sublist in [[c for c in df.columns if (
            c.startswith(txt) if start else c.endswith(txt) if end else txt in c
        )] for txt in to_match] for item in sublist]

    include = df.copy()[keys + keep + names]
    exclude = df.copy().drop(columns = keep + names)
    return include if mode == 'include' else exclude if mode == 'exclude' else (include, exclude)


def combine(df, name=None, cols:list=None, items:dict=None, replace=True) -> pd.DataFrame:
    """
    Given a list of column names, create a new column with their sum, and
    position it before the first col in 'cols'. So if replace=True, then
    the old columns will effectively be replaced in their original position.
    To do multiple sums, pass 'items' as dict with names as keys and col list as vals
    """
    if not items:
        items = {name: cols}
    for name, cols in items.items():
        new = sum([df[c] for c in cols])
        df = insert(df, name, cols[0], new)
        if replace:
            df = df.drop(columns=cols)
    return df


---
---
---

## Load data

In [59]:
pd.set_option('display.max_colwidth', 80)

pop_raw = pd.read_csv("../county_stats/output/county_population.csv", index_col=0)

dem_raw = pd.concat([
            pd.read_csv(f'../county_stats/raw/census_counties_{year}.csv'
                ).drop(columns=['pop', 'geonum', 'the_geom']
                ).assign(year=year
                ).rename(columns={'civ_ni_':'civ_ni_p'})
            for year in range(2012, 2020)
        ]
    ).copy() # df gets fragmented during the process. Copy to fix this

dem_raw.insert(1, 'year', dem_raw.pop('year'))

In [60]:
pop_raw

Unnamed: 0,year,county,total,male,female,over18,under19,under19_male,under19_female,over18_male,over18_female
0,1990.0,ADAMS,265709.0,131902.0,133807.0,184665.0,81044.0,41519.0,39525.0,90383.0,94282.0
1,1990.0,ALAMOSA,13617.0,6677.0,6940.0,9311.0,4306.0,2189.0,2117.0,4488.0,4823.0
2,1990.0,ARAPAHOE,393289.0,191722.0,201567.0,281301.0,111988.0,57241.0,54747.0,134481.0,146820.0
3,1990.0,ARCHULETA,5352.0,2725.0,2627.0,3709.0,1643.0,890.0,753.0,1835.0,1874.0
4,1990.0,BACA,4556.0,2244.0,2312.0,3360.0,1196.0,597.0,599.0,1647.0,1713.0
...,...,...,...,...,...,...,...,...,...,...,...
3899,2050.0,SUMMIT,37255.0,19212.0,18048.0,32286.0,4969.0,2541.0,2428.0,16671.0,15620.0
3900,2050.0,TELLER,29192.0,14505.0,14698.0,24411.0,4781.0,2433.0,2349.0,12072.0,12349.0
3901,2050.0,WASHINGTON,4277.0,2201.0,2076.0,3591.0,686.0,353.0,334.0,1848.0,1742.0
3902,2050.0,WELD,619624.0,311234.0,308389.0,474189.0,145435.0,74010.0,71425.0,237224.0,236964.0


## Combine population data and census data
- Looking in the original demographic data, most population groups are present: gender, age, etc. So why not use those?
- A couple reasons.
  - The population dataset is likely more accurate, claiming to provide "actual" numbers, whereas the census data provides "estimates"
  - The population dataset is more precise, with age groups of each individual age number, allowing us to make our own aggregated bins (adult, minor). The census data has age groups defined already, but in increments of 5, so the middle group is "15 to 19", but we need 18 and under!
  - The population dataset offers sub-aggregations: we have `minor_female` and `minor_male`, for instance, whereas the census data only offers age populations and gender populations separately
- So instead, we will use population dataset first, and add in additional groups from census data
---

#### Select desired columns from census data

In [61]:
df = dem_raw.copy()
df = df.rename(columns={'geoname': 'county'})
df.county = df.county.str.upper()
df = df[['year', 'county', 'med_age',
    'households', 'avghhsize',
    'civ_lf', 'emp', 'unemp',
    'hispanic', 'white_nh', 'black_nh', 'asian_nh', 'ntvam_nh', 'hawpi_nh', 'other_nh', 'twoplus_nh',
    'pop25plus', 'hsgrad_sc',
    'med_hh_inc', 'per_cap_in',
    'citz_birth', 'citz_nat', 'born_in_co',
    'pop_3pl', 'enrolled', 'undergrad',
    'gr_1_4', 'gr_5_8', 'gr_9_12',
    'med_hm_val', 'med_yr_blt',
    'housing_un', 'occ_hu',
    'own_occ_hu', 'v_l_50k', 'v50k_100k', 'v100k_150k', 'v150k_200k', 'v200k_250k', 'v250k_300k',
    'v300k_400k', 'v400k_500k', 'v500k_750k', 'v750k_1m', 'v_1m_plus',
    'b2000_2009', 'b1990_1999', 'b1980_1989', 'b1970_1979',
    'b1960_1969', 'b1950_1959', 'b1940_1949', 'b1939_e',
    'ps_uni', 'ps_below',
    'tot_l18', 'pov_l18',
]]

#### Group bins together

In [62]:
# Create new variable for total citizens. Place it next to citz_birth
df = insert(df, 'citz', 'citz_birth', df.citz_birth + df.citz_nat)
df = df.drop(columns='citz_nat') # not needed anymore

# Note: combine() will replace a list of columns with their sum, IN its original position
df = combine(df, 'race_other', ['ntvam_nh', 'hawpi_nh', 'other_nh', 'twoplus_nh'])
df = combine(df, 'b1949_e', ['b1939_e', 'b1940_1949'])

df = combine(df, items={
    'v50k_150k': ['v50k_100k', 'v100k_150k'],
    'v150k_250k': ['v150k_200k', 'v200k_250k'],
    'v250k_400k': ['v250k_300k', 'v300k_400k'],
    'v400k_750k': ['v400k_500k', 'v500k_750k'],
    'v750k_plus': ['v750k_1m', 'v_1m_plus'],
})

#### Create nominal variables for housing price and housing age
- First, create a categorical variable whose values are the COLUMN NAME of the bin with the max value. For instance, if a given county has more houses in the `v50k_100k` range than any other range, the value at that row in the new column will be "v50k_100k"
- Next, create a nominal column from that categorical column, ordered so that a lower number means less desirable. So for prices, "v_l_50k" -> 1, and for year built, "b1939_e" -> 1

Housing year built

In [63]:
# Replace year-built bins with a single categorical column containing the name of the col with max value
blt_ascending = ['b1949_e','b1950_1959','b1960_1969','b1970_1979','b1980_1989','b1990_1999','b2000_2009']
# Categorical
df = add_binmax(df, 'blt_freq_yr', blt_ascending)
# Ordinal
df = add_ordinal(df, 'blt_freq_yr', blt_ascending)

Housing Prices

In [64]:
prices_ascending = ['v_l_50k', 'v50k_150k', 'v150k_250k', 'v250k_400k', 'v400k_750k', 'v750k_plus']
# Categorical
df = add_binmax(df, 'hu_freq_val', prices_ascending)
# Ordinal
df = add_ordinal(df, 'hu_freq_val', prices_ascending)

---

#### Rename everything, with a naming system that let's us easily select sub-groups of columns with a simple string match

In [65]:
pop = pop_raw.rename(columns={
    'total':            'pop',

    'male':             'gend_m',
    'female':           'gend_f',

    'over18':           'age_over18',
    'under19':          'age_undr19',

    'over18_male':      'gend_m_age_over18',
    'over18_female':    'gend_f_age_over18',
    'under19_male':     'gend_m_age_undr19',
    'under19_female':   'gend_f_age_undr19',
})
df = df.rename(columns={
    'med_age':      'age_median',

    'per_cap_in':   'inc_per_cap',
    'med_hh_inc':   'inc_hh_median',

    'households':   'hh',
    'avghhsize':    'hh_size_avg',

    'pop25plus':    'hsgrad_pool',
    'hsgrad_sc':    'hsgrad_graduated',

    'born_in_co':   'citz_co',
    'citz_birth':   'citz_birth',

    'emp':          'civ_lf_employed',

    'hispanic':     'race_hispanic',
    'white_nh':     'race_white',
    'black_nh':     'race_black',
    'asian_nh':     'race_asian',

    'ps_uni':       'ps_known',
    'ps_below':     'ps_below',
    'tot_l18':      'ps_undr18_known',
    'pov_l18':      'ps_undr18_below',

    'pop_3pl':      'stud_enroll_pool',
    'enrolled':     'stud_enrolled',
    'undergrad':    'stud_undergrad',
    'gr_1_4':       'stud_1_4',
    'gr_5_8':       'stud_5_8',
    'gr_9_12':      'stud_9_12',

    'housing_un':   'hu',
    'occ_hu':       'hu_occ',

    'blt_freq_yr':  'hu_blt_freq_yr',
    'blt_freq_yr_ord':'hu_blt_freq_yr_ord',
    'b1949_e':      'hu_blt_lt_1950',
    'b1950_1959':   'hu_blt_1950_1959',
    'b1960_1969':   'hu_blt_1960_1969',
    'b1970_1979':   'hu_blt_1970_1979',
    'b1980_1989':   'hu_blt_1980_1989',
    'b1990_1999':   'hu_blt_1990_1999',
    'b2000_2009':   'hu_blt_2000_plus',

    'own_occ_hu':   'hu_oo',
    'hu_freq_val':  'hu_oo_freq_val',
    'hu_freq_val_ord':'hu_oo_freq_val_ord',
    'v_l_50k':      'hu_oo_lt_50',
    'v50k_150k':    'hu_oo_50_150',
    'v150k_250k':   'hu_oo_150_250',
    'v250k_400k':   'hu_oo_250_400',
    'v400k_750k':   'hu_oo_400_750',
    'v750k_plus':   'hu_oo_750_plus',
})

### Merge population and census data

In [66]:
main = merge(pop, df)
main = move_col(main, 'age_median', 'age_over18')

## Calculations for groups
---

In [67]:
from grouped_df import GroupedDF
GroupedDF.groups = {
    'gend': 'Gender', 'age': 'Age', 'race': 'Race', 'inc': 'Income', 'hh': 'Households', 'citz': 'Citizenship',
    'hsgrad':   'Population over 25 (highschool graduate counts)',
    'civ_lf':   'Civilian laborforce (employment)',
    'ps':       'Poverty status',
    'stud':     'Students (counts by program type)',
    'hu':       'Housing units: Total and occupied.',
    'hu_blt':   'Housing units: Year built',
    'hu_oo':    'Housing units: Owner occupied (home values. Difference between occupied and owner occupied is renter occupied)',
}

#### Group names

In [68]:
gd = GroupedDF(main)
gd = GroupedDF(main, custom={'hu': ['year', 'county', 'hu', 'hu_occ']})
gd.display(5)

gend: Gender


Unnamed: 0,year,county,gend_m,gend_f,gend_m_age_undr19,gend_f_age_undr19,gend_m_age_over18,gend_f_age_over18
0,2012.0,ADAMS,231571.0,228902.0,69462.0,66249.0,162109.0,162653.0
1,2012.0,ALAMOSA,7823.0,7792.0,2201.0,2044.0,5622.0,5748.0
2,2012.0,ARAPAHOE,292548.0,303673.0,80341.0,76419.0,212207.0,227254.0
3,2012.0,ARCHULETA,6033.0,5960.0,1297.0,1150.0,4736.0,4810.0
4,2012.0,BACA,1836.0,1869.0,422.0,409.0,1414.0,1460.0



age: Age


Unnamed: 0,year,county,age_median,age_over18,age_undr19
0,2012.0,ADAMS,32.4,324757.0,135711.0
1,2012.0,ALAMOSA,32.2,11367.0,4239.0
2,2012.0,ARAPAHOE,35.7,439465.0,156761.0
3,2012.0,ARCHULETA,47.5,9539.0,2450.0
4,2012.0,BACA,47.8,2872.0,830.0



race: Race


Unnamed: 0,year,county,race_hispanic,race_white,race_black,race_asian,race_other
0,2012.0,ADAMS,167556,235991,12970,15304,11175
1,2012.0,ALAMOSA,7185,7767,110,59,629
2,2012.0,ARAPAHOE,105174,364766,55629,28067,20721
3,2012.0,ARCHULETA,2157,9493,9,117,333
4,2012.0,BACA,347,3311,16,30,79



inc: Income


Unnamed: 0,year,county,inc_hh_median,inc_per_cap
0,2012.0,ADAMS,56633,24357
1,2012.0,ALAMOSA,38045,19657
2,2012.0,ARAPAHOE,60400,32845
3,2012.0,ARCHULETA,54007,29771
4,2012.0,BACA,39497,22436



hh: Households


Unnamed: 0,year,county,hh,hh_size_avg
0,2012.0,ADAMS,151034,2.91
1,2012.0,ALAMOSA,5853,2.49
2,2012.0,ARAPAHOE,223747,2.55
3,2012.0,ARCHULETA,4536,2.64
4,2012.0,BACA,1675,2.18



citz: Citizenship


Unnamed: 0,year,county,citz,citz_birth,citz_co
0,2012.0,ADAMS,396172,376454,223907
1,2012.0,ALAMOSA,15122,14868,9542
2,2012.0,ARAPAHOE,519940,487576,223433
3,2012.0,ARCHULETA,11924,11729,3411
4,2012.0,BACA,3717,3654,1996



hsgrad: Population over 25 (highschool graduate counts)


Unnamed: 0,year,county,hsgrad_pool,hsgrad_graduated
0,2012.0,ADAMS,275628,166731
1,2012.0,ALAMOSA,9424,5946
2,2012.0,ARAPAHOE,378792,199197
3,2012.0,ARCHULETA,8659,4882
4,2012.0,BACA,2769,1909



civ_lf: Civilian laborforce (employment)


Unnamed: 0,year,county,civ_lf,civ_lf_employed
0,2012.0,ADAMS,236110,213794
1,2012.0,ALAMOSA,7171,6449
2,2012.0,ARAPAHOE,318041,292089
3,2012.0,ARCHULETA,6124,5444
4,2012.0,BACA,1876,1827



ps: Poverty status


Unnamed: 0,year,county,ps_known,ps_below,ps_undr18_known,ps_undr18_below
0,2012.0,ADAMS,438171,62008,124375,25278
1,2012.0,ALAMOSA,14622,3191,3817,758
2,2012.0,ARAPAHOE,568999,66945,144576,23054
3,2012.0,ARCHULETA,11989,1051,2386,359
4,2012.0,BACA,3649,530,824,139



stud: Students (counts by program type)


Unnamed: 0,year,county,stud_enroll_pool,stud_enrolled,stud_undergrad,stud_1_4,stud_5_8,stud_9_12
0,2012.0,ADAMS,420756,117499,19299,28761,26645,24342
1,2012.0,ALAMOSA,14903,5362,2285,736,801,890
2,2012.0,ARAPAHOE,549701,153854,29388,33703,30902,33425
3,2012.0,ARCHULETA,11866,2588,228,494,748,789
4,2012.0,BACA,3663,749,52,126,221,215



hu: Housing units: Total and occupied.


Unnamed: 0,year,county,hu,hu_occ
0,2012.0,ADAMS,163245,151034
1,2012.0,ALAMOSA,6572,5853
2,2012.0,ARAPAHOE,238160,223747
3,2012.0,ARCHULETA,8742,4536
4,2012.0,BACA,2253,1675



hu_blt: Housing units: Year built


Unnamed: 0,year,county,hu_blt_2000_plus,hu_blt_1990_1999,hu_blt_1980_1989,hu_blt_1970_1979,hu_blt_1960_1969,hu_blt_1950_1959,hu_blt_freq_yr_ord,hu_blt_freq_yr,hu_blt_lt_1950
0,2012.0,ADAMS,38682,27598,20368,30185,19615,20369,7,b2000_2009,6158
1,2012.0,ALAMOSA,650,866,862,1405,654,591,1,b1949_e,1536
2,2012.0,ARAPAHOE,39415,33989,56011,62253,22258,16519,4,b1970_1979,7165
3,2012.0,ARCHULETA,2204,2186,2054,1384,326,124,7,b2000_2009,415
4,2012.0,BACA,46,172,172,470,284,306,1,b1949_e,803



hu_oo: Housing units: Owner occupied (home values. Difference between occupied and owner occupied is renter occupied)


Unnamed: 0,year,county,hu_oo,hu_oo_freq_val_ord,hu_oo_freq_val,hu_oo_lt_50,hu_oo_50_150,hu_oo_150_250,hu_oo_250_400,hu_oo_400_750,hu_oo_750_plus
0,2012.0,ADAMS,100108,3,v150k_250k,8578,19838,47583,17779,5427,903
1,2012.0,ALAMOSA,3702,2,v50k_150k,435,1599,1077,397,177,17
2,2012.0,ARAPAHOE,143158,3,v150k_250k,4207,22174,55935,38213,16339,6290
3,2012.0,ARCHULETA,3532,4,v250k_400k,152,513,781,1153,612,321
4,2012.0,BACA,1236,2,v50k_150k,399,601,144,49,23,20





## Calculations
---

- **age, and gend**
  - `age_median`: (Existing)
  - `age_undr19_prop`: What percent of the population is under 19?
  - `gend_m_prop`: What percent of the population is male?
  - `age_undr19_gend_m_prop`: What percent of under-19 year old are male? (divide m_undr19 by undr19)
- **inc**
  - `inc_hh_med`: (Existing) Median household income
  - `inc_per_cap`: (Existing) Per capita income
- **hh**
  - `hh_size_avg`: (Existing) Average household size
- **race**
  - `race_{x}_prop`: What percent of the population is race x?
  - `race_prop_stdev`: What is the standard deviation of the race proportions? We need to calculate the proportions first, to normalize for the population size, that way, we can compare the standard deviations across groups
- **hsgrad**
  - `hsgrad_graduated_prop`: What percent of adults (age 25+) have a high school diploma or equivalent?
- **civ_lf**
  - `civ_lf_prop`: What percent of the population is in the civilian labor force?
  - `civ_lf_employed_prop`: What percent of the civilian labor force is employed?
- **ps**
  - `ps_total_prop`: What percent of people whose poverty status is known are below the poverty line?
  - `ps_undr18_total_prop`: What percent of under-18 people whose poverty status is known are below the poverty line?
  - `ps_undr18_prop`: What percent of people below the poverty line are under 18?
- **stud**
  - `stud_enrolled_prop`: Percent of people who could be enrolled in school that actually are enrolled
  - `stud_hs_prop`: What percent of gradeschool students (1-12) are high schoolers? (lower number indicates dropouts, which may associate with crime)
  - `stud_undergrad_prop`: What percent of enrolled students are undergraduates?
- **citz**
  - `citz_prop`: What percent of the population is a us citizen?
  - `citz_birth_prop`: What percent of us citizens were born in the us?
  - `citz_co_prop`: What percent of citizens were born in Colorado?
- **hu**
  - `hu_occ_prop`: Percent of homes which are occupied
  - `hu_blt_after1989`: Percent of homes which were built in the past 20 years
  - `hu_blt_nominal`: Convert hu_blt_mode_range to nominal, where the highest number corresponds to highest year range
- **hu_oo**
  - `hu_oo_prop`: Percent of occupied properties occupied by owner. The remaining percent is renter occupied
  - `hu_oo_lt_50_prop`: Percent of owner occupied properties worth less than $50,000
  - `hu_oo_750_plus_prop`: Percent of owner occupied properties worth $750,000 or more


In [69]:
df = main.copy()

df['age_over18_prop'] = df.age_over18 / df['pop']
df['age_undr19_prop'] = df.age_undr19 / df['pop']
df['gend_m_prop'] = df.gend_m / df['pop']
df['gend_f_prop'] = df.gend_f / df['pop']
df['age_undr19_gend_m_prop'] = df.gend_m_age_undr19 / df.age_undr19
df['age_undr19_gend_f_prop'] = df.gend_f_age_undr19 / df.age_undr19
df['age_over18_gend_m_prop'] = df.gend_m_age_over18 / df.age_over18
df['age_over18_gend_f_prop'] = df.gend_f_age_over18 / df.age_over18

df['gend_m_age_undr19_prop'] = df.gend_m_age_undr19 / df.gend_m
df['gend_m_age_over18_prop'] = df.gend_m_age_over18 / df.gend_m
df['gend_f_age_undr19_prop'] = df.gend_f_age_undr19 / df.gend_f
df['gend_f_age_over18_prop'] = df.gend_f_age_over18 / df.gend_f

race_base = GroupedDF(df).race
race = df.copy()[['year', 'county']]
for c in [c for c in race_base.columns if c not in ['year', 'county']]:
    race[f'{c}_prop'] = race_base[c] / df['pop']

race['race_prop_stdev'] = np.std(race.drop(columns=['year', 'county']), axis=1)
df = merge(df, race)

df['hsgrad_graduated_prop'] = df.hsgrad_graduated / df.hsgrad_pool

df['civ_lf_prop'] = df.civ_lf / df['pop']
df['civ_lf_employed_prop'] = df.civ_lf_employed / df.civ_lf

df['ps_total_prop'] = df.ps_below / df.ps_known
df['ps_undr18_total_prop'] = df.ps_undr18_below / df.ps_undr18_known
df['ps_undr18_prop'] = df.ps_undr18_below / df.ps_below

df['stud_enrolled_prop'] = df.stud_enrolled / df.stud_enroll_pool
df['stud_hs_prop'] = df.stud_9_12 / (df.stud_1_4 + df.stud_5_8 + df.stud_9_12)
df['stud_undergrad_prop'] = df.stud_undergrad / df.stud_enrolled

df['citz_per_cap'] = df.citz / df['pop']
df['citz_birth_prop'] = df.citz_birth / df.citz
df['citz_co_prop'] = df.citz_co / df.citz

df['hu_per_cap'] = df.hu / df['pop']
df['hu_occ_prop'] = df.hu_occ / df.hu
df['hu_blt_2000_plus_prop'] = df.hu_blt_2000_plus / df.hu

df['hu_oo_prop'] = df.hu_oo / df.hu_occ

for hval in ['hu_oo_lt_50', 'hu_oo_50_150', 'hu_oo_150_250', 'hu_oo_250_400', 'hu_oo_400_750', 'hu_oo_750_plus']:
    df[f'{hval}_prop'] = df[hval] / df.hu_oo

for hyear in [
        'hu_blt_lt_1950', 'hu_blt_1950_1959', 'hu_blt_1960_1969',
        'hu_blt_1970_1979', 'hu_blt_1980_1989', 'hu_blt_1990_1999', 'hu_blt_2000_plus'
    ]:
    df[f'{hyear}_prop'] = df[hyear] / df.hu

prop = separate_by(df, ['prop', 'per_cap', 'median', 'avg', 'freq'], keep=['pop'], mode='include')
prop.to_csv('output/county_stats_normalized.csv')
gprop = GroupedDF(prop, custom={'hu': ['year', 'county', 'hu_per_cap', 'hu_occ_prop']})
gprop.display()

gend: Gender


Unnamed: 0,year,county,gend_m_prop,gend_f_prop,gend_m_age_undr19_prop,gend_m_age_over18_prop,gend_f_age_undr19_prop,gend_f_age_over18_prop
0,2012.0,ADAMS,0.502904,0.497107,0.29996,0.70004,0.289421,0.710579
1,2012.0,ALAMOSA,0.501282,0.499295,0.28135,0.71865,0.26232,0.73768
2,2012.0,ARAPAHOE,0.490666,0.509325,0.274625,0.725375,0.251649,0.748351



age: Age


Unnamed: 0,year,county,age_over18_prop,age_undr19_prop,age_undr19_gend_m_prop,age_undr19_gend_f_prop,age_over18_gend_m_prop,age_over18_gend_f_prop,age_median
0,2012.0,ADAMS,0.705276,0.294724,0.511838,0.488162,0.49917,0.500845,32.4
1,2012.0,ALAMOSA,0.728374,0.271626,0.519226,0.482189,0.49459,0.505674,32.2
2,2012.0,ARAPAHOE,0.737078,0.262922,0.512506,0.487487,0.482876,0.517115,35.7



race: Race


Unnamed: 0,year,county,race_hispanic_prop,race_white_prop,race_black_prop,race_asian_prop,race_other_prop,race_prop_stdev
0,2012.0,ADAMS,0.363882,0.512502,0.028167,0.033236,0.024269,0.206129
1,2012.0,ALAMOSA,0.4604,0.497693,0.007049,0.003781,0.040305,0.227001
2,2012.0,ARAPAHOE,0.1764,0.611792,0.093302,0.047074,0.034754,0.215377



inc: Income


Unnamed: 0,year,county,inc_per_cap,inc_hh_median
0,2012.0,ADAMS,24357,56633
1,2012.0,ALAMOSA,19657,38045
2,2012.0,ARAPAHOE,32845,60400



hh: Households


Unnamed: 0,year,county,hh_size_avg
0,2012.0,ADAMS,2.91
1,2012.0,ALAMOSA,2.49
2,2012.0,ARAPAHOE,2.55



citz: Citizenship


Unnamed: 0,year,county,citz_birth_prop,citz_co_prop,citz_per_cap
0,2012.0,ADAMS,0.950229,0.565176,0.860368
1,2012.0,ALAMOSA,0.983203,0.631001,0.968986
2,2012.0,ARAPAHOE,0.937754,0.429728,0.872052



hsgrad: Population over 25 (highschool graduate counts)


Unnamed: 0,year,county,hsgrad_graduated_prop
0,2012.0,ADAMS,0.604913
1,2012.0,ALAMOSA,0.630942
2,2012.0,ARAPAHOE,0.525874



civ_lf: Civilian laborforce (employment)


Unnamed: 0,year,county,civ_lf_prop,civ_lf_employed_prop
0,2012.0,ADAMS,0.512761,0.905485
1,2012.0,ALAMOSA,0.459503,0.899317
2,2012.0,ARAPAHOE,0.533424,0.9184



ps: Poverty status


Unnamed: 0,year,county,ps_total_prop,ps_undr18_total_prop,ps_undr18_prop
0,2012.0,ADAMS,0.141516,0.20324,0.407657
1,2012.0,ALAMOSA,0.218233,0.198585,0.237543
2,2012.0,ARAPAHOE,0.117654,0.159459,0.344372



stud: Students (counts by program type)


Unnamed: 0,year,county,stud_enrolled_prop,stud_hs_prop,stud_undergrad_prop
0,2012.0,ADAMS,0.279257,0.305236,0.164248
1,2012.0,ALAMOSA,0.359793,0.366708,0.426147
2,2012.0,ARAPAHOE,0.279887,0.340967,0.191012



hu: Housing units: Total and occupied.


Unnamed: 0,year,county,hu_per_cap,hu_occ_prop
0,2012.0,ADAMS,0.35452,0.925198
1,2012.0,ALAMOSA,0.42112,0.890596
2,2012.0,ARAPAHOE,0.399446,0.939482



hu_blt: Housing units: Year built


Unnamed: 0,year,county,hu_blt_2000_plus_prop,hu_blt_lt_1950_prop,hu_blt_1950_1959_prop,hu_blt_1960_1969_prop,hu_blt_1970_1979_prop,hu_blt_1980_1989_prop,hu_blt_1990_1999_prop,hu_blt_freq_yr_ord,hu_blt_freq_yr
0,2012.0,ADAMS,0.236957,0.037722,0.124776,0.120157,0.184906,0.12477,0.169059,7,b2000_2009
1,2012.0,ALAMOSA,0.098904,0.233719,0.089927,0.099513,0.213786,0.131163,0.131771,1,b1949_e
2,2012.0,ARAPAHOE,0.165498,0.030085,0.069361,0.093458,0.261392,0.235182,0.142715,4,b1970_1979



hu_oo: Housing units: Owner occupied (home values. Difference between occupied and owner occupied is renter occupied)


Unnamed: 0,year,county,hu_oo_prop,hu_oo_lt_50_prop,hu_oo_50_150_prop,hu_oo_150_250_prop,hu_oo_250_400_prop,hu_oo_400_750_prop,hu_oo_750_plus_prop,hu_oo_freq_val_ord,hu_oo_freq_val
0,2012.0,ADAMS,0.662818,0.085687,0.198166,0.475317,0.177598,0.054211,0.00902,3,v150k_250k
1,2012.0,ALAMOSA,0.632496,0.117504,0.431929,0.290924,0.107239,0.047812,0.004592,2,v50k_150k
2,2012.0,ARAPAHOE,0.639821,0.029387,0.154892,0.390722,0.266929,0.114133,0.043937,3,v150k_250k





In [70]:
cols(gprop.df)

pop
age_over18_prop
age_undr19_prop
gend_m_prop
gend_f_prop
age_undr19_gend_m_prop
age_undr19_gend_f_prop
age_over18_gend_m_prop
age_over18_gend_f_prop
gend_m_age_undr19_prop
gend_m_age_over18_prop
gend_f_age_undr19_prop
gend_f_age_over18_prop
race_hispanic_prop
race_white_prop
race_black_prop
race_asian_prop
race_other_prop
race_prop_stdev
hsgrad_graduated_prop
civ_lf_prop
civ_lf_employed_prop
ps_total_prop
ps_undr18_total_prop
ps_undr18_prop
stud_enrolled_prop
stud_hs_prop
stud_undergrad_prop
citz_birth_prop
citz_co_prop
hu_occ_prop
hu_blt_2000_plus_prop
hu_oo_prop
hu_oo_lt_50_prop
hu_oo_50_150_prop
hu_oo_150_250_prop
hu_oo_250_400_prop
hu_oo_400_750_prop
hu_oo_750_plus_prop
hu_blt_lt_1950_prop
hu_blt_1950_1959_prop
hu_blt_1960_1969_prop
hu_blt_1970_1979_prop
hu_blt_1980_1989_prop
hu_blt_1990_1999_prop
inc_per_cap
citz_per_cap
hu_per_cap
age_median
inc_hh_median
hh_size_avg
hu_oo_freq_val_ord
hu_oo_freq_val
hu_blt_freq_yr_ord
hu_blt_freq_yr
