In [12]:
import pandas as pd
import os
INDEX = ['year', 'county', 'geo']
PATH = 'data_groups'
if not os.path.exists(PATH):
    os.makedirs(PATH)

In [13]:
def head(df, n=3, name=None):
    if name:
        print(name.upper())
    print("COLUMNS: ", df.shape[1], '\n', 'ROWS:    ', df.shape[0], sep="")
    display(df.head(n))


def extract_group(df, prefix, keys=INDEX, keep=[], keep_first_word=False) -> pd.DataFrame:
    """ Returns a df containing index plus columns that start with given prefix """

    if type(prefix) != list:
        prefix = [prefix]
    names = [item for sublist in [
                [c for c in df.columns if c.startswith(txt)] for txt in prefix
            ] for item in sublist ]

    result = df.copy()[keys + keep + names]

    for p in prefix:
        remove = f'{p}_'
        if keep_first_word:
            remove = f'{"_".join(p.split("_")[1:])}'
        for c in result.columns:
            if c.startswith(p):
                result = result.rename(columns={c: c.replace(remove, '')})
    return result

---
---
---
### **Everything**

In [14]:
df_all = pd.read_csv('output/everything_grouped.csv')
df_all.to_csv(f'{PATH}/ALL.csv', index=False)
head(df_all)

COLUMNS: 378
ROWS:    428


Unnamed: 0,year,county,geo,CRIME_BASE_COUNT_cr_count,CRIME_BASE_COUNT_against_person,CRIME_BASE_COUNT_against_property,CRIME_BASE_COUNT_against_society,CRIME_BASE_COUNT_not_a_crime,CRIME_BASE_NORM_cr_rate,CRIME_BASE_NORM_age_avg,...,EDU_NORM_white_mobile_instances_rate,EDU_NORM_black_stable_rate,EDU_NORM_black_mobile_rate,EDU_NORM_black_mobile_instances_rate,EDU_NORM_hispanic_stable_rate,EDU_NORM_hispanic_mobile_rate,EDU_NORM_hispanic_mobile_instances_rate,EDU_NORM_asian_stable_rate,EDU_NORM_asian_mobile_rate,EDU_NORM_asian_mobile_instances_rate
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,35891,5041.0,23469.0,7381.0,0.0,0.077945,27.892097,...,32.45,54.66,45.08,47.91,67.49,32.71,36.4,78.55,21.37,23.54
1,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,34494,4975.0,24832.0,4686.0,1.0,0.071877,28.951378,...,32.45,54.66,45.08,47.91,67.49,32.71,36.4,78.55,21.37,23.54
2,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,34716,4867.0,24902.0,4947.0,0.0,0.070798,29.617563,...,32.45,54.66,45.08,47.91,67.49,32.71,36.4,78.55,21.37,23.54


---
---
---
### **Everything: Counts**

In [15]:
df = extract_group(df_all, ['CRIME_BASE_COUNT', 'CRIME_CATEGORY_COUNT', 'CENSUS_COUNT', 'EDU_COUNT'], keep_first_word=True)
df.to_csv(f'{PATH}/ALL_counts.csv', index=False)
head(df)

COLUMNS: 154
ROWS:    428


Unnamed: 0,year,county,geo,CRIME__cr_count,CRIME__against_person,CRIME__against_property,CRIME__against_society,CRIME__not_a_crime,CRIME__arson,CRIME__assault,...,EDU__migrant_completed,EDU__title_1_grad_base_total,EDU__title_1_graduated,EDU__title_1_completed,EDU__homeless_grad_base_total,EDU__homeless_graduated,EDU__homeless_completed,EDU__gifted_grad_base_total,EDU__gifted_graduated,EDU__gifted_completed
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,35891,5041.0,23469.0,7381.0,0.0,105.0,4217.0,...,33.0,935.0,529.0,559.0,360.0,190.0,204.0,402.0,337.0,345.0
1,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,34494,4975.0,24832.0,4686.0,1.0,123.0,4166.0,...,33.0,935.0,529.0,559.0,360.0,190.0,204.0,402.0,337.0,345.0
2,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,34716,4867.0,24902.0,4947.0,0.0,82.0,3998.0,...,33.0,935.0,529.0,559.0,360.0,190.0,204.0,402.0,337.0,345.0


---
---
---
### **Everything: Normalized**

In [16]:
df = extract_group(df_all, ['CRIME_BASE_NORM', 'CRIME_CATEGORY_NORM', 'CENSUS_NORM', 'EDU_NORM'], keep_first_word=True)
df.to_csv(f'{PATH}/ALL_normalized.csv', index=False)
head(df)

COLUMNS: 143
ROWS:    428


Unnamed: 0,year,county,geo,CRIME__cr_rate,CRIME__age_avg,CRIME__quarter_mode,CRIME__month_mode,CRIME__day_of_week_mode,CRIME__hour_mode,CRIME__against_person_rate,...,EDU__white_mobile_instances_rate,EDU__black_stable_rate,EDU__black_mobile_rate,EDU__black_mobile_instances_rate,EDU__hispanic_stable_rate,EDU__hispanic_mobile_rate,EDU__hispanic_mobile_instances_rate,EDU__asian_stable_rate,EDU__asian_mobile_rate,EDU__asian_mobile_instances_rate
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.077945,27.892097,2.0,7.0,4.0,17.0,0.010948,...,32.45,54.66,45.08,47.91,67.49,32.71,36.4,78.55,21.37,23.54
1,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.071877,28.951378,3.0,7.0,2.0,18.0,0.010367,...,32.45,54.66,45.08,47.91,67.49,32.71,36.4,78.55,21.37,23.54
2,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.070798,29.617563,2.0,5.0,4.0,17.0,0.009925,...,32.45,54.66,45.08,47.91,67.49,32.71,36.4,78.55,21.37,23.54


---
---
---

### **Crime: Counts**
- Crime counts
- Useful for visualizations, but NOT for models

In [17]:
df = extract_group(df_all, ['CRIME_BASE_COUNT', 'CRIME_CATEGORY_COUNT'])
df.to_csv(f'{PATH}/crime_counts_category.csv', index=False)
head(df)

COLUMNS: 28
ROWS:    428


Unnamed: 0,year,county,geo,cr_count,against_person,against_property,against_society,not_a_crime,arson,assault,...,kidnapping,larceny_theft,porn,property_damage,prostitution,robbery,sex_offense,stolen_property,vehicle_theft,weapon_law
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,35891,5041.0,23469.0,7381.0,0.0,105.0,4217.0,...,199.0,10549.0,3.0,5795.0,38.0,306.0,600.0,349.0,1539.0,607.0
1,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,34494,4975.0,24832.0,4686.0,1.0,123.0,4166.0,...,222.0,11786.0,12.0,6014.0,20.0,365.0,565.0,194.0,1773.0,563.0
2,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,34716,4867.0,24902.0,4947.0,0.0,82.0,3998.0,...,272.0,11222.0,13.0,5152.0,33.0,423.0,577.0,251.0,2812.0,663.0


---
---
---

### **Crime: Normalized**
- Aggregate crime stats for counties, normalized
- Useful for models
- Useful for SOME visualizations, showing proportions or comparisons across groups

In [18]:
df = extract_group(df_all, ['CRIME_BASE_RATE', 'CRIME_CATEGORY_RATE'])
df.to_csv(f'{PATH}/crime_normalized_category.csv', index=False)
head(df)

COLUMNS: 3
ROWS:    428


Unnamed: 0,year,county,geo
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...
1,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...
2,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...


---
---
---

### **Census: Counts**
- Population counts for different demographics
- Useful for visualizations, but NOT for models

In [19]:
df = extract_group(df_all, 'CENSUS_COUNT')
df.to_csv(f'{PATH}/demographics_counts.csv', index=False)
head(df)

COLUMNS: 52
ROWS:    428


Unnamed: 0,year,county,geo,pop,gend_m,gend_f,age_over18,age_undr19,gend_m_age_undr19,gend_f_age_undr19,...,hu_blt_1990_1999,hu_blt_1980_1989,hu_blt_1970_1979,hu_blt_1960_1969,hu_blt_1950_1959,hu_blt_lt_1950,ps_known,ps_below,ps_undr18_known,ps_undr18_below
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,460468.0,231571.0,228902.0,324757.0,135711.0,69462.0,66249.0,...,27598,20368,30185,19615,20369,6158,438171,62008,124375,25278
1,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,479904.0,241181.0,238716.0,341149.0,138755.0,71006.0,67749.0,...,26953,20664,29550,19704,19775,6818,456829,64599,128235,25000
2,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,490355.0,246361.0,243987.0,349808.0,140547.0,71903.0,68640.0,...,27228,21165,29020,19119,19447,6690,466690,64241,130178,24906


---
---
---
### **Census: Normalized**
- Aggregate stats for counties, correcting for population
- Useful for models
- Useful for SOME visualizations, showing proportions or comparisons across groups

In [20]:
df = extract_group(df_all, 'CENSUS_NORM')
df.to_csv(f'{PATH}/demographics_normalized.csv', index=False)
head(df)

COLUMNS: 58
ROWS:    428


Unnamed: 0,year,county,geo,age_over18_prop,age_undr19_prop,gend_m_prop,gend_f_prop,age_undr19_gend_m_prop,age_undr19_gend_f_prop,age_over18_gend_m_prop,...,citz_per_cap,hu_per_cap,age_median,inc_hh_median,hh_size_avg,hu_oo_freq_val_ord,hu_oo_freq_val,hu_blt_freq_yr_ord,hu_blt_freq_yr,med_yr_blt
0,2012,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.705276,0.294724,0.502904,0.497107,0.511838,0.488162,0.49917,...,0.860368,0.35452,32.4,56633,2.91,3,v150k_250k,7,b2000_2009,1983
1,2014,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.710869,0.289131,0.502561,0.497424,0.511737,0.488263,0.498829,...,0.860066,0.342535,32.8,57421,2.95,3,v150k_250k,7,b2000_2009,1983
2,2015,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,0.713377,0.286623,0.502414,0.497572,0.511594,0.488378,0.498725,...,0.860401,0.336585,33.0,58946,2.98,3,v150k_250k,7,b2000_2009,1984


---
---
---
### **Education: Counts** (by county only)
- Population counts for different demographics
- Useful for visualizations, but NOT for models

In [21]:
df = extract_group(df_all, 'EDU_COUNT')
df = df.groupby(['county', 'geo']).agg('first').reset_index()
df.to_csv(f'{PATH}/education_counts.csv', index=False)
head(df)

COLUMNS: 80
ROWS:    63


Unnamed: 0,county,geo,year,pupil_total,stable,mobile,mobile_instances,disabled_pupil_total,disabled_stable,disabled_mobile,...,migrant_completed,title_1_grad_base_total,title_1_graduated,title_1_completed,homeless_grad_base_total,homeless_graduated,homeless_completed,gifted_grad_base_total,gifted_graduated,gifted_completed
0,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,2012,98546.0,67272.0,31222.0,33925.0,8848.0,6263.0,2588.0,...,33.0,935.0,529.0,559.0,360.0,190.0,204.0,402.0,337.0,345.0
1,ALAMOSA,MULTIPOLYGON (((-105.59917426201822 37.7521648...,2012,2775.0,1882.0,885.0,950.0,223.0,159.0,63.0,...,4.0,28.0,22.0,23.0,6.0,6.0,6.0,0.0,0.0,0.0
2,ARAPAHOE,MULTIPOLYGON (((-103.70653410023402 39.7398580...,2012,124639.0,94109.0,30134.0,32269.0,11842.0,9461.0,2354.0,...,9.0,488.0,202.0,213.0,243.0,96.0,102.0,909.0,820.0,828.0


---
---
---
### **Education: Normalized** (by county only)
- Aggregate stats for counties, correcting for population
- Useful for models
- Useful for SOME visualizations, showing proportions or comparisons across groups

In [22]:
df = extract_group(df_all, 'EDU_NORM')
df = df.groupby(['county', 'geo']).agg('first').reset_index().drop(columns='year')
df.to_csv(f'{PATH}/education_normalized.csv', index=False)
head(df)

COLUMNS: 58
ROWS:    63


Unnamed: 0,county,geo,stable_rate,mobile_rate,mobile_instances_rate,disabled_stable_rate,disabled_mobile_rate,disabled_mobile_instances_rate,disabled_graduated_rate,disabled_completed_rate,...,white_mobile_instances_rate,black_stable_rate,black_mobile_rate,black_mobile_instances_rate,hispanic_stable_rate,hispanic_mobile_rate,hispanic_mobile_instances_rate,asian_stable_rate,asian_mobile_rate,asian_mobile_instances_rate
0,ADAMS,MULTIPOLYGON (((-103.70574149517748 39.9999110...,68.26,31.68,34.43,70.78,29.25,32.73,47.54,50.1,...,32.45,54.66,45.08,47.91,67.49,32.71,36.4,78.55,21.37,23.54
1,ALAMOSA,MULTIPOLYGON (((-105.59917426201822 37.7521648...,67.82,31.89,34.23,71.3,28.25,29.6,86.67,93.33,...,35.54,57.14,42.86,42.86,70.15,29.47,32.69,64.0,36.0,36.0
2,ARAPAHOE,MULTIPOLYGON (((-103.70653410023402 39.7398580...,75.51,24.18,25.89,79.89,19.88,21.69,51.26,52.06,...,21.16,67.47,32.03,34.57,72.76,26.75,29.2,78.59,21.3,22.56
