In [73]:
import pandas as pd
import os

ROOT_DIR = os.path.abspath("")
DATA_DIR = os.path.join(ROOT_DIR, "data")

zip_tract_mapping_df = pd.read_excel(
    os.path.join(DATA_DIR, "ZIP_TRACT_122020 - Denver Only.xlsx"),
    sheet_name= "Denver ZIP and Full Code Tracts"
)

unique_tracts = zip_tract_mapping_df["TRACT"].unique()
unique_zips = zip_tract_mapping_df["ZIP"].unique()

income_df = pd.read_csv(os.path.join(DATA_DIR, "Income - All Counties.csv")).transpose()
home_own_rent_df = pd.read_csv(os.path.join(DATA_DIR, "Own v Rent - All Counties.csv")).transpose()
race_df = pd.read_csv(os.path.join(DATA_DIR, "Race - All Counties.csv")).transpose()

CENSUS_TRACT_COLNAME = 'census_tract'
CENSUS_TRACT_NAME_COLNAME = 'Label (Grouping)'

In [74]:
COUNTY_CENSUS_MAPPING = {
    'Adams County' : '001',
    'Arapahoe County' : '005',
    'Denver County' : '031',
    'Jefferson County' : '059'
}

def find_census_tract_number(tract_label):
    
    census_name_elems = [substr.strip() for substr in tract_label.split(",")]
    try:
        county = census_name_elems[1]
        county_tract = COUNTY_CENSUS_MAPPING[county]
        census_tract_name = census_name_elems[0]
        census_tract_number = census_tract_name.lstrip("Census Tract ").replace(".", "")
        census_tract_number = census_tract_number.ljust(4, "0").zfill(6)
    
        full_tract = f"08{county_tract}{census_tract_number}"
    except IndexError:
        return None
    
    return full_tract

In [75]:
def assign_columns(df):
    
    df.columns = df.iloc[0]
    df.drop(df.index[0], inplace=True)

    return df

def remove_unicode_from_column_names(colname):

    return str(colname).replace(u'\xa0', u'')

home_own_rent_df = assign_columns(home_own_rent_df)
race_df = assign_columns(race_df)
income_df = assign_columns(income_df)

home_own_rent_df.columns = [remove_unicode_from_column_names(colname) for colname in home_own_rent_df.columns]
race_df.columns = [remove_unicode_from_column_names(colname) for colname in race_df.columns]
income_df.columns = [remove_unicode_from_column_names(colname) for colname in income_df.columns]


In [76]:
home_own_rent_df[CENSUS_TRACT_COLNAME] = home_own_rent_df.apply(lambda x: find_census_tract_number(x.name), axis = 1)
race_df[CENSUS_TRACT_COLNAME] = race_df.apply(lambda x: find_census_tract_number(x.name), axis = 1)
income_df[CENSUS_TRACT_COLNAME] = income_df.apply(lambda x: find_census_tract_number(x.name), axis = 1)

In [77]:
def transform_counts_into_percentages(df, total_column, list_of_columns_to_transform):

    rows_w_percentages = []

    for idx, row in df.iterrows():

        denominator = int(str(row[total_column]).replace(",", ""))
        
        try:
            for col in list_of_columns_to_transform:
                reformatted_col_name = col.lower().replace(" ","_").replace("$", "").replace(",", "").replace(":", "").replace(",000", "k")
                row[f"{reformatted_col_name}_percentage_total"] = int(str(row[col]).replace(",", "")) / denominator

            rows_w_percentages.append(row)

        except ZeroDivisionError:
            continue

    return pd.DataFrame(rows_w_percentages)

In [78]:
income_df_w_percentages = transform_counts_into_percentages(
    income_df,
    'Total:',
    [col for col in income_df.columns if col not in ['Total:', CENSUS_TRACT_COLNAME]]
)

race_df_w_percentages = transform_counts_into_percentages(
    race_df,
    'Population of one race:',
    [col for col in race_df.columns if 'alone' in col]
)

home_own_rent_df_w_percentages = transform_counts_into_percentages(
    home_own_rent_df[['Total:', CENSUS_TRACT_COLNAME, 'Owner occupied:', 'Renter occupied:']],
    'Total:',
    ['Owner occupied:', 'Renter occupied:']
)

In [100]:
home_own_rent_columns = ['Total:', CENSUS_TRACT_COLNAME] + [col for col in home_own_rent_df_w_percentages.columns if 'percentage_total' in col]
home_own_rent_columns

['Total:',
 'census_tract',
 'owner_occupied_percentage_total',
 'renter_occupied_percentage_total']

In [119]:
income_df_w_percentages.rename({'Total:':'income_total_count'}, axis=1, inplace=True)
race_df_w_percentages.rename({'Population of one race:':'race_total_count'}, axis=1, inplace=True)
home_own_rent_df_w_percentages.rename({'Total:':'home_own_rent_total'}, axis=1, inplace=True)

home_own_rent_df = home_own_rent_df_w_percentages[['home_own_rent_total', CENSUS_TRACT_COLNAME] 
                               + [col for col in home_own_rent_df_w_percentages.columns if 'percentage_total' in col]] #.to_csv(os.path.join(DATA_DIR, "interim", "home_own_rent.csv"))
income_df = income_df_w_percentages[['income_total_count', CENSUS_TRACT_COLNAME] + 
                        [col for col in income_df_w_percentages.columns if 'percentage_total' in col]] #.to_csv(os.path.join(DATA_DIR, "interim", "income.csv"))
race_df = race_df_w_percentages[['race_total_count', CENSUS_TRACT_COLNAME] + 
                      [col for col in race_df_w_percentages.columns if 'percentage_total' in col]] #.to_csv(os.path.join(DATA_DIR, "interim", "race.csv"))

In [121]:
# income_df = pd.read_csv(os.path.join(DATA_DIR, "interim", "income.csv"))
# race_df = pd.read_csv(os.path.join(DATA_DIR, "interim", "race.csv"))
# home_own_rent_df = pd.read_csv(os.path.join(DATA_DIR, "interim", "home_own_rent.csv"))

combined_df = pd.merge(income_df, race_df, how='outer', on=CENSUS_TRACT_COLNAME)
combined_df = pd.merge(combined_df, home_own_rent_df, how='outer', on=CENSUS_TRACT_COLNAME)

In [133]:
MAX_MIN_DICT = {}
for col in [column for column in combined_df.columns if 'percentage_total' in column]:
    MAX_MIN_DICT[col] = {
        'max' : combined_df[col].max(),
        'min' : combined_df[col].min()
    }

rows_w_standardized_values = []
for idx, row in combined_df.iterrows():
    if row[CENSUS_TRACT_COLNAME] is None:
        continue

    for col in [column for column in combined_df.columns if 'percentage_total' in column]:

        denominator = MAX_MIN_DICT[col]['max'] - MAX_MIN_DICT[col]['min']
        standardized_val = (row[col] - MAX_MIN_DICT[col]['min']) / denominator
        standardized_colname = col.replace("percentage_total", "standardized")
        row[standardized_colname] = standardized_val

    
    rows_w_standardized_values.append(row)



In [135]:
standardized_df = pd.DataFrame(rows_w_standardized_values)
standardized_df.to_csv(os.path.join(DATA_DIR, "processed", "standardized.csv"))

In [137]:
for column in [col for col in standardized_df.columns if "standardized" in col]:
    print(column)
    print(standardized_df[column].max())
    print(standardized_df[column].min())

less_than_10000_standardized
1.0
0.0
10000_to_14999_standardized
1.0
0.0
15000_to_19999_standardized
1.0
0.0
20000_to_24999_standardized
1.0
0.0
25000_to_29999_standardized
1.0
0.0
30000_to_34999_standardized
1.0
0.0
35000_to_39999_standardized
1.0
0.0
40000_to_44999_standardized
1.0
0.0
45000_to_49999_standardized
1.0
0.0
50000_to_59999_standardized
1.0
0.0
60000_to_74999_standardized
1.0
0.0
75000_to_99999_standardized
1.0
0.0
100000_to_124999_standardized
1.0
0.0
125000_to_149999_standardized
1.0
0.0
150000_to_199999_standardized
1.0
0.0
200000_or_more_standardized
1.0
0.0
white_alone_standardized
1.0
0.0
black_or_african_american_alone_standardized
1.0
0.0
american_indian_and_alaska_native_alone_standardized
1.0
0.0
asian_alone_standardized
1.0
0.0
native_hawaiian_and_other_pacific_islander_alone_standardized
1.0
0.0
some_other_race_alone_standardized
1.0
0.0
owner_occupied_standardized
1.0
0.0
renter_occupied_standardized
1.0
0.0


In [80]:
rows_w_percentages = []

for idx, row in home_own_rent_df.iterrows():

    denominator = int(str(row[total_column]).replace(",", ""))
    
    try:
        for col in ['Owner occupied:', 'Renter occupied:']:
            reformatted_col_name = col.lower().replace(" ","_").replace("$", "").replace(",", "").replace(":", "").replace(",000", "k")
            row[f"{reformatted_col_name}_percentage_total"] = int(str(row[col]).replace(",", "")) / denominator

        rows_w_percentages.append(row)

    except ZeroDivisionError:
        continue

In [95]:
rows_w_percentages

[Total:                                                                       1,191
 Owner occupied:                                                                135
 Householder who is White alone                                                  87
 Householder who is Black or African American alone                              13
 Householder who is American Indian and Alaska Native alone                       1
 Householder who is Asian alone                                                   3
 Householder who is Native Hawaiian and Other Pacific Islander alone              0
 Householder who is Some Other Race alone                                        20
 Householder who is Two or More Races                                            11
 Renter occupied:                                                             1,056
 Householder who is White alone                                                 400
 Householder who is Black or African American alone                         

In [94]:
pd.Series([row.name for row in rows_w_percentages]).duplicated().value_counts()

False    522
dtype: int64

In [54]:
total_column = "Total:"
standardized_columns_income = [col for col in income_df.columns if col not in [total_column, CENSUS_TRACT_COLNAME]]

rows_w_percentage = []
for idx, row in income_df.iterrows():
    denominator = int(str(row[total_column]).replace(",", ""))

    try:
        for col in standardized_columns_income:
            reformatted_col_name = col.lower().replace(" ","_").replace("$","").replace(",000", "k").replace(",", "")
            row[f"{reformatted_col_name}_percentage_total"] = int(str(row[col]).replace(",", "")) / denominator

        rows_w_percentage.append(row)
    
    except ZeroDivisionError: 
        continue

income_df_w_percentage = pd.DataFrame(rows_w_percentage)


In [55]:
income_df_w_percentage.to_csv(os.path.join(DATA_DIR, "interim", "income_df_w_percentages.csv"))

In [56]:
standardizable_columns = [col for col in income_df.columns if "percentage_total" in col]

In [20]:
total_description = 

Unnamed: 0,Total:,"Less than $10,000","$10,000 to $14,999","$15,000 to $19,999","$20,000 to $24,999","$25,000 to $29,999","$30,000 to $34,999","$35,000 to $39,999","$40,000 to $44,999","$45,000 to $49,999","$50,000 to $59,999","$60,000 to $74,999","$75,000 to $99,999","$100,000 to $124,999","$125,000 to $149,999","$150,000 to $199,999","$200,000 or more",census_tract
Colorado!!Estimate,2137402,96924,65782,67594,73827,72607,82052,79193,80495,75425,156526,214966,290253,229671,158981,186878,206228,
"Census Tract 78.01, Adams County, Colorado!!Estimate",1222,149,87,120,84,103,61,103,103,64,71,111,107,11,9,25,14,08001007801
"Census Tract 78.02, Adams County, Colorado!!Estimate",1377,137,150,128,91,62,34,77,75,56,183,149,134,64,0,21,16,08001007802
"Census Tract 79, Adams County, Colorado!!Estimate",2005,205,34,89,168,139,172,42,127,56,112,274,345,141,33,30,38,08001007900
"Census Tract 80, Adams County, Colorado!!Estimate",1536,63,96,28,111,90,143,13,67,48,285,114,210,135,57,52,24,08001008000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Census Tract 605.01, Jefferson County, Colorado!!Estimate",2934,72,17,14,10,42,92,52,47,70,108,158,242,360,394,632,624,08059060501
"Census Tract 9800, Jefferson County, Colorado!!Estimate",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,08059009800
"Census Tract 9804, Jefferson County, Colorado!!Estimate",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,08059009804
"Census Tract 9807, Jefferson County, Colorado!!Estimate",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,08059009807


In [60]:
income_df

Unnamed: 0,Total:,"Less than $10,000","$10,000 to $14,999","$15,000 to $19,999","$20,000 to $24,999","$25,000 to $29,999","$30,000 to $34,999","$35,000 to $39,999","$40,000 to $44,999","$45,000 to $49,999","$50,000 to $59,999","$60,000 to $74,999","$75,000 to $99,999","$100,000 to $124,999","$125,000 to $149,999","$150,000 to $199,999","$200,000 or more",census_tract
Colorado!!Estimate,2137402,96924,65782,67594,73827,72607,82052,79193,80495,75425,156526,214966,290253,229671,158981,186878,206228,
"Census Tract 78.01, Adams County, Colorado!!Estimate",1222,149,87,120,84,103,61,103,103,64,71,111,107,11,9,25,14,08001007801
"Census Tract 78.02, Adams County, Colorado!!Estimate",1377,137,150,128,91,62,34,77,75,56,183,149,134,64,0,21,16,08001007802
"Census Tract 79, Adams County, Colorado!!Estimate",2005,205,34,89,168,139,172,42,127,56,112,274,345,141,33,30,38,08001007900
"Census Tract 80, Adams County, Colorado!!Estimate",1536,63,96,28,111,90,143,13,67,48,285,114,210,135,57,52,24,08001008000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Census Tract 605.01, Jefferson County, Colorado!!Estimate",2934,72,17,14,10,42,92,52,47,70,108,158,242,360,394,632,624,08059060501
"Census Tract 9800, Jefferson County, Colorado!!Estimate",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,08059009800
"Census Tract 9804, Jefferson County, Colorado!!Estimate",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,08059009804
"Census Tract 9807, Jefferson County, Colorado!!Estimate",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,08059009807


In [42]:
income_df.head()

Label (Grouping),Total:,"Less than $10,000","$10,000 to $14,999","$15,000 to $19,999","$20,000 to $24,999","$25,000 to $29,999","$30,000 to $34,999","$35,000 to $39,999","$40,000 to $44,999","$45,000 to $49,999","$50,000 to $59,999","$60,000 to $74,999","$75,000 to $99,999","$100,000 to $124,999","$125,000 to $149,999","$150,000 to $199,999","$200,000 or more",census_tract
Colorado!!Estimate,2137402,96924,65782,67594,73827,72607,82052,79193,80495,75425,156526,214966,290253,229671,158981,186878,206228,
"Census Tract 78.01, Adams County, Colorado!!Estimate",1222,149,87,120,84,103,61,103,103,64,71,111,107,11,9,25,14,8001007801.0
"Census Tract 78.02, Adams County, Colorado!!Estimate",1377,137,150,128,91,62,34,77,75,56,183,149,134,64,0,21,16,8001007802.0
"Census Tract 79, Adams County, Colorado!!Estimate",2005,205,34,89,168,139,172,42,127,56,112,274,345,141,33,30,38,8001007900.0
"Census Tract 80, Adams County, Colorado!!Estimate",1536,63,96,28,111,90,143,13,67,48,285,114,210,135,57,52,24,8001008000.0


In [62]:
RACE_DF_COLS_TO_KEEP = [
    "Population of one race:",
    "White alone",
    "Black or African American alone",
    "American Indian and Alaska Native alone",
    "Asian alone",
    "Native Hawaiian and Other Pacific Islander alone",
    "Some Other Race alone"
]
race_df[RACE_DF_COLS_TO_KEEP]

Unnamed: 0,Population of one race:,White alone,Black or African American alone,American Indian and Alaska Native alone,Asian alone,Native Hawaiian and Other Pacific Islander alone,Some Other Race alone
"Census Tract 78.01, Adams County, Colorado",3464,899,668,77,901,1,918
"Census Tract 78.02, Adams County, Colorado",3165,748,738,145,316,31,1187
"Census Tract 79, Adams County, Colorado",4416,1687,849,148,135,25,1572
"Census Tract 80, Adams County, Colorado",5449,2470,897,164,286,23,1609
"Census Tract 81, Adams County, Colorado",1278,858,149,18,203,11,39
...,...,...,...,...,...,...,...
"Census Tract 605.01, Jefferson County, Colorado",9337,8554,71,57,478,4,173
"Census Tract 9800, Jefferson County, Colorado",0,0,0,0,0,0,0
"Census Tract 9804, Jefferson County, Colorado",1103,881,103,101,16,0,2
"Census Tract 9807, Jefferson County, Colorado",0,0,0,0,0,0,0


In [48]:
race_df.columns[2]

'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0White alone'

'Population of one race:'

In [41]:
home_own_rent_df

Label (Grouping),Total:,Owner occupied:,Householder who is White alone,Householder who is Black or African American alone,Householder who is American Indian and Alaska Native alone,Householder who is Asian alone,Householder who is Native Hawaiian and Other Pacific Islander alone,Householder who is Some Other Race alone,Householder who is Two or More Races,Renter occupied:,Householder who is White alone.1,Householder who is Black or African American alone.1,Householder who is American Indian and Alaska Native alone.1,Householder who is Asian alone.1,Householder who is Native Hawaiian and Other Pacific Islander alone.1,Householder who is Some Other Race alone.1,Householder who is Two or More Races.1,census_tract
"Census Tract 78.01, Adams County, Colorado",1191,135,87,13,1,3,0,20,11,1056,400,223,13,66,3,292,59,08001007801
"Census Tract 78.02, Adams County, Colorado",1240,157,108,8,0,3,0,29,9,1083,388,248,19,46,4,336,42,08001007802
"Census Tract 79, Adams County, Colorado",1780,535,359,55,11,13,0,81,16,1245,472,296,30,27,4,360,56,08001007900
"Census Tract 80, Adams County, Colorado",1726,993,615,161,11,32,0,144,30,733,342,183,15,18,3,136,36,08001008000
"Census Tract 81, Adams County, Colorado",314,36,24,2,0,3,0,6,1,278,175,16,4,49,0,16,18,08001008100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Census Tract 603, Jefferson County, Colorado",1584,1424,1239,8,4,127,0,26,20,160,136,2,1,15,1,4,1,08059006030
"Census Tract 604, Jefferson County, Colorado",1184,1043,923,8,12,55,1,26,18,141,118,3,3,3,1,5,8,08059006040
"Census Tract 605, Jefferson County, Colorado",1612,1021,974,3,7,22,0,7,8,591,497,15,6,33,1,18,21,08059006050
"Census Tract 9800, Jefferson County, Colorado",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,08059009800
