In [23]:
import pandas as pd
import os

ROOT_DIR = os.path.abspath("")
DATA_DIR = os.path.join(ROOT_DIR, "data")

zip_tract_mapping_df = pd.read_excel(
    os.path.join(DATA_DIR, "ZIP_TRACT_122020 - Denver Only.xlsx"),
    sheet_name= "Denver ZIP and Full Code Tracts"
)

unique_tracts = zip_tract_mapping_df["TRACT"].unique()
unique_zips = zip_tract_mapping_df["ZIP"].unique()

income_df = pd.read_csv(os.path.join(DATA_DIR, "raw", "Income - All Counties.csv")).transpose()
home_own_rent_df = pd.read_csv(os.path.join(DATA_DIR, "raw", "Own v Rent - All Counties.csv")).transpose()
race_df = pd.read_csv(os.path.join(DATA_DIR, "raw", "Race - All Counties.csv")).transpose()
educational_attainment_df = pd.read_csv(os.path.join(DATA_DIR, "raw", "acs_educational_attainment.csv")).drop('Unnamed: 1538', axis='columns')
age_df = pd.read_csv(os.path.join(DATA_DIR, "raw", "acs_age.csv")).drop('Unnamed: 914', axis='columns')
units_in_structure_df = pd.read_csv(os.path.join(DATA_DIR, "raw", "acs_units_in_structure.csv")).drop('Unnamed: 46', axis='columns')
disability_df = pd.read_csv(os.path.join(DATA_DIR, "raw", "acs_disability.csv")).drop('Unnamed: 830', axis='columns')

CENSUS_TRACT_COLNAME = 'census_tract'
CENSUS_TRACT_NAME_COLNAME_DECENNIAL = 'Label (Grouping)'
CENSUS_TRACT_NAME_COLNAME_ACS = 'Geographic Area Name'

  educational_attainment_df = pd.read_csv(os.path.join(DATA_DIR, "raw", "acs_educational_attainment.csv")).drop('Unnamed: 1538', axis='columns')


In [24]:
COUNTY_CENSUS_MAPPING = {
    'Adams County' : '001',
    'Arapahoe County' : '005',
    'Denver County' : '031',
    'Jefferson County' : '059'
}

def find_census_tract_number(tract_label):
    
    census_name_elems = [substr.strip() for substr in tract_label.split(",")]
    try:
        county = census_name_elems[1]
        county_tract = COUNTY_CENSUS_MAPPING[county]
        census_tract_name = census_name_elems[0]
        census_tract_number = census_tract_name.lstrip("Census Tract ").replace(".", "")
        census_tract_number = census_tract_number.ljust(4, "0").zfill(6)
    
        full_tract = f"08{county_tract}{census_tract_number}"
    except IndexError:
        return None
    
    return full_tract

In [25]:
def assign_columns(df, col_idx = 0):
    
    df.columns = df.iloc[col_idx]
    df.drop(df.index[col_idx], inplace=True)

    return df

def remove_unicode_from_column_names(colname):

    return str(colname).replace(u'\xa0', u'')

home_own_rent_df = assign_columns(home_own_rent_df)
race_df = assign_columns(race_df)
income_df = assign_columns(income_df)
educational_attainment_df = assign_columns(educational_attainment_df)
age_df = assign_columns(age_df)
units_in_structure_df = assign_columns(units_in_structure_df)
disability_df = assign_columns(disability_df)

home_own_rent_df.columns = [remove_unicode_from_column_names(colname) for colname in home_own_rent_df.columns]
race_df.columns = [remove_unicode_from_column_names(colname) for colname in race_df.columns]
income_df.columns = [remove_unicode_from_column_names(colname) for colname in income_df.columns]


In [26]:
home_own_rent_df[CENSUS_TRACT_COLNAME] = home_own_rent_df.apply(lambda x: find_census_tract_number(x.name), axis = 1)
race_df[CENSUS_TRACT_COLNAME] = race_df.apply(lambda x: find_census_tract_number(x.name), axis = 1)
income_df[CENSUS_TRACT_COLNAME] = income_df.apply(lambda x: find_census_tract_number(x.name), axis = 1)
educational_attainment_df[CENSUS_TRACT_COLNAME] = educational_attainment_df[CENSUS_TRACT_NAME_COLNAME_ACS].apply(lambda x: find_census_tract_number(x))
age_df[CENSUS_TRACT_COLNAME] = age_df[CENSUS_TRACT_NAME_COLNAME_ACS].apply(lambda x: find_census_tract_number(x))
units_in_structure_df[CENSUS_TRACT_COLNAME] = units_in_structure_df[CENSUS_TRACT_NAME_COLNAME_ACS].apply(lambda x: find_census_tract_number(x))
disability_df[CENSUS_TRACT_COLNAME] = disability_df[CENSUS_TRACT_NAME_COLNAME_ACS].apply(lambda x: find_census_tract_number(x))

In [28]:
educational_attainment_columns = [CENSUS_TRACT_COLNAME] + [col for col in educational_attainment_df.columns if col.startswith('Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over')]
removed_educational_attainment_columns = [col for col in educational_attainment_df.columns if col not in educational_attainment_columns]
educational_attainment_df = educational_attainment_df[educational_attainment_columns]

age_columns = [CENSUS_TRACT_COLNAME] + [col for col in age_df.columns if col.startswith('Estimate!!Total!!Total population!!SELECTED') or col == 'Estimate!!Total!!Total population']
removed_age_columns = [col for col in age_df.columns if col not in age_columns]
age_df = age_df[age_columns]

units_in_structure_columns = [CENSUS_TRACT_COLNAME] + [col for col in units_in_structure_df.columns if col.startswith('Estimate!!Total:') and col not in ['Estimate!!Total:!!Mobile home', 'Estimate!!Total:!!Boat, RV, van, etc.']]
removed_units_in_structure_columns = [col for col in units_in_structure_df.columns if col not in units_in_structure_columns]
units_in_structure_df = units_in_structure_df[units_in_structure_columns]

disability_columns = [CENSUS_TRACT_COLNAME] + ['Estimate!!Total!!Total civilian noninstitutionalized population',
                                                'Estimate!!With a disability!!Total civilian noninstitutionalized population!!DISABILITY TYPE BY DETAILED AGE!!With a hearing difficulty', 
                                               'Estimate!!With a disability!!Total civilian noninstitutionalized population!!DISABILITY TYPE BY DETAILED AGE!!With a vision difficulty',
                                               'Estimate!!With a disability!!Total civilian noninstitutionalized population!!DISABILITY TYPE BY DETAILED AGE!!With a cognitive difficulty',
                                               'Estimate!!With a disability!!Total civilian noninstitutionalized population!!DISABILITY TYPE BY DETAILED AGE!!With an ambulatory difficulty',
                                               'Estimate!!With a disability!!Total civilian noninstitutionalized population!!DISABILITY TYPE BY DETAILED AGE!!With a self-care difficulty',
                                               'Estimate!!With a disability!!Total civilian noninstitutionalized population!!DISABILITY TYPE BY DETAILED AGE!!With an independent living difficulty'
                                               ]
removed_disability_columns = [col for col in disability_df.columns if col not in disability_columns]
disability_df = disability_df[disability_columns]

In [12]:
def transform_counts_into_percentages(df, total_column, list_of_columns_to_transform):

    rows_w_percentages = []

    for idx, row in df.iterrows():

        denominator = int(str(row[total_column]).replace(",", ""))
        
        try:
            for col in list_of_columns_to_transform:
                reformatted_col_name = col.lower().replace(" ","_").replace("$", "").replace(",", "").replace(":", "").replace(",000", "k").replace("!!","_")
                row[f"{reformatted_col_name}_percentage_total"] = int(str(row[col]).replace(",", "")) / denominator

            rows_w_percentages.append(row)

        except ZeroDivisionError:
            continue

    return pd.DataFrame(rows_w_percentages)

In [33]:
income_df_w_percentages = transform_counts_into_percentages(
    income_df,
    'Total:',
    [col for col in income_df.columns if col not in ['Total:', CENSUS_TRACT_COLNAME]]
)

race_df_w_percentages = transform_counts_into_percentages(
    race_df,
    'Population of one race:',
    [col for col in race_df.columns if 'alone' in col]
)
removed_race_columns = [col for col in race_df.columns if col not in [colname for colname in race_df.columns if 'alone' in colname]]

home_own_rent_df_w_percentages = transform_counts_into_percentages(
    home_own_rent_df[['Total:', CENSUS_TRACT_COLNAME, 'Owner occupied:', 'Renter occupied:']],
    'Total:',
    ['Owner occupied:', 'Renter occupied:']
)
removed_home_own_columns = [col for col in home_own_rent_df.columns if col not in ['Total:', 'Owner occupied:', 'Renter occupied:']]

educational_attainment_df_w_percentages = transform_counts_into_percentages(
    educational_attainment_df,
    'Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over',
    [col for col in educational_attainment_df.columns if col not in [CENSUS_TRACT_COLNAME, 'Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over']]
)

age_df_w_percentages = transform_counts_into_percentages(
    age_df,
    'Estimate!!Total!!Total population',
    [col for col in age_df.columns if 'SELECTED AGE CATEGORIES' in col]
)

units_in_structure_df_w_percentages = transform_counts_into_percentages(
    units_in_structure_df,
    'Estimate!!Total:',
    [col for col in units_in_structure_df.columns if col.startswith('Estimate!!Total:!!')]
)

disability_df_w_percentages = transform_counts_into_percentages(
    disability_df,
    'Estimate!!Total!!Total civilian noninstitutionalized population',
    [col for col in disability_df.columns if col.startswith('Estimate!!With a disability!!')]
)

In [35]:
removed_race_columns

['Total:',
 'Population of one race:',
 'Population of two or more races:',
 'Population of two races:',
 'White; Black or African American',
 'White; American Indian and Alaska Native',
 'White; Asian',
 'White; Native Hawaiian and Other Pacific Islander',
 'White; Some Other Race',
 'Black or African American; American Indian and Alaska Native',
 'Black or African American; Asian',
 'Black or African American; Native Hawaiian and Other Pacific Islander',
 'Black or African American; Some Other Race',
 'American Indian and Alaska Native; Asian',
 'American Indian and Alaska Native; Native Hawaiian and Other Pacific Islander',
 'American Indian and Alaska Native; Some Other Race',
 'Asian; Native Hawaiian and Other Pacific Islander',
 'Asian; Some Other Race',
 'Native Hawaiian and Other Pacific Islander; Some Other Race',
 'Population of three races:',
 'White; Black or African American; American Indian and Alaska Native',
 'White; Black or African American; Asian',
 'White; Black or 

In [37]:
non_informative_attributes_df = pd.DataFrame(
    removed_age_columns, columns=["non_informative_age_attributes"]
).to_csv(os.path.join(DATA_DIR, "processed", "non_informative_attributes_age.csv"))

non_informative_attributes_df = pd.DataFrame(
    removed_disability_columns, columns=["non_informative_comorbidities_attributes"]
).to_csv(os.path.join(DATA_DIR, "processed", "non_informative_attributes_comorbidities.csv"))

non_informative_attributes_df = pd.DataFrame(
    removed_educational_attainment_columns, columns=["non_informative_education_attainment_attributes"]
).to_csv(os.path.join(DATA_DIR, "processed", "non_informative_attributes_educational_attainment.csv"))

non_informative_attributes_df = pd.DataFrame(
    removed_home_own_columns, columns=["non_informative_home_own_attributes"]
).to_csv(os.path.join(DATA_DIR, "processed", "non_informative_attributes_home_ownership.csv"))

non_informative_attributes_df = pd.DataFrame(
    removed_race_columns, columns=["non_informative_race_attributes"]
).to_csv(os.path.join(DATA_DIR, "processed", "non_informative_attributes_race.csv"))

non_informative_attributes_df = pd.DataFrame(
    removed_units_in_structure_columns, columns=["non_informative_units_in_structure_attributes"]
).to_csv(os.path.join(DATA_DIR, "processed", "non_informative_attributes_units_in_structure.csv"))

# non_informative_attributes_df["non_informative_comorbidities_attributes"] = removed_disability_columns
# non_informative_attributes_df["non_informative_educational_attainment_attributes"] = removed_educational_attainment_columns
# non_informative_attributes_df["non_informative_home_ownership_attributes"] = removed_home_own_columns
# non_informative_attributes_df["non_informative_race_attributes"] = removed_race_columns
# non_informative_attributes_df["non_informative_units_in_structure_attributes"] = removed_units_in_structure_columns
# non_informative_attributes_df.to_csv(os.path.join(DATA_DIR, "processed", "non_informative_attributes.csv"))

In [199]:
income_df_w_percentages.rename({'Total:':'income_total_count'}, axis=1, inplace=True)
race_df_w_percentages.rename({'Population of one race:':'race_total_count'}, axis=1, inplace=True)
home_own_rent_df_w_percentages.rename({'Total:':'home_own_rent_total'}, axis=1, inplace=True)
educational_attainment_df_w_percentages.rename({'Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over' : 'population_25_and_over'}, axis = 1, inplace=True)
age_df_w_percentages.rename({'Estimate!!Total!!Total population': 'age_total_population'}, axis = 1, inplace=True)
units_in_structure_df_w_percentages.rename({'Estimate!!Total:' : 'total_households'}, axis=1, inplace=True)
disability_df_w_percentages.rename({'Estimate!!Total!!Total civilian noninstitutionalized population' : 'total_noninstitutionalized_population'}, axis=1, inplace=True)

home_own_rent_df = home_own_rent_df_w_percentages[['home_own_rent_total', CENSUS_TRACT_COLNAME] 
                               + [col for col in home_own_rent_df_w_percentages.columns if 'percentage_total' in col]] #.to_csv(os.path.join(DATA_DIR, "interim", "home_own_rent.csv"))
income_df = income_df_w_percentages[['income_total_count', CENSUS_TRACT_COLNAME] + 
                        [col for col in income_df_w_percentages.columns if 'percentage_total' in col]] #.to_csv(os.path.join(DATA_DIR, "interim", "income.csv"))
race_df = race_df_w_percentages[['race_total_count', CENSUS_TRACT_COLNAME] + 
                      [col for col in race_df_w_percentages.columns if 'percentage_total' in col]] #.to_csv(os.path.join(DATA_DIR, "interim", "race.csv"))
educational_attainment_df = educational_attainment_df_w_percentages[['population_25_and_over', CENSUS_TRACT_COLNAME] + 
                      [col for col in educational_attainment_df_w_percentages.columns if 'percentage_total' in col]]
age_df = age_df_w_percentages[['age_total_population', CENSUS_TRACT_COLNAME] +
                    [col for col in age_df_w_percentages.columns if 'percentage_total' in col]]

units_in_structure_df = units_in_structure_df_w_percentages[['total_households', CENSUS_TRACT_COLNAME] + 
                    [col for col in units_in_structure_df_w_percentages.columns if 'percentage_total' in col]]

disability_df = disability_df_w_percentages[['total_noninstitutionalized_population', CENSUS_TRACT_COLNAME] +
                    [col for col in disability_df_w_percentages.columns if 'percentage_total' in col]]


In [41]:
full_input_dataset = pd.merge(income_df, race_df_w_percentages, how='outer', on=CENSUS_TRACT_COLNAME)
full_input_dataset = pd.merge(full_input_dataset, home_own_rent_df_w_percentages, how='outer', on=CENSUS_TRACT_COLNAME)
full_input_dataset = pd.merge(full_input_dataset, educational_attainment_df_w_percentages, how='outer', on=CENSUS_TRACT_COLNAME)
full_input_dataset = pd.merge(full_input_dataset, age_df_w_percentages, how='outer', on=CENSUS_TRACT_COLNAME)
full_input_dataset = pd.merge(full_input_dataset, units_in_structure_df_w_percentages, how='outer', on=CENSUS_TRACT_COLNAME)
full_input_dataset = pd.merge(full_input_dataset, disability_df_w_percentages, how='outer', on=CENSUS_TRACT_COLNAME)

full_input_dataset.to_csv(os.path.join(DATA_DIR, "processed", "full_input_dataset.csv"))

In [45]:
# income_df = pd.read_csv(os.path.join(DATA_DIR, "interim", "income.csv"))
# race_df = pd.read_csv(os.path.join(DATA_DIR, "interim", "race.csv"))
# home_own_rent_df = pd.read_csv(os.path.join(DATA_DIR, "interim", "home_own_rent.csv"))

combined_df = pd.merge(income_df, race_df, how='outer', on=CENSUS_TRACT_COLNAME)
combined_df = pd.merge(combined_df, home_own_rent_df, how='outer', on=CENSUS_TRACT_COLNAME)
combined_df = pd.merge(combined_df, educational_attainment_df, how='outer', on=CENSUS_TRACT_COLNAME)
combined_df = pd.merge(combined_df, age_df, how='outer', on=CENSUS_TRACT_COLNAME)
combined_df = pd.merge(combined_df, units_in_structure_df, how='outer', on=CENSUS_TRACT_COLNAME)
combined_df = pd.merge(combined_df, disability_df, how='outer', on=CENSUS_TRACT_COLNAME)

for col in combined_df.columns: 
    if 'percentage_total' in col:
        combined_df[col] = combined_df[col].fillna(combined_df[col].median())

In [46]:
MAX_MIN_DICT = {}
for col in [column for column in combined_df.columns if 'percentage_total' in column]:
    MAX_MIN_DICT[col] = {
        'max' : combined_df[col].max(),
        'min' : combined_df[col].min()
    }

rows_w_standardized_values = []
for idx, row in combined_df.iterrows():
    if row[CENSUS_TRACT_COLNAME] is None:
        continue

    for col in [column for column in combined_df.columns if 'percentage_total' in column]:

        denominator = MAX_MIN_DICT[col]['max'] - MAX_MIN_DICT[col]['min']
        standardized_val = (row[col] - MAX_MIN_DICT[col]['min']) / denominator
        standardized_colname = col.replace("percentage_total", "standardized")
        row[standardized_colname] = standardized_val

    
    rows_w_standardized_values.append(row)



In [47]:
standardized_df = pd.DataFrame(rows_w_standardized_values)
standardized_df_reduced_columns = standardized_df[
    [CENSUS_TRACT_COLNAME] + [col for col in standardized_df.columns if "_standardized" in col]
]
#standardized_df_reduced_columns.to_csv(os.path.join(DATA_DIR, "processed", "standardized.csv"), index=False)

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [206]:
standardized_df_reduced_columns.to_csv(os.path.join(DATA_DIR, "processed", "standardized.csv"), index=False)
standardized_df = pd.read_csv(os.path.join(DATA_DIR, "processed", "standardized.csv"))

In [15]:
pca_input_columns = [col for col in standardized_df_reduced_columns.columns if '_standardized' in col]
pca_input_df = standardized_df_reduced_columns[pca_input_columns]
census_tracts = standardized_df_reduced_columns[CENSUS_TRACT_COLNAME]

In [16]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.85)
pca.fit(pca_input_df)

In [18]:
pca_output_df = pd.DataFrame(pca.transform(pca_input_df))

In [19]:
pca_output_df.columns = [f"x{idx}" for idx in range(len(pca.components_))]

In [22]:
pca_output_df[CENSUS_TRACT_COLNAME] = None
pca_output_df

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,census_tract
0,1.225187,-0.149232,0.368477,0.295765,-0.017200,-0.046010,0.216196,0.075602,-0.119034,0.104189,0.509474,0.165538,
1,1.302083,-0.001440,0.244756,0.268733,-0.192988,-0.111020,0.193172,0.111338,-0.124874,0.167984,-0.043474,0.092288,
2,0.969652,0.064561,0.062836,0.115468,-0.161904,0.033649,-0.075587,-0.169447,0.097774,0.160711,0.016590,-0.063087,
3,0.611850,0.261121,-0.000724,0.079005,-0.083322,0.033510,-0.235503,-0.216684,0.190810,0.158696,-0.247446,-0.038649,
4,0.622973,-0.561071,0.427328,-0.146777,0.022960,-0.040714,0.066930,0.076440,0.219344,0.108046,-0.067912,-0.105341,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
645,0.121072,-0.407953,0.076673,-0.056382,-0.120624,0.056911,-0.005085,0.019283,-0.014072,-0.021790,-0.019763,0.024556,
646,0.052439,-0.306938,0.040640,-0.051133,-0.082369,0.042198,-0.006720,0.013653,-0.013948,-0.015917,-0.020361,0.021622,
647,0.109254,-0.390559,0.070469,-0.055478,-0.114036,0.054377,-0.005367,0.018314,-0.014051,-0.020779,-0.019866,0.024051,
648,0.274948,-0.634429,0.157460,-0.068150,-0.206391,0.089897,-0.001418,0.031904,-0.014351,-0.034957,-0.018421,0.031136,


In [29]:
pca_output_df.at[0, CENSUS_TRACT_COLNAME] = "101"

In [34]:
for idx in range(0, len(census_tracts)):
    pca_output_df.at[idx, CENSUS_TRACT_COLNAME] = census_tracts[idx]

In [40]:
pca_output_df["vulnerability_index"] = \
    pca_output_df["x0"] + pca_output_df["x1"] + pca_output_df["x2"] + pca_output_df["x3"] + \
    pca_output_df["x4"] + pca_output_df["x5"] + pca_output_df["x6"] + pca_output_df["x7"] + \
    pca_output_df["x8"] + pca_output_df["x9"] + pca_output_df["x10"] + pca_output_df["x11"] 

In [42]:
pca_output_df.to_csv(os.path.join(DATA_DIR, "final", "vulnerability_index_unverified.csv"))
