<a href="https://colab.research.google.com/github/atharvanaik10/cs498-css/blob/main/cs498_rp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q ydata-profiling
!pip install -q matplotlib

[0m

In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport

# Creating the Dataset

This is a dataset of factors believed to influence homelessness across communities. The dictionary is included in the data folder in Google Drive.

Tasks
1. Find dataset for all counties
2. Write a script based on API calls for each county to download the excel sheets for 2010-2017

Source: https://www.huduser.gov/portal/datasets/hpmd.html?q=datasets%2Fhpmd.html

In [3]:
data_filepath = 'https://raw.githubusercontent.com/atharvanaik10/cs498-css/main/data/05b_analysis_file_update.csv'
df_full = pd.read_csv(data_filepath)

df_il = df_full[df_full['cocnumber'].str.contains('IL')]
df_il

Unnamed: 0,year,cocnumber,pit_tot_shelt_pit_hud,pit_tot_unshelt_pit_hud,pit_tot_hless_pit_hud,pit_ind_shelt_pit_hud,pit_ind_unshelt_pit_hud,pit_ind_hless_pit_hud,pit_perfam_shelt_pit_hud,pit_perfam_unshelt_pit_hud,...,sub_high_cost_rent75,sub_high_cost_homeval75,sub_high_rent_share75,tight_high_cost_rental_mkt,sub_tight_high_cost_rent,sub_west_coast_all_urb,sub_west_census,major_city,suburban,rural
840,2010,IL-500,211.0,1.0,212.0,123.0,1.0,124.0,88.0,0.0,...,1,1,1,3,1,0,0,0,1,0
841,2011,IL-500,179.0,3.0,182.0,110.0,3.0,113.0,69.0,0.0,...,0,0,0,0,0,0,0,0,1,0
842,2012,IL-500,193.0,3.0,196.0,120.0,3.0,123.0,73.0,0.0,...,1,1,1,3,1,0,0,0,1,0
843,2013,IL-500,180.0,5.0,185.0,106.0,5.0,111.0,74.0,0.0,...,0,0,0,0,0,0,0,0,1,0
844,2014,IL-500,161.0,5.0,166.0,103.0,5.0,108.0,58.0,0.0,...,1,1,1,3,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
987,2013,IL-520,263.0,75.0,338.0,121.0,39.0,160.0,142.0,36.0,...,0,0,0,0,0,0,0,0,0,1
988,2014,IL-520,279.0,46.0,325.0,142.0,33.0,175.0,137.0,13.0,...,1,1,1,3,1,0,0,0,0,1
989,2015,IL-520,316.0,30.0,346.0,166.0,26.0,192.0,150.0,4.0,...,0,0,0,0,0,0,0,0,0,1
990,2016,IL-520,332.0,19.0,351.0,159.0,16.0,175.0,173.0,3.0,...,1,1,1,3,1,0,0,0,0,1


Since this dataset does not contain any health/healthcare data, we append a health dataset from Illinois below.

The first step is to create a master health rankings dataset for all the counties in Illinois for all the years.

In [4]:
# Load yearly datasets
years = range(2010, 2018)
file_path_template = 'https://raw.githubusercontent.com/atharvanaik10/cs498-css/main/data/health_data/{}_county_ranking.xls'# Assuming the files are named like 'health_data_2010.xlsx', etc.

# Initialize an empty list to store each year's DataFrame
health_dfs = []

for year in years:
    file_path = file_path_template.format(year)
    # Load the "Subrankings" sheet from the current year's file
    df = pd.read_excel(file_path, sheet_name='Outcomes & Factors SubRankings', header=[0,1])
    # Create new column names by combining the top-level category with the subcolumn name
    df.columns = [f'{col[0]}_{col[1]}'.replace(' ', '_') if col[1] else col[0].replace(' ', '_') for col in df.columns]
    # Add a 'year' column to the DataFrame
    df['year'] = year
    # Append the DataFrame to the list
    health_dfs.append(df)

In [5]:
# Combine into master health ranking dataset and drop irrelevant columns
df_health_rankings = pd.concat(health_dfs, ignore_index=True)
df_health_rankings.drop(0, axis='index', inplace=True)
df_health_rankings.drop(['Unnamed:_0_level_0_FIPS', 'Unnamed:_1_level_0_State'], axis=1, inplace=True)
df_health_rankings.rename(columns={'Unnamed:_2_level_0_County' : 'county'}, inplace=True)

# We note that the "Mortality" column was changed to "Length of Life" and
# the "Morbidity" column was changed to "Quality of Life". Hence, we combine
# these two columns into one.
df_health_rankings['Mortality_Z-Score'] = df_health_rankings['Mortality_Z-Score'].combine_first(df_health_rankings['Length_of_Life_Z-Score'])
df_health_rankings['Mortality_Rank'] = df_health_rankings['Mortality_Rank'].combine_first(df_health_rankings['Length_of_Life_Rank'])
df_health_rankings['Morbidity_Z-Score'] = df_health_rankings['Morbidity_Z-Score'].combine_first(df_health_rankings['Quality_of_Life_Z-Score'])
df_health_rankings['Morbidity_Rank'] = df_health_rankings['Morbidity_Rank'].combine_first(df_health_rankings['Quality_of_Life_Rank'])
df_health_rankings.drop(['Length_of_Life_Z-Score', 'Length_of_Life_Rank', 'Quality_of_Life_Z-Score', 'Quality_of_Life_Rank'], axis=1, inplace=True)

df_health_rankings

Unnamed: 0,county,Mortality_Z-Score,Mortality_Rank,Morbidity_Z-Score,Morbidity_Rank,Health_Behaviors_Z-Score,Health_Behaviors_Rank,Clinical_Care_Z-Score,Clinical_Care_Rank,Social_&_Economic_Factors_Z-Score,Social_&_Economic_Factors_Rank,Physical_Environment_Z-Score,Physical_Environment_Rank,year
1,Adams,-0.315922,26.0,0.018905,55.0,-0.177479,9.0,-0.128418,8.0,-0.222109,18.0,-0.003727,53.0,2010
2,Alexander,1.449328,99.0,0.600000,100.0,0.324788,99.0,-0.132972,6.0,0.814184,101.0,0.070877,94.0,2010
3,Bond,-0.218783,36.0,0.296697,81.0,0.062052,74.0,0.005012,60.0,-0.082260,42.0,-0.046851,13.0,2010
4,Boone,-0.539522,11.0,0.075576,63.0,-0.248704,4.0,0.005551,61.0,0.242444,82.0,-0.005828,50.0,2010
5,Brown,-0.282644,30.0,-0.197342,27.0,0.073038,80.0,-0.052093,36.0,-0.222639,16.0,-0.067829,2.0,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
819,Whiteside,-0.101659,44.0,-0.395511,15.0,-0.095026,24.0,-0.070668,27.0,-0.037313,44.0,-0.024960,40.0,2017
820,Will,-0.700882,8.0,-0.180036,37.0,-0.150657,11.0,-0.022934,38.0,-0.234465,17.0,0.102365,98.0,2017
821,Williamson,0.323895,81.0,0.073420,60.0,0.093817,78.0,-0.005888,45.0,0.076363,69.0,0.015021,67.0,2017
822,Winnebago,0.247607,73.0,0.421506,91.0,0.210368,96.0,-0.001822,48.0,0.409720,96.0,0.060289,90.0,2017


Now we have two datasets, one for homelessness and other economic factors that is split by CoC, and one for health rankings that is split by county. Thankfully in most cases, CoC boundaries map neatly to existing county boundaries. We create a CoC to county mapping below and merge our datasets into a master dataset that is separated by CoC that includes the average health factors of all the counties in that CoC.

In [6]:
coc_to_county = {
    'IL-500' : ['McHenry'],
    'IL-501' : ['Winnebago', 'Boone'],
    'IL-502' : ['Lake'],
    'IL-503' : ['Champaign'],
    'IL-504' : ['Madison'],
    'IL-506' : ['Will', 'Kendall', 'Grundy'],
    'IL-507' : ['Peoria', 'Fulton', 'Tazewell', 'Woodford'],
    'IL-508' : ['McHenry'],
    'IL-509' : ['DeKalb'],
    'IL-510' : ['Cook'],
    'IL-511' : ['Cook'],
    'IL-512' : ['DeWitt', 'Ford', 'Iroquois', 'Kankakee', 'Livingston', 'Logan',
                'Mason', 'Menard', 'McLean', 'Piatt', 'Vermilion'],
    'IL-513' : ['Sangamon'],
    'IL-514' : ['DuPage'],
    'IL-515' : ['Calhoun', 'Christian', 'Clark', 'Clay', 'Coles', 'Crawford',
                'Cumberland', 'Douglas', 'Edgar', 'Effingham', 'Fayette',
                'Greene', 'Jasper', 'Jersey', 'Macoupin', 'Montgomery',
                'Moultrie', 'Shelby'],
    'IL-516' : ['Macon'],
    'IL-517' : ['Kane'],
    'IL-518' : ['Bureau', 'Carroll', 'Jo Daviess', 'Henry', 'Knox', 'LaSalle',
                'Lee', 'Marshall', 'Mercer', 'Ogle', 'Putnam', 'Rock Island',
                'Stark', 'Stephenson', 'Whiteside'],
    'IL-519' : ['Adams', 'Brown', 'Cass', 'Hancock', 'Henderson', 'McDonough',
                'Morgan', 'Pike', 'Schuyler', 'Scott', 'Warren'],
    'IL-520' : ['Alexander', 'Bond', 'Clinton', 'Edwards', 'Franklin',
                'Gallatin', 'Hamilton', 'Hardin', 'Jackson', 'Jefferson',
                'Johnson', 'Lawrence', 'Marion', 'Massac', 'Monroe', 'Perry',
                'Pope', 'Pulaski', 'Randolph', 'Richland', 'Saline', 'Union',
                'Wabash', 'Washington', 'Wayne', 'White', 'Williamson'],
}

In [7]:
# Calculate mean stats by CoC
coc_avgs = []
for coc, counties in coc_to_county.items():
  subset = df_health_rankings[df_health_rankings['county'].isin(counties)]
  avg_stats = subset.groupby('year').mean(numeric_only=True).reset_index()
  avg_stats['cocnumber'] = coc
  coc_avgs.append(avg_stats)

# combine all averages
coc_avg_df = pd.concat(coc_avgs, ignore_index=True)

df = pd.merge(df_il, coc_avg_df, on=['cocnumber', 'year'], how='left')
df

Unnamed: 0,year,cocnumber,pit_tot_shelt_pit_hud,pit_tot_unshelt_pit_hud,pit_tot_hless_pit_hud,pit_ind_shelt_pit_hud,pit_ind_unshelt_pit_hud,pit_ind_hless_pit_hud,pit_perfam_shelt_pit_hud,pit_perfam_unshelt_pit_hud,...,Morbidity_Z-Score,Morbidity_Rank,Health_Behaviors_Z-Score,Health_Behaviors_Rank,Clinical_Care_Z-Score,Clinical_Care_Rank,Social_&_Economic_Factors_Z-Score,Social_&_Economic_Factors_Rank,Physical_Environment_Z-Score,Physical_Environment_Rank
0,2010,IL-500,211.0,1.0,212.0,123.0,1.0,124.0,88.0,0.0,...,-0.262275,19.000000,-0.077647,29.000000,-0.065311,27.000000,-0.431845,5.000000,0.009561,70.000000
1,2011,IL-500,179.0,3.0,182.0,110.0,3.0,113.0,69.0,0.0,...,-0.319520,16.000000,-0.116947,16.000000,-0.011183,50.000000,-0.368764,7.000000,-0.050696,11.000000
2,2012,IL-500,193.0,3.0,196.0,120.0,3.0,123.0,73.0,0.0,...,-0.274948,20.000000,-0.264598,4.000000,-0.067671,29.000000,-0.398543,7.000000,0.002239,60.000000
3,2013,IL-500,180.0,5.0,185.0,106.0,5.0,111.0,74.0,0.0,...,-0.097183,42.000000,-0.245293,5.000000,-0.115609,17.000000,-0.356753,7.000000,-0.023845,25.000000
4,2014,IL-500,161.0,5.0,166.0,103.0,5.0,108.0,58.0,0.0,...,-0.137073,35.000000,-0.199592,6.000000,-0.091232,23.000000,-0.412663,6.000000,0.063221,97.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,2013,IL-520,263.0,75.0,338.0,121.0,39.0,160.0,142.0,36.0,...,0.123743,62.481481,0.037166,61.037037,0.051943,66.259259,0.108449,64.962963,0.033150,72.925926
148,2014,IL-520,279.0,46.0,325.0,142.0,33.0,175.0,137.0,13.0,...,0.136255,65.777778,0.053163,63.074074,0.060579,66.814815,0.102835,63.222222,0.020539,67.703704
149,2015,IL-520,316.0,30.0,346.0,166.0,26.0,192.0,150.0,4.0,...,0.157189,67.777778,0.061233,66.370370,0.068371,69.518519,0.111391,66.000000,0.022888,67.814815
150,2016,IL-520,332.0,19.0,351.0,159.0,16.0,175.0,173.0,3.0,...,0.211328,67.185185,0.083181,67.925926,0.065990,70.370370,0.110339,64.185185,0.013939,60.222222


In [8]:
# Store the data
df.to_pickle("data_raw.pkl")

# Preprocessing for Time Series

In [8]:
# set the Time Series index to year
df.set_index("year", inplace=True)
df

Unnamed: 0_level_0,cocnumber,pit_tot_shelt_pit_hud,pit_tot_unshelt_pit_hud,pit_tot_hless_pit_hud,pit_ind_shelt_pit_hud,pit_ind_unshelt_pit_hud,pit_ind_hless_pit_hud,pit_perfam_shelt_pit_hud,pit_perfam_unshelt_pit_hud,pit_perfam_hless_pit_hud,...,Morbidity_Z-Score,Morbidity_Rank,Health_Behaviors_Z-Score,Health_Behaviors_Rank,Clinical_Care_Z-Score,Clinical_Care_Rank,Social_&_Economic_Factors_Z-Score,Social_&_Economic_Factors_Rank,Physical_Environment_Z-Score,Physical_Environment_Rank
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010,IL-500,211.0,1.0,212.0,123.0,1.0,124.0,88.0,0.0,88.0,...,-0.262275,19.000000,-0.077647,29.000000,-0.065311,27.000000,-0.431845,5.000000,0.009561,70.000000
2011,IL-500,179.0,3.0,182.0,110.0,3.0,113.0,69.0,0.0,69.0,...,-0.319520,16.000000,-0.116947,16.000000,-0.011183,50.000000,-0.368764,7.000000,-0.050696,11.000000
2012,IL-500,193.0,3.0,196.0,120.0,3.0,123.0,73.0,0.0,73.0,...,-0.274948,20.000000,-0.264598,4.000000,-0.067671,29.000000,-0.398543,7.000000,0.002239,60.000000
2013,IL-500,180.0,5.0,185.0,106.0,5.0,111.0,74.0,0.0,74.0,...,-0.097183,42.000000,-0.245293,5.000000,-0.115609,17.000000,-0.356753,7.000000,-0.023845,25.000000
2014,IL-500,161.0,5.0,166.0,103.0,5.0,108.0,58.0,0.0,58.0,...,-0.137073,35.000000,-0.199592,6.000000,-0.091232,23.000000,-0.412663,6.000000,0.063221,97.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013,IL-520,263.0,75.0,338.0,121.0,39.0,160.0,142.0,36.0,178.0,...,0.123743,62.481481,0.037166,61.037037,0.051943,66.259259,0.108449,64.962963,0.033150,72.925926
2014,IL-520,279.0,46.0,325.0,142.0,33.0,175.0,137.0,13.0,150.0,...,0.136255,65.777778,0.053163,63.074074,0.060579,66.814815,0.102835,63.222222,0.020539,67.703704
2015,IL-520,316.0,30.0,346.0,166.0,26.0,192.0,150.0,4.0,154.0,...,0.157189,67.777778,0.061233,66.370370,0.068371,69.518519,0.111391,66.000000,0.022888,67.814815
2016,IL-520,332.0,19.0,351.0,159.0,16.0,175.0,173.0,3.0,176.0,...,0.211328,67.185185,0.083181,67.925926,0.065990,70.370370,0.110339,64.185185,0.013939,60.222222


In [36]:
time_series_df = df.copy()

In [42]:
# removing whatever NAN values we can using linear interpolation and backfill
missing_value_columns = time_series_df.columns[time_series_df.isna().any()].tolist()
new_cols = []
for col in missing_value_columns:
    new_col_name = col+" linear"
    new_cols.append(new_col_name)
    time_series_df[new_col_name] = time_series_df[col].interpolate(method='linear')
    time_series_df[new_col_name] = time_series_df[new_col_name].bfill()
time_series_df[new_cols[1]]

  time_series_df[new_col_name] = time_series_df[col].interpolate(method='linear')


year
2010    51.0
2011    51.0
2012    57.0
2013    25.0
2014    25.0
        ... 
2013    23.0
2014    39.0
2015    27.0
2016    21.0
2017    19.0
Name: pit_vet_hless_pit_hud linear, Length: 152, dtype: float64

In [44]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [45]:
# using the scaling thing from later code
# Select only numeric columns
numeric_columns = time_series_df.select_dtypes(include=['number']).columns

numeric_columns = numeric_columns[1::]

# Extract numeric data
numeric_data = time_series_df[numeric_columns]

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform only the numeric data
scaled_data_numeric = scaler.fit_transform(numeric_data)

# Replace the scaled numeric values back into the original DataFrame
time_series_df[numeric_columns] = scaled_data_numeric
time_series_df

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Unnamed: 0_level_0,cocnumber,pit_tot_shelt_pit_hud,pit_tot_unshelt_pit_hud,pit_tot_hless_pit_hud,pit_ind_shelt_pit_hud,pit_ind_unshelt_pit_hud,pit_ind_hless_pit_hud,pit_perfam_shelt_pit_hud,pit_perfam_unshelt_pit_hud,pit_perfam_hless_pit_hud,...,d_dem_soc_singparent_xt linear linear linear,d_dem_soc_vet_xt linear linear linear,d_env_wea_avgtemp_noaa linear linear linear,d_env_wea_avgtemp_summer_noaa linear linear linear,d_env_wea_precip_noaa linear linear linear,d_env_wea_precip_annual_noaa linear linear linear,dem_soc_ed_lessbach_xt linear linear linear,dem_health_ins_acs5yr_2012 linear linear linear linear,dem_health_ins_acs5yr_2012 linear linear linear linear linear,dem_health_ins_acs5yr_2012 linear linear linear linear linear linear
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010,IL-500,211.0,-0.341225,-0.297729,-0.261507,-0.307872,-0.280874,-0.301240,-0.499818,-0.321306,...,-1.008305,0.146269,0.742571,0.682745,0.154821,-1.447763,-0.361123,,,
2011,IL-500,179.0,-0.335668,-0.316476,-0.281342,-0.302182,-0.291923,-0.332647,-0.499818,-0.352320,...,-1.008305,0.146269,0.742571,0.682745,0.154821,-1.447763,-0.361123,,,
2012,IL-500,193.0,-0.335668,-0.307727,-0.266084,-0.302182,-0.281878,-0.326035,-0.499818,-0.345791,...,-1.008305,0.146269,0.742571,0.682745,0.154821,-1.447763,-0.361123,,,
2013,IL-500,180.0,-0.330112,-0.314601,-0.287445,-0.296493,-0.293932,-0.324382,-0.499818,-0.344159,...,-1.008305,0.146269,0.742571,0.682745,0.154821,-1.447763,-0.361123,,,
2014,IL-500,161.0,-0.330112,-0.326474,-0.292022,-0.296493,-0.296946,-0.350831,-0.499818,-0.370276,...,-1.008305,0.146269,0.742571,0.682745,0.154821,-1.447763,-0.443126,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013,IL-520,263.0,-0.135638,-0.218994,-0.264558,-0.199770,-0.244712,-0.211977,0.732720,-0.174394,...,-0.367813,-1.658817,0.492483,0.891546,-2.120900,-0.331516,1.285271,,,
2014,IL-520,279.0,-0.216206,-0.227117,-0.232517,-0.216839,-0.229645,-0.220242,-0.054734,-0.220100,...,-0.398551,-1.791218,0.972924,0.896728,-2.482247,0.209736,1.236832,,,
2015,IL-520,316.0,-0.260657,-0.213995,-0.195899,-0.236752,-0.212569,-0.198753,-0.362869,-0.213570,...,-0.429290,-1.923619,1.453365,0.901910,-2.843594,0.750988,1.188393,,,
2016,IL-520,332.0,-0.291217,-0.210870,-0.206579,-0.265200,-0.229645,-0.160734,-0.397106,-0.177658,...,-0.460028,-2.056019,1.933806,0.907091,-3.204942,1.292240,1.188393,,,


# Data Exploration

### Using `ydata-profiling`

In [None]:
profile = ProfileReport(df, title="Time-series Report", tsmode=True, sortby='year')
profile.to_file("df_report.html")

### Manual exploration

In [None]:
coc_numbers = ['IL-500',
 'IL-501',
 'IL-502',
 'IL-503',
 'IL-504',
 'IL-506',
 'IL-507',
 'IL-508',
 'IL-509',
 'IL-511',
 'IL-512',
 'IL-513',
 'IL-514',
 'IL-515',
 'IL-516',
 'IL-517',
 'IL-518',
 'IL-519',
 'IL-520']

# data visualization for the total sheltered population per county in Illinois
for i in range(len(coc_numbers)):
  df_co = df_il[df_il['cocnumber']== coc_numbers[i]]
  plt.plot(df_co['year'],df_co['pit_tot_shelt_pit_hud'] / df_co['pit_tot_pit_hud'])
plt.legend(coc_numbers);
plt.ylabel('ratio sheltered')
plt.xlabel('year')
plt.title('Ratio of Sheltered Population per CoC')

In [None]:
# data visualization of the ratio of total population that is unsheltered per county in Illinois
for i in range(len(coc_numbers)):
  df_co = df_il[df_il['cocnumber']== coc_numbers[i]]
  plt.plot(df_co['year'],df_co['pit_tot_unshelt_pit_hud'] / df_co['pit_tot_pit_hud'])
plt.legend(coc_numbers);
plt.ylabel('ratio homeless')
plt.xlabel('year')
plt.title('Ratio of Unsheltered Population per CoC')

In [None]:
# data visualization of the ratio of total population that is homeless per county in Illinois
for i in range(len(coc_numbers)):
  df_co = df_il[df_il['cocnumber']== coc_numbers[i]]
  plt.plot(df_co['year'],df_co['pit_tot_hless_pit_hud'] / df_co['pit_tot_pit_hud'])
plt.legend(coc_numbers);
plt.ylabel('ratio homeless')
plt.xlabel('year')
plt.title('Ratio of Homeless Population per CoC')

In [None]:
# @title Mortality_Z-Score

from matplotlib import pyplot as plt
df_health_rankings['Mortality_Z-Score'].plot(kind='hist', bins=20, title='Mortality_Z-Score')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# @title Mortality_Z-Score vs Mortality_Rank

from matplotlib import pyplot as plt
df_health_rankings.plot(kind='scatter', x='Mortality_Z-Score', y='Mortality_Rank', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.title("Mortality_Z-Score vs Mortality_Rank")

Now that we have the master dataset, we create a mapping from counties to CoCs and append by average ranking and average z-score by the counties in each CoC. (TODO).

# Correlation Analysis

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Replace NaN values with 0
df.fillna(0, inplace=True)
df

In [None]:
# Select only numeric columns
numeric_columns = df.select_dtypes(include=['number']).columns

numeric_columns = numeric_columns[1::]

# Extract numeric data
numeric_data = df[numeric_columns]

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform only the numeric data
scaled_data_numeric = scaler.fit_transform(numeric_data)

# Replace the scaled numeric values back into the original DataFrame
df[numeric_columns] = scaled_data_numeric
df

In [None]:
# Perform PCA
pca = PCA(n_components=4)
pca.fit(scaled_data_numeric)

In [None]:
# display PCA

# Explained variance ratio
explained_variance = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance)

# Cumulative explained variance
cumulative_variance = explained_variance.cumsum()
print("Cumulative Explained Variance:", cumulative_variance)

# Determine the number of components to retain
n_components = len(cumulative_variance[cumulative_variance <= 0.95])
print("Number of components to retain:", n_components)

# Apply dimensionality reduction
pca = PCA(n_components=n_components)
pca_data = pca.fit_transform(scaled_data_numeric)

# Create a DataFrame for the principal components
pca_df = pd.DataFrame(data=pca_data, columns=[f"PC{i+1}" for i in range(n_components)])

# Concatenate original DataFrame with PCA DataFrame if needed
# pca_df = pd.concat([df, pca_df], axis=1)

# Plotting
plt.figure(figsize=(8, 6))
plt.bar(range(1, n_components + 1), explained_variance, alpha=0.5, align='center', label='Individual Explained Variance')
plt.step(range(1, n_components + 1), cumulative_variance, where='mid', label='Cumulative Explained Variance')
plt.xlabel('Principal Components')
plt.ylabel('Explained Variance Ratio')
plt.title('Scree Plot')
plt.legend()
plt.show()

In [None]:
# get component list

# Get the principal component vectors
components = pca.components_


# Create a DataFrame to display the component loadings
component_loadings = pd.DataFrame(components, columns=df[numeric_columns].columns)

# Display the component loadings
print("Component Loadings:")
print(component_loadings)

In [None]:
# another way to print
# Print loadings for each component
count = 0
for i, component in enumerate(df[numeric_columns].iterrows(), 1):
    count +=1
    print(f"Principal Component {i} Loadings:")
    print(component[1])
    print()  # for better readability
    if(count > 5):
      break

# Correlation Matrix

In [None]:
# Compute pairwise correlation of columns
correlation_matrix = df[numeric_columns].corr()

# Print correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)

In [None]:
# use a single column (unsheltered)
specific_column = 'pit_tot_unshelt_pit_hud'  # Replace 'column_name' with the name of your specific column
correlations_with_specific_column = correlation_matrix[specific_column]

plt.figure(figsize=(15, 10))
sns.barplot(x=correlations_with_specific_column.index, y=correlations_with_specific_column.values)
plt.title(f'Correlations with {specific_column}')
plt.xlabel('Variables')
plt.ylabel('Correlation')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()





In [None]:
top_pos_correlations = correlations_with_specific_column.sort_values(ascending=False).head(30)

# Remove the correlation of the specific column with itself
top_pos_correlations = top_pos_correlations[top_pos_correlations.index != specific_column]
plt.figure(figsize=(10, 6))
sns.barplot(x=top_pos_correlations.index, y=top_pos_correlations.values)
plt.title(f'Top 30 Most Positively Correlated with {specific_column}')
plt.xlabel('Variables')
plt.ylabel('Absolute Correlation')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


In [None]:
top_neg_correlations = correlations_with_specific_column.sort_values(ascending=True).head(30)

# Remove the correlation of the specific column with itself
top_neg_correlations = top_neg_correlations[top_neg_correlations.index != specific_column]
plt.figure(figsize=(10, 6))
sns.barplot(x=top_neg_correlations.index, y=top_neg_correlations.values)
plt.title(f'Top 30 Most Negatively Correlated with {specific_column}')
plt.xlabel('Variables')
plt.ylabel('Absolute Correlation')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:

# Set up the matplotlib figure
plt.figure(figsize=(10, 8))

# Plot heatmap
sns.heatmap(correlations_with_specific_column, annot=True, cmap='coolwarm', fmt=".2f")

# Add title and rotate tick labels
plt.title("Correlation Matrix")
plt.xticks(rotation=45)
plt.yticks(rotation=45)

# Show plot
plt.show()