# Examining the Homeless Population of the United States
By Andrew Watkins

## Machine Learning

In [1]:
#Data Wrangling from previous section
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Name the files to use
file_hic_state = 'data/2007-2017-HIC-Counts-by-State.xlsx'
file_pit_state = 'data/2007-2017-PIT-Counts-by-State.xlsx'

# Load spreadsheet
hic_state = pd.ExcelFile(file_hic_state)
pit_state = pd.ExcelFile(file_pit_state)

#HIC
df_hic = pd.DataFrame()

#concat the rest of the sheets to the original df
for sheet in range(2007, 2018): 
    excel_sheet = hic_state.parse(str(sheet), header=1)
    excel_sheet['year'] = str(sheet)
    excel_sheet.set_index(['year','State'], inplace=True)
    df_hic = pd.concat([df_hic, excel_sheet], axis=0, ignore_index=False, sort=True)
    
#Columns were named differenlty before 2014 so we will use all the named variations and then combine them.
columns_to_use = [ "Total Year-Round Beds (ES)",
                   "Total Year-Round ES Beds",
                   "Total Year-Round Beds (TH)",
                   "Total Year-Round TH Beds",
                   "Total Year-Round Beds (SH)",
                   "Total Year-Round SH Beds",
                   "Total Year-Round Beds (PSH)",
                   "Total Year-Round PSH Beds",
                   "Total Year-Round Beds (RRH)",
                   "Total Year-Round RRH Beds",
                   "Total Year-Round Beds (DEM)",
                   "Total Year-Round Beds (OPH)",
]
df_hic = df_hic[columns_to_use]
df_hic.fillna(0.0, inplace=True)


#Here we combine the coumns that contain the same informaiton but were named differently. 
df_hic['Total Year-Round Beds (ES)'] = df_hic['Total Year-Round Beds (ES)'] + df_hic['Total Year-Round ES Beds']
df_hic['Total Year-Round Beds (TH)'] = df_hic['Total Year-Round Beds (TH)'] + df_hic['Total Year-Round TH Beds']
df_hic['Total Year-Round Beds (SH)'] = df_hic['Total Year-Round Beds (SH)'] + df_hic['Total Year-Round SH Beds']
df_hic['Total Year-Round Beds (PSH)'] = df_hic['Total Year-Round Beds (PSH)'] + df_hic['Total Year-Round PSH Beds']
df_hic['Total Year-Round Beds (RRH)'] = df_hic['Total Year-Round Beds (RRH)'] + df_hic['Total Year-Round RRH Beds']

#We drop the extra columns we no longer need.
cols_to_drop = ['Total Year-Round ES Beds',
                'Total Year-Round TH Beds',
                'Total Year-Round SH Beds',
                'Total Year-Round PSH Beds',
                'Total Year-Round RRH Beds']
df_hic.drop(cols_to_drop, axis=1, inplace=True)

#PIT
#Create a new DF with the rest of the sheets. Which are the homeless population in each state from 2007-2017
df_pit = pd.DataFrame()

#concat the rest of the sheets to the original df
for sheet in range(2007, 2018):
    excel_sheet = pit_state.parse(str(sheet))
    excel_sheet['year'] = str(sheet)
    excel_sheet.set_index(['year','State'], inplace=True)
    #rename the columns
    cols_to_use = []
    for column in excel_sheet.columns:
        if column.__contains__(','):
            cols_to_use.append(column.split(',')[0])
        else:
            cols_to_use.append(column)
    excel_sheet.columns = cols_to_use
    df_pit = pd.concat([df_pit, excel_sheet], axis=0, ignore_index=False, sort=True)
    
#There are only two with the note. So we just drop them directly. 
df_pit.drop(level=1, inplace=True, index='Note: The number of CoCs in 2016 was 402. However, MO-604 merged in 2016 and covers territory in both MO and KS, contributing to the PIT count in both states. ')
df_pit.drop(level=1, inplace=True, index='Note: The number of CoCs in 2017 was 399. However, MO-604 merged in 2016 and covers territory in both MO and KS, contributing to the PIT count in both states. ')

#Grab the 2 columns that we need for now
df_pit = df_pit[['Total Homeless', 'Number of CoCs']]
df_pit.fillna(0, inplace=True)
df_pit.replace(to_replace='.', value='0', inplace=True)
df_pit = df_pit.astype(float, copy=False)

#Merge both DF (PIT and HIC) and drop some unecessary columns
df = pd.concat([df_hic, df_pit], axis=1)
df.drop(index=['Total', 'KS*', ' ', 'MP'], level=1, inplace=True)

#Create an aggregate column of the total of all the beds.
df['Total Beds'] = df_hic['Total Year-Round Beds (ES)'] + df_hic['Total Year-Round Beds (TH)'] + df_hic['Total Year-Round Beds (SH)'] + df_hic['Total Year-Round Beds (PSH)'] + df_hic['Total Year-Round Beds (RRH)'] + df_hic['Total Year-Round Beds (DEM)'] + df_hic['Total Year-Round Beds (OPH)']
df.fillna(0, inplace=True)

ML Models
-Linear Regression
-Logistic Regression (How to divide/classify the total amount of beds?)
-Decision Tree
-SVM
-Naive Bayes
-K-Means
-Random Forests

# Section:  Linear Regression

In [2]:
#additional imports needed for ML
from sklearn.model_selection import train_test_split

In [3]:
#We now need to create a train-test split in our data that we will use on all our models.
X=df['Total Homeless'].values.reshape(-1,1)
y=df['Total Beds'].values.reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [4]:
#drop CA and NY
df_min = df.drop(index=['CA', 'NY'], level=1)

#We now need to create a train-test split in our data that we will use on all our models.
X_min=df_min['Total Homeless'].values.reshape(-1,1)
y_min=df_min['Total Beds'].values.reshape(-1,1)

X_train_min, X_test_min, y_train_min, y_test_min = train_test_split(X_min, y_min, test_size = 0.2, random_state=42)

## Linear Regression

In [5]:
from sklearn import linear_model
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_curve
from patsy.builtins import *

In [6]:
# Create linear regression object
model = linear_model.LinearRegression()

# Train the model using the training sets and check score
model.fit(X_train, y_train)

#Predict Output
y_pred = model.predict(X_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(model.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.8942869051025861
Root Mean Squared Error: 5203.300913404746


In [7]:
model = ols("Q('Total Homeless') ~ Q('Total Beds')", df).fit()
print(model.summary())

                             OLS Regression Results                            
Dep. Variable:     Q('Total Homeless')   R-squared:                       0.820
Model:                             OLS   Adj. R-squared:                  0.820
Method:                  Least Squares   F-statistic:                     2696.
Date:                 Tue, 05 Mar 2019   Prob (F-statistic):          1.37e-222
Time:                         14:27:55   Log-Likelihood:                -6211.8
No. Observations:                  594   AIC:                         1.243e+04
Df Residuals:                      592   BIC:                         1.244e+04
Df Model:                            1                                         
Covariance Type:             nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept       -1054.4702    41

 
 
### We now do the same operations with the min df.
 
 

In [8]:
# Create linear regression object
model = linear_model.LinearRegression()

# Train the model using the training sets and check score
model.fit(X_train_min, y_train_min)

#Predict Output
y_pred_min = model.predict(X_test_min)

# Compute and print R^2 and RMSE
print("R^2: {}".format(model.score(X_test_min, y_test_min)))
rmse_min = np.sqrt(mean_squared_error(y_test_min, y_pred_min))
print("Root Mean Squared Error: {}".format(rmse_min))

R^2: 0.6921948861246401
Root Mean Squared Error: 5248.060219215974


In [9]:
model = ols("Q('Total Homeless') ~ Q('Total Beds')", df_min).fit()
print(model.summary())

                             OLS Regression Results                            
Dep. Variable:     Q('Total Homeless')   R-squared:                       0.665
Model:                             OLS   Adj. R-squared:                  0.665
Method:                  Least Squares   F-statistic:                     1134.
Date:                 Tue, 05 Mar 2019   Prob (F-statistic):          1.18e-137
Time:                         14:27:55   Log-Likelihood:                -5678.6
No. Observations:                  572   AIC:                         1.136e+04
Df Residuals:                      570   BIC:                         1.137e+04
Df Model:                            1                                         
Covariance Type:             nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept         341.0452    30

## Decission Tree

In [10]:
#Import Library
from sklearn import tree
from sklearn.metrics import accuracy_score

In [11]:
# Create tree object 
model = tree.DecisionTreeRegressor() #for regression

#Train the model using the training sets and check score
model.fit(X_train, y_train)

#Predict Output
y_pred= model.predict(X_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(model.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.7557748670573805
Root Mean Squared Error: 7908.784208559684


In [12]:
# Create tree object 
model = tree.DecisionTreeRegressor()

#Train the model using the training sets and check score
model.fit(X_train_min, y_train_min)

#Predict Output
y_pred_min = model.predict(X_test_min)

# Compute and print R^2 and RMSE
print("R^2: {}".format(model.score(X_test_min, y_test_min)))
rmse = np.sqrt(mean_squared_error(y_test_min, y_pred_min))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.5423021831180048
Root Mean Squared Error: 6399.562940239509


## KNN

In [13]:
#Import Library
from sklearn.neighbors import KNeighborsClassifier

In [14]:
# Create KNeighbors classifier object model 
model = KNeighborsClassifier(n_neighbors=6) # default value for n_neighbors is 5

# Train the model using the training sets and check score
model.fit(X_train, y_train.ravel())

#Predict Output
predicted= model.predict(X_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(model.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.0
Root Mean Squared Error: 7908.784208559684


WHat threshold or category should I create?

In [40]:
# Create KNeighbors classifier object model 
model = KNeighborsClassifier(n_neighbors=6) # default value for n_neighbors is 5

# Train the model using the training sets and check score
model.fit(X_train_min, y_train_min.ravel())

#Predict Output
y_pred = model.predict(X_test_min)

# Compute and print R^2 and RMSE
print("R^2: {}".format(model.score(X_test_min, y_test_min)))
rmse = np.sqrt(mean_squared_error(y_test_min, y_pred_min))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.0
Root Mean Squared Error: 6399.562940239509


## Random Forest

In [39]:
#Import Library
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [17]:
# Create Random Forest object
model = RandomForestClassifier()

# Train the model using the training sets and check score
model.fit(X_train, y_train.ravel())

#Predict Output
y_pred = model.predict(X_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(model.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.0
Root Mean Squared Error: 7939.82140169137


In [38]:
# Create Random Forest object
model = RandomForestClassifier()

# Train the model using the training sets and check score
model.fit(X_train_min, y_train_min.ravel())

#Predict Output
y_pred = model.predict(X_test_min)

# Compute and print R^2 and RMSE
print("R^2: {}".format(model.score(X_test_min, y_test_min)))
rmse = np.sqrt(mean_squared_error(y_test_min, y_pred_min))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.008695652173913044
Root Mean Squared Error: 6399.562940239509


# Section: Logistic Regression

To be able to do some logistic regression we need to create or determine a threshold for our data to be able to determine if the threshold is met. The threshold we are going to use is 'Able', we want to classify if the state is able to handle the Homeless popualtion based on the beds they have. 

In [20]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Total Year-Round Beds (ES),Total Year-Round Beds (TH),Total Year-Round Beds (SH),Total Year-Round Beds (PSH),Total Year-Round Beds (RRH),Total Year-Round Beds (DEM),Total Year-Round Beds (OPH),Total Beds
year,State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2007,AK,1095.0,663.0,0.0,489.0,0.0,0.0,0.0,2247.0
2007,AL,1766.0,2607.0,0.0,2420.0,0.0,0.0,0.0,6793.0
2007,AR,1483.0,1109.0,0.0,1538.0,0.0,0.0,0.0,4130.0
2007,AZ,3736.0,5597.0,0.0,3019.0,0.0,0.0,0.0,12352.0
2007,CA,20181.0,30897.0,0.0,26787.0,0.0,0.0,0.0,77865.0


In [74]:
#We now need to create a train-test split in our data that we will use on all our models.
#X=df[['Total Year-Round Beds (ES)','Total Year-Round Beds (TH)','Total Year-Round Beds (SH)','Total Year-Round Beds (PSH)','Total Year-Round Beds (RRH)','Total Year-Round Beds (DEM)','Total Year-Round Beds (OPH)']]#.values.reshape(-1,1)
#y=df['Total Beds']#.values.reshape(-1,1)
X=df_test
y=df_test.Program

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, shuffle=False)

In [75]:
print(X.shape)
print(y.shape)

(4158, 2)
(4158,)


In [77]:
#print(y_pred)
print('X_train: ',X_train)
print('y_train: ',y_train)
print('X_test: ',X_test)
print('y_test: ',y_test)

X_train:                                  Program     Beds
year State                                      
2007 AK      Total Year-Round Beds (ES)   1095.0
     AL      Total Year-Round Beds (ES)   1766.0
     AR      Total Year-Round Beds (ES)   1483.0
     AZ      Total Year-Round Beds (ES)   3736.0
     CA      Total Year-Round Beds (ES)  20181.0
     CO      Total Year-Round Beds (ES)   2780.0
     CT      Total Year-Round Beds (ES)   2331.0
     DC      Total Year-Round Beds (ES)   3001.0
     DE      Total Year-Round Beds (ES)    515.0
     FL      Total Year-Round Beds (ES)   8913.0
     GA      Total Year-Round Beds (ES)   4176.0
     GU      Total Year-Round Beds (ES)     79.0
     HI      Total Year-Round Beds (ES)   1081.0
     IA      Total Year-Round Beds (ES)   1338.0
     ID      Total Year-Round Beds (ES)    746.0
     IL      Total Year-Round Beds (ES)   5204.0
     IN      Total Year-Round Beds (ES)   3966.0
     KS      Total Year-Round Beds (ES)   1727.0
     KY   

## Logistic Regression

In [25]:
#Import Library
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve

In [76]:
# Create logistic regression object
model = LogisticRegression()

# Train the model using the training sets and check score
model.fit(X_train, y_train)

#Predict Output
y_pred = model.predict(X_test)

#Print Accuracy
print('Accuracy: ', accuracy_score(y_pred, y_test))

ValueError: could not convert string to float: 'Total Year-Round Beds (DEM)'

In [27]:
print(y_pred)

[  2687.  42906.  42906.  37496.   7556.   7556.   3646.  12056.  42906.
  37496.  37496.  37496.  33441.  33214. 104568. 104568.  42906.   9363.
   3646.  37496.  42906.  18557.    258.  38884.   5140.  17018.  95752.
  42906.  37496.  15029.  17127.   3646.  14366.  19995.   2894.   7744.
  12771.   2074.   4595.  17018.  37496.   4595.  37496.  15029.   4662.
  11207.    258.  29804.  37496.  27597.  33441.  13557.   3139.  42906.
  37496.  42906.  42906.  16021.   3646.   7556.  27597.  12056.  42906.
   7556.  37496.  37496.  37496.   3253.  12771. 104568.  42906.   9363.
   3646.   4595.  42906. 104568.   3646.  27597.   4595.  37496. 104568.
  37496.   5323.  37496.   5667.   3646.  14366.  37496.   3646.   7744.
  12771.   3646.   3646.   3139.   1728.   3646.  37496.  29935.  16021.
  26164.   3646.  16021.   7556.  27597.  33441.  14366.   3139.  37496.
   3646.  15029.  12771.   3646.   3646.   7556.   3646.  12056.   7556.
   3139.  37496.]


## KNN

In [28]:
#Import Library
from sklearn.neighbors import KNeighborsClassifier

In [29]:
# Create KNeighbors classifier object model 
model = KNeighborsClassifier(n_neighbors=6) # default value for n_neighbors is 5

# Train the model using the training sets and check score
model.fit(X_train, y_train)

#Predict Output
y_pred = model.predict(X_test)

#Print Accuracy
print('Accuracy: ', accuracy_score(y_pred, y_test))

Accuracy:  0.0


## K-Means

In [30]:
#Import Library
from sklearn.cluster import KMeans

In [31]:
# Create KNeighbors classifier object model 
model = KMeans(n_clusters=100, random_state=0)

# Train the model using the training sets and check score
model.fit(X_train)

#Predict Output
y_pred = model.predict(X_test)

#Print Accuracy
print('Accuracy: ', accuracy_score(y_pred, y_test))

Accuracy:  0.0


## Descision Tree

In [32]:
# Create tree object 
model = tree.DecisionTreeRegressor() #for regression

#Train the model using the training sets and check score
model.fit(X_train, y_train)

#Predict Output
y_pred= model.predict(X_test)

#Print Accuracy
print('Accuracy: ', accuracy_score(y_pred, y_test))

Accuracy:  0.0


## SVM

In [33]:
#Import Library
from sklearn import svm

In [34]:
# Create SVM classification object 
model = svm.SVC()

# Train the model using the training sets and check score
model.fit(X_train, y_train)

#Predict Output
predicted= model.predict(X_test)

#Print Accuracy
print('Accuracy: ', accuracy_score(y_pred, y_test))

Accuracy:  0.0


## Naive Bayes

In [35]:
#Import Library
from sklearn.naive_bayes import GaussianNB

In [36]:
# Create SVM classification object 
model = GaussianNB()

# Train the model using the training sets and check score
model.fit(X_train, y_train)

#Predict Output
y_pred= model.predict(X_test)

#Print Accuracy
print('Accuracy: ', accuracy_score(y_pred, y_test))

Accuracy:  0.0


## Random Forest

In [37]:
# Create Random Forest object
model = RandomForestClassifier()

# Train the model using the training sets and check score
model.fit(X_train, y_train)

#Predict Output
y_pred = model.predict(X_test)

#Print Accuracy
print('Accuracy: ', accuracy_score(y_pred, y_test))

Accuracy:  0.0
