<a href="https://colab.research.google.com/github/ananyaa06/Create-A-Thon-Fantastic-4/blob/main/Homelessness_Prediction_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Link to data source (also has the link to the data dictionary)  https://www.huduser.gov/portal/datasets//hpmd.html

In [None]:
import pandas as pd

In [None]:
dataset = pd.read_csv("https://www.huduser.gov/portal/sites/default/files/xls/05b_analysis_file_update.csv")

# FUNCTIONS

In [None]:
def generate_data(outcome_var):

  # CREATE LISTS FOR OUTCOME VARIABLES
  outcomes = list(dataset.keys())[2:5] + ["pit_miss", 
                                        "odd_flag", 
                                        "pit_hless_balance", 
                                        "pit_shelt_balance", 
                                        "pit_unshelt_balance", 
                                        "unbalance_flag", 
                                        "pit_shelt_pit_hud_share", 
                                        "pit_unshelt_pit_hud_share",
                                        "pit_hless_pit_hud_share",
                                        "missing"]
                                        
  secondary_outcomes = list(dataset.keys())[5:14] + list(dataset.keys())[17:22]

  # ISOLATE TRAINING FEATURES FROM TOTAL DATASET
  features_df = dataset.drop(["year", "cocnumber", "coctag", "panelvar", "state_abr"] + outcomes + secondary_outcomes, axis=1, inplace=False)

  # CREATE DATAFRAME OF ALL OUTCOME VARIABLE DATA
  possible_outcomes_df = dataset[outcomes + secondary_outcomes]

  # IDENTIFY (AND DROP) FEATURES WITH MANY NAN VALUES
  NaN_features = []

  for key in features_df.keys():
    if features_df[key].isna().sum() > 300:
      NaN_features.append(key)

  features_df.drop(NaN_features, axis=1, inplace=True)

  # FILL IN THE FEW REMAINING NAN VALUES WITH COLUMN-WISE AVERAGE
  for key in features_df.keys():
    # print(key)
    features_df[key].fillna(value=round(features_df[key].mean()), inplace=True)

  # ADDING OUTCOME VAR TO THE END OF THE DATASET
  features_df[outcome_var] = possible_outcomes_df[outcome_var]

  # DROP THE NAN VALUES THAT ARE PRESENT IN THE OUTCOME VAR
  final_df = features_df.dropna()

  return final_df


In [None]:
def get_train_test_data(data):

  X = data.iloc[:, :-1].values
  y = data.iloc[:, -1].values
  #I just wrote this assuming we're using the last variable for the predicted variable but we can change it accordingly

  from sklearn.model_selection import train_test_split
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 6)

  from sklearn.preprocessing import StandardScaler
  sc = StandardScaler()
  X_train = sc.fit_transform(X_train)
  X_test = sc.transform(X_test)

  return X_train, X_test, y_train, y_test

In [None]:
# function for training and testing different models

def train_and_test_func(X_train, X_test, y_train, y_test):

  from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso, BayesianRidge

  print("Model:".ljust(20), "R2 Score:")
  print()

  lr = LinearRegression()
  lr.fit(X_train, y_train)
  print("Linear Regression:".ljust(20), lr.score(X_test, y_test))

  en = ElasticNet()
  en.fit(X_train, y_train)
  print("ElasticNet:".ljust(20), en.score(X_test, y_test))

  rd = Ridge()
  rd.fit(X_train, y_train)
  print("Ridge:".ljust(20), rd.score(X_test, y_test))

  ls = Lasso()
  ls.fit(X_train, y_train)
  print("Lasso:".ljust(20), ls.score(X_test, y_test))

  br = BayesianRidge()
  br.fit(X_train, y_train)
  print("Bayesian Ridge:".ljust(20), br.score(X_test, y_test))

  print()

# MODEL EXPERIMENTATION

In [None]:
# PRINT NAN COUNT FOR EACH OF THE POSSIBLE OUTCOME VARIABLES

outcomes = list(dataset.keys())[2:5] + ["pit_miss", 
                                      "odd_flag", 
                                      "pit_hless_balance", 
                                      "pit_shelt_balance", 
                                      "pit_unshelt_balance", 
                                      "unbalance_flag", 
                                      "pit_shelt_pit_hud_share", 
                                      "pit_unshelt_pit_hud_share",
                                      "pit_hless_pit_hud_share",
                                      "missing"]
                                      
secondary_outcomes = list(dataset.keys())[5:14] + list(dataset.keys())[17:22]

possible_outcomes_df = dataset[outcomes + secondary_outcomes]

#--
print("Key:".ljust(35), "NaN Count")
print()

for key in possible_outcomes_df.keys():
  print(key.ljust(35), possible_outcomes_df[key].isna().sum())


Key:                                NaN Count

pit_tot_shelt_pit_hud               14
pit_tot_unshelt_pit_hud             14
pit_tot_hless_pit_hud               14
pit_miss                            0
odd_flag                            0
pit_hless_balance                   0
pit_shelt_balance                   0
pit_unshelt_balance                 0
unbalance_flag                      0
pit_shelt_pit_hud_share             14
pit_unshelt_pit_hud_share           14
pit_hless_pit_hud_share             14
missing                             0
pit_ind_shelt_pit_hud               14
pit_ind_unshelt_pit_hud             14
pit_ind_hless_pit_hud               14
pit_perfam_shelt_pit_hud            14
pit_perfam_unshelt_pit_hud          14
pit_perfam_hless_pit_hud            14
pit_ind_chronic_hless_pit_hud       14
pit_perfam_chronic_hless_pit_hud    1137
pit_vet_hless_pit_hud               387
hou_pol_totalind_hud                2259
hou_pol_totalday_hud                2259
hou_pol_totalexit

In [None]:
# GENERATE DATA BASED ON OUTCOME VARIABLE
DATA = generate_data(outcome_var = "pit_ind_chronic_hless_pit_hud") # ths outcome variable can be adjusted to whatever we want

In [None]:
X_train, X_test, y_train, y_test = get_train_test_data(DATA)

In [None]:
train_and_test_func(X_train, X_test, y_train, y_test)

Model:               R2 Score:

Linear Regression:   0.8288487927438104
ElasticNet:          0.7354505066786698
Ridge:               0.8206148114592273
Lasso:               0.8192637107304898
Bayesian Ridge:      0.8188413218809069



# Code Graveyard

In [None]:
# options for our predicted variables
outcomes = list(dataset.keys())[2:5] + ["pit_miss", 
                                        "odd_flag", 
                                        "pit_hless_balance", 
                                        "pit_shelt_balance", 
                                        "pit_unshelt_balance", 
                                        "unbalance_flag", 
                                        "pit_shelt_pit_hud_share", 
                                        "pit_unshelt_pit_hud_share",
                                        "pit_hless_pit_hud_share",
                                        "missing"]
                                        
secondary_outcomes = list(dataset.keys())[5:14] + list(dataset.keys())[17:22]
print(outcomes)
print(secondary_outcomes)

['pit_tot_shelt_pit_hud', 'pit_tot_unshelt_pit_hud', 'pit_tot_hless_pit_hud', 'pit_miss', 'odd_flag', 'pit_hless_balance', 'pit_shelt_balance', 'pit_unshelt_balance', 'unbalance_flag', 'pit_shelt_pit_hud_share', 'pit_unshelt_pit_hud_share', 'pit_hless_pit_hud_share', 'missing']
['pit_ind_shelt_pit_hud', 'pit_ind_unshelt_pit_hud', 'pit_ind_hless_pit_hud', 'pit_perfam_shelt_pit_hud', 'pit_perfam_unshelt_pit_hud', 'pit_perfam_hless_pit_hud', 'pit_ind_chronic_hless_pit_hud', 'pit_perfam_chronic_hless_pit_hud', 'pit_vet_hless_pit_hud', 'hou_pol_totalind_hud', 'hou_pol_totalday_hud', 'hou_pol_totalexit_hud', 'hou_pol_numret6mos_hud', 'hou_pol_numret12mos_hud']


In [None]:
possible_outcomes_df = dataset[outcomes + secondary_outcomes]

In [None]:
# isolate training feaures (dropping identifiers and outcome columns)
features_df = dataset.drop(["year", "cocnumber", "coctag", "panelvar", "state_abr"] + outcomes + secondary_outcomes, axis=1, inplace=False)

In [None]:
# Identifying features with lots of NaN(missing) values
NaN_features = []

for key in features_df.keys():
  if features_df[key].isna().sum() > 300:
    NaN_features.append(key)
  # print(key.ljust(35), features_df[key].isna().sum())

In [None]:
#Dropping the Nan features
features_df.drop(NaN_features, axis=1, inplace=True)
# features_df

In [None]:
# filling the remaining NaNs with the mean of the column
for key in features_df.keys():
  # print(key)
  features_df[key].fillna(value=round(features_df[key].mean()), inplace=True)

In [None]:
# Adding our outcome var to the end of the dataset
features_df[outcome_var] = possible_outcomes_df[outcome_var]
# features_df

In [None]:
outcome_var = "pit_tot_shelt_pit_hud"

In [None]:
# Drop the 14 (or however many) NaN values are present in the outcome var
dataset = features_df.dropna()

In [None]:
# I'VE COMMENTED THIS OUT FOR NOW BECAUSE 0 COLUMNS WERE LABEL ENCODED
# (still keeping the code in case we need it later)

# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# le_count = 0
# for col in DATA.columns[1:]:
#     if DATA[col].dtype == 'object':
#         if len(list(DATA[col].unique())) <= 2:
#             le.fit(DATA[col])
#             DATA[col] = le.transform(DATA[col])
#             le_count += 1
# print('{} columns were label encoded.'.format(le_count))

0 columns were label encoded.


In [None]:
# I'VE COMMENTED THIS OUT FOR NOW BECAUSE 0 COLUMNS WERE ONE-HOT ENCODED
# (still keeping the code in case we need it later)

# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder

# ct_count = 0
# for col in DATA.columns[1:]:
#     if DATA[col].dtype == 'object':
#         if len(list(DATA[col].unique())) >= 2:
#           DATA = pd.concat([DATA,pd.get_dummies(DATA[col], prefix=col)], axis=1)
#           DATA.drop([col],axis=1, inplace=True)
#           ct_count += 1
# print('{} columns were one-hot encoded.'.format(ct_count))


0 columns were one-hot encoded.
