# Feature Matrix and Machine Learning
_Calvin Whealton_

This notebook combines the cleaned variables into a feature matrix. The features in the matrix include the population density, median income, US GDP (quarterly), and time series of month-over-month change in Zillow Housing Value Index (ZHVI). This feature matrix will be used in the machine learning portion of this project.

In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import os
import matplotlib.pyplot as plt
import datetime
from scipy.stats import norm
import pickle

## Datasets for Zip Codes, Housing Data, and Peak Flows

This section loads many of the input files. Initially, only the zip codes with flood and housing data are considered as viable.

### Zip Code Shapefile

In [None]:
#shapefile is too large to be uploaded to github
#shapefile available from https://drive.google.com/file/d/1yTwgTfbYZirtNQOIfgQVDY4Tc-QKDVTb/view?usp=sharing
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/geo_data/tl_2019_us_zcta510_clipped48contig')
zips_shapefile = gpd.read_file('clipped48contig.shp')

In [None]:
zips_shapefile.head()

In [None]:
zips_shapefile.shape

### Housing Data

Zillow Housing Value Index (ZHVI) that was processed to be a month-over-month percentage change. Values are indexed by the last day of the month. Only the year-month will be used when referencing the time later in the combining of data.

In [None]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/processed_data')
housing = pd.read_csv('zillow_mon_pct_val.csv')

In [None]:
housing.head()

In [None]:
housing.columns

In [None]:
# stripping day from the column names
for i in housing.columns[2:295]:
    housing = housing.rename(columns={i: i[0:7]})

In [None]:
housing.shape

In [None]:
housing.head()

### National Flood Insurance Claims Dataset

In [None]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/processed_data')
claims = pd.read_csv('ts_claims_month.csv')

### Flood Gages

Time series of return periods of floods. Return period is inverse of exceedance probability of the flood.

In [None]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/processed_data')
ret_pers = pd.read_csv('ts_rps_2020-08-15.csv')

In [None]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/processed_data')
zip_gage = pd.read_csv('zip_gage_dist_2020-08-10.csv')
zip_gage.head()

## Step 1: Finding Zip Codes with Floods

Loop through each zip code. Floods with return periods greater than 50 years are identified for gages associated with those zip codes. Return periods are assigned based on the log normal distribution fit with log-space method of moments.

Once the floods are identified for each gage, the mean day of flood for each year is determined across the gages. This is taken as the date of the flood.

In [None]:
data_ml = pd.DataFrame(columns=['zip','year','month',
                                'flood_rp', # population density
                                   ])

In [None]:
# finding the set that has all information of interest
zips_with_shape_housing = set.intersection(set(zips_shapefile['GEOID10'].astype(int).values), set(housing['GEOID10_str'].values))

In [None]:
# used in extracting gage numbers
gage_num_cols = ['gage0','gage1','gage2','gage3','gage4','gage5','gage6','gage7','gage8','gage9']

# loop over zip codes
for z in list(zips_with_shape_housing):
    
    # set of gages pre-processed as relevant for zip code
    gages_for_zip = zip_gage.loc[zip_gage['GEOID10']==int(z),gage_num_cols]
    
    dates_check = []
    rps_check = []
    
    # loop over gages for the zip code
    for g in gages_for_zip.iloc[0].values:
        
        # reading in the file for the gage
        # sometimes need to pad with opening 0s
        os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/gage_data/peak_flows')
        if str(g).zfill(8)+'.csv' in os.listdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/gage_data/peak_flows'):
            
            # pad the gage number to account for leading 0s
            gage_name = str(g).zfill(8)+'.csv'
            gage_data = pd.read_csv(gage_name, comment='#')
            
            # get peak flows and dates
            peaks = gage_data['peak_va'].values
            dates = gage_data['peak_dt'].values
            
            # drop nan values (missing flood record)
            keepers = (np.isnan(peaks) == False)
            
            peaks = peaks[keepers]
            dates = dates[keepers]
            
            # log-space mean and variance for log-normal distribution
            ls_mean = np.mean(np.log(peaks))
            ls_sd = np.std(np.log(peaks))
            
            # calculation of return period, rp(x) = 1/(1- cdf(x))
            rps = 1/(1-norm.cdf(np.log(peaks),ls_mean,ls_sd))
            
            # because interested in extremes, dropping floods below 50-yr return period (2% exceedance)
            keep_extremes = (rps > 50)
            rps_extremes = rps[keep_extremes]
            dates_extremes = dates[keep_extremes]

            if len(rps_extremes) > 0:
                
                # imputing a month-day or day when it is 00
                # 00 would indicate that the value is either fairly old (e.g. 1863-00-00)
                # or that the gage does not automatrically record values
                for j in range(len(dates_extremes)):
                    dates_extremes[j] = dates_extremes[j].replace('-00-00','-06-15')
                    dates_extremes[j] = dates_extremes[j].replace('-00','-15')
                
                # convert to date-time index
                # extract the appropriate dates and return periods
                datetime_extremes = pd.DatetimeIndex(pd.Series(dates_extremes))
                extremes_use = np.where(np.logical_and(datetime_extremes.year > 1996, datetime_extremes.year < 2019))
                dates_use = datetime_extremes[extremes_use]
                rps_use = rps_extremes[extremes_use]
                
                # appending to list of dates and return periods for the zip code
                if len(rps_use) > 0:
                    dates_check.extend(dates_use)
                    rps_check.extend(rps_use)
        
        # identifying dates and typical return periods
        temp_dates = pd.DataFrame({'date': dates_check, 'rps': rps_check})
        temp_dates['log_rp'] = np.log(temp_dates['rps'])
        temp_dates['year'] = pd.DatetimeIndex(temp_dates['date']).year
        
        # identified_floods (one per calendar year)
        flood_dates = temp_dates.groupby('year')['date'].agg(lambda x: x.mean())
        flood_rps = np.exp(temp_dates.groupby('year')['log_rp'].agg(lambda x: x.mean()))
    
    # add the mean day of flood and geometric mean of return period to the data frame
    # reference information includes the zip code, year, and month
    for c in range(len(flood_dates)):
        data_ml = data_ml.append({'zip': z,
                                     'year': pd.DatetimeIndex(flood_dates).year.values[c],
                                     'month': pd.DatetimeIndex(flood_dates).month.values[c],
                                     'flood_rp': flood_rps.values[c]},ignore_index=True)
    

In [None]:
# check to make sure it all looks okay
data_ml.head(100)

In [None]:
data_ml.reset_index(inplace=True)

In [None]:
data_ml.to_csv('data_ml_after_floods.csv')

## Step 2: Adding Housing Value

The mean day of the flood (year-month) are used to find the accompanying housing price data for the zip code. Months between 12 before and 12 after the flood are extracted. There is processing to handle NaNs.

In [None]:
# used in naming columns
housing_price_cols = ['h-12','h-11', 'h-10','h-09','h-08','h-07','h-06','h-05','h-04','h-03','h-02','h-01','h+00','h+01','h+02','h+03','h+04','h+05','h+06','h+07','h+08','h+09','h+10','h+11','h+12']

In [None]:
len(housing_price_cols)

In [None]:
# reading-in data already calculated
data_ml = pd.read_csv('data_ml_after_floods.csv')

In [None]:
# initializing housing price data in the machine learning dataframe
for h in housing_price_cols:
    data_ml[h] = 0

In [None]:
data_ml.head()

In [None]:
# loop to extract the housing data and paste it in to the machine learning dataframe
# take the year-month of the mean day of flood for the year and use that as the zero index
# take 12 months before and after that day
for ind in data_ml.index:
    yr_check = int(data_ml.loc[ind,'year'])
    mo_check = int(data_ml.loc[ind,'month'])
    
    date_list = [str(yr_check) + '-' + str(mo_check).zfill(2)]
    for i in range(1,13):
        if mo_check - i <= 0:
            date_list.insert(0, str(yr_check-1) + '-' + str(mo_check - i+12).zfill(2))
        else:
            date_list.insert(0, str(yr_check) + '-' + str(mo_check - i).zfill(2))
    
    for i in range(1,13):
        if mo_check + i >= 12:
            date_list.append(str(yr_check+1) + '-' + str(mo_check + i-12).zfill(2))
        else:
            date_list.append(str(yr_check) + '-' + str(mo_check + i).zfill(2))
    
    # extracting housing price data
    housing_temp = housing.loc[housing['GEOID10_str'].values==int(data_ml.loc[ind,'zip']), date_list]
    housing_use = housing_temp.iloc[0].values
         
    data_ml.loc[ind,housing_price_cols] = housing_use

In [None]:
# check that it looks okay
data_ml.head()

In [None]:
# dropping columns with a lot of nan values
# indicate large gaps in the housing price data for that zip code over the time period
data_ml = data_ml.loc[data_ml.isnull().sum(axis=1) <= 1]

In [None]:
data_ml.shape

In [None]:
data_ml.head()

In [None]:
data_ml.to_csv('data_ml_with_housing.csv')

In [None]:
data_ml = pd.read_csv('data_ml_with_housing.csv')

In [None]:
# cleaning up zip codes with one missing (NaN value)
# when it is between two other values, the value is linearly interpolated
# when it is on the edge, the value from the neighbor is used
for i in data_ml.index:
    if data_ml.loc[i].isnull().sum() == 1:
        if np.isnan(data_ml.loc[i,'h-11']):
            data_ml.loc[i,'h-11'] = data_ml.loc[i,'h-10']
        elif np.isnan(data_ml.loc[i,'h+12']):
            data_ml.loc[i,'h+12'] = data_ml.loc[i,'h+11']
        else:
            for h in range(len(housing_price_cols)):
                if np.isnan(data_ml.loc[i,housing_price_cols[h]]):
                    data_ml.loc[i,housing_price_cols[h]] = 0.5*(data_ml.loc[i,housing_price_cols[h+1]] + data_ml.loc[i,housing_price_cols[h-1]])
                    break

In [None]:
# verifying that no NaNs remain
data_ml.isnull().sum().sum()

In [None]:
data_ml.to_csv('data_ml_with_housing_imputedNaN.csv')

## Step 3: GDP Information

GPD is taken as representative of the overall national economy. Disasters in a recession might not be the same as disasters during a boom.

In [None]:
data_ml = pd.read_csv('data_ml_with_housing_imputedNaN.csv')

In [None]:
data_ml.head()

In [None]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data')
gdp = pd.read_csv('A191RL1Q225SBEA.csv')

In [None]:
gdp.head()

In [None]:
gdp['datetime'] = pd.DatetimeIndex(gdp['DATE'])
gdp['year'] = pd.DatetimeIndex(gdp['DATE']).year
gdp['month'] = pd.DatetimeIndex(gdp['DATE']).month

In [None]:
gdp.head()

In [None]:
data_ml['GDP'] = 0

In [None]:
# loop to obtain the gdps
for i in data_ml.index:
    # make a datetime object from flood year-month
    d = datetime.datetime(int(data_ml.loc[i,'year']), int(data_ml.loc[i,'month']), 1)
    
    # extract the first GDP that is greater than the time index of the flood
    # will put it in the right quarter
    data_ml.loc[i,'GDP'] = gdp.loc[gdp['datetime'] >= d,'A191RL1Q225SBEA'].values[0]

In [None]:
data_ml.head()

In [None]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/processed_data')
data_ml.to_csv('data_ml_clean_housing_gpd.csv')

## Step 4: Demographic Data

Appending the median household income and population density for the zip code. These are the last two features considered in this problem.

In [None]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/processed_data')
data_ml = pd.read_csv('data_ml_clean_housing_gpd.csv')

In [None]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data')
zcta_cousub = pd.read_csv('zcta_countysub_uscensus.txt')

In [None]:
zcta_cousub.head()

In [None]:
# make a dataframe with the minimal data needed
zips_key_vals = pd.DataFrame({'zips':zips_shapefile['GEOID10'].astype(int).values,
                              'area':zips_shapefile['ALAND10'].values })


In [None]:
zips_key_vals.head()

In [None]:
# separate dataframe for population
pop_df = pd.DataFrame({'zips':(zcta_cousub.groupby('ZCTA5').mean())['ZPOP'].index,
                      'zpop':(zcta_cousub.groupby('ZCTA5').mean())['ZPOP']})

In [None]:
# merging values in a series of operations
zips_key_vals2 = pd.merge(left=zips_key_vals, right = pop_df, left_on = 'zips', right_on = 'zips')

In [None]:
zips_key_vals2.head()

In [None]:
zips_key_vals2['pop_dens'] = zips_key_vals2['zpop']/zips_key_vals2['area']

In [None]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/processed_data')
zip_medinc = pd.read_csv('zips_med_inc.csv')

In [None]:
zip_medinc.head()

In [None]:
zips_key_vals3 = pd.merge(left=zips_key_vals2, right = zip_medinc, left_on = 'zips', right_on = 'zip')

In [None]:
zips_key_vals3.head()

In [None]:
# adding data to the machine learning dataframe
data_ml['pop_dens'] = np.nan
data_ml['med_inc'] = np.nan

for i in data_ml.index:
    if data_ml.loc[i,'zip'] in zips_key_vals3['zip'].values:
        data_ml.loc[i,'pop_dens'] = zips_key_vals3.loc[zips_key_vals3['zips'].values==data_ml.loc[i,'zip'],'pop_dens'].values
        data_ml.loc[i,'med_inc'] = zips_key_vals3.loc[zips_key_vals3['zips'].values==data_ml.loc[i,'zip'],'med_hh_inc'].values

Checking to make sure all the data looks valid. No input/output problems

In [None]:
data_ml.head()

In [None]:
np.max(data_ml['year'].values)

In [None]:
# might not be needed.
data_ml.drop(columns=['GPD', 'Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1'],inplace=True)

In [None]:
data_ml.head()

In [None]:
# only keep the complete cases
# no imputation of missing demographic data
data_ml.dropna(inplace=True)

In [None]:
data_ml.shape

In [None]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/processed_data')
data_ml.to_csv('data_ml_flood_hou_gdp_pop.csv')

# Macine Learning

Goal of the Machine Learning part of this project is to predict the typical and range of responses one might expect following a flood for the zip code based on the attributes given above. The selected algorithm is K Nearest Neighbors (KNN), some processing of the features will be required.

_Flood Return Period_ : Log-transformed because a 50-year and 200-year flood are in some sense the same distance away from a 100-year flood. They are both a factor of 2. There could be some threshold effects around the 100-year flood, a common value used in designs, but it is difficult to accurately assess the 100-year flood even with 100 years of data.

_Median Household Income_ : Log-transformed. The values are spread over roughly an order of magnitude. It is anticipated that the impact of a flood on an area with 20 kUSD/yr and one with 50 kUSD/yr would be more significant than 120 kUSD/yr vs 150 kUSD/yr.

_Population Density_ : Log-transformed. The values are roughly spread over 6 orders of magnitude. This variable is a proxy for how urban or rural an area is.

_Housing Values_ : The variable has already been transformed in to month-over-month percentage increase in the housing value. This will be further condensed into a single sum of squares difference between the pre-flood time series and the location desired to be estimated.

_GDP_ : Scaled based on range of values. Zero still assumed to map to zero.

In [None]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/processed_data')
data_ml = pd.read_csv('data_ml_flood_hou_gdp_pop.csv')

## User-Defined Transformers

In [None]:
from sklearn import base
class ColumnSelectTransformer(base.BaseEstimator, base.TransformerMixin):
    '''
    Transformer used in the practical machine learning mini project
    Selects the columns defined as col_names from the dataframe
    Returns the values for those columns
    Does not need to learn anything about the data
    '''
    
    def __init__(self, col_names):
        self.col_names = col_names
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        rets = np.zeros((X.shape[0], len(self.col_names)))
        for c in range(len(self.col_names)):
            rets[:,c] = X[self.col_names[c]]
        return rets

In [None]:
class LogTransformer(base.BaseEstimator, base.TransformerMixin):
    '''
    Transforms columns as the logarithm of the given values
    It does not have to learn anything about the data
    '''
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return np.log(X)

In [None]:
class TimeSeriesRescaler(base.BaseEstimator, base.TransformerMixin):
    '''
    Transforms columns as a time series
    scales based on the standard deviation
    Does not shift the mean value
    '''
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        self.std = np.std(X)
        return self
    
    def transform(self, X):
        return [row/self.std for row in X]

In [None]:
class MoveRefScale(base.BaseEstimator, base.TransformerMixin):
    '''
    Transforms columns based on
    ref = reference value (shift values with respect to what)
    scalter = option for how to measure spread for normalization
    '''
    
    def __init__(self,ref=None,scaler='std'):
        self.scaler = scaler
        self.ref = ref
    
    def fit(self, X, y=None):
        if self.ref is None:
            self.ref_use = np.mean(X)
        else:
            self.ref_use = self.ref
        
        if self.scaler == 'std':
            self.scale_value = np.std(X)
        if self.scaler == 'min_max':
            self.scale_value = np.max(X) - np.min(X)
        if self.scaler == 'iqr':
            self.scale_value = np.quantile(X,0.75) - np.quantile(X,0.25)
        return self
    
    def transform(self, X):
        return (X-self.ref_use)/self.scale_value

The general manipulation of the data requires the use of pipelines and feature unions.

In [None]:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

pipe_rp = Pipeline([
                ('cst_rp', ColumnSelectTransformer(col_names=['flood_rp'])),
                ('lt_rp', LogTransformer()),
                ('mrs_rp',  MoveRefScale(ref=np.log(100), scaler='iqr'))
])


pipe_gdp = Pipeline([
                ('cst_gdp', ColumnSelectTransformer(col_names=['GDP'])),
                ('mrs_gdp', MoveRefScale(ref=0.0, scaler='std'))
])

pipe_inc = Pipeline([
                ('cst_inc', ColumnSelectTransformer(col_names=['med_inc'])),
                ('lt_inc', LogTransformer()),
                ('mrs_inc', MoveRefScale(ref=None,scaler='std'))
])

pipe_popden = Pipeline([
                ('cst_pden',  ColumnSelectTransformer(col_names=['pop_dens'])),
                ('lt_pden', LogTransformer()),
                ('mrs_pden', MoveRefScale(ref=None,scaler='std'))
])

pipe_houTS = Pipeline([
                ('cst_gdp', ColumnSelectTransformer(col_names=['h-12', 'h-11','h-10','h-09', 'h-08','h-07','h-06','h-05','h-04','h-03','h-02','h-01'])),
                ('tsr_gdp', TimeSeriesRescaler())
])

In [None]:
X_ml = data_ml[['flood_rp','GDP','med_inc','pop_dens','h-12', 'h-11','h-10','h-09', 'h-08','h-07','h-06','h-05','h-04','h-03','h-02','h-01']].copy()
y_ml = data_ml[['h+00','h+01','h+02','h+03','h+04','h+05','h+06','h+07','h+08','h+09','h+10','h+11','h+12']]

In [None]:
X_ml.shape

In [None]:
y_ml.shape

In [None]:
union = FeatureUnion([
        ('rp',pipe_rp),
        ('gdp',pipe_gdp),
        ('inc',pipe_inc),
        ('popden',pipe_popden),
        ('houTS', pipe_houTS)
    ])

In [None]:
union.fit(X_ml)

In [None]:
class KNNMixedTSConsts(base.BaseEstimator, base.RegressorMixin):
    '''
    Custom estimator for the time series data (and non-time series) for problem
    neighbors = number of neighbors
    ts_inds = indices of the time series (assumed to be in correct order)
    weights = weights for different parts of distance (time series collapsed to singe distance then weighted)
    '''

    def __init__(self,neighbors,ts_inds, weights):
        self.neighbors = neighbors
        self.ts_inds = ts_inds
        self.weights = weights
    
    def fit(self, X, y):
        self.X = X # store the values passed in
        self.y = y
        return self
    
    def predict(self, X):
        # prediction will be the mean of the k nearest neighbors
        # prediction also will return 80% interval
        # size will be number_of_prediction * length_of_time_series * number_of_metrics
        num_metrics = 3
        num_preds = X.shape[0]
        length_of_ts = self.y.shape[1]
        
        pred_arr = np.zeros((num_preds, length_of_ts, num_metrics))
        
        ts_vals = np.array(self.y)
        
        for p in range(num_preds):
            
            # calculate the distance
            dists = dist_calc(X[p,:], self.X, self.ts_inds, self.weights)
            
            # find neighbors by index
            neighbors_close = (np.argsort(dists))[0:self.neighbors]
            
            # take mean down the columns
            # length will be same as number of columns
            # also estimate the quantiles
            pred_arr[p,:,0] = np.mean(ts_vals[neighbors_close],axis=0)
            
            pred_arr[p,:,1] = np.quantile(ts_vals[neighbors_close], 0.1,axis=0)
            pred_arr[p,:,2] = np.quantile(ts_vals[neighbors_close], 0.9,axis=0)
        
        return pred_arr
    
    
def dist_calc(X_fitting, X_mat, ts_inds, weights):
    
    # dimensions and initializing an array to store results
    nrows = np.array(X_mat).shape[0]
    ncols = np.array(X_mat).shape[1] - len(ts_inds) + 1
    dist = np.zeros((nrows, ncols))
    
    # calculate the distances
    for i in np.arange(ncols):
        if i != (ncols - 1):
            dist[:,i] = weights[i]*((np.array(X_mat[:,i])-X_fitting[i])**2)
        else:
            dist_ts = np.zeros((nrows,len(ts_inds)))
            for j in range(len(ts_inds)):
                dist_ts[:,j] = ((np.array(X_mat[:,ts_inds[j]])-X_fitting[ts_inds[j]])**2)
            dist[:,i] = weights[i]*np.sum(dist_ts,axis=1)
    
    # return the mean across a row
    # will be length equal to number of rows
    return np.mean(dist,axis=1)

In [None]:
knn_pipe = Pipeline([
                    ('union_feature', union),
                    ('mix_knn', KNNMixedTSConsts(neighbors=50, ts_inds = np.arange(4,16), weights = [1., 1., 1., 1., 0.1]))
])

Checking to make sure things work

In [None]:
knn_pipe.fit(X_ml,y_ml)

In [None]:
knn_pipe.predict(X_ml.iloc[0:100,:])

Making a version of random search to try and find the optimal parameters. The hyper parameters that can be optimized are:

_number of neighbors_ : The number of neighbors to choose. Somewhat arbitary. Probably the minimum value should be 5 but the maximum could be over 100. Setting the range to be 5-200 and sampling on a logarithmic range.

_weights for features_ : Weights only need to be relative to one feature, which is chosen to be the flood return period. The weights are essentially accounting for the usefulness of the distance in the other feature dimensions, which is also somewhat depends on the transformations used to obtain those other features. As a safe range, values between 0.01 and 100 will be sampled on a log-scale.

In [None]:
# shuffling data and splitting in to train and test arrays
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

X_ml_sh, y_ml_sh = shuffle(X_ml, y_ml, random_state=0)

X_train, X_test, y_train, y_test = train_test_split(X_ml_sh, y_ml_sh, test_size=0.35, random_state=1)

In [None]:
# setting random values for hyper parameters
# lots of use of the seed to try and have reproducible results
num_hypers_test = 300

np.random.seed(seed=3)
num_neighbors = np.around(np.exp(np.random.uniform(np.log(5), np.log(200), num_hypers_test))).astype(int)
np.random.seed(seed=4)
weight_gdp = np.exp(np.random.uniform(np.log(0.01), np.log(100),num_hypers_test))
np.random.seed(seed=5)
weight_medinc = np.exp(np.random.uniform(np.log(0.01), np.log(100),num_hypers_test))
np.random.seed(seed=6)
weight_popdens = np.exp(np.random.uniform(np.log(0.01), np.log(100),num_hypers_test))
np.random.seed(seed=7)
weight_houts = np.exp(np.random.uniform(np.log(0.01), np.log(100),num_hypers_test))

In [None]:
def calc_mse_knn_mix(predictions, reality):
    '''
    function that calculates the mse and accuracy of the probability interval
    takes in a set of predictions and each prediction is paired with a real value
    '''
    mean_preds = predictions[:,:,0] # index for the mean, 1 = 10%, 2 = 90% estimates
    
    # evaluating if 80% interval covers 80% of outcomes
    interval_accuracy = 0
    
    for i in range(predictions.shape[0]):
        for j in range(predictions.shape[1]):
            if np.array(reality)[i,j] >= predictions[i,j,2] or np.array(reality)[i,j] <= predictions[i,j,1]:
                interval_accuracy += 1
    int_acc = interval_accuracy/(predictions.shape[0]*predictions.shape[1])
    
    return np.mean(np.mean((mean_preds - np.array(reality))**2)), int_acc

In [None]:
# defining pipeline and feature untion before loop testing hyper parameters

pipe_rp = Pipeline([
            ('cst_rp', ColumnSelectTransformer(col_names=['flood_rp'])),
            ('lt_rp', LogTransformer()),
            ('mrs_rp',  MoveRefScale(ref=np.log(100), scaler='iqr'))
    ])

pipe_gdp = Pipeline([
            ('cst_gdp', ColumnSelectTransformer(col_names=['GDP'])),
            ('mrs_gdp', MoveRefScale(ref=0.0, scaler='std'))
    ])

pipe_inc = Pipeline([
            ('cst_inc', ColumnSelectTransformer(col_names=['med_inc'])),
            ('lt_inc', LogTransformer()),
            ('mrs_inc', MoveRefScale(ref=None,scaler='std'))
    ])

pipe_popden = Pipeline([
            ('cst_pden',  ColumnSelectTransformer(col_names=['pop_dens'])),
            ('lt_pden', LogTransformer()),
            ('mrs_pden', MoveRefScale(ref=None,scaler='std'))
    ])

pipe_houTS = Pipeline([
            ('cst_gdp', ColumnSelectTransformer(col_names=['h-12', 'h-11','h-10','h-09', 'h-08','h-07','h-06','h-05','h-04','h-03','h-02','h-01'])),
            ('tsr_gdp', TimeSeriesRescaler())
    ])

# feature union
union = FeatureUnion([
    ('rp',pipe_rp),
    ('gdp',pipe_gdp),
    ('inc',pipe_inc),
    ('popden',pipe_popden),
    ('houTS', pipe_houTS)
])

In [None]:
# cell to test all the hyper parameter values
%%time
mse_acc_knn_mix = np.zeros((num_hypers_test, 2))

for i in range(num_hypers_test):
    
    hyper_loop = [num_neighbors[i], 1, weight_gdp[i], weight_medinc[i], weight_popdens[i], weight_houts[i]]
    
    # set up pipeline from feature union with knn mixed estimator
    knn_pipe = Pipeline([
                    ('union_feature', union),
                    ('mix_knn', KNNMixedTSConsts(neighbors=hyper_loop[0], ts_inds = np.arange(4,16), weights = hyper_loop[1:len(hyper_loop)]))
        ])
    
    # fitting the pipeline
    knn_pipe.fit(X_train,y_train)
    
    # predicting results
    knn_mix_preds = knn_pipe.predict(X_test)
    
    # store error metrics
    mse_acc_knn_mix[i,0], mse_acc_knn_mix[i,1] = calc_mse_knn_mix(knn_mix_preds, y_test)
    

In [None]:
np.min(mse_acc_knn_mix[:,0])

In [None]:
min_config_ind = np.argsort(mse_acc_knn_mix[:,0])[0]

In [None]:
hyper_params_opt = [num_neighbors[min_config_ind], 1.0, weight_gdp[min_config_ind], weight_medinc[min_config_ind], weight_popdens[min_config_ind], weight_houts[min_config_ind]]


In [None]:
hyper_params_opt

The results show that the weights on the GDP is very high relative to the others. This could suggest that the response to the natural disasters are largely controlled by the overall national economic environment. The median income of the zip code has a very low weight and it is on par with the weight for the time series. However, the time series component on average is about 10 times as large because it is the sum of 13 terms, so the time series and the population density have a similar effective weight.

The number of neighbors is relatively small at 6.

Making a series of plots to show the results

In [None]:
plt.scatter(x=num_neighbors, y=mse_acc_knn_mix[:,0])
plt.xlabel('Number of Neighbors')
plt.ylabel('Mean Squared Error')
plt.xscale('log')
plt.show()

In [None]:
plt.scatter(x=weight_gdp, y=mse_acc_knn_mix[:,0])
plt.xlabel('GDP Weight')
plt.ylabel('Mean Squared Error')
plt.xscale('log')
plt.show()

In [None]:
plt.scatter(x=weight_medinc, y=mse_acc_knn_mix[:,0])
plt.xlabel('Median Income Weight')
plt.ylabel('Mean Squared Error')
plt.xscale('log')
plt.show()

In [None]:
plt.scatter(x=weight_popdens, y=mse_acc_knn_mix[:,0])
plt.xlabel('Population Density Weight')
plt.ylabel('Mean Squared Error')
plt.xscale('log')
plt.show()

In [None]:
plt.scatter(x=weight_houts, y=mse_acc_knn_mix[:,0])
plt.xlabel('Pre-Flood Housing Weight')
plt.ylabel('Mean Squared Error')
plt.xscale('log')
plt.show()

In [None]:
# fitting the final model for pickling
knn_pipe_opt = Pipeline([
                    ('union_feature', union),
                    ('mix_knn', KNNMixedTSConsts(neighbors=hyper_params_opt[0], ts_inds = np.arange(4,16), weights = hyper_params_opt[1:len(hyper_loop)]))
        ])

knn_pipe_opt.fit(X_ml_sh,y_ml_sh)

In [None]:
opt_preds = knn_pipe_opt.predict(X_ml_sh)
calc_mse_knn_mix(opt_preds, y_ml_sh)

In [None]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/pickled_models')
filename = 'mix_knn_opt.sav'
pickle.dump(knn_pipe_opt, open(filename, 'wb'))

## Other Models for Comparison

These are a couple of other models that would be used for comparison. Essentially, does the nearest neighbors provide any more information.

### Testing 1: Average

The model for the prediction time period is simply the mean of the training time period.

In [None]:
sse_avg_mod = 0

for i in range(X_ml_sh.shape[0]):
    sse_avg_mod += np.sum((np.mean(np.array(X_ml_sh)[i,4:16]) - np.array(y_ml_sh)[i,:])**2)

In [None]:
mse_avg_mod = sse_avg_mod/(np.array(y_ml_sh).shape[0]*np.array(y_ml_sh).shape[1])

In [None]:
mse_avg_mod

### Testing 2: Autoregressive model

An admittedly poor AR (Auto-Regressive) model. The time series for fitting the model is the same as the pre-flood time series, and based on that the subsequent post-flood series is estiamted. This is performed for each zip code flood series.

In [None]:
from statsmodels.tsa.ar_model import AR

sse_ar_mod = 0

for i in range(X_ml_sh.shape[0]):
    model_ar = AR(np.array(X_ml_sh)[0,4:16])
    model_fitted = model_ar.fit()
    preds = model_fitted.predict(
                        start=len(np.array(X_ml_sh)[0,4:16]), 
                        end=len(np.array(X_ml_sh)[0,4:16]) + 12, 
                        dynamic=False)
    
    sse_ar_mod += np.sum((preds - np.array(y_ml_sh)[i,:])**2)
    
mse_ar_mod = sse_ar_mod/(np.array(y_ml_sh).shape[0]*np.array(y_ml_sh).shape[1])

In [None]:
mse_ar_mod

### Testing 3: KNN with No Flood Information

This model assumes that floods have nothing to do with the housing market. Therefore, random time periods and zip codes are chosen across the housing price dataest. The other variables, namely the GDP, median income, population density, and housing prices in the 12 months before the prediction were included as predictors. The same series of transformations is applied and a similar hyper-parameter optimization will be performed.

The random sampling of zip codes and time periods implies that some of these might be repeated or there might be some overlap.

In [None]:
# revised down to 200000 for the sake of time
num_random_zip_date = 200000 #500000

In [None]:
# sampling zip code, year, and month randomly
# year chosen as 1997 to 2018 (housing data starts in 1996 so 1997 ensures 12-months before)
sam_zip = np.random.choice(np.array(list(zips_with_shape_housing)), size=num_random_zip_date, replace=True)
sam_yr = np.random.choice(np.arange(1997, 2019), size=num_random_zip_date, replace=True)
sam_mo = np.random.choice(np.arange(1,13), size=num_random_zip_date, replace=True)

In [None]:
data_ml_noFlood = pd.DataFrame({'zip':sam_zip, 'year':sam_yr, 'month':sam_mo})

In [None]:
for h in housing_price_cols:
    data_ml_noFlood[h] = 0

In [None]:
# loop to extract the housing data and paste it in to the machine learning dataframe
# take the year-month of the mean day of flood for the year and use that as the zero index
# take 12 months before and after that day
for ind in data_ml_noFlood.index:
    yr_check = int(data_ml_noFlood.loc[ind,'year'])
    mo_check = int(data_ml_noFlood.loc[ind,'month'])
    
    date_list = [str(yr_check) + '-' + str(mo_check).zfill(2)]
    for i in range(1,13):
        if mo_check - i <= 0:
            date_list.insert(0, str(yr_check-1) + '-' + str(mo_check - i+12).zfill(2))
        else:
            date_list.insert(0, str(yr_check) + '-' + str(mo_check - i).zfill(2))
    
    for i in range(1,13):
        if mo_check + i >= 12:
            date_list.append(str(yr_check+1) + '-' + str(mo_check + i-12).zfill(2))
        else:
            date_list.append(str(yr_check) + '-' + str(mo_check + i).zfill(2))
    
    # extracting housing price data
    housing_temp = housing.loc[housing['GEOID10_str'].values==int(data_ml_noFlood.loc[ind,'zip']), date_list]
    housing_use = housing_temp.iloc[0].values
         
    data_ml_noFlood.loc[ind,housing_price_cols] = housing_use

In [None]:
# dropping columns with a lot of nan values
# indicate large gaps in the housing price data for that zip code over the time period
data_ml_noFlood = data_ml_noFlood.iloc[0:202426]
data_ml_noFlood = data_ml_noFlood.loc[data_ml_noFlood.isnull().sum(axis=1) <= 1]

In [None]:
data_ml_noFlood.shape

In [None]:
# cleaning up zip codes with one missing (NaN value)
# when it is between two other values, the value is linearly interpolated
# when it is on the edge, the value from the neighbor is used
for i in data_ml_noFlood.index:
    if data_ml_noFlood.loc[i].isnull().sum() == 1:
        if np.isnan(data_ml_noFlood.loc[i,'h-11']):
            data_ml_noFlood.loc[i,'h-11'] = data_mlFlood.loc[i,'h-10']
        elif np.isnan(data_ml_noFlood.loc[i,'h+12']):
            data_ml_noFlood.loc[i,'h+12'] = data_ml_noFlood.loc[i,'h+11']
        else:
            for h in range(len(housing_price_cols)):
                if np.isnan(data_ml_noFlood.loc[i,housing_price_cols[h]]):
                    data_ml_noFlood.loc[i,housing_price_cols[h]] = 0.5*(data_ml_noFlood.loc[i,housing_price_cols[h+1]] + data_ml_noFlood.loc[i,housing_price_cols[h-1]])
                    break

In [None]:
data_ml_noFlood['GDP'] = 0

# loop to obtain the gdps
for i in data_ml_noFlood.index:
    # make a datetime object from flood year-month
    d = datetime.datetime(int(data_ml_noFlood.loc[i,'year']), int(data_ml_noFlood.loc[i,'month']), 1)
    
    # extract the first GDP that is greater than the time index of the flood
    # will put it in the right quarter
    data_ml_noFlood.loc[i,'GDP'] = gdp.loc[gdp['datetime'] >= d,'A191RL1Q225SBEA'].values[0]

In [None]:
data_ml_noFlood['pop_dens'] = np.nan
data_ml_noFlood['med_inc'] = np.nan

for i in data_ml_noFlood.index:
    if data_ml_noFlood.loc[i,'zip'] in zips_key_vals3['zip'].values:
        data_ml_noFlood.loc[i,'pop_dens'] = zips_key_vals3.loc[zips_key_vals3['zips'].values==data_ml_noFlood.loc[i,'zip'],'pop_dens'].values
        data_ml_noFlood.loc[i,'med_inc'] = zips_key_vals3.loc[zips_key_vals3['zips'].values==data_ml_noFlood.loc[i,'zip'],'med_hh_inc'].values

In [None]:
# defining pipeline and feature union before loop testing hyper parameters
pipe_gdp_nf = Pipeline([
            ('cst_gdp', ColumnSelectTransformer(col_names=['GDP'])),
            ('mrs_gdp', MoveRefScale(ref=0.0, scaler='std'))
    ])

pipe_inc_nf = Pipeline([
            ('cst_inc', ColumnSelectTransformer(col_names=['med_inc'])),
            ('lt_inc', LogTransformer()),
            ('mrs_inc', MoveRefScale(ref=None,scaler='std'))
    ])

pipe_popden_nf = Pipeline([
            ('cst_pden',  ColumnSelectTransformer(col_names=['pop_dens'])),
            ('lt_pden', LogTransformer()),
            ('mrs_pden', MoveRefScale(ref=None,scaler='std'))
    ])

pipe_houTS_nf = Pipeline([
            ('cst_gdp', ColumnSelectTransformer(col_names=['h-12', 'h-11','h-10','h-09', 'h-08','h-07','h-06','h-05','h-04','h-03','h-02','h-01'])),
            ('tsr_gdp', TimeSeriesRescaler())
    ])

# feature union
union_nf = FeatureUnion([
    ('gdp',pipe_gdp_nf),
    ('inc',pipe_inc_nf),
    ('popden',pipe_popden_nf),
    ('houTS', pipe_houTS_nf)
])

In [None]:
X_ml_nf = data_ml[['GDP','med_inc','pop_dens','h-12', 'h-11','h-10','h-09', 'h-08','h-07','h-06','h-05','h-04','h-03','h-02','h-01']].copy()
y_ml_nf = data_ml[['h+00','h+01','h+02','h+03','h+04','h+05','h+06','h+07','h+08','h+09','h+10','h+11','h+12']]
X_train_nf, X_test_nf, y_train_nf, y_test_nf = train_test_split(X_ml_nf, y_ml_nf, test_size=0.35, random_state=111)

In [None]:
X_train_nf.head()

In [None]:
# choosing to use population density as the critical value this time
# all weights are relative to that one
# also decreasing the range for neighbors to 100 given previous resluts
num_hypers_test_nf = 200

np.random.seed(seed=3)
num_neighbors_nf = np.around(np.exp(np.random.uniform(np.log(5), np.log(100), num_hypers_test_nf))).astype(int)
np.random.seed(seed=4)
weight_gdp_nf = np.exp(np.random.uniform(np.log(0.01), np.log(100),num_hypers_test_nf))
np.random.seed(seed=5)
weight_medinc_nf = np.exp(np.random.uniform(np.log(0.01), np.log(100),num_hypers_test_nf))
np.random.seed(seed=7)
weight_houts_nf = np.exp(np.random.uniform(np.log(0.01), np.log(100),num_hypers_test_nf))

In [None]:
mse_acc_knn_mix_nf = np.zeros((num_hypers_test_nf, 2))
mse_acc_knn_mix_nf_flood = np.zeros((num_hypers_test_nf, 2))

In [None]:
# this calculation was aborted early due to time constraints
# ran about 40 cases
%%time

for i in range(num_hypers_test_nf):
    
    hyper_loop = [num_neighbors_nf[i], weight_gdp_nf[i], weight_medinc_nf[i], 1.0, weight_houts_nf[i]]
    
    # set up pipeline from feature union with knn mixed estimator
    knn_pipe_nf = Pipeline([
                    ('union_feature', union_nf),
                    ('mix_knn', KNNMixedTSConsts(neighbors=hyper_loop[0], ts_inds = np.arange(3,15), weights = hyper_loop[1:len(hyper_loop)]))
        ])
    
    # fitting the pipeline
    knn_pipe_nf.fit(X_train_nf,y_train_nf)
    
    # predicting results
    # one is for predicting the general model (x data and y data)
    # one if for predicting the flood errors explicitly from that model
    # model is fit to the non-flood data
    knn_mix_preds_nf = knn_pipe_nf.predict(X_test_nf)
    knn_mix_preds_nf_flood = knn_pipe_nf.predict(X_ml_sh)
    
    mse_acc_knn_mix_nf[i,0], mse_acc_knn_mix_nf[i,1] = calc_mse_knn_mix(knn_mix_preds_nf, y_test_nf)

In [None]:
mse_acc_knn_mix_nf