In [68]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import sys, os, time, math, csv
import itertools
import collections

import numpy as np
np.warnings.filterwarnings('ignore')
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

import scipy.optimize

from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

import MigrationModels
import MigrationEvaluationMethods

#simple-maps
sys.path.append(os.path.join(os.getcwd(),"simple-maps"))
from simplemaps.SimpleFigures import simpleMap, differenceMap, simpleBinnedMap

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [178]:
import keras
import keras.backend as K
from keras.optimizers import SGD, Adam
from keras.models import Model
from keras.layers import Dense, Dropout, AlphaDropout, Input, BatchNormalization, Activation
from keras.wrappers.scikit_learn import KerasRegressor
from keras.callbacks import EarlyStopping

from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib

Using TensorFlow backend.


# Initial setup

In [2]:
years = range(2004, 2014 + 1)

In [3]:
shapefile_fn = "data/intermediate/boundary_shapefiles/cb_2015_us_county_500k.shp"
shapefile_key = "GEOID"

# Load data

### Migration data

In [4]:
migration_matrices = [
    np.load("data/processed/migration/migration_matrix_%d.npy" % (year))
    for year in years
]

for migration_matrix in migration_matrices:
    np.fill_diagonal(migration_matrix, 0.0)

f = open("data/processed/county_intersection_list_2004_2014.txt")
county_list = f.read().strip().split("\n")
f.close()

county_fips_to_idx = {fips:idx for idx, fips in enumerate(county_list)}
num_counties = len(county_list)

In [5]:
f = open("data/processed/hurricane_affected_counties.csv", "r")
flooded_counties = f.read().strip().split("\n")
f.close()

unflooded_counties = [fips for fips in county_list if fips not in flooded_counties]

flooded_county_idxs = np.array([county_fips_to_idx[fips] for fips in flooded_counties])
unflooded_county_idxs = np.array([county_fips_to_idx[fips] for fips in unflooded_counties])

all_county_idxs = np.array(list(range(num_counties)))

### Per county features

In [6]:
population_by_year = pd.read_csv("data/processed/county_population_2004_2014.csv", dtype={"FIPS":str})
population_by_year = population_by_year.set_index("FIPS")

population_vectors_by_year = [population_by_year["POPESTIMATE%d" % (year)].values.reshape(-1,1) for year in years]

### Extra features

In [7]:
distances = np.load("data/processed/county_distance_matrix.npy")

In [8]:
intervening_population_opportunities = []
for i in range(len(years)):
    s = MigrationModels.getInterveningOpportunities(population_vectors_by_year[i], distances)
    intervening_population_opportunities.append(s)

# Experiments

In [9]:
def get_full_dataset(year_idx, origin_list, destination_list, model=None):

    origin_pop = population_vectors_by_year[year_idx][origin_list,:].astype(float)
    destination_pop = population_vectors_by_year[year_idx][destination_list,:].astype(float)
    S = intervening_population_opportunities[year_idx][origin_list,:][:,destination_list].astype(float)
    D = distances[origin_list,:][:,destination_list].astype(float)
    T = migration_matrices[year_idx][origin_list,:][:,destination_list].astype(float)

    t_model = LinearRegression(fit_intercept=False)
    t_model.fit(origin_pop, T.sum(axis=1))
    beta = t_model.coef_[0]

    args = {
        "origin_pop": origin_pop,
        "destination_pop": destination_pop,
        "S": S,
        "D": D,
        "T": T,
        "beta": beta,
        "model": model
    }
    return args

In [10]:
def get_pairs_from_full_dataset(args):
    
    origin_pop = args["origin_pop"]
    destination_pop = args["destination_pop"]
    S = args["S"]
    D = args["D"]
    T = args["T"]
    
    num_rows, num_cols = origin_pop.shape[0], destination_pop.shape[0]
    num_entries = num_rows * num_cols
    num_features = 4
    
    X = np.zeros((num_entries, num_features), dtype=float)
    Y = np.zeros(num_entries, dtype=float)    
    
    for i in range(num_rows):
        for j in range(num_cols):
            
            idx = (i*num_cols) + j
            
            origin_idx = i
            dest_idx = j
            
            X[idx,:] = [
                origin_pop[origin_idx],
                destination_pop[dest_idx],
                D[origin_idx, dest_idx],
                S[origin_idx, dest_idx],
            ]
            
            Y[idx] = T[origin_idx, dest_idx]

    return X, Y

In [11]:
def run_traditional_models(alpha, args):
    model = args["model"]
    origin_pop, destination_pop = args["origin_pop"], args["destination_pop"]
    S, D, T = args["S"], args["D"], args["T"]
    beta = args["beta"]
    
    if model == "extrad":
        P = MigrationModels.extendedRadiationModel(origin_pop, destination_pop, S, alpha)
    elif model == "rad":
        P = MigrationModels.radiationModel(origin_pop, destination_pop, S)
    elif model == "gravpow":
        P = MigrationModels.gravityModel(origin_pop, destination_pop, D, alpha, decay="power")
    elif model == "gravexp":
        P = MigrationModels.gravityModel(origin_pop, destination_pop, D, alpha, decay="exponential")
    
    P = MigrationModels.row_normalize(P)
    T_pred = MigrationModels.productionFunction(origin_pop, P, beta=beta)
    
    return T, T_pred

def fit_traditional_models(alpha, args):
    T, T_pred = run_traditional_models(alpha, args)
    
    score = MigrationEvaluationMethods.cpc(T, T_pred)    
    return -score
    
def evaluate_traditional_models(alpha, beta, args):
    args["beta"] = beta
    T, T_pred = run_traditional_models(alpha, args)
    D = args["D"]
    
    return MigrationEvaluationMethods.evaluate_all(T, T_pred, D)

## Extrad model

### Flooded to Unflooded

In [12]:
args = get_full_dataset(1, flooded_county_idxs, unflooded_county_idxs, model="extrad")
opt_result = scipy.optimize.minimize(fit_traditional_models, x0=[1.0], args=args, bounds=[[0,3]])
beta_flooded = opt_result.x[0]
alpha = args["beta"]

In [13]:
alpha, beta_flooded

(0.16908589967733545, 0.13012304380629758)

### Unflooded to Unflooded

In [168]:
alphas = []
betas_unflooded = []
for i in range(len(years)):
    print(i, len(years))
    args = get_full_dataset(i, unflooded_county_idxs, unflooded_county_idxs, model="extrad")
    opt_result = scipy.optimize.minimize(fit_traditional_models, x0=[1.0], args=args, bounds=[[0,3]])
    beta_unflooded = opt_result.x[0]
    alpha = args["beta"]
    
    alphas.append(alpha)
    betas_unflooded.append(beta_unflooded)
alphas = np.array(alphas)
betas_unflooded = np.array(betas_unflooded)

0 11
1 11
2 11
3 11
4 11
5 11
6 11
7 11
8 11
9 11
10 11


In [173]:
alphas.mean()

0.03179085324317379

In [174]:
betas_unflooded.mean()

0.3535117898490684

In [175]:
f = open("output/extrad_params.txt","w")
f.write("alpha,beta_affected,beta_unaffected\n")
f.write("%f,%f,%f" % (alphas.mean(), beta_flooded, betas_unflooded.mean()))
f.close()

## ML Models

In [182]:
def cpc_loss(y_true, y_pred):
    return 1.0 - (2.0*K.sum(K.minimum(y_true,y_pred))) / (K.sum(y_true) + K.sum(y_pred))

def baseline_model():
    inputs = Input(shape=(4,))
    x = inputs

    x = Dense(128, activation="relu")(x)
    x = Dense(128, activation="relu")(x)
    
    outputs = Dense(1, activation="relu")(x)
    
    model = Model(inputs=[inputs], outputs=[outputs])
    optimizer = Adam(lr=0.001)

    model.compile(loss=cpc_loss, metrics=["mse", cpc_loss], optimizer=optimizer)
    return model

### Flooded to Unflooded

In [183]:
# Load train
args = get_full_dataset(1, flooded_county_idxs, unflooded_county_idxs, model="dl")
beta = args["beta"]
x_train, y_train = get_pairs_from_full_dataset(args)
    
y_train_binary = y_train.copy().astype(int)
mask = y_train>0

positive_indices = np.where(mask)[0]
negative_indices = np.where(~mask)[0]

num_positive = positive_indices.shape[0]
num_negative = negative_indices.shape[0]
new_num_negative = max(negative_indices.shape[0], 40*num_positive)

negative_indices = np.random.choice(negative_indices, size=new_num_negative, replace=True)

new_indices = np.concatenate([
    positive_indices,
    negative_indices
])

x_train = x_train[new_indices]
y_train = y_train[new_indices]

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
    
    
K.clear_session()
early_stopping = EarlyStopping(monitor="cpc_loss", patience=100, restore_best_weights=True)
model = keras.wrappers.scikit_learn.KerasRegressor(
    build_fn=baseline_model,
    epochs=500,
    batch_size=2**12,
    callbacks=[early_stopping],
    verbose=0
)
history = model.fit(x_train, y_train)

model.model.save("output/dl_flooded.h5")
joblib.dump(scaler, "output/scaler_flooded.p")

In [None]:
# Load train
args = get_full_dataset(2, unflooded_county_idxs, unflooded_county_idxs, model="dl")
beta = args["beta"]
x_train, y_train = get_pairs_from_full_dataset(args)
    
y_train_binary = y_train.copy().astype(int)
mask = y_train>0

positive_indices = np.where(mask)[0]
negative_indices = np.where(~mask)[0]

num_positive = positive_indices.shape[0]
num_negative = negative_indices.shape[0]
new_num_negative = max(negative_indices.shape[0], 40*num_positive)

negative_indices = np.random.choice(negative_indices, size=new_num_negative, replace=True)

new_indices = np.concatenate([
    positive_indices,
    negative_indices
])

x_train = x_train[new_indices]
y_train = y_train[new_indices]

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
    
    
K.clear_session()
early_stopping = EarlyStopping(monitor="cpcLoss", patience=100, restore_best_weights=True)
model = keras.wrappers.scikit_learn.KerasRegressor(
    build_fn=baseline_model,
    epochs=500,
    batch_size=2**12,
    callbacks=[early_stopping],
    verbose=0
)
history = model.fit(x_train, y_train)

model.model.save("output/dl_unflooded.h5")
joblib.dump(scaler, "output/scaler_unflooded.p")

In [None]:
model.model.save("output/dl_unflooded.h5")
joblib.dump(scaler, "output/scaler_unflooded.p")