## Spatial Features Vs Urban Footprint Vs Population Size

In [1]:
import sklearn
import pandas as pd
import numpy as np
import csv
import scipy.stats as stats
from statistics import pstdev
from statistics import mean
from sklearn import preprocessing
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.datasets import make_regression
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from itertools import product
import copy


### Read the Features data

In [2]:
spfeas = pd.read_excel('blz_spfeas_v3.xlsx', sheet_name=0)
spfeas['OBJECTID'] = spfeas['OBJECTID'].astype(int)
spfeas = spfeas.set_index('OBJECTID')
spfeas.head()

Unnamed: 0_level_0,FID,Administra,Administ_1,Area,Urban_Rura,CTV_2018,ED_2018,Cluster_Nu,fourier_sc31_variance_mean,fourier_sc31_variance_std,...,sfs_sc51_std_sum,sfs_sc71_max_ratio_of_orthgonal_angles_mean,sfs_sc71_max_ratio_of_orthgonal_angles_std,sfs_sc71_max_ratio_of_orthgonal_angles_sum,sfs_sc31_std_mean,sfs_sc31_std_std,sfs_sc31_std_sum,sfs_sc71_min_line_length_mean,sfs_sc71_min_line_length_std,sfs_sc71_min_line_length_sum
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,Toledo District,BLZ006,Toledo Rural,Rural,Cayes,217,665,1.87101,4.929879,...,42799652.0,140.70989,1.0652,4286729472,1.193974,1.408849,36374452.0,874775.350589,330970.94884,26650047676400
2,1,Stann Creek District,BLZ005,Stann Creek Rural,Rural,Cayes,201,602,2.608672,6.141846,...,92699568.0,140.376936,4.624142,7790784512,1.375928,1.689124,76362680.0,824859.818661,380083.243379,45778923946000
3,2,,,,,Cayes,126,0,6.380702,8.840365,...,10851945.0,139.21994,8.347895,461388384,2.41311,2.69909,7997279.0,614737.965726,486655.09781,2037301248000
4,3,,,,,Cayes,122,0,2.894536,6.357571,...,65703932.0,139.365071,12.484677,5233991680,1.457598,1.841355,54741532.0,814288.179976,388871.179731,30581389787100
5,4,,,,,Cayes,123,0,4.566548,7.459406,...,44759044.0,140.065461,2.836504,2403786496,1.953999,2.401075,33534290.0,692178.805971,461589.477238,11879088914400


### Load GHS

In [3]:
guf = pd.read_csv('blz_ghs.csv')
guf['OBJECTID'] = guf['OBJECTID'].astype(int)
guf = guf.set_index('OBJECTID')
guf.head()

Unnamed: 0_level_0,Administra,Administ_1,Area,Urban_Rura,CTV_2018,ED_2018,Cluster_Nu,Shape_Leng,Shape_Area,ghs_count,ghs_sum,ghs_mean
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,Toledo District,BLZ006,Toledo Rural,Rural,Cayes,217,665,294022.358507,2934572000.0,2191097.0,2191297.0,1.000091
2,Stann Creek District,BLZ005,Stann Creek Rural,Rural,Cayes,201,602,431082.304407,5345558000.0,4016469.0,4019169.0,1.000672
3,,,,,Cayes,126,0,69417.179782,319206400.0,241847.0,241847.0,1.0
4,,,,,Cayes,122,0,247650.43079,3617347000.0,2735984.0,2735984.0,1.0
5,,,,,Cayes,123,0,240469.134053,1652992000.0,1249474.0,1250674.0,1.00096


### Merge the two 

In [4]:
spfeas_guf = spfeas.merge(guf, left_on='OBJECTID', right_on="OBJECTID", how='outer')
spfeas_guf = spfeas_guf.round(3)
spfeas_guf.head()

Unnamed: 0_level_0,FID,Administra_x,Administ_1_x,Area_x,Urban_Rura_x,CTV_2018_x,ED_2018_x,Cluster_Nu_x,fourier_sc31_variance_mean,fourier_sc31_variance_std,...,Area_y,Urban_Rura_y,CTV_2018_y,ED_2018_y,Cluster_Nu_y,Shape_Leng,Shape_Area,ghs_count,ghs_sum,ghs_mean
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,Toledo District,BLZ006,Toledo Rural,Rural,Cayes,217,665,1.871,4.93,...,Toledo Rural,Rural,Cayes,217,665,294022.359,2934572000.0,2191097.0,2191297.0,1.0
2,1,Stann Creek District,BLZ005,Stann Creek Rural,Rural,Cayes,201,602,2.609,6.142,...,Stann Creek Rural,Rural,Cayes,201,602,431082.304,5345558000.0,4016469.0,4019169.0,1.001
3,2,,,,,Cayes,126,0,6.381,8.84,...,,,Cayes,126,0,69417.18,319206400.0,241847.0,241847.0,1.0
4,3,,,,,Cayes,122,0,2.895,6.358,...,,,Cayes,122,0,247650.431,3617347000.0,2735984.0,2735984.0,1.0
5,4,,,,,Cayes,123,0,4.567,7.459,...,,,Cayes,123,0,240469.134,1652992000.0,1249474.0,1250674.0,1.001


### Filter Dataset by Builtup Surface. 

Select Rows where  builtup is greater than or equal to 10 percent

In [5]:
spfeas_guf.shape

(723, 452)

### Analysis

Get the list of dependent variables from the DataFrame to store in list y_vars

In [6]:
y_var = list(spfeas_guf.axes[1])[451]
y_var

'ghs_mean'

Get a list of all independent variables from the DataFrame in list all_x

In [7]:
all_x = list(spfeas_guf.axes[1])[8:440]

#Check
all_x

['fourier_sc31_variance_mean',
 'fourier_sc31_variance_std',
 'fourier_sc31_variance_sum',
 'fourier_sc71_mean_mean',
 'fourier_sc71_mean_std',
 'fourier_sc71_mean_sum',
 'fourier_sc51_mean_mean',
 'fourier_sc51_mean_std',
 'fourier_sc51_mean_sum',
 'fourier_sc31_mean_mean',
 'fourier_sc31_mean_std',
 'fourier_sc31_mean_sum',
 'fourier_sc51_variance_mean',
 'fourier_sc51_variance_std',
 'fourier_sc51_variance_sum',
 'fourier_sc71_variance_mean',
 'fourier_sc71_variance_std',
 'fourier_sc71_variance_sum',
 'gabor_sc3_filter_5_mean',
 'gabor_sc3_filter_5_std',
 'gabor_sc3_filter_5_sum',
 'gabor_sc3_filter_4_mean',
 'gabor_sc3_filter_4_std',
 'gabor_sc3_filter_4_sum',
 'gabor_sc3_filter_7_mean',
 'gabor_sc3_filter_7_std',
 'gabor_sc3_filter_7_sum',
 'gabor_sc3_filter_6_mean',
 'gabor_sc3_filter_6_std',
 'gabor_sc3_filter_6_sum',
 'gabor_sc3_filter_1_mean',
 'gabor_sc3_filter_1_std',
 'gabor_sc3_filter_1_sum',
 'gabor_sc3_mean_mean',
 'gabor_sc3_mean_std',
 'gabor_sc3_mean_sum',
 'gabor_sc

### Compute Coorelation of features with population density

Store all features with the least correlation (stat. significance (p < 0.05)) 

The Pearson correlation coefficient **measures the linear relationship
between two datasets.** Strictly speaking, Pearson's correlation requires
that each dataset be **normally distributed, and not necessarily zero-mean.**

Like other correlation coefficients, this one varies between -1 and +1
with 0 implying no correlation. Correlations of -1 or +1 imply an exact
linear relationship. Positive correlations imply that as x increases, so
does y. Negative correlations imply that as x increases, y decreases.

The p-value roughly indicates **the probability of an uncorrelated system**
producing datasets that have a Pearson correlation at least as extreme
as the one computed from these datasets. 

***The p-values are not entirely
reliable but are probably reasonable for datasets larger than 500 or so.***

In [8]:
spfeas_guf[y_var] = spfeas_guf[y_var].fillna(0)
spfeas_guf[y_var].isnull().values.any()

False

In [9]:
y_dict = {}
x = []

for x_var in all_x:
    
    #Calculate the Pearson statistics, 
    # returns the Pearson value and p value
    
    p = stats.skew(spfeas_guf[x_var])
    
    #print(y_var, x_var, p)
    # print back for mike
    #print (y_var + " , " + x_var + " , " +  p)

In [70]:
y_dict = {}
x = []

for x_var in all_x:
    
    #Calculate the Pearson statistics, 
    # returns the Pearson value and p value
    
    p = stats.pearsonr(spfeas_guf[x_var],spfeas_guf[y_var])
    
    # print back for mike
    #print (y_var + " , " + x_var + " , " +  str(p[0]) + " , " + str(p[1]))
    
    #If p < 0.05 append to list x
    if p[1] < 0.05:
        x.append([x_var,(p[0])])

#List x is made into a DataFrame 
# which is sorted by the absolute values of the Pearson values
x_df = pd.DataFrame(x,columns=["x_var","abs_r2"]).sort_values("abs_r2",ascending=True)


#The dependent variable dictionary is given an entry 
# where the key is the name of the dependent variable
# and the value is a list of top 200 most significant values

y_dict[y_var] = list(x_df["x_var"][0:200])
#y_dict[y_var]
#Print out each dependent variable and 
#the number of x values that remain to check completion

In [71]:
x_df.head(10)

Unnamed: 0,x_var,abs_r2
314,ndvi_sc7_variance_mean,-0.681168
384,sfs_sc51_max_line_length_std,-0.675907
317,ndvi_sc5_variance_mean,-0.67321
417,sfs_sc51_std_std,-0.670797
390,sfs_sc71_max_line_length_std,-0.667861
308,ndvi_sc3_mean_mean,-0.665118
393,sfs_sc71_mean_std,-0.66479
320,ndvi_sc5_mean_mean,-0.664284
311,ndvi_sc7_mean_mean,-0.663283
402,sfs_sc51_mean_std,-0.66273


In [32]:
#check 

for key in y_dict.keys():
    print(key,len(y_dict[key]))


ghs_mean 200


### Correlation Significance

For each dependent variable y in the list of all dependent values, calibrate the model.
Add new key to the output dictionary where y is the dependent variable curently being processed and the values are empty for now

In [33]:
#Initialize the output dictionary, Y_D, 
# with each key being a dependent variable and the values being the results of the analyses

Y_D = {}

Y_D[y_var]={}

#Dictionary Models is used to store each result object for later use if needed

Models ={}

#Get independent variables from the variable dictionary and store in list x_vars
x_vars = y_dict[y_var]


vars_df = pd.DataFrame()

vars_df[y_var] = spfeas_guf[y_var]


for x in x_vars:
    vars_df[x] = spfeas_guf[x]

In [34]:
stats.describe(vars_df['ghs_mean'])

DescribeResult(nobs=723, minmax=(1.0, 101.0), mean=16.570911479944677, variance=638.4296127262253, skewness=1.630291157307923, kurtosis=1.452901986170958)

### Scale/Normalize Data

In [35]:
#minmax_scaler = preprocessing.MinMaxScaler()
standard_scaler = preprocessing.StandardScaler()

names = vars_df.columns
scaled_df = standard_scaler.fit_transform(vars_df)
scaled_df = pd.DataFrame(scaled_df, columns=names)
scaled_df.head()

Unnamed: 0,ghs_mean,gabor_sc7_filter_11_mean,gabor_sc7_filter_9_mean,gabor_sc5_filter_9_mean,gabor_sc7_filter_13_mean,gabor_sc5_filter_11_mean,gabor_sc5_filter_7_mean,gabor_sc7_filter_7_mean,gabor_sc7_filter_5_mean,gabor_sc5_filter_5_mean,...,sfs_sc31_max_line_length_mean,gabor_sc5_filter_2_std,gabor_sc5_filter_14_std,orb_sc71_kurtosis_mean,orb_sc31_max_std,gabor_sc5_filter_10_std,gabor_sc5_filter_8_std,gabor_sc5_filter_12_std,gabor_sc5_filter_4_std,hog_sc7_kurtosis_std
0,-0.616677,-3.287058,-3.272931,-3.27257,-3.292668,-3.277816,-3.268348,-3.261398,-3.26202,-3.266306,...,-2.208609,-0.146938,-0.119918,-0.265775,-1.05274,-0.121226,-0.152803,-0.088578,-0.10751,-0.648048
1,-0.616638,-3.174185,-3.159913,-3.160689,-3.179829,-3.166262,-3.156661,-3.148873,-3.149904,-3.155004,...,-2.072974,0.382919,0.407614,-0.101278,-0.852036,0.386531,0.345487,0.441425,0.435588,-0.668433
2,-0.616677,-2.307744,-2.298957,-2.302038,-2.310518,-2.305233,-2.299794,-2.292073,-2.294768,-2.299711,...,-1.530234,1.441195,1.536159,-0.063758,-0.486104,1.443054,1.356361,1.567259,1.523584,-0.603503
3,-0.616677,-3.135207,-3.120357,-3.121008,-3.141944,-3.126785,-3.116751,-3.109489,-3.109719,-3.114829,...,-2.060971,0.345637,0.381875,-0.251131,-1.031454,0.364926,0.31805,0.420588,0.397518,-0.633033
4,-0.616638,-2.695897,-2.68564,-2.686909,-2.698327,-2.690737,-2.683999,-2.677472,-2.678936,-2.683016,...,-1.842915,0.597499,0.646315,0.080052,-0.857104,0.613477,0.555974,0.686027,0.659037,-0.668294


In [36]:
# #Create a new dataframe for scaled and centered values
# scaled_df = pd.DataFrame()

# #Scale and center the values
# scaled_df[y_var] = scale(pop_count, with_mean=True, with_std=True)


# for x in x_vars:
#     scaled_df[x] = scale(spfeas_world_pop_merged[x], with_mean=True, with_std=True)

# scaled_df = scaled_df.round(3)

In [37]:


# Y = preprocessing.minmax_scale(vars_df[y_var])
# name='pop_sum'
# Y = pd.DataFrame(Y)
# #X = pd.DataFrame()

# X = scaler.fit_transform(vars_df[x_vars])
# X = pd.DataFrame(X, columns=x_vars)
# Y.head()

In [38]:
#X.head()

### Scale the variables

### Set Elastic net's parameters

In [39]:
enet_result = ElasticNetCV(max_iter=1e8,
                    alphas = [0.0005, 0.001, 0.01, 0.03, 0.05, 0.1],
                    l1_ratio =[.1, .5, .7, .9, .95, .99, 1],
                    verbose= False,
                    n_jobs = -1, 
                    cv=5, 
                    selection = 'random',
                    fit_intercept=False)


In [40]:
scaled_df.shape

(723, 201)

In [41]:
# Fit the mode

In [42]:
#Fit the model with the scaled data
enet_result.fit(scaled_df[x_vars],scaled_df[y_var])
#Append the model to the Models dictionary
Models[y_var] = enet_result


In [43]:
enet_result

ElasticNetCV(alphas=[0.0005, 0.001, 0.01, 0.03, 0.05, 0.1], copy_X=True, cv=5,
       eps=0.001, fit_intercept=False,
       l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], max_iter=100000000.0,
       n_alphas=100, n_jobs=-1, normalize=False, positive=False,
       precompute='auto', random_state=None, selection='random',
       tol=0.0001, verbose=False)

In [44]:
opt_alpha, opt_l1_ratio = enet_result.alpha_, enet_result.l1_ratio_

#Print update to ensure that the script is progressing properly
print("R2: {:.2f} Alpha: {} l1_ratio: {}"
      .format(enet_result.score(scaled_df[x_vars],scaled_df[y_var]),
              enet_result.alpha_, enet_result.l1_ratio_))

R2: 0.70 Alpha: 0.1 l1_ratio: 0.1


Record the overall R squared score and optimal alpha 
and l1 ratio values and store them in the output dictionary


In [45]:
Y_D[y_var]['Total R2'] = enet_result.score(scaled_df[x_vars],scaled_df[y_var])
Y_D[y_var]['Alpha'] = opt_alpha
Y_D[y_var]['l1_ratio'] = opt_l1_ratio

### Ten Fold Cross validated regression

In [50]:
#Create a list R2s to store out of sample R squared values

R2s = []

#Specify the number of trials to run

trials = 10

#Run the number of trials specified in trials, 
#for each trial 66% of the observations are randomly selected to train the model
#Testing is done on the remaining 33% of observations and the R squared values are recorded

for i in range(trials):    
    X_train, X_test, y_train, y_test = train_test_split(scaled_df[x_vars],scaled_df[y_var], test_size=0.34)
    enet_regr = ElasticNetCV(max_iter=1e8,
                    alphas = [opt_alpha],
                    l1_ratio =[opt_l1_ratio],
                    n_jobs = -1, 
                    cv=5, 
                    selection = 'random',
                    fit_intercept=False)
    enet_regr.fit(X_train,y_train)
    R2s.append(enet_regr.score(X_test,y_test))

#print("Mean R2: {:.2f} StDev: {:.4f}".format(mean(R2s),pstdev(R2s)))
#Record the out of sample R squared values
Y_D[y_var]['Sampling']={'trials':trials,'R2':mean(R2s),'StDev':pstdev(R2s),'R2s':R2s}
#coefs = [i for i in zip(list(scaled_df[x_vars].axes[1]),enet_result.coef_)]
#remaining = [i for i in coefs if abs(i[1])>0.0]
#Y_D[y_var]["Coefficients"]=remaining

In [51]:
Y_D[y_var]

{'Total R2': 0.6991194573596675,
 'Alpha': 0.1,
 'l1_ratio': 0.1,
 'Sampling': {'trials': 10,
  'R2': 0.6713192756912206,
  'StDev': 0.023774726717250057,
  'R2s': [0.6961748560096979,
   0.6632594490409282,
   0.6796491226434911,
   0.6829699237910698,
   0.6756039104722994,
   0.6507783724228104,
   0.6805422697001263,
   0.6103889748153356,
   0.691402650463971,
   0.6824232275524764]},
 'Coefficients': [('gabor_sc7_filter_11_mean', 0.03995779596893736),
  ('gabor_sc7_filter_9_mean', 0.03795331640474447),
  ('gabor_sc5_filter_9_mean', 0.023476706021465604),
  ('gabor_sc7_filter_13_mean', 0.03144773799159832),
  ('gabor_sc5_filter_11_mean', 0.021220828590185304),
  ('gabor_sc5_filter_7_mean', 0.021657351815607337),
  ('gabor_sc7_filter_7_mean', 0.03121511728895684),
  ('gabor_sc7_filter_5_mean', 0.028170059644529014),
  ('gabor_sc5_filter_5_mean', 0.021249430230141413),
  ('mean_sc7_mean_mean', 0.01872806707648392),
  ('gabor_sc5_filter_3_mean', 0.02060640781475518),
  ('gabor_sc7_fi

In [49]:
y_df = pd.DataFrame([i for i in zip(list(scaled_df[x_vars].axes[1]),enet_result.coef_)], 
                    columns=["features","Coeff"]).sort_values("Coeff", ascending=False)

y_df.head(25)

Unnamed: 0,features,Coeff
80,hog_sc5_skew_std,0.062598
129,sfs_sc31_mean_mean,0.045858
0,gabor_sc7_filter_11_mean,0.039958
27,gabor_sc7_filter_12_mean,0.039622
1,gabor_sc7_filter_9_mean,0.037953
84,sfs_sc31_std_std,0.036235
3,gabor_sc7_filter_13_mean,0.031448
6,gabor_sc7_filter_7_mean,0.031215
188,orb_sc51_variance_std,0.030833
7,gabor_sc7_filter_5_mean,0.02817


## ONLY HOG

In [66]:
filter_var = [col for col in scaled_df if col.startswith('hog')]
X_train, X_test, y_train, y_test = train_test_split(scaled_df[filter_var],scaled_df[y_var], test_size=0.34)
enet_result.fit(X_train,y_train)

Models[y_var] = enet_result

opt_alpha, opt_l1_ratio = enet_result.alpha_, enet_result.l1_ratio_


print("R2: {:.2f} Alpha: {} l1_ratio: {}"
      .format(enet_result.score(scaled_df[filter_hog],scaled_df[y_var]),
              enet_result.alpha_, enet_result.l1_ratio_))


R2: 0.53 Alpha: 0.03 l1_ratio: 0.1


## GABOR

In [64]:
filter_var = [col for col in scaled_df if col.startswith('gabor')]
Models[y_var] = enet_result
X_train, X_test, y_train, y_test = train_test_split(scaled_df[filter_var],scaled_df[y_var], test_size=0.34)
enet_result.fit(X_train,y_train)

Models[y_var] = enet_result

opt_alpha, opt_l1_ratio = enet_result.alpha_, enet_result.l1_ratio_


print("R2: {:.2f} Alpha: {} l1_ratio: {}"
      .format(enet_result.score(scaled_df[filter_var],scaled_df[y_var]),
              enet_result.alpha_, enet_result.l1_ratio_))


R2: 0.68 Alpha: 0.001 l1_ratio: 0.95


## LBPM

In [62]:
filter_var = [col for col in scaled_df if col.startswith('lbpm')]
Models[y_var] = enet_result
enet_result.fit(scaled_df[filter_var],scaled_df[y_var])

Models[y_var] = enet_result

opt_alpha, opt_l1_ratio = enet_result.alpha_, enet_result.l1_ratio_


print("R2: {:.2f} Alpha: {} l1_ratio: {}"
      .format(enet_result.score(scaled_df[filter_var],scaled_df[y_var]),
              enet_result.alpha_, enet_result.l1_ratio_))


R2: 0.51 Alpha: 0.001 l1_ratio: 0.1


In [63]:
filter_var = [col for col in scaled_df if col.startswith('sfs')]
Models[y_var] = enet_result
enet_result.fit(scaled_df[filter_var],scaled_df[y_var])

Models[y_var] = enet_result

opt_alpha, opt_l1_ratio = enet_result.alpha_, enet_result.l1_ratio_


print("R2: {:.2f} Alpha: {} l1_ratio: {}"
      .format(enet_result.score(scaled_df[filter_var],scaled_df[y_var]),
              enet_result.alpha_, enet_result.l1_ratio_))


R2: 0.56 Alpha: 0.01 l1_ratio: 0.1
