## Spatial Features Vs Urban Footprint Vs Population Size

Steps: 

    1) read zonal stats for spatial features (Independent vars) and GUF (Dependent var)
    2) merge the two files with id
    3) Locate the dependant and Independent variables (exclude columns like name)
    4) Standardize the dataset with μ=0 and σ=1  (using StandardScaler from sklearn)
    5) Run Pearson Correlation (Dependent vs Independent vars)
    6) Create a new dataset by selecting those independent variables with high statistical signficance 
    7) Split the new dataset into train(2/3) and test(1/3) sets
    7) Run Elastic Net with 10 fold cross validation
    
    

In [1]:
import sklearn
import pandas as pd
import numpy as np
import csv
import scipy.stats as stats
from statistics import pstdev
from statistics import mean
from sklearn import preprocessing
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.datasets import make_regression
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from itertools import product
import copy


### Read the Features data

In [2]:
spfeas = pd.read_excel('blz_spfeas_v3.xlsx', sheet_name=0)
spfeas['OBJECTID'] = spfeas['OBJECTID'].astype(int)
spfeas = spfeas.set_index('OBJECTID')
spfeas.head()

Unnamed: 0_level_0,FID,Administra,Administ_1,Area,Urban_Rura,CTV_2018,ED_2018,Cluster_Nu,fourier_sc31_variance_mean,fourier_sc31_variance_std,...,sfs_sc51_std_sum,sfs_sc71_max_ratio_of_orthgonal_angles_mean,sfs_sc71_max_ratio_of_orthgonal_angles_std,sfs_sc71_max_ratio_of_orthgonal_angles_sum,sfs_sc31_std_mean,sfs_sc31_std_std,sfs_sc31_std_sum,sfs_sc71_min_line_length_mean,sfs_sc71_min_line_length_std,sfs_sc71_min_line_length_sum
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,Toledo District,BLZ006,Toledo Rural,Rural,Cayes,217,665,1.87101,4.929879,...,42799652.0,140.70989,1.0652,4286729472,1.193974,1.408849,36374452.0,874775.350589,330970.94884,26650047676400
2,1,Stann Creek District,BLZ005,Stann Creek Rural,Rural,Cayes,201,602,2.608672,6.141846,...,92699568.0,140.376936,4.624142,7790784512,1.375928,1.689124,76362680.0,824859.818661,380083.243379,45778923946000
3,2,,,,,Cayes,126,0,6.380702,8.840365,...,10851945.0,139.21994,8.347895,461388384,2.41311,2.69909,7997279.0,614737.965726,486655.09781,2037301248000
4,3,,,,,Cayes,122,0,2.894536,6.357571,...,65703932.0,139.365071,12.484677,5233991680,1.457598,1.841355,54741532.0,814288.179976,388871.179731,30581389787100
5,4,,,,,Cayes,123,0,4.566548,7.459406,...,44759044.0,140.065461,2.836504,2403786496,1.953999,2.401075,33534290.0,692178.805971,461589.477238,11879088914400


In [3]:
print(spfeas.columns)

Index(['FID', 'Administra', 'Administ_1', 'Area', 'Urban_Rura', 'CTV_2018',
       'ED_2018', 'Cluster_Nu', 'fourier_sc31_variance_mean',
       'fourier_sc31_variance_std',
       ...
       'sfs_sc51_std_sum', 'sfs_sc71_max_ratio_of_orthgonal_angles_mean',
       'sfs_sc71_max_ratio_of_orthgonal_angles_std',
       'sfs_sc71_max_ratio_of_orthgonal_angles_sum', 'sfs_sc31_std_mean',
       'sfs_sc31_std_std', 'sfs_sc31_std_sum', 'sfs_sc71_min_line_length_mean',
       'sfs_sc71_min_line_length_std', 'sfs_sc71_min_line_length_sum'],
      dtype='object', length=440)


### Load GUF

In [4]:
guf = pd.read_csv('blz_guf.csv')
guf['OBJECTID'] = guf['OBJECTID'].astype(int)
guf = guf.set_index('OBJECTID')
guf.head()

Unnamed: 0_level_0,Administra,Administ_1,Area,Urban_Rura,CTV_2018,ED_2018,Cluster_Nu,Shape_Leng,Shape_Area,guf_count,guf_sum,guf_mean
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,Toledo District,BLZ006,Toledo Rural,Rural,Cayes,217,665,294022.358507,2934572000.0,19997649,15555,0.000778
2,Stann Creek District,BLZ005,Stann Creek Rural,Rural,Cayes,201,602,431082.304407,5345558000.0,29936453,105315,0.003518
3,,,,,Cayes,126,0,69417.179782,319206400.0,2201594,57375,0.026061
4,,,,,Cayes,122,0,247650.43079,3617347000.0,24922037,6120,0.000246
5,,,,,Cayes,123,0,240469.134053,1652992000.0,11387124,205530,0.018049


### Merge population data with urbanfootprint and featuures

In [5]:
spfeas_guf = spfeas.merge(guf, left_on='OBJECTID', right_on="OBJECTID", how='outer')

In [6]:
spfeas_guf = spfeas_guf.round(3)
spfeas_guf.head()

Unnamed: 0_level_0,FID,Administra_x,Administ_1_x,Area_x,Urban_Rura_x,CTV_2018_x,ED_2018_x,Cluster_Nu_x,fourier_sc31_variance_mean,fourier_sc31_variance_std,...,Area_y,Urban_Rura_y,CTV_2018_y,ED_2018_y,Cluster_Nu_y,Shape_Leng,Shape_Area,guf_count,guf_sum,guf_mean
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,Toledo District,BLZ006,Toledo Rural,Rural,Cayes,217,665,1.871,4.93,...,Toledo Rural,Rural,Cayes,217,665,294022.359,2934572000.0,19997649,15555,0.001
2,1,Stann Creek District,BLZ005,Stann Creek Rural,Rural,Cayes,201,602,2.609,6.142,...,Stann Creek Rural,Rural,Cayes,201,602,431082.304,5345558000.0,29936453,105315,0.004
3,2,,,,,Cayes,126,0,6.381,8.84,...,,,Cayes,126,0,69417.18,319206400.0,2201594,57375,0.026
4,3,,,,,Cayes,122,0,2.895,6.358,...,,,Cayes,122,0,247650.431,3617347000.0,24922037,6120,0.0
5,4,,,,,Cayes,123,0,4.567,7.459,...,,,Cayes,123,0,240469.134,1652992000.0,11387124,205530,0.018


### Filter Dataset by Builtup Surface. 

Select Rows where  builtup is greater than or equal to 10 percent

In [7]:
#filter_by_builtup.shape

In [8]:
spfeas_guf.shape

(723, 452)

### Analysis

Get the list of dependent variables from the DataFrame to store in list y_vars

In [9]:
#y_var = list(filter_by_builtup.axes[1])[440]

y_var = list(spfeas_guf.axes[1])[451]
y_var

'guf_mean'

Get a list of all independent variables from the DataFrame in list all_x

In [10]:
all_x = list(spfeas_guf.axes[1])[8:440]

#Check
all_x

['fourier_sc31_variance_mean',
 'fourier_sc31_variance_std',
 'fourier_sc31_variance_sum',
 'fourier_sc71_mean_mean',
 'fourier_sc71_mean_std',
 'fourier_sc71_mean_sum',
 'fourier_sc51_mean_mean',
 'fourier_sc51_mean_std',
 'fourier_sc51_mean_sum',
 'fourier_sc31_mean_mean',
 'fourier_sc31_mean_std',
 'fourier_sc31_mean_sum',
 'fourier_sc51_variance_mean',
 'fourier_sc51_variance_std',
 'fourier_sc51_variance_sum',
 'fourier_sc71_variance_mean',
 'fourier_sc71_variance_std',
 'fourier_sc71_variance_sum',
 'gabor_sc3_filter_5_mean',
 'gabor_sc3_filter_5_std',
 'gabor_sc3_filter_5_sum',
 'gabor_sc3_filter_4_mean',
 'gabor_sc3_filter_4_std',
 'gabor_sc3_filter_4_sum',
 'gabor_sc3_filter_7_mean',
 'gabor_sc3_filter_7_std',
 'gabor_sc3_filter_7_sum',
 'gabor_sc3_filter_6_mean',
 'gabor_sc3_filter_6_std',
 'gabor_sc3_filter_6_sum',
 'gabor_sc3_filter_1_mean',
 'gabor_sc3_filter_1_std',
 'gabor_sc3_filter_1_sum',
 'gabor_sc3_mean_mean',
 'gabor_sc3_mean_std',
 'gabor_sc3_mean_sum',
 'gabor_sc

### Compute Coorelation of features with population density

Store all features with the least correlation (stat. significance (p < 0.05)) 

The Pearson correlation coefficient **measures the linear relationship
between two datasets.** Strictly speaking, Pearson's correlation requires
that each dataset be **normally distributed, and not necessarily zero-mean.**

Like other correlation coefficients, this one varies between -1 and +1
with 0 implying no correlation. Correlations of -1 or +1 imply an exact
linear relationship. Positive correlations imply that as x increases, so
does y. Negative correlations imply that as x increases, y decreases.

The p-value roughly indicates **the probability of an uncorrelated system**
producing datasets that have a Pearson correlation at least as extreme
as the one computed from these datasets. 

***The p-values are not entirely
reliable but are probably reasonable for datasets larger than 500 or so.***

In [11]:
spfeas_guf[y_var] = spfeas_guf[y_var].fillna(0)
spfeas_guf[y_var].isnull().values.any()

False

In [12]:
y_dict = {}
x = []

for x_var in all_x:
    
    #Calculate the Pearson statistics, 
    # returns the Pearson value and p value
    
    p = stats.skew(spfeas_guf[x_var])
    
    #print(y_var, x_var, p)
    # print back for mike
    #print (y_var + " , " + x_var + " , " +  p)

In [33]:
y_dict = {}
x = []

for x_var in all_x:
    
    #Calculate the Pearson statistics, 
    # returns the Pearson value and p value
    
    p = stats.pearsonr(spfeas_guf[x_var],spfeas_guf[y_var])
    
    #print back for mike
    print (y_var + " , " + x_var + " , " +  str(p[0]) + " , " + str(p[1]))
    
    #If p < 0.05 append to list x
    if p[1] < 0.05:
        x.append([x_var,abs(p[0])])

#List x is made into a DataFrame 
# which is sorted by the absolute values of the Pearson values
x_df = pd.DataFrame(x,columns=["x_var","abs_r2"]).sort_values("abs_r2",ascending=False)


#The dependent variable dictionary is given an entry 
# where the key is the name of the dependent variable
# and the value is a list of top 200 most significant values

y_dict[y_var] = list(x_df["x_var"][0:200])
#y_dict[y_var]
#Print out each dependent variable and 
#the number of x values that remain to check completion

guf_mean , fourier_sc31_variance_mean , 0.5531049153284179 , 3.5946710523852053e-59
guf_mean , fourier_sc31_variance_std , -0.6942343564705571 , 4.552340233930312e-105
guf_mean , fourier_sc31_variance_sum , -0.2504564341649655 , 8.375848718813707e-12
guf_mean , fourier_sc71_mean_mean , 0.7798984906413732 , 7.302622441699035e-149
guf_mean , fourier_sc71_mean_std , -0.6303140966908208 , 2.511384043908478e-81
guf_mean , fourier_sc71_mean_sum , -0.23728684299027752 , 1.0346860813575387e-10
guf_mean , fourier_sc51_mean_mean , 0.7889228438322723 , 1.2467057744687107e-154
guf_mean , fourier_sc51_mean_std , -0.6799596376807764 , 3.063311194116338e-99
guf_mean , fourier_sc51_mean_sum , -0.23891688124834207 , 7.640700334534671e-11
guf_mean , fourier_sc31_mean_mean , 0.8027781328674961 , 4.638625859734778e-164
guf_mean , fourier_sc31_mean_std , -0.7182128859013315 , 1.1424101839172244e-115
guf_mean , fourier_sc31_mean_sum , -0.24078827391621258 , 5.3797609073378695e-11
guf_mean , fourier_sc51_var

In [34]:
x_df.head(50)

Unnamed: 0,x_var,abs_r2
165,hog_sc5_kurtosis_mean,0.882898
189,hog_sc7_kurtosis_mean,0.87347
205,hog_sc3_mean_std,0.857705
393,sfs_sc71_mean_std,0.855795
416,sfs_sc51_std_std,0.855571
207,lac_sc3_lac_mean,0.855243
196,hog_sc3_variance_std,0.855034
384,sfs_sc51_max_line_length_std,0.851945
401,sfs_sc51_mean_std,0.847836
377,sfs_sc31_max_ratio_of_orthgonal_angles_mean,0.847779


In [35]:
#check 

for key in y_dict.keys():
    print(key,len(y_dict[key]))


guf_mean 200


### Correlation Significance

For each dependent variable y in the list of all dependent values, calibrate the model.
Add new key to the output dictionary where y is the dependent variable curently being processed and the values are empty for now

In [36]:
#Initialize the output dictionary, Y_D, 
# with each key being a dependent variable and the values being the results of the analyses

Y_D = {}

Y_D[y_var]={}

#Dictionary Models is used to store each result object for later use if needed

Models ={}

#Get independent variables from the variable dictionary and store in list x_vars
x_vars = y_dict[y_var]


vars_df = pd.DataFrame()

vars_df[y_var] = spfeas_guf[y_var]


for x in x_vars:
    vars_df[x] = spfeas_guf[x]

In [37]:
vars_df.head()

Unnamed: 0_level_0,guf_mean,hog_sc5_kurtosis_mean,hog_sc7_kurtosis_mean,hog_sc3_mean_std,sfs_sc71_mean_std,sfs_sc51_std_std,lac_sc3_lac_mean,hog_sc3_variance_std,sfs_sc51_max_line_length_std,sfs_sc51_mean_std,...,lbpm_sc7_mean_mean,orb_sc71_kurtosis_mean,orb_sc31_skew_mean,mean_sc5_variance_std,lbpm_sc3_mean_mean,lsr_sc31_line_mean_mean,orb_sc31_max_std,orb_sc71_skew_mean,gabor_sc3_mean_std,gabor_sc3_filter_7_std
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.001,8572.192,16205.395,0.03,3.611,2.171,5.032,0.02,8.963,2.139,...,2.479,12832.994,139.765,41.083,0.456,0.01,0.212,180.055,6.29,6.289
2,0.004,14905.479,36392.528,0.034,3.9,2.543,5.12,0.021,10.37,2.498,...,2.476,19176.402,236.6,61.521,0.457,0.025,0.41,268.037,7.223,7.224
3,0.026,42670.318,111379.225,0.05,6.27,4.127,5.442,0.031,15.748,4.135,...,2.454,20623.241,369.236,102.843,0.456,0.043,0.771,289.146,10.871,10.872
4,0.0,18403.911,46792.398,0.037,4.205,2.718,5.157,0.023,10.53,2.726,...,2.459,13397.685,147.771,60.128,0.454,0.02,0.233,187.422,7.259,7.258
5,0.018,24319.113,52112.358,0.044,5.75,3.668,5.008,0.028,14.556,3.617,...,2.463,26168.934,315.683,69.789,0.455,0.025,0.405,365.633,9.25,9.25


### Scale/Normalize Data

In [38]:
#minmax_scaler = preprocessing.MinMaxScaler()
standard_scaler = preprocessing.StandardScaler()

names = vars_df.columns
scaled_df = standard_scaler.fit_transform(vars_df)
scaled_df = pd.DataFrame(scaled_df, columns=names)
scaled_df.head()

Unnamed: 0,guf_mean,hog_sc5_kurtosis_mean,hog_sc7_kurtosis_mean,hog_sc3_mean_std,sfs_sc71_mean_std,sfs_sc51_std_std,lac_sc3_lac_mean,hog_sc3_variance_std,sfs_sc51_max_line_length_std,sfs_sc51_mean_std,...,lbpm_sc7_mean_mean,orb_sc71_kurtosis_mean,orb_sc31_skew_mean,mean_sc5_variance_std,lbpm_sc3_mean_mean,lsr_sc31_line_mean_mean,orb_sc31_max_std,orb_sc71_skew_mean,gabor_sc3_mean_std,gabor_sc3_filter_7_std
0,-0.868299,-1.466496,-1.162112,0.228516,-0.442054,-1.010173,-0.769269,-1.098023,-0.642303,-1.153309,...,9.68485,-0.265775,-0.833535,-0.306638,3.295182,-1.341674,-1.05274,-0.314308,2.085719,2.09257
1,-0.868268,-1.43131,-1.140364,0.425437,-0.303468,-0.697011,-0.75704,-0.977374,-0.337376,-0.863187,...,9.401002,-0.101278,-0.80151,0.149147,3.57999,-1.3261,-0.852036,-0.156482,2.656299,2.66543
2,-0.868036,-1.277054,-1.059579,1.213121,0.833038,0.636455,-0.712293,0.229116,0.828153,0.459734,...,7.319452,-0.063758,-0.757646,1.070662,3.295182,-1.307411,-0.486104,-0.118615,4.887247,4.900503
3,-0.86831,-1.411873,-1.12916,0.573127,-0.157208,-0.54969,-0.751898,-0.736076,-0.3027,-0.678932,...,7.792531,-0.251131,-0.830887,0.118082,2.725567,-1.331291,-1.031454,-0.301092,2.678315,2.686262
4,-0.86812,-1.37901,-1.123429,0.917739,0.583678,0.250053,-0.772604,-0.132831,0.569821,0.041119,...,8.170995,0.080052,-0.775356,0.33353,3.010374,-1.3261,-0.857104,0.018591,3.895918,3.906729


In [39]:
# #Create a new dataframe for scaled and centered values
# scaled_df = pd.DataFrame()

# #Scale and center the values
# scaled_df[y_var] = scale(pop_count, with_mean=True, with_std=True)


# for x in x_vars:
#     scaled_df[x] = scale(spfeas_world_pop_merged[x], with_mean=True, with_std=True)

# scaled_df = scaled_df.round(3)

In [40]:


# Y = preprocessing.minmax_scale(vars_df[y_var])
# name='pop_sum'
# Y = pd.DataFrame(Y)
# #X = pd.DataFrame()

# X = scaler.fit_transform(vars_df[x_vars])
# X = pd.DataFrame(X, columns=x_vars)
# Y.head()

In [41]:
#X.head()

### Scale the variables

### Set Elastic net's parameters

In [42]:
enet_result = ElasticNetCV(max_iter=1e8,
                    alphas = [0.0005, 0.001, 0.01, 0.03, 0.05, 0.1],
                    l1_ratio =[.1, .5, .7, .9, .95, .99, 1],
                    verbose= False,
                    n_jobs = -1, 
                    cv=5, 
                    selection = 'random',
                    fit_intercept=False)


In [43]:
scaled_df.shape

(723, 201)

In [44]:
# Fit the mode

In [45]:
#Fit the model with the scaled data
enet_result.fit(scaled_df[x_vars],scaled_df[y_var])
#Append the model to the Models dictionary
Models[y_var] = enet_result


In [46]:
enet_result

ElasticNetCV(alphas=[0.0005, 0.001, 0.01, 0.03, 0.05, 0.1], copy_X=True, cv=5,
       eps=0.001, fit_intercept=False,
       l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], max_iter=100000000.0,
       n_alphas=100, n_jobs=-1, normalize=False, positive=False,
       precompute='auto', random_state=None, selection='random',
       tol=0.0001, verbose=False)

In [47]:
opt_alpha, opt_l1_ratio = enet_result.alpha_, enet_result.l1_ratio_

#Print update to ensure that the script is progressing properly
print("R2: {:.2f} Alpha: {} l1_ratio: {}"
      .format(enet_result.score(scaled_df[x_vars],scaled_df[y_var]),
              enet_result.alpha_, enet_result.l1_ratio_))

R2: 0.90 Alpha: 0.01 l1_ratio: 0.1


Record the overall R squared score and optimal alpha 
and l1 ratio values and store them in the output dictionary


In [48]:
Y_D[y_var]['Total R2'] = enet_result.score(scaled_df[x_vars],scaled_df[y_var])
Y_D[y_var]['Alpha'] = opt_alpha
Y_D[y_var]['l1_ratio'] = opt_l1_ratio

### Ten Fold Cross validated regression

In [49]:
#Create a list R2s to store out of sample R squared values

R2s = []

#Specify the number of trials to run

trials = 10

#Run the number of trials specified in trials, 
#for each trial 66% of the observations are randomly selected to train the model
#Testing is done on the remaining 33% of observations and the R squared values are recorded

for i in range(trials):    
    X_train, X_test, y_train, y_test = train_test_split(scaled_df[x_vars],scaled_df[y_var], test_size=0.34)
    enet_regr = ElasticNetCV(max_iter=1e8,
                    alphas = [opt_alpha],
                    l1_ratio =[opt_l1_ratio],
                    n_jobs = -1, 
                    cv=5, 
                    selection = 'random',
                    fit_intercept=False)
    enet_regr.fit(X_train,y_train)
    R2s.append(enet_regr.score(X_test,y_test))

#print("Mean R2: {:.2f} StDev: {:.4f}".format(mean(R2s),pstdev(R2s)))
#Record the out of sample R squared values
Y_D[y_var]['Sampling']={'trials':trials,'R2':mean(R2s),'StDev':pstdev(R2s),'R2s':R2s}
#coefs = [i for i in zip(list(scaled_df[x_vars].axes[1]),enet_result.coef_)]
#remaining = [i for i in coefs if abs(i[1])>0.0]
#Y_D[y_var]["Coefficients"]=remaining

In [50]:
Y_D[y_var]

{'Total R2': 0.9002250532222461,
 'Alpha': 0.01,
 'l1_ratio': 0.1,
 'Sampling': {'trials': 10,
  'R2': 0.8612523446563781,
  'StDev': 0.023065739096964293,
  'R2s': [0.8801373602764369,
   0.8640005926204498,
   0.8691959046162878,
   0.8651283991546372,
   0.8260838403089498,
   0.8141135569186908,
   0.8788506747647068,
   0.8879644449102,
   0.8496406743013571,
   0.8774079986920645]}}

In [51]:
y_df = pd.DataFrame([i for i in zip(list(scaled_df[x_vars].axes[1]),enet_result.coef_)], 
                    columns=["features","Coeff"]).sort_values("Coeff", ascending=False)

y_df.head(25)

Unnamed: 0,features,Coeff
128,lbpm_sc5_max_std,0.217399
69,fourier_sc71_mean_mean,0.172057
159,mean_sc3_variance_std,0.14494
88,hog_sc5_mean_std,0.143438
55,sfs_sc31_std_std,0.135279
56,gabor_sc7_filter_14_mean,0.130393
98,ndvi_sc3_variance_mean,0.099113
89,hog_sc5_skew_std,0.090964
147,sfs_sc71_min_line_length_std,0.090292
17,gabor_sc7_filter_11_mean,0.089561


# Result

At national Scale, with GUF we can explain, 86% varriation of built up surface/human settlement



HOG, NDIV, SFS and MEAN are significant

## HOG

In [54]:
filter_var = [col for col in scaled_df if col.startswith('hog')]
X_train, X_test, y_train, y_test = train_test_split(scaled_df[filter_var],scaled_df[y_var], test_size=0.34)
enet_result.fit(X_train,y_train)

Models[y_var] = enet_result

opt_alpha, opt_l1_ratio = enet_result.alpha_, enet_result.l1_ratio_


print("R2: {:.2f} Alpha: {} l1_ratio: {}"
      .format(enet_result.score(scaled_df[filter_var],scaled_df[y_var]),
              enet_result.alpha_, enet_result.l1_ratio_))


R2: 0.83 Alpha: 0.001 l1_ratio: 0.1


## GABOR

In [55]:
filter_var = [col for col in scaled_df if col.startswith('gabor')]
X_train, X_test, y_train, y_test = train_test_split(scaled_df[filter_var],scaled_df[y_var], test_size=0.34)
enet_result.fit(X_train,y_train)

Models[y_var] = enet_result

opt_alpha, opt_l1_ratio = enet_result.alpha_, enet_result.l1_ratio_


print("R2: {:.2f} Alpha: {} l1_ratio: {}"
      .format(enet_result.score(scaled_df[filter_var],scaled_df[y_var]),
              enet_result.alpha_, enet_result.l1_ratio_))

R2: 0.76 Alpha: 0.0005 l1_ratio: 0.1


In [57]:
filter_var = [col for col in scaled_df if col.startswith('lbpm')]
X_train, X_test, y_train, y_test = train_test_split(scaled_df[filter_var],scaled_df[y_var], test_size=0.34)
enet_result.fit(X_train,y_train)

Models[y_var] = enet_result

opt_alpha, opt_l1_ratio = enet_result.alpha_, enet_result.l1_ratio_


print("R2: {:.2f} Alpha: {} l1_ratio: {}"
      .format(enet_result.score(scaled_df[filter_var],scaled_df[y_var]),
              enet_result.alpha_, enet_result.l1_ratio_))

R2: 0.80 Alpha: 0.0005 l1_ratio: 1.0
