## Spatial Features Vs Urban Footprint Vs Population Size

In [1]:
import sklearn
import pandas as pd
import numpy as np
import csv
import scipy.stats as stats
from statistics import pstdev
from statistics import mean
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from itertools import product
import copy


### Read the Features data

In [2]:
!ls spfeas

Belize_spfeas.xlsx


In [3]:
spfeas = pd.read_excel('spfeas/Belize_spfeas.xlsx', sheet_name=0)

In [4]:
spfeas['OBJECTID'] = spfeas['OBJECTID'].astype(int)
spfeas = spfeas.set_index('OBJECTID')

In [5]:
spfeas.head()

Unnamed: 0_level_0,FID,Administra,Administ_1,Area,Urban_Rura,CTV_2018,ED_2018,Cluster_Nu,fourier_sc31_variance_mean,fourier_sc31_variance_std,...,sfs_sc51_std_sum,sfs_sc71_max_ratio_of_orthgonal_angles_mean,sfs_sc71_max_ratio_of_orthgonal_angles_std,sfs_sc71_max_ratio_of_orthgonal_angles_sum,sfs_sc31_std_mean,sfs_sc31_std_std,sfs_sc31_std_sum,sfs_sc71_min_line_length_mean,sfs_sc71_min_line_length_std,sfs_sc71_min_line_length_sum
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,Toledo District,BLZ006,Toledo Rural,Rural,Cayes,217,665,1.87101,4.929879,...,42799652.0,140.70989,1.0652,4286729472,1.193974,1.408849,36374452.0,874775.350589,330970.94884,26650047676400
2,1,Stann Creek District,BLZ005,Stann Creek Rural,Rural,Cayes,201,602,2.608672,6.141846,...,92699568.0,140.376936,4.624142,7790784512,1.375928,1.689124,76362680.0,824859.818661,380083.243379,45778923946000
3,2,,,,,Cayes,126,0,6.380702,8.840365,...,10851945.0,139.21994,8.347895,461388384,2.41311,2.69909,7997279.0,614737.965726,486655.09781,2037301248000
4,3,,,,,Cayes,122,0,2.894536,6.357571,...,65703932.0,139.365071,12.484677,5233991680,1.457598,1.841355,54741532.0,814288.179976,388871.179731,30581389787100
5,4,,,,,Cayes,123,0,4.566548,7.459406,...,44759044.0,140.065461,2.836504,2403786496,1.953999,2.401075,33534290.0,692178.805971,461589.477238,11879088914400


Check data

Convert the GN_Code column to an integer index 

### Load the Global Urban Foot Print Data

In [6]:
#guf = pd.read_csv("guf.csv")

In [7]:
#guf['gid'] = guf['gid'].astype(int)
#guf = guf.set_index('gid')

In [8]:
#guf.head()

### Load World Pop Data

In [9]:
world_pop = pd.read_csv("world_pop/blz_ppp_v2b_2015_UNadj_qgis.csv")

In [10]:
world_pop['OBJECTID'] = world_pop['OBJECTID'].astype(int)
world_pop = world_pop.set_index('OBJECTID')

In [11]:
world_pop.head()

Unnamed: 0_level_0,Administra,Administ_1,Area,Urban_Rura,CTV_2018,ED_2018,Cluster_Nu,Shape_Leng,Shape_Area,_count,_sum,_mean
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,Toledo District,BLZ006,Toledo Rural,Rural,Cayes,217,665,294022.358507,2934572000.0,535.0,6.136642,0.01147
2,Stann Creek District,BLZ005,Stann Creek Rural,Rural,Cayes,201,602,431082.304407,5345558000.0,1246.0,262.607626,0.210761
3,,,,,Cayes,126,0,69417.179782,319206400.0,1931.0,43.015828,0.022276
4,,,,,Cayes,122,0,247650.43079,3617347000.0,12463.0,108.512611,0.008707
5,,,,,Cayes,123,0,240469.134053,1652992000.0,1942.0,1265.180093,0.651483


In [12]:
# Total Population size

In [13]:
np.sum(world_pop['_sum'])

344026.97329303942

### Merge population data with urbanfootprint and featuures

In [14]:
#guf_spfeas_merged = spfeas.merge(guf, left_on='gnd_c', right_on='code', how='outer')

In [15]:
spfeas_world_pop_merged = spfeas.merge(world_pop, left_on='OBJECTID', right_on='OBJECTID', how='outer')

In [16]:
spfeas_world_pop_merged = spfeas_world_pop_merged.round(3)
spfeas_world_pop_merged.head()

Unnamed: 0_level_0,FID,Administra_x,Administ_1_x,Area_x,Urban_Rura_x,CTV_2018_x,ED_2018_x,Cluster_Nu_x,fourier_sc31_variance_mean,fourier_sc31_variance_std,...,Area_y,Urban_Rura_y,CTV_2018_y,ED_2018_y,Cluster_Nu_y,Shape_Leng,Shape_Area,_count,_sum,_mean
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,Toledo District,BLZ006,Toledo Rural,Rural,Cayes,217,665,1.871,4.93,...,Toledo Rural,Rural,Cayes,217,665,294022.359,2934572000.0,535.0,6.137,0.011
2,1,Stann Creek District,BLZ005,Stann Creek Rural,Rural,Cayes,201,602,2.609,6.142,...,Stann Creek Rural,Rural,Cayes,201,602,431082.304,5345558000.0,1246.0,262.608,0.211
3,2,,,,,Cayes,126,0,6.381,8.84,...,,,Cayes,126,0,69417.18,319206400.0,1931.0,43.016,0.022
4,3,,,,,Cayes,122,0,2.895,6.358,...,,,Cayes,122,0,247650.431,3617347000.0,12463.0,108.513,0.009
5,4,,,,,Cayes,123,0,4.567,7.459,...,,,Cayes,123,0,240469.134,1652992000.0,1942.0,1265.18,0.651


### Filter Dataset by Builtup Surface. 

Select Rows where  builtup is greater than or equal to 10 percent

In [17]:
#filter_by_builtup = guf_spfeas_world_pop_merged[guf_spfeas_world_pop_merged['PCNT_built']>=0.1]

In [18]:
#filter_by_builtup.head()

In [19]:
#np.sum(filter_by_builtup['stats_sum_y'])

In [20]:
#pop_density = filter_by_builtup['stats_sum_y']/filter_by_builtup['GN_area']

In [21]:
pop_count = spfeas_world_pop_merged['_sum']

In [22]:
#filter_by_builtup.shape, pop_count.shape

In [23]:
import matplotlib.pyplot as plt
plt.subplots(figsize=(8,8))
y = pop_count
x = spfeas_world_pop_merged['fourier_sc31_variance_mean']
plt.scatter(x, y)

z = np.polyfit(x, y, 1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")
plt.show()

<Figure size 800x800 with 1 Axes>

In [24]:
spfeas_world_pop_merged.shape

(723, 452)

### Analysis

Get the list of dependent variables from the DataFrame to store in list y_vars

In [25]:
y_var = list(spfeas_world_pop_merged.axes[1])[450]
y_var

'_sum'

Get a list of all independent variables from the DataFrame in list all_x

In [26]:
all_x = list(spfeas_world_pop_merged.axes[1])[8:440]

#Check
all_x

['fourier_sc31_variance_mean',
 'fourier_sc31_variance_std',
 'fourier_sc31_variance_sum',
 'fourier_sc71_mean_mean',
 'fourier_sc71_mean_std',
 'fourier_sc71_mean_sum',
 'fourier_sc51_mean_mean',
 'fourier_sc51_mean_std',
 'fourier_sc51_mean_sum',
 'fourier_sc31_mean_mean',
 'fourier_sc31_mean_std',
 'fourier_sc31_mean_sum',
 'fourier_sc51_variance_mean',
 'fourier_sc51_variance_std',
 'fourier_sc51_variance_sum',
 'fourier_sc71_variance_mean',
 'fourier_sc71_variance_std',
 'fourier_sc71_variance_sum',
 'gabor_sc3_filter_5_mean',
 'gabor_sc3_filter_5_std',
 'gabor_sc3_filter_5_sum',
 'gabor_sc3_filter_4_mean',
 'gabor_sc3_filter_4_std',
 'gabor_sc3_filter_4_sum',
 'gabor_sc3_filter_7_mean',
 'gabor_sc3_filter_7_std',
 'gabor_sc3_filter_7_sum',
 'gabor_sc3_filter_6_mean',
 'gabor_sc3_filter_6_std',
 'gabor_sc3_filter_6_sum',
 'gabor_sc3_filter_1_mean',
 'gabor_sc3_filter_1_std',
 'gabor_sc3_filter_1_sum',
 'gabor_sc3_mean_mean',
 'gabor_sc3_mean_std',
 'gabor_sc3_mean_sum',
 'gabor_sc

### Compute Coorelation of features with population density

Store all features with the least correlation (stat. significance (p < 0.05)) 

The Pearson correlation coefficient measures the linear relationship
between two datasets. Strictly speaking, Pearson's correlation requires
that each dataset be normally distributed, and not necessarily zero-mean.

Like other correlation coefficients, this one varies between -1 and +1
with 0 implying no correlation. Correlations of -1 or +1 imply an exact
linear relationship. Positive correlations imply that as x increases, so
does y. Negative correlations imply that as x increases, y decreases.

The p-value roughly indicates the probability of an uncorrelated system
producing datasets that have a Pearson correlation at least as extreme
as the one computed from these datasets. The p-values are not entirely
reliable but are probably reasonable for datasets larger than 500 or so.

In [27]:
y_dict = {}
x = []

for x_var in all_x:
    
    #Calculate the Pearson statistics, 
    # returns the Pearson value and p value
    
    p = stats.skew(spfeas_world_pop_merged[x_var])
    
    print(y_var, x_var, p)
    # print back for mike
    #print (y_var + " , " + x_var + " , " +  p)

_sum fourier_sc31_variance_mean -1.4616923765132483
_sum fourier_sc31_variance_std 1.5229818957230805
_sum fourier_sc31_variance_sum 7.394662134735632
_sum fourier_sc71_mean_mean -0.6771168726779779
_sum fourier_sc71_mean_std 3.0014159816521495
_sum fourier_sc71_mean_sum 7.862943729619969
_sum fourier_sc51_mean_mean -0.5599512064399966
_sum fourier_sc51_mean_std 2.0478761429705132
_sum fourier_sc51_mean_sum 7.8997584889507655
_sum fourier_sc31_mean_mean -0.40134411736103376
_sum fourier_sc31_mean_std 1.0693539416211884
_sum fourier_sc31_mean_sum 7.9379202480789575
_sum fourier_sc51_variance_mean -1.540668291729564
_sum fourier_sc51_variance_std 1.896037348436762
_sum fourier_sc51_variance_sum 7.292575318101691
_sum fourier_sc71_variance_mean -1.5661091142446384
_sum fourier_sc71_variance_std 2.2590809559440994
_sum fourier_sc71_variance_sum 7.260798343531496
_sum gabor_sc3_filter_5_mean 0.5520925849606606
_sum gabor_sc3_filter_5_std 1.4834270344090212
_sum gabor_sc3_filter_5_sum 7.2743

_sum lbpm_sc7_skew_mean -0.22949881460081964
_sum lbpm_sc7_skew_std -0.034573763180087404
_sum lbpm_sc7_skew_sum 7.42566394053456
_sum lbpm_sc7_kurtosis_mean -0.06845368360518306
_sum lbpm_sc7_kurtosis_std 0.020182739702669292
_sum lbpm_sc7_kurtosis_sum 6.866326848772566
_sum lbpm_sc7_max_mean 1.5975666246533897
_sum lbpm_sc7_max_std 0.034850823288314826
_sum lbpm_sc7_max_sum 13.361106031549898
_sum lbpm_sc3_skew_mean -0.5873692750177618
_sum lbpm_sc3_skew_std -0.09213583816319569
_sum lbpm_sc3_skew_sum 8.18617591466823
_sum lbpm_sc3_kurtosis_mean -0.4215198731629711
_sum lbpm_sc3_kurtosis_std -1.3288106205449028
_sum lbpm_sc3_kurtosis_sum 7.511332356300035
_sum lbpm_sc5_max_mean 1.4459311095735556
_sum lbpm_sc5_max_std -0.05991697607347857
_sum lbpm_sc5_max_sum 13.28826873179224
_sum lbpm_sc7_variance_mean 2.5561784540883092
_sum lbpm_sc7_variance_std 0.7506429172945557
_sum lbpm_sc7_variance_sum 14.543536870927081
_sum lbpm_sc5_kurtosis_mean -0.22312626691864282
_sum lbpm_sc5_kurtosi

In [30]:
y_dict = {}
x = []

for x_var in all_x:
    
    #Calculate the Pearson statistics, 
    # returns the Pearson value and p value
    
    p = stats.pearsonr(spfeas_world_pop_merged[x_var],pop_count)
    
    # print back for mike
    print (y_var + " , " + x_var + " , " +  str(p[0]) + " , " + str(p[1]))
    
    #If p < 0.05 append to list x
    if p[1] < 0.05:
        x.append([x_var,abs(p[0])])

#List x is made into a DataFrame 
# which is sorted by the absolute values of the Pearson values
x_df = pd.DataFrame(x,columns=["x_var","abs_r2"]).sort_values("abs_r2",ascending=False)


#The dependent variable dictionary is given an entry 
# where the key is the name of the dependent variable
# and the value is a list of top 200 most significant values

y_dict[y_var] = list(x_df["x_var"][0:200])

#Print out each dependent variable and 
#the number of x values that remain to check completion

_sum , fourier_sc31_variance_mean , 0.138978962701 , 0.000177754405549
_sum , fourier_sc31_variance_std , -0.122948261419 , 0.000923769029304
_sum , fourier_sc31_variance_sum , 0.165513164032 , 7.69074989603e-06
_sum , fourier_sc71_mean_mean , 0.20207218502 , 4.23682220943e-08
_sum , fourier_sc71_mean_std , -0.165147188919 , 8.05989317448e-06
_sum , fourier_sc71_mean_sum , 0.16392965013 , 9.41347615991e-06
_sum , fourier_sc51_mean_mean , 0.199896728647 , 5.94476351768e-08
_sum , fourier_sc51_mean_std , -0.185236459538 , 5.28654095921e-07
_sum , fourier_sc51_mean_sum , 0.16661144862 , 6.67733747124e-06
_sum , fourier_sc31_mean_mean , 0.200597705875 , 5.33231592559e-08
_sum , fourier_sc31_mean_std , -0.199079867812 , 6.74444128035e-08
_sum , fourier_sc31_mean_sum , 0.169134838726 , 4.80962457153e-06
_sum , fourier_sc51_variance_mean , 0.129060035023 , 0.000503653692574
_sum , fourier_sc51_variance_std , -0.114716450745 , 0.0020053488473
_sum , fourier_sc51_variance_sum , 0.16078842902 , 

In [31]:
#check 

for key in y_dict.keys():
    print(key,len(y_dict[key]))


_sum 200


### Correlation Significance

For each dependent variable y in the list of all dependent values, calibrate the model.
Add new key to the output dictionary where y is the dependent variable curently being processed and the values are empty for now

In [32]:
#Initialize the output dictionary, Y_D, 
# with each key being a dependent variable and the values being the results of the analyses

Y_D = {}

Y_D[y_var]={}

#Dictionary Models is used to store each result object for later use if needed

Models ={}

#Get independent variables from the variable dictionary and store in list x_vars
x_vars = y_dict[y_var]


vars_df = pd.DataFrame()

vars_df[y_var] = spfeas_world_pop_merged[y_var]


for x in x_vars:
    vars_df[x] = spfeas_world_pop_merged[x]

In [33]:
vars_df.head()

Unnamed: 0_level_0,_sum,orb_sc51_variance_std,gabor_sc7_filter_14_mean,orb_sc31_variance_std,gabor_sc5_filter_14_mean,orb_sc71_variance_std,gabor_sc7_filter_12_mean,gabor_sc5_filter_12_mean,orb_sc31_mean_std,mean_sc7_variance_mean,...,hog_sc7_max_sum,hog_sc7_variance_sum,hog_sc7_mean_sum,hog_sc3_mean_sum,mean_sc7_mean_std,hog_sc3_skew_sum,hog_sc5_skew_sum,hog_sc3_kurtosis_sum,hog_sc3_skew_mean,sfs_sc31_std_std
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,6.137,0.05,14.12,0.017,6.724,0.089,13.998,6.698,0.006,31.713,...,1633047.625,109074.211,400523.625,277890.344,4.584,2208210000.0,4975729000.0,49103220000.0,72.472,1.409
2,262.608,0.165,19.366,0.045,9.212,0.367,19.153,9.171,0.013,43.306,...,3444789.75,219205.703,883100.125,658231.625,5.259,5473134000.0,12725780000.0,125809900000.0,98.618,1.689
3,43.016,0.355,54.734,0.101,25.966,0.737,54.097,25.844,0.024,121.695,...,426383.469,22852.414,121547.195,102249.82,7.913,863249900.0,2047544000.0,20087430000.0,260.487,2.699
4,108.513,0.058,19.639,0.019,9.357,0.108,19.503,9.329,0.007,44.129,...,2570719.0,154279.344,684993.438,522591.188,5.289,4353483000.0,10308280000.0,100527200000.0,115.926,1.841
5,1265.18,0.166,36.086,0.04,17.15,0.457,35.695,17.069,0.011,80.619,...,1813205.25,106684.805,483537.156,378277.469,6.736,3064286000.0,7068155000.0,69332640000.0,178.558,2.401


In [34]:
vars_df.isnull().values.any()

False

## Normalize

In [35]:
from sklearn import preprocessing


#norm_df = preprocessing.normalize(vars_df)

#names = vars_df.columns
#scaled_df = scaler(vars_df)
#norm_df = pd.DataFrame(norm_df, columns=names

### Min-MaxScaler

In [38]:

# Y = preprocessing.minmax_scale(vars_df[y_var])
# name='pop_sum'
# Y = pd.DataFrame(Y)
# #X = pd.DataFrame()

# X = scaler.fit_transform(vars_df[x_vars])
# X = pd.DataFrame(X, columns=x_vars)
# Y.head()

In [39]:
quantile_scaler = preprocessing.scale()

In [71]:
minmax_scaler = preprocessing.MinMaxScaler()
robust_scaler = preprocessing.RobustScaler()
names = vars_df.columns
scaled_df = robust_scaler.fit_transform(vars_df)
scaled_df = pd.DataFrame(scaled_df, columns=names)
scaled_df.head()

Unnamed: 0,_sum,orb_sc51_variance_std,gabor_sc7_filter_14_mean,orb_sc31_variance_std,gabor_sc5_filter_14_mean,orb_sc71_variance_std,gabor_sc7_filter_12_mean,gabor_sc5_filter_12_mean,orb_sc31_mean_std,mean_sc7_variance_mean,...,hog_sc7_max_sum,hog_sc7_variance_sum,hog_sc7_mean_sum,hog_sc3_mean_sum,mean_sc7_mean_std,hog_sc3_skew_sum,hog_sc5_skew_sum,hog_sc3_kurtosis_sum,hog_sc3_skew_mean,sfs_sc31_std_std
0,-0.430752,-0.345786,-0.919301,-0.356546,-0.91517,-0.352174,-0.936556,-0.91112,-0.583333,-0.91941,...,21.953652,25.798419,20.287623,17.621932,2.76151,17.410232,17.077269,17.439645,-1.942371,-0.531157
1,0.112525,-0.14596,-0.868722,-0.200557,-0.866133,-0.110435,-0.886081,-0.862443,-0.4375,-0.87149,...,46.336765,51.867312,44.769367,41.791172,3.383057,43.21885,43.752854,44.760972,-1.886833,-0.323442
2,-0.352632,0.184188,-0.527724,0.111421,-0.535921,0.211304,-0.543927,-0.534264,-0.208333,-0.547471,...,5.713909,5.389118,6.13478,6.460646,5.826888,6.778572,6.998494,7.104818,-1.543005,0.425816
3,-0.213891,-0.331885,-0.86609,-0.345404,-0.863275,-0.335652,-0.882654,-0.859333,-0.5625,-0.868088,...,34.573189,36.49879,34.719153,33.171739,3.410681,34.368222,35.431862,35.755803,-1.850069,-0.210682
4,2.236252,-0.144222,-0.707517,-0.228412,-0.709679,-0.032174,-0.72411,-0.706985,-0.479167,-0.717258,...,24.378282,25.23283,24.499009,24.001152,4.743094,24.177355,24.279375,24.644949,-1.717032,0.204748


### Scale the variables

In [60]:
# #Create a new dataframe for scaled and centered values
# scaled_df = pd.DataFrame()

# #Scale and center the values
# scaled_df[y_var] = scale(pop_count, with_mean=True, with_std=True)


# for x in x_vars:
#     scaled_df[x] = scale(spfeas_world_pop_merged[x], with_mean=True, with_std=True)

# scaled_df = scaled_df.round(3)

### Set Elastic net's parameters

In [61]:

result = ElasticNetCV(max_iter=1e8,
                    alphas = [0.0005, 0.001, 0.01, 0.03, 0.05, 0.1],
                    l1_ratio =[.1, .5, .7, .9, .95, .99, 1],
                    verbose= False,
                    n_jobs = -1, 
                    cv=5, 
                    selection = 'random',
                    fit_intercept=False)


In [62]:
scaled_df.shape

(723, 201)

In [63]:
# Fit the mode

In [64]:
#Fit the model with the scaled data
result.fit(scaled_df[x_vars],scaled_df[y_var])
#Append the model to the Models dictionary
Models[y_var] = result


In [57]:
result

ElasticNetCV(alphas=[0.0005, 0.001, 0.01, 0.03, 0.05, 0.1], copy_X=True, cv=5,
       eps=0.001, fit_intercept=False,
       l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], max_iter=100000000.0,
       n_alphas=100, n_jobs=-1, normalize=False, positive=False,
       precompute='auto', random_state=None, selection='random',
       tol=0.0001, verbose=False)

In [66]:
opt_alpha, opt_l1_ratio = result.alpha_, result.l1_ratio_

#Print update to ensure that the script is progressing properly
print("R2: {:.2f} Alpha: {} l1_ratio: {}"
      .format(result.score(scaled_df[x_vars],scaled_df[y_var]),
              result.alpha_, result.l1_ratio_))

R2: 0.15 Alpha: 0.1 l1_ratio: 1.0


Record the overall R squared score and optimal alpha 
and l1 ratio values and store them in the output dictionary


In [67]:
Y_D[y_var]['Total R2'] = result.score(scaled_df[x_vars],scaled_df[y_var])
Y_D[y_var]['Alpha'] = opt_alpha
Y_D[y_var]['l1_ratio'] = opt_l1_ratio

### Ten Fold Cross validated regression

In [68]:
#Create a list R2s to store out of sample R squared values

R2s = []

#Specify the number of trials to run

trials = 10

#Run the number of trials specified in trials, 
#for each trial 34% of the observations are randomly selected to train the model
#Testing is done on the remaining 66% of observations and the R squared values are recorded

for i in range(trials):    
    X_train, X_test, y_train, y_test = train_test_split(scaled_df[x_vars],scaled_df[y_var], test_size=0.66)
    test_regr = ElasticNetCV(max_iter=1e8,
                    alphas = [opt_alpha],
                    l1_ratio =[opt_l1_ratio],
                    n_jobs = -1, 
                    cv=5, 
                    selection = 'random',
                    fit_intercept=False)
    test_regr.fit(X_train,y_train)
    R2s.append(test_regr.score(X_test,y_test))

#print("Mean R2: {:.2f} StDev: {:.4f}".format(mean(R2s),pstdev(R2s)))
#Record the out of sample R squared values
Y_D[y_var]['Sampling']={'trials':trials,'R2':mean(R2s),'StDev':pstdev(R2s),'R2s':R2s}
coefs = [i for i in zip(list(scaled_df[x_vars].axes[1]),result.coef_)]
remaining = [i for i in coefs if abs(i[1])>0.0]
Y_D[y_var]["Coefficients"]=remaining

In [69]:
Y_D[y_var]

{'Total R2': 0.15022938368209371,
 'Alpha': 0.10000000000000001,
 'l1_ratio': 1.0,
 'Sampling': {'trials': 10,
  'R2': 0.096955524794979292,
  'StDev': 0.017565504229756054,
  'R2s': [0.086740999540138919,
   0.082111583872249039,
   0.11241073515881117,
   0.099827436202382547,
   0.10642146275462971,
   0.10990039845788258,
   0.10751541003271736,
   0.08799013315539439,
   0.057371647471978322,
   0.11926544130360894]},
 'Coefficients': [('orb_sc51_variance_std', 0.24427371896237485),
  ('gabor_sc7_filter_14_mean', 0.16888980880893451),
  ('orb_sc71_variance_std', 0.0043338986510297215),
  ('orb_sc71_variance_mean', 0.027288639264800969),
  ('gabor_sc3_filter_14_std', 0.010756878176986657),
  ('hog_sc5_skew_std', 0.079192058310745425),
  ('ndvi_sc7_mean_sum', 0.010316078489919304),
  ('ndvi_sc5_mean_sum', 0.012295296197832076),
  ('ndvi_sc3_mean_sum', 0.0099508733046463262),
  ('lsr_sc31_line_mean_sum', -0.012671149613084758),
  ('hog_sc7_variance_sum', 0.016257601560910714),
  ('ho

In [65]:
# Lasso Regression

In [None]:
# Lasso
from sklearn.linear_model import Lasso