# Geothermal and Machine Learning Sandbox

# Prepare Nevada PFA Geothermal Resources Dataset

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

import datetime
import time

from pathlib import Path
import os, sys

from tqdm.notebook import trange, tqdm

## import local function for preprocessing PFA data

In [4]:
myPath = os.getcwd()
sys.path.append(myPath)

import PFA_preprocessing as pfa

In [5]:
myPath

'/home/srbrown/Desktop/UNR/Ingeneous_ML_Sandbox/Essentials/Geothermal_ML_git_archive/modules/data_preprocessing'

## load PFA data

In [6]:
# dataPath = myPath+'/../../../datasets/pandas_dataframes/'
dataPath = myPath+'/../../datasets/pandas_dataframes/'

In [7]:
dataPath

'/home/srbrown/Desktop/UNR/Ingeneous_ML_Sandbox/Essentials/Geothermal_ML_git_archive/modules/data_preprocessing/../../datasets/pandas_dataframes/'

In [8]:
filename = 'PFA_dataframe_grid_march2021_df.h5'

df_features = pd.read_hdf(dataPath+filename, 'df_features')

In [9]:
filename = 'PFA_structuralSettingsLookup_march2021_df.h5'

df_SSlookup = pd.read_hdf(dataPath+filename, 'df_SSLookup')

In [10]:
df_features.tail()

Unnamed: 0,row,column,id_rc,X_83UTM11,Y_83UTM11,NullInfo,TrainCodeNeg,TrainCodePos,TrainCodePosT130,PosSite130_Id,...,GeodeticStrainRate_Error,QuaternarySlipRate_Error,FaultRecency_Error,Earthquakes_Error,Heatflow_Error,HorizGravityGradient2_Confidence,HorizMagneticGradient2_Confidence,Hillshade-100m,DEM-30m,Fairway
1727995,999.0,1723.0,R1000C1724,717077.690182,4508464.0,nullValue,12,12,12,14,...,0.537311,0.788923,0.403113,0.453099,22.451099,0.190986,4,180,1703,-9999.0
1727996,999.0,1724.0,R1000C1725,717327.690182,4508464.0,nullValue,12,12,12,14,...,0.536572,0.788923,0.403113,0.468096,22.432301,0.203718,4,180,1705,-9999.0
1727997,999.0,1725.0,R1000C1726,717577.690182,4508464.0,nullValue,12,12,12,14,...,0.535831,0.788923,0.403113,0.468096,22.4135,0.190986,4,180,1703,-9999.0
1727998,999.0,1726.0,R1000C1727,717827.690182,4508464.0,nullValue,12,12,12,14,...,0.535089,0.788923,0.403113,0.468096,22.3946,0.165521,4,180,1702,-9999.0
1727999,999.0,1727.0,R1000C1728,718077.690182,4508464.0,nullValue,12,12,12,14,...,0.534346,0.788923,0.403113,0.468096,22.375601,0.165521,4,180,1702,-9999.0


In [11]:
df_features.shape

(1728000, 54)

In [12]:
df_SSlookup.head()

Unnamed: 0,Local_polygon_Id,Local_polygon_overlap_Id,Label,Label2,Label3
0,1,1,AZ,,
1,2,2,FT,,
2,3,3,SO,,
3,4,4,SO,,
4,7,7,SO,,


In [13]:
df_SSlookup.shape

(375, 5)

In [14]:
df_features.columns

Index(['row', 'column', 'id_rc', 'X_83UTM11', 'Y_83UTM11', 'NullInfo',
       'TrainCodeNeg', 'TrainCodePos', 'TrainCodePosT130', 'PosSite130_Id',
       'PosSite130_Distance', 'PosSite_Id', 'PosSite_Distance', 'NegSite_Id',
       'NegSite_Distance', 'Local_polygon_Id', 'Local_polygon_overlap_Id',
       'Local-StructuralSetting', 'Local-QuaternaryFaultRecency',
       'Local-QuaternaryFaultSlipDilation', 'Local-QuaternaryFaultSlipRate',
       'QuaternaryFaultTraces', 'GeodeticStrainRate', 'QuaternarySlipRate',
       'FaultRecency', 'FaultSlipDilationTendency2', 'Earthquakes',
       'HorizGravityGradient2', 'HorizMagneticGradient2', 'GravityDensity',
       'MagneticDensity', 'Heatflow', 'GeochemistryTemperature2',
       'Tufa_Distance', 'Travertine_Distance', 'Silica_Distance',
       'TufaOrTravertine_Distance', 'FavorableStructuralSettings_Distance',
       'Local-StructuralSetting_Error', 'Local-QuaternaryFaultRecency_Error',
       'Local-QuaternaryFaultSlipDilation_Error',
 

## preprocess data

In [15]:
extraFeatures = ['GravityDensity', 'MagneticDensity', 
                 'GeochemistryTemperature2',
                 'Silica_Distance', 'TufaOrTravertine_Distance',
                 'DEM-30m', 'FavorableStructuralSettings_Distance']
# extraFeatures = ['HorizMagneticGradient2','DEM-30m']
# extraFeatures = None

In [16]:
# transformDict = {}

transformDict = {'features': ['Local-StructuralSetting',
                              'Local-QuaternaryFaultRecency',
                              'Local-QuaternaryFaultSlipDilation', 
                              'Local-QuaternaryFaultSlipRate',
                              'QuaternaryFaultTraces'], 
                 'transforms': ['gaussianFilter', 
                                'gaussianFilter', 
                                'gaussianFilter', 
                                'gaussianFilter', 
                                'distance_edt'], 
                 'params': [10, 10, 10, 10, 20]}

# transformDict = {'features': ['QuaternaryFaultTraces'], 
#                  'transforms': ['distance_edt'], 
#                  'params': [20]}

dfn, dfInfo, nullIndexes, scaler = pfa.preprocess_features_AllNumerical(df_features,
                                                                          transformFeatures=transformDict,
                                                                          extraFeatures=extraFeatures, 
                                                                          prescaleFeatures=True, withMean=True)
#                                                                           prescaleFeatures=True, withMean=False)

# dfn, dfInfo, nullIndexes, scaler = pfa.preprocess_features_LocalNumerical(df_features, 
#                                                                           resetLocal=None,
#                                                                           transformFaultTraces='distance_edt',
#                                                                           extraFeatures=extraFeatures, 
#                                                                           prescaleFeatures=False)

# dfc, dfInfoc, nullIndexesc, scalerc = pfa.preprocess_features_LocalCategorical(df_features.copy(), df_SSlookup,
#                                                                                resetLocal=None,
#                                                                                transformFaultTraces='distance_edt',
#                                                                                extraFeatures=None,
#                                                                                prescaleFeatures=True)

In [17]:
nFeatures = len(dfn.columns)
print (nFeatures)

20


In [18]:
# dfn = dfn.drop(dfn.columns[0:4], axis=1)

In [19]:
# categorical structural settings

# dfc, dfInfo, nullIndexes, scaler = pfa.preprocess_features_LocalCategorical(df_features, df_SSlookup,
#                                                                              resetLocal=None,
#                                                                              extraFeatures=extraFeatures)
# dfc, dfInfo, nullIndexes, scaler = pfa.preprocess_features_LocalCategorical(df_features, df_SSlookup,
#                                                                              df_features, 
#                                                                              resetLocal='random')

In [20]:
# print (len(scaler))
# print ('')
# print (scaler[0].scale_)
# print (scaler[0].mean_)
# print (scaler[0].var_)

In [21]:
dfn.columns

Index(['Local-StructuralSetting', 'Local-QuaternaryFaultRecency',
       'Local-QuaternaryFaultSlipDilation', 'Local-QuaternaryFaultSlipRate',
       'QuaternaryFaultTraces', 'HorizGravityGradient2',
       'HorizMagneticGradient2', 'GeodeticStrainRate', 'QuaternarySlipRate',
       'FaultRecency', 'FaultSlipDilationTendency2', 'Earthquakes', 'Heatflow',
       'GravityDensity', 'MagneticDensity', 'GeochemistryTemperature2',
       'Silica_Distance', 'TufaOrTravertine_Distance', 'DEM-30m',
       'FavorableStructuralSettings_Distance'],
      dtype='object')

## select benchmark sites based on trainCode distance

### set random number seed

In [22]:
seed = 10

In [23]:
np.random.seed(seed)

# X_pfa, y_pfa, XyInfo = pfa.makeBenchmarks(dfn, dfInfo, nullIndexes, 
#                                           trainCode=2, randomize=True, balance=True)

X_pfa, y_pfa, XyInfo = pfa.makeBenchmarks(dfn, dfInfo, nullIndexes, 
                                          trainCode=2, randomize=True, balance=False)


Number of (+):  415   ; Number of (-):  310
Index(['Local-StructuralSetting', 'Local-QuaternaryFaultRecency',
       'Local-QuaternaryFaultSlipDilation', 'Local-QuaternaryFaultSlipRate',
       'QuaternaryFaultTraces', 'HorizGravityGradient2',
       'HorizMagneticGradient2', 'GeodeticStrainRate', 'QuaternarySlipRate',
       'FaultRecency', 'FaultSlipDilationTendency2', 'Earthquakes', 'Heatflow',
       'GravityDensity', 'MagneticDensity', 'GeochemistryTemperature2',
       'Silica_Distance', 'TufaOrTravertine_Distance', 'DEM-30m',
       'FavorableStructuralSettings_Distance', 'labels'],
      dtype='object')
Index(['row', 'column', 'id_rc', 'X_83UTM11', 'Y_83UTM11', 'NullInfo',
       'TrainCodeNeg', 'TrainCodePos', 'TrainCodePosT130', 'PosSite130_Id',
       'PosSite130_Distance', 'PosSite_Id', 'PosSite_Distance', 'NegSite_Id',
       'NegSite_Distance', 'Local_polygon_Id', 'Local_polygon_overlap_Id'],
      dtype='object')


In [24]:
# option to save all data info so that we can extract row,col locations of benchmark sites for plotting

# X_pfa, y_pfa, XyInfo = pfa.makeBenchmarks(dfn, dfInfo, nullIndexes, 
#                                           trainCode=1, randomize=False, balance=False)

# # hf5File = 'benchmark_sites_february2021_tc2_df.h5'
# hf5File = 'benchmark_sites_february2021_tc1_df.h5'

# XyInfo.to_hdf(hf5File, 'XyInfo', format='table', mode='a')

In [25]:
print( X_pfa.shape, y_pfa.shape)

(725, 20) (725,)


In [26]:
X_pfa.head()

Unnamed: 0,Local-StructuralSetting,Local-QuaternaryFaultRecency,Local-QuaternaryFaultSlipDilation,Local-QuaternaryFaultSlipRate,QuaternaryFaultTraces,HorizGravityGradient2,HorizMagneticGradient2,GeodeticStrainRate,QuaternarySlipRate,FaultRecency,FaultSlipDilationTendency2,Earthquakes,Heatflow,GravityDensity,MagneticDensity,GeochemistryTemperature2,Silica_Distance,TufaOrTravertine_Distance,DEM-30m,FavorableStructuralSettings_Distance
1629211,0.605907,1.214984,3.19154,1.120474,-0.784042,0.956721,-0.383781,-1.553196,3.963975,-2.485171,0.451266,-1.352377,0.856456,-1.447685,0.042378,7.965358,-1.153277,-1.748728,0.071722,-1.439888
792748,2.75382,0.973401,0.219568,2.368457,0.743874,2.838552,-0.631091,-1.014735,-1.16264,1.180387,1.218827,0.250527,-1.224496,-1.678404,0.500513,-0.185826,0.993788,1.746996,-0.003075,0.524243
1427233,-0.532809,-0.58623,-0.4029,-0.584992,1.248917,1.6279,-0.435304,-1.246016,-0.087931,0.565757,-0.099213,-2.18292,0.426947,-1.332326,1.416784,-0.185826,0.055923,1.52673,-0.137194,1.095007
630421,0.627393,1.615954,-0.183165,1.500129,-1.064303,-1.286707,-0.662004,-0.813587,-1.262306,0.897017,0.459076,0.14234,-0.693443,0.97486,0.844115,-0.185826,1.71907,1.030348,0.948657,-1.130844
741007,-0.108018,-0.38465,-0.362674,-0.394128,-0.965216,-0.928128,-0.971141,-0.811781,-1.198261,0.90934,0.206954,-0.170018,-0.52882,-0.178733,1.416784,-0.185826,1.529526,1.298969,0.732003,-0.038541


In [27]:
y_pfa.head()

1629211    1
792748     0
1427233    0
630421     0
741007     0
Name: labels, dtype: int64

In [28]:
XyInfo

Unnamed: 0,row,column,id_rc,X_83UTM11,Y_83UTM11,NullInfo,TrainCodeNeg,TrainCodePos,TrainCodePosT130,PosSite130_Id,PosSite130_Distance,PosSite_Id,PosSite_Distance,NegSite_Id,NegSite_Distance,Local_polygon_Id,Local_polygon_overlap_Id
1629211,942.0,1435.0,R943C1436,645077.690182,4.494214e+06,notNull,12,1,1,14,84.074090,14,84.074090,4,10574.081138,315,0
792748,458.0,1324.0,R459C1325,617327.690182,4.373214e+06,notNull,2,12,12,76,109719.824992,47,28269.390720,48,224.802810,24,0
1427233,825.0,1633.0,R826C1634,694577.690182,4.464964e+06,notNull,1,12,12,14,57577.584385,25,49848.864916,31,83.994037,0,0
630421,364.0,1429.0,R365C1430,643577.690182,4.349714e+06,notNull,2,12,12,13,88734.532066,55,38087.135264,41,173.133020,37,0
741007,428.0,1423.0,R429C1424,642077.690182,4.365714e+06,notNull,2,12,12,13,104234.526475,47,37025.554552,33,218.270209,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1486117,860.0,37.0,R861C38,295577.690182,4.473714e+06,notNull,12,2,2,3,283.719223,3,283.719223,62,59488.571007,0,0
1358832,786.0,624.0,R787C625,442327.690182,4.455214e+06,notNull,12,2,12,57,17594.785904,68,143.548189,5,63599.126130,0,0
727151,420.0,1391.0,R421C1392,634077.690182,4.363714e+06,notNull,1,12,12,13,101327.914872,47,36583.022500,57,63.243804,0,0
641753,371.0,665.0,R372C666,452577.690182,4.351464e+06,notNull,12,2,2,18,154.440589,18,154.440589,56,59532.903211,147,0


In [29]:
dfInfo

Unnamed: 0,row,column,id_rc,X_83UTM11,Y_83UTM11,NullInfo,TrainCodeNeg,TrainCodePos,TrainCodePosT130,PosSite130_Id,PosSite130_Distance,PosSite_Id,PosSite_Distance,NegSite_Id,NegSite_Distance,Local_polygon_Id,Local_polygon_overlap_Id
0,0.0,0.0,R1C1,286327.690182,4.258714e+06,notNull,12,12,12,9,82070.110132,61,34166.449170,22,122848.502942,0,0
1,0.0,1.0,R1C2,286577.690182,4.258714e+06,notNull,12,12,12,9,81993.349012,61,34115.759156,22,122726.185082,0,0
2,0.0,2.0,R1C3,286827.690182,4.258714e+06,notNull,12,12,12,9,81917.278930,61,34066.828392,22,122604.254961,0,0
3,0.0,3.0,R1C4,287077.690182,4.258714e+06,notNull,12,12,12,9,81841.901813,61,34019.664470,22,122482.713736,0,0
4,0.0,4.0,R1C5,287327.690182,4.258714e+06,notNull,12,12,12,9,81767.219578,61,33974.274746,22,122361.562567,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1727995,999.0,1723.0,R1000C1724,717077.690182,4.508464e+06,nullValue,12,12,12,14,73443.286671,14,73443.286671,31,49055.906980,0,0
1727996,999.0,1724.0,R1000C1725,717327.690182,4.508464e+06,nullValue,12,12,12,14,73688.591551,14,73688.591551,31,49171.357287,0,0
1727997,999.0,1725.0,R1000C1726,717577.690182,4.508464e+06,nullValue,12,12,12,14,73933.927886,14,73933.927886,31,49287.805239,0,0
1727998,999.0,1726.0,R1000C1727,717827.690182,4.508464e+06,nullValue,12,12,12,14,74179.295362,14,74179.295362,31,49405.243782,0,0


## write dataframes to hdf file archive

In [30]:
print( X_pfa.shape, y_pfa.shape, XyInfo.shape)

(725, 20) (725,) (725, 17)


In [31]:
filename = 'PFA_features&labels_march2021_tc2_imbalanced_SmoothLSS_FSS_df.h5'

# place file in folder one level up
hf5File = dataPath+'/../'+filename

In [32]:
hf5File

'/home/srbrown/Desktop/UNR/Ingeneous_ML_Sandbox/Essentials/Geothermal_ML_git_archive/modules/data_preprocessing/../../datasets/pandas_dataframes//../PFA_features&labels_march2021_tc2_imbalanced_SmoothLSS_FSS_df.h5'

In [33]:
####################################################
# add benchmark data
X_pfa.to_hdf(hf5File, 'X', format='table', mode='a')
y_pfa.to_hdf(hf5File, 'y', format='table', mode='a')
XyInfo.to_hdf(hf5File, 'XyInfo', format='table', mode='a')

####################################################
# add whole study area dataframes and info for inference
dfn.to_hdf(hf5File, 'dfn', format='table', mode='a')
dfInfo.to_hdf(hf5File, 'dfInfo', format='table', mode='a')

dfNullIndexes = pd.Series(nullIndexes)
dfScaler = pd.DataFrame(np.array([scaler[0].scale_, scaler[0].mean_, scaler[0].var_]))

dfNullIndexes.to_hdf(hf5File, 'nullIndexes', format='table', mode='a')
dfScaler.to_hdf(hf5File, 'scaler', format='table', mode='a')                        
                        

## check contents of archive

In [34]:
import h5py
f = h5py.File(hf5File, 'r')

def keys(f):
    return [key for key in f.keys()]

key_list = keys(f)
print(key_list)

f.close()

['X', 'XyInfo', 'dfInfo', 'dfn', 'nullIndexes', 'scaler', 'y']


In [35]:
X_tst = pd.read_hdf(hf5File, key='X')
y_tst = pd.read_hdf(hf5File, key='y')
XyInfo_tst = pd.read_hdf(hf5File, key='XyInfo')

In [36]:
xx_tst = pd.read_hdf(hf5File, key='nullIndexes')

In [37]:
xx_tst

0         209080
1         209081
2         209082
3         209083
4         209084
          ...   
30523    1727995
30524    1727996
30525    1727997
30526    1727998
30527    1727999
Length: 30528, dtype: int64

In [38]:
print( X_tst.shape, y_tst.shape, XyInfo_tst.shape)

(725, 20) (725,) (725, 17)


In [39]:
X_tst.head()

Unnamed: 0,Local-StructuralSetting,Local-QuaternaryFaultRecency,Local-QuaternaryFaultSlipDilation,Local-QuaternaryFaultSlipRate,QuaternaryFaultTraces,HorizGravityGradient2,HorizMagneticGradient2,GeodeticStrainRate,QuaternarySlipRate,FaultRecency,FaultSlipDilationTendency2,Earthquakes,Heatflow,GravityDensity,MagneticDensity,GeochemistryTemperature2,Silica_Distance,TufaOrTravertine_Distance,DEM-30m,FavorableStructuralSettings_Distance
1629211,0.605907,1.214984,3.19154,1.120474,-0.784042,0.956721,-0.383781,-1.553196,3.963975,-2.485171,0.451266,-1.352377,0.856456,-1.447685,0.042378,7.965358,-1.153277,-1.748728,0.071722,-1.439888
792748,2.75382,0.973401,0.219568,2.368457,0.743874,2.838552,-0.631091,-1.014735,-1.16264,1.180387,1.218827,0.250527,-1.224496,-1.678404,0.500513,-0.185826,0.993788,1.746996,-0.003075,0.524243
1427233,-0.532809,-0.58623,-0.4029,-0.584992,1.248917,1.6279,-0.435304,-1.246016,-0.087931,0.565757,-0.099213,-2.18292,0.426947,-1.332326,1.416784,-0.185826,0.055923,1.52673,-0.137194,1.095007
630421,0.627393,1.615954,-0.183165,1.500129,-1.064303,-1.286707,-0.662004,-0.813587,-1.262306,0.897017,0.459076,0.14234,-0.693443,0.97486,0.844115,-0.185826,1.71907,1.030348,0.948657,-1.130844
741007,-0.108018,-0.38465,-0.362674,-0.394128,-0.965216,-0.928128,-0.971141,-0.811781,-1.198261,0.90934,0.206954,-0.170018,-0.52882,-0.178733,1.416784,-0.185826,1.529526,1.298969,0.732003,-0.038541


In [40]:
y_tst.head()

1629211    1
792748     0
1427233    0
630421     0
741007     0
Name: labels, dtype: int64

In [41]:
XyInfo_tst.head()

Unnamed: 0,row,column,id_rc,X_83UTM11,Y_83UTM11,NullInfo,TrainCodeNeg,TrainCodePos,TrainCodePosT130,PosSite130_Id,PosSite130_Distance,PosSite_Id,PosSite_Distance,NegSite_Id,NegSite_Distance,Local_polygon_Id,Local_polygon_overlap_Id
1629211,942.0,1435.0,R943C1436,645077.690182,4494214.0,notNull,12,1,1,14,84.07409,14,84.07409,4,10574.081138,315,0
792748,458.0,1324.0,R459C1325,617327.690182,4373214.0,notNull,2,12,12,76,109719.824992,47,28269.39072,48,224.80281,24,0
1427233,825.0,1633.0,R826C1634,694577.690182,4464964.0,notNull,1,12,12,14,57577.584385,25,49848.864916,31,83.994037,0,0
630421,364.0,1429.0,R365C1430,643577.690182,4349714.0,notNull,2,12,12,13,88734.532066,55,38087.135264,41,173.13302,37,0
741007,428.0,1423.0,R429C1424,642077.690182,4365714.0,notNull,2,12,12,13,104234.526475,47,37025.554552,33,218.270209,0,0


In [42]:
hf5File

'/home/srbrown/Desktop/UNR/Ingeneous_ML_Sandbox/Essentials/Geothermal_ML_git_archive/modules/data_preprocessing/../../datasets/pandas_dataframes//../PFA_features&labels_march2021_tc2_imbalanced_SmoothLSS_FSS_df.h5'