# imports

In [1]:
# Import the modules of interest
import pandas as pd
import subprocess
import time
import datetime
import ee as ee
ee.Initialize()


# Initialise variables

In [2]:
# Input the name of the username that serves as the home folder for asset storage
usernameFolderString = 'acottam'

# Input the name of the project folder inside which all of the assets will be stored
# !! You should create this folder immediately under the home asset directory before the script is run
projectFolder = 'ETH_Global_Forest_Cover'

# Input the name of a folder used to hold the bootstrap collections
bootstrapCollFolder = 'Bootstrap_Collections_RegTest'

# Input the normal wait time (in seconds) for "wait and break" cells
normalWaitTime = 5

# Input a longer wait time (in seconds) for "wait and break" cells
longWaitTime = 10

# Input the Cloud Storage Bucket that will hold the bootstrap collections when uploading them to Earth Engine
# !! This bucket should be pre-created before running this script
bucketOfInterest = 'crowther_examples'

# Specify the column names where the latitude and longitude information is stored
latString = 'Lat'
longString = 'Long'

# Input the name of the property that holds the CV fold assignment (cross-validation)
cvFoldString = 'CV_Fold'

# Input the name of the classification property
classProperty = 'treecover'

# Input a list of the covariates being used
covariateList = ["CHELSA_Annual_Mean_Temperature",
  "CHELSA_Annual_Precipitation",
  "EarthEnvTopoMed_Elevation",
  "SG_Depth_to_bedrock",
  "CHELSA_Precipitation_Seasonality",
  "CHELSA_Mean_Temperature_of_Warmest_Quarter",
  "CHELSA_Precipitation_of_Driest_Quarter",
  "SG_Sand_Content_000cm",
  "SG_Sand_Content_005cm",
  "EarthEnvTopoMed_Northness",
  "EarthEnvTopoMed_Eastness"];

# Input the name of the folder inside which you want to store the cross validation collections and results
cvCollFolder = 'CV_Colls_RegTest'

# Input the name of the image collection inside which you'll store the bootstrapped images
# AC this doesnt need to be created before hand
bootstrapImageColl = 'Bootstrap_Images_RegTest'

# Load the composite on which to perform the mapping, and subselect the bands of interest
compositeToClassify = ee.Image("users/devinrouth/ETH_Composites/CrowtherLab_Composite_30ArcSec").select(covariateList)

# Input the header text that will name each bootstrapped dataset
fileNameHeader = 'ForestPotential2_BootstrapColl_RegTest_'

# Generate the seeds for bootstrapping
seedsToUseForBootstrapping = list(range(1, 6))

# Write the name of a local staging area folder for outputted CSV's
# holdingFolder = '/Users/DevinRouth/Downloads/ForestPotential2_BootstrapData'
holdingFolder = '/home/ubuntu/environment/jupyter/notebooks/crowther/downloads'

# Load a geometry to use for the export
# exportingGeometry = ee.Geometry.Polygon([-180, 88, 0, 88, 180, 88, 180, -88, 0, -88, -180, -88], None, False);
exportingGeometry = ee.Geometry.Polygon(
        [[[8.283486006858624, 47.492568637670196],
          [8.283486006858624, 47.011581664842936],
          [9.162392256858624, 47.011581664842936],
          [9.162392256858624, 47.492568637670196]]], None, False);

# Determine k for k fold CV
k = 4

# Input the number of points to use for each bootstrap model
# !! This should be chosen carefully, as it will determine the size of the bootstrap collections and models
# bootstrapModelSize = 100
bootstrapModelSize = 2

# Input the model type; i.e., is this a classification (on categorical data) or a regression (on continuous data)?
# !! Options should be inputted are 'CLASSIFICATION' or 'REGRESSION'
modelType = 'REGRESSION'

# Input the title of the CSV that will hold all of the data that has been given a CV fold assignment
titleOfCSVWithCVAssignments = "CV_Fold_Collection_RegTest"

# Input the title of the CV Accuracy Feature Collection
cvAccuracyFCNameString = "CV_Accuracy_FC_RegTest"


In [3]:
# Specify the necessary arguments to upload the files to a Cloud Storage bucket
# I.e., create bash variables in order to create/check/delete Earth Engine Assets

# Specify main bash functions being used
# !! You must specify the full path of the executable if the executable is not scoped from root
# bashFunction_EarthEngine = '/Users/DevinRouth/Library/Python/3.7/bin/earthengine'
bashFunction_EarthEngine = 'earthengine' # tested and works fine!
bashFunctionGSUtil = 'gsutil'

# Specify the arguments to these functions
arglist_preEEUploadTable = ['--no-use_cloud_api','upload','table']
arglist_postEEUploadTable = ['--x_column', longString, '--y_column', latString]
arglist_preGSUtilUploadFile = ['cp']
formattedBucketOI = 'gs://'+bucketOfInterest
assetIDStringPrefix = '--asset_id='
arglist_CreateCollection = ['--no-use_cloud_api','create','collection']
arglist_CreateFolder = ['--no-use_cloud_api','create','folder']
arglist_Detect = ['--no-use_cloud_api','asset','info']
arglist_Delete = ['--no-use_cloud_api','rm','-r']
stringsOfInterest = ['Asset does not exist or is not accessible']

# Compose the arguments into lists that can be run via the subprocess module
bashCommandList_Detect = [bashFunction_EarthEngine]+arglist_Detect
bashCommandList_Delete = [bashFunction_EarthEngine]+arglist_Delete
bashCommandList_CreateCollection = [bashFunction_EarthEngine]+arglist_CreateCollection
bashCommandList_CreateFolder = [bashFunction_EarthEngine]+arglist_CreateFolder



# Read in Forest potential sample data

In [4]:
%%time
# Import the raw CSV being bootstrapped
rawPointCollection = pd.read_csv('20200303_ForestPotential_Samples.csv',float_precision='round_trip')


CPU times: user 534 ms, sys: 31.9 ms, total: 565 ms
Wall time: 571 ms


## Clean sample data

In [5]:
# Print basic information on the csv
print(rawPointCollection.columns)
print(rawPointCollection.shape)


Index(['system:index', 'CHELSA_Annual_Mean_Temperature',
       'CHELSA_Annual_Precipitation',
       'CHELSA_Mean_Temperature_of_Warmest_Quarter',
       'CHELSA_Precipitation_Seasonality',
       'CHELSA_Precipitation_of_Driest_Quarter', 'EarthEnvTopoMed_Eastness',
       'EarthEnvTopoMed_Elevation', 'EarthEnvTopoMed_Northness', 'Lat', 'Long',
       'Resolve_Biome', 'SG_Depth_to_bedrock', 'SG_Sand_Content_000cm',
       'SG_Sand_Content_005cm', 'shrubcover', 'treecover', '.geo'],
      dtype='object')
(75772, 18)


In [6]:
%%time
# Remove the "system:index" and rename the ".geo" column to "geo" and shuffle the data frame while setting a new index
# (to ensure geographic clumps of points are not clumped in anyway)
preppedCollection = rawPointCollection.drop(['system:index','.geo'], axis=1).sample(frac=1).reset_index(drop=True)
print(preppedCollection.columns)
print(preppedCollection.shape)
print(preppedCollection.dtypes)


Index(['CHELSA_Annual_Mean_Temperature', 'CHELSA_Annual_Precipitation',
       'CHELSA_Mean_Temperature_of_Warmest_Quarter',
       'CHELSA_Precipitation_Seasonality',
       'CHELSA_Precipitation_of_Driest_Quarter', 'EarthEnvTopoMed_Eastness',
       'EarthEnvTopoMed_Elevation', 'EarthEnvTopoMed_Northness', 'Lat', 'Long',
       'Resolve_Biome', 'SG_Depth_to_bedrock', 'SG_Sand_Content_000cm',
       'SG_Sand_Content_005cm', 'shrubcover', 'treecover'],
      dtype='object')
(75772, 16)
CHELSA_Annual_Mean_Temperature                float64
CHELSA_Annual_Precipitation                   float64
CHELSA_Mean_Temperature_of_Warmest_Quarter    float64
CHELSA_Precipitation_Seasonality              float64
CHELSA_Precipitation_of_Driest_Quarter        float64
EarthEnvTopoMed_Eastness                      float64
EarthEnvTopoMed_Elevation                     float64
EarthEnvTopoMed_Northness                     float64
Lat                                           float64
Long                   

In [7]:
# Drop NAs
print('Original Collection with NA values included')
print(preppedCollection.shape)
print(preppedCollection.isna().sum())
print('\n')
print('Cleaned Collection with NA values excluded')
preppedCollection = preppedCollection.dropna(how='any')
print(preppedCollection.shape)
print(preppedCollection.isna().sum())


Original Collection with NA values included
(75772, 16)
CHELSA_Annual_Mean_Temperature                3121
CHELSA_Annual_Precipitation                   3121
CHELSA_Mean_Temperature_of_Warmest_Quarter    3121
CHELSA_Precipitation_Seasonality              3121
CHELSA_Precipitation_of_Driest_Quarter        3121
EarthEnvTopoMed_Eastness                      3094
EarthEnvTopoMed_Elevation                     3094
EarthEnvTopoMed_Northness                     3094
Lat                                              0
Long                                             0
Resolve_Biome                                  153
SG_Depth_to_bedrock                           3093
SG_Sand_Content_000cm                         3093
SG_Sand_Content_005cm                         3093
shrubcover                                       0
treecover                                        0
dtype: int64


Cleaned Collection with NA values excluded
(72524, 16)
CHELSA_Annual_Mean_Temperature                0
CHELSA_Ann

In [8]:
# Make a list of the k-fold CV assignments to use
kList = list(range(1,k+1))
print(kList)


[1, 2, 3, 4]


In [9]:
# Add fold assignments to each of the points, stratified by biome
preppedCollection[cvFoldString] = (preppedCollection.groupby('Resolve_Biome').cumcount() % k) + 1
print(preppedCollection.columns)
print(preppedCollection.shape)


Index(['CHELSA_Annual_Mean_Temperature', 'CHELSA_Annual_Precipitation',
       'CHELSA_Mean_Temperature_of_Warmest_Quarter',
       'CHELSA_Precipitation_Seasonality',
       'CHELSA_Precipitation_of_Driest_Quarter', 'EarthEnvTopoMed_Eastness',
       'EarthEnvTopoMed_Elevation', 'EarthEnvTopoMed_Northness', 'Lat', 'Long',
       'Resolve_Biome', 'SG_Depth_to_bedrock', 'SG_Sand_Content_000cm',
       'SG_Sand_Content_005cm', 'shrubcover', 'treecover', 'CV_Fold'],
      dtype='object')
(72524, 17)


In [10]:
# Test to ensure that the CV fold assignment has been done correctly
test = preppedCollection.loc[preppedCollection['Resolve_Biome'] == 7]
print(test.shape)
test[cvFoldString].value_counts()

(4559, 17)


3    1140
2    1140
1    1140
4    1139
Name: CV_Fold, dtype: int64

In [11]:
# Write the CSV to disk and upload it to Earth Engine as a Feature Collection
localPathToCVAssignedData = holdingFolder+'/'+titleOfCSVWithCVAssignments+'.csv'
preppedCollection.to_csv(localPathToCVAssignedData,index=False)


In [12]:
# Format the bash call to upload the file to the Google Cloud Storage bucket
gsutilBashUploadList = [bashFunctionGSUtil]+arglist_preGSUtilUploadFile+[localPathToCVAssignedData]+[formattedBucketOI]
subprocess.run(gsutilBashUploadList)
print(titleOfCSVWithCVAssignments+' uploaded to a GCSB!')

# Wait for a short period to ensure the command has been received by the server
time.sleep(normalWaitTime/2)


CV_Fold_Collection_RegTest uploaded to a GCSB!


In [13]:
# Wait for the GSUTIL uploading process to finish before moving on
while not all(x in subprocess.run([bashFunctionGSUtil,'ls',formattedBucketOI],stdout=subprocess.PIPE).stdout.decode('utf-8') for x in [titleOfCSVWithCVAssignments]):
    print('Not everything is uploaded...')
    time.sleep(normalWaitTime)
print('Everything is uploaded; moving on...')


Everything is uploaded; moving on...


In [14]:
# Upload the file into Earth Engine as a table asset
assetIDForCVAssignedColl = 'users/'+usernameFolderString+'/'+projectFolder+'/'+titleOfCSVWithCVAssignments
earthEngineUploadTableCommands = [bashFunction_EarthEngine]+arglist_preEEUploadTable+[assetIDStringPrefix+assetIDForCVAssignedColl]+[formattedBucketOI+'/'+titleOfCSVWithCVAssignments+'.csv']+arglist_postEEUploadTable
subprocess.run(earthEngineUploadTableCommands)
print('Upload to EE queued!')

# Wait for a short period to ensure the command has been received by the server
time.sleep(normalWaitTime/2)


Upload to EE queued!


In [15]:
# !! Break and wait
while any(x in str(ee.batch.Task.list()) for x in ['RUNNING','READY']):
    print('You have jobs running! ',datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    time.sleep(normalWaitTime)
print('Moving on...')


You have jobs running!  2020-04-05 18:36:39
You have jobs running!  2020-04-05 18:36:45
You have jobs running!  2020-04-05 18:36:50
You have jobs running!  2020-04-05 18:36:56
You have jobs running!  2020-04-05 18:37:02
You have jobs running!  2020-04-05 18:37:07
You have jobs running!  2020-04-05 18:37:13
You have jobs running!  2020-04-05 18:37:18
You have jobs running!  2020-04-05 18:37:23
You have jobs running!  2020-04-05 18:37:29
You have jobs running!  2020-04-05 18:37:34
You have jobs running!  2020-04-05 18:37:40
You have jobs running!  2020-04-05 18:37:45
You have jobs running!  2020-04-05 18:37:51
You have jobs running!  2020-04-05 18:37:56
You have jobs running!  2020-04-05 18:38:02
You have jobs running!  2020-04-05 18:38:07
Moving on...


# Classification

## Load the training data in GEE

In [16]:
# Load the collection with the pre-assigned K-Fold assignments
fcOI = ee.FeatureCollection(assetIDForCVAssignedColl)


## Create the Random Forest Classifiers

In [17]:
# Instantiate a selection of random forest classifiers to determine the best model (using features for wrapping, so the entire process can be processed via an export task)
rf_VP2 = ee.Feature(ee.Geometry.Point([0,0])).set('cName','rf_VP2','c',ee.Classifier.smileRandomForest(
    numberOfTrees=25,
    variablesPerSplit=2,
    bagFraction=0.632
).setOutputMode(modelType))

rf_VP3 = ee.Feature(ee.Geometry.Point([0,0])).set('cName','rf_VP3','c',ee.Classifier.smileRandomForest(
    numberOfTrees=25,
    variablesPerSplit=3,
    bagFraction=0.632
).setOutputMode(modelType))

rf_VP4 = ee.Feature(ee.Geometry.Point([0,0])).set('cName','rf_VP4','c',ee.Classifier.smileRandomForest(
    numberOfTrees=25,
    variablesPerSplit=4,
    bagFraction=0.632
).setOutputMode(modelType))

rf_VP5 = ee.Feature(ee.Geometry.Point([0,0])).set('cName','rf_VP5','c',ee.Classifier.smileRandomForest(
    numberOfTrees=25,
    variablesPerSplit=5,
    bagFraction=0.632
).setOutputMode(modelType))

rf_VP6 = ee.Feature(ee.Geometry.Point([0,0])).set('cName','rf_VP6','c',ee.Classifier.smileRandomForest(
    numberOfTrees=25,
    variablesPerSplit=6,
    bagFraction=0.632
).setOutputMode(modelType))

rf_VP7 = ee.Feature(ee.Geometry.Point([0,0])).set('cName','rf_VP7','c',ee.Classifier.smileRandomForest(
    numberOfTrees=25,
    variablesPerSplit=7,
    bagFraction=0.632
).setOutputMode(modelType))

rf_VP8 = ee.Feature(ee.Geometry.Point([0,0])).set('cName','rf_VP8','c',ee.Classifier.smileRandomForest(
    numberOfTrees=25,
    variablesPerSplit=8,
    bagFraction=0.632
).setOutputMode(modelType))

rf_VP9 = ee.Feature(ee.Geometry.Point([0,0])).set('cName','rf_VP9','c',ee.Classifier.smileRandomForest(
    numberOfTrees=25,
    variablesPerSplit=9,
    bagFraction=0.632
).setOutputMode(modelType))

rf_VP10 = ee.Feature(ee.Geometry.Point([0,0])).set('cName','rf_VP10','c',ee.Classifier.smileRandomForest(
    numberOfTrees=25,
    variablesPerSplit=10,
    bagFraction=0.632
).setOutputMode(modelType))

# Wrap all of the models into a feature collection for function mapping
classifierList = [rf_VP2,
                rf_VP3,
                rf_VP4,
                rf_VP5,
                rf_VP6,
                rf_VP7,
                rf_VP8,
                rf_VP9,
                rf_VP10]



## Create the folder to house the cross validation feature collections

In [18]:
# Turn the folder string into an assetID and perform the deletion
assetIDToCreate_Folder = 'users/'+usernameFolderString+'/'+projectFolder+'/'+cvCollFolder
print(assetIDToCreate_Folder,'being created...')

# Create the folder within Earth Engine
subprocess.run(bashCommandList_CreateFolder+[assetIDToCreate_Folder])
while any(x in subprocess.run(bashCommandList_Detect+[assetIDToCreate_Folder],stdout=subprocess.PIPE).stdout.decode('utf-8') for x in stringsOfInterest):
    print('Waiting for asset to be created...')
    time.sleep(normalWaitTime)
print('Asset created!')


# Sleep to allow the server time to receive incoming requests
time.sleep(normalWaitTime/2)


users/acottam/ETH_Global_Forest_Cover/CV_Colls_RegTest being created...
Asset created!


In [19]:
# According to the model/data type (classification/categorical versus regression/continuous), change variables that are used in the rest of the script
if modelType == 'CLASSIFICATION':
    categoricalLevels = [int(n) for n in list(ee.Dictionary(fcOI.aggregate_histogram(classProperty)).keys().getInfo())]
    print('Categorical levels are\n')
    print(categoricalLevels)
    pyramidingPolicy = 'mode'
    print("The pyramiding policy will be 'mode'.")
    accuracyMetricString = 'OverallAccuracy'
    print("The accuracy type used for crossvalidation will be 'overall accuracy'.")
else:
    print('No need to compute categorical levels!')
    print("The pyramiding policy will be 'mean'.")
    pyramidingPolicy = 'mean'
    print("The accuracy type used for cross validation will be 'coefficient of determination'(i.e., R^2).")
    accuracyMetricString = 'R2'


No need to compute categorical levels!
The pyramiding policy will be 'mean'.
The accuracy type used for cross validation will be 'coefficient of determination'(i.e., R^2).


In [20]:
# Define the R^2 function for use with continuous valued models (i.e., regression based models)
def coefficientOfDetermination(fcOI,propertyOfInterest,propertyOfInterest_Predicted):
    # Compute the mean of the property of interest
    propertyOfInterestMean = ee.Number(ee.Dictionary(ee.FeatureCollection(fcOI).select([propertyOfInterest]).reduceColumns(ee.Reducer.mean(),[propertyOfInterest])).get('mean'));
    
    # Compute the total sum of squares
    def totalSoSFunction(f):
        return f.set('Difference_Squared',ee.Number(ee.Feature(f).get(propertyOfInterest)).subtract(propertyOfInterestMean).pow(ee.Number(2)))
    totalSumOfSquares = ee.Number(ee.Dictionary(ee.FeatureCollection(fcOI).map(totalSoSFunction).select(['Difference_Squared']).reduceColumns(ee.Reducer.sum(),['Difference_Squared'])).get('sum'))
    
    # Compute the residual sum of squares
    def residualSoSFunction(f):
        return f.set('Residual_Squared',ee.Number(ee.Feature(f).get(propertyOfInterest)).subtract(ee.Number(ee.Feature(f).get(propertyOfInterest_Predicted))).pow(ee.Number(2)))
    residualSumOfSquares = ee.Number(ee.Dictionary(ee.FeatureCollection(fcOI).map(residualSoSFunction).select(['Residual_Squared']).reduceColumns(ee.Reducer.sum(),['Residual_Squared'])).get('sum'))
    
    # Finalize the calculation
    r2 = ee.Number(1).subtract(residualSumOfSquares.divide(totalSumOfSquares))
    
    return ee.Number(r2)


## Train and classify the cross validation feature collections

In [21]:
# Make a feature collection from the k-fold assignment list
# !! Note: this is used within the scope of the function below, so this should be defined
# !! explicitly in order for the computeCVAccuracy function to run
kFoldAssignmentFC = ee.FeatureCollection(ee.List(kList).map(lambda n: ee.Feature(ee.Geometry.Point([0,0])).set('Fold',n)))

# Define a function to take a feature with a classifier of interest
def computeCVAccuracy(featureWithClassifier):
    # Pull the classifier from the feature
    cOI = ee.Classifier(featureWithClassifier.get('c'))
    
    # Create a function to map through the fold assignments and compute the overall accuracy 
    # for all validation folds
    def computeAccuracyForFold(foldFeature):
        # Organize the training and validation data
        foldNumber = ee.Number(ee.Feature(foldFeature).get('Fold'))
        trainingData = fcOI.filterMetadata(cvFoldString,'not_equals',foldNumber)
        validationData = fcOI.filterMetadata(cvFoldString,'equals',foldNumber)
        # Train the classifier and classify the validation dataset
        trainedClassifier = cOI.train(trainingData,classProperty,covariateList)
        outputtedPropName = classProperty+'_Predicted'
        classifiedValidationData = validationData.classify(trainedClassifier,outputtedPropName)
        # Create a central if/then statement that determines the type of accuracy values that are returned
        if modelType == 'CLASSIFICATION':
            # Compute the overall accuracy of the classification
            errorMatrix = classifiedValidationData.errorMatrix(classProperty,outputtedPropName,categoricalLevels)
            overallAccuracy = ee.Number(errorMatrix.accuracy())
            return foldFeature.set(accuracyMetricString,overallAccuracy)
        else:
            # Compute the R^2 of the regression
            r2ToSet = coefficientOfDetermination(classifiedValidationData,classProperty,outputtedPropName)
            return foldFeature.set(accuracyMetricString,r2ToSet)
    
    # Compute the accuracy values of the classifier across all folds
    accuracyFC = kFoldAssignmentFC.map(computeAccuracyForFold)
    meanAccuracy = accuracyFC.aggregate_mean(accuracyMetricString)
    tsdAccuracy = accuracyFC.aggregate_total_sd(accuracyMetricString)
    
    # Compute the feature to return
    featureToReturn = featureWithClassifier.select(['cName']).set('Mean_'+accuracyMetricString,meanAccuracy,'StDev_'+accuracyMetricString,tsdAccuracy)
    return featureToReturn


## Export the classified features 

In [22]:
# !! Export the accuracy FC's individually for memory purposes
for featureWithClassifier in classifierList:
    accuracyFC = ee.FeatureCollection(ee.Feature(computeCVAccuracy(featureWithClassifier)))
    classifierName = str(featureWithClassifier.get('cName').getInfo())
    finalClassifierFCExport = ee.batch.Export.table.toAsset(
        collection=accuracyFC,
        description=classifierName,
        assetId='users/'+usernameFolderString+'/'+projectFolder+'/'+cvCollFolder+'/'+classifierName
    );
    finalClassifierFCExport.start()
    print('users/'+usernameFolderString+'/'+projectFolder+'/'+cvCollFolder+'/'+classifierName+' started!')
print('All CV jobs queued; moving on...')


users/acottam/ETH_Global_Forest_Cover/CV_Colls_RegTest/rf_VP2 started!
users/acottam/ETH_Global_Forest_Cover/CV_Colls_RegTest/rf_VP3 started!
users/acottam/ETH_Global_Forest_Cover/CV_Colls_RegTest/rf_VP4 started!
users/acottam/ETH_Global_Forest_Cover/CV_Colls_RegTest/rf_VP5 started!
users/acottam/ETH_Global_Forest_Cover/CV_Colls_RegTest/rf_VP6 started!
users/acottam/ETH_Global_Forest_Cover/CV_Colls_RegTest/rf_VP7 started!
users/acottam/ETH_Global_Forest_Cover/CV_Colls_RegTest/rf_VP8 started!
users/acottam/ETH_Global_Forest_Cover/CV_Colls_RegTest/rf_VP9 started!
users/acottam/ETH_Global_Forest_Cover/CV_Colls_RegTest/rf_VP10 started!
All CV jobs queued; moving on...


In [23]:
# !! Break and wait
while any(x in str(ee.batch.Task.list()) for x in ['RUNNING','READY']):
    print('You have jobs running! ',datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    time.sleep(normalWaitTime)
print('Moving on...')


You have jobs running!  2020-04-05 18:38:32
You have jobs running!  2020-04-05 18:38:37
You have jobs running!  2020-04-05 18:38:43
You have jobs running!  2020-04-05 18:38:48
You have jobs running!  2020-04-05 18:38:54
You have jobs running!  2020-04-05 18:39:00
You have jobs running!  2020-04-05 18:39:05
You have jobs running!  2020-04-05 18:39:10
You have jobs running!  2020-04-05 18:39:16
You have jobs running!  2020-04-05 18:39:21
You have jobs running!  2020-04-05 18:39:27
You have jobs running!  2020-04-05 18:39:32
You have jobs running!  2020-04-05 18:39:38
You have jobs running!  2020-04-05 18:39:43
You have jobs running!  2020-04-05 18:39:49
You have jobs running!  2020-04-05 18:39:54
You have jobs running!  2020-04-05 18:40:00
You have jobs running!  2020-04-05 18:40:05
You have jobs running!  2020-04-05 18:40:11
You have jobs running!  2020-04-05 18:40:16
You have jobs running!  2020-04-05 18:40:21
You have jobs running!  2020-04-05 18:40:27
You have jobs running!  2020-04-

## Combine the features into a feature collection and export as an asset

In [24]:
# Create/export a feature collection specifically to hold all of the accuracy values
cvAccuracyFC = []
for featureWithClassifier in classifierList:
    cvAccuracyFC.append(ee.Feature(ee.FeatureCollection('users/'+usernameFolderString+'/'+projectFolder+'/'+cvCollFolder+'/'+str(featureWithClassifier.get('cName').getInfo())).first()))
cvAccuracyFC = ee.FeatureCollection(cvAccuracyFC)

cvAccuracyFCExport = ee.batch.Export.table.toAsset(
    collection=cvAccuracyFC,
    description=cvAccuracyFCNameString,
    assetId='users/'+usernameFolderString+'/'+projectFolder+'/'+cvCollFolder+'/'+cvAccuracyFCNameString
);
cvAccuracyFCExport.start()


In [25]:
# !! Break and wait
while any(x in str(ee.batch.Task.list()) for x in ['RUNNING','READY']):
    print('You have jobs running! ',datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    time.sleep(normalWaitTime)
print('Moving on...')


You have jobs running!  2020-04-05 18:46:19
You have jobs running!  2020-04-05 18:46:24
You have jobs running!  2020-04-05 18:46:30
You have jobs running!  2020-04-05 18:46:35
Moving on...


In [26]:
# Print the full set of accuracy values within the feature collection
print(ee.FeatureCollection('users/'+usernameFolderString+'/'+projectFolder+'/'+cvCollFolder+'/'+cvAccuracyFCNameString).sort('Mean_'+accuracyMetricString,False).getInfo())


{'type': 'FeatureCollection', 'columns': {'Mean_R2': 'Float', 'StDev_R2': 'Float', 'cName': 'String', 'system:index': 'String'}, 'version': 1586112398304843, 'id': 'users/acottam/ETH_Global_Forest_Cover/CV_Colls_RegTest/CV_Accuracy_FC_RegTest', 'properties': {'system:asset_size': 9055}, 'features': [{'type': 'Feature', 'geometry': {'type': 'Point', 'coordinates': [0, 0]}, 'id': '00000000000000000003', 'properties': {'Mean_R2': 0.5718027734974569, 'StDev_R2': 0.003659647133458033, 'cName': 'rf_VP5'}}, {'type': 'Feature', 'geometry': {'type': 'Point', 'coordinates': [0, 0]}, 'id': '00000000000000000001', 'properties': {'Mean_R2': 0.5713517965109007, 'StDev_R2': 0.0031138620284438764, 'cName': 'rf_VP3'}}, {'type': 'Feature', 'geometry': {'type': 'Point', 'coordinates': [0, 0]}, 'id': '00000000000000000000', 'properties': {'Mean_R2': 0.570778092745803, 'StDev_R2': 0.00402692397342983, 'cName': 'rf_VP2'}}, {'type': 'Feature', 'geometry': {'type': 'Point', 'coordinates': [0, 0]}, 'id': '0000

In [27]:
# Print the info on the best model
mostAccurateModelFeature = ee.Feature(ee.FeatureCollection('users/'+usernameFolderString+'/'+projectFolder+'/'+cvCollFolder+'/'+cvAccuracyFCNameString).sort('Mean_'+accuracyMetricString,False).sort('Mean_'+accuracyMetricString,False).first())
bestModelName = mostAccurateModelFeature.get('cName').getInfo()
bestModelMeanAccuracy = mostAccurateModelFeature.get('Mean_'+accuracyMetricString).getInfo()
bestModelAccuracyStDev = mostAccurateModelFeature.get('StDev_'+accuracyMetricString).getInfo()
print('Best Model','\n',bestModelName)
print('\n')
print('Model Cross Validated Mean '+accuracyMetricString,'\n',bestModelMeanAccuracy)
print('\n')
print('Model Cross Validated Standard Deviation '+accuracyMetricString,'\n',bestModelAccuracyStDev)


Best Model 
 rf_VP5


Model Cross Validated Mean R2 
 0.5718027734974569


Model Cross Validated Standard Deviation R2 
 0.003659647133458033


# Bootstrapping collection creation

In [28]:
# Check the number of points within each biome
preppedCollection['Resolve_Biome'].value_counts()
# results = preppedCollection['Resolve_Biome'].value_counts()
# df1 = results.to_frame()
# resolveBiomesDict = [[1, 'Tropical and subtropical moist broadleaf forests'],
# [2,  'Tropical and subtropical dry broadleaf forests'],
# [3, 'Tropical and subtropical coniferous forests'],
# [4, 'Temperate broadleaf and mixed forests'],
# [5, 'Temperate conifer forests'],
# [6, 'Boreal forests or taiga'],
# [7, 'Tropical and subtropical grasslands, savannas, and shrublands'],
# [8, 'Temperate grasslands, savannas, and shrublands'],
# [9, 'Flooded Grasslands and Savannas'],
# [10, 'Montane grasslands and shrublands'],
# [11, 'Tundra'],
# [12, 'Mediterranean forests, woodlands, and scrub'],
# [13, 'Deserts and xeric shrublands'],
# [14, 'Mangroves']]
# df2 = pd.DataFrame(resolveBiomesDict, columns=['id','name']).set_index('id')
# df1 = df1.join(df2)
# print(df1[['name','Resolve_Biome']])


6.0     25999
13.0    14443
11.0    14367
1.0      6233
7.0      4559
10.0     2599
5.0      1992
4.0      1473
12.0      489
8.0       184
9.0        79
2.0        75
14.0       29
3.0         3
Name: Resolve_Biome, dtype: int64

In [29]:
# Input the dictionary of values for each of the biomes
# This was computed using the following script
# https://code.earthengine.google.com/d98223f98f6f11073aa21b059bf667d6
# Missing assests in that script - not sure what it does
# These look like percentages!! Maybe its the area of biome as a percentage?
biomeDict = {
    1: 14.900835665820974,
    2: 2.941697660221864,
    3: 0.526059731441294,
    4: 9.56387696566245,
    5: 2.865354077500338,
    6: 11.519674266872787,
    7: 16.26999434439293,
    8: 8.047078485979089,
    9: 0.861212221078014,
    10: 3.623974712557433,
    11: 6.063922959332467,
    12: 2.5132866428302836,
    13: 20.037841544639985,
    14: 0.26519072167008,
}


In [30]:
print(dir(preppedCollection.groupby('Resolve_Biome', group_keys=False)))
# print(preppedCollection.groupby('Resolve_Biome', group_keys=False).head())

['CHELSA_Annual_Mean_Temperature', 'CHELSA_Annual_Precipitation', 'CHELSA_Mean_Temperature_of_Warmest_Quarter', 'CHELSA_Precipitation_Seasonality', 'CHELSA_Precipitation_of_Driest_Quarter', 'CV_Fold', 'EarthEnvTopoMed_Eastness', 'EarthEnvTopoMed_Elevation', 'EarthEnvTopoMed_Northness', 'Lat', 'Long', 'Resolve_Biome', 'SG_Depth_to_bedrock', 'SG_Sand_Content_000cm', 'SG_Sand_Content_005cm', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_accessors', '_add_numeric_operations', '_agg_examples_doc', '_agg_see_also_doc', '_aggregate', '_aggregate_frame', '_aggregate_item_by_item', '_aggregate_multiple_funcs', '_apply_filter', '_

In [39]:
# Perform an example stratified sample by biome
# AC This is pandas code - why not use GEE?
# print(dir(preppedCollection.groupby('Resolve_Biome', group_keys=False)))
# print(preppedCollection.groupby('Resolve_Biome', group_keys=False).first())
stratSample = preppedCollection.groupby('Resolve_Biome', group_keys=False).apply(lambda x: x.sample(n=int(round((biomeDict.get(x.name)/100)*bootstrapModelSize)), replace=True, random_state=1))
print(stratSample.columns)
print(stratSample.shape)


Index(['CHELSA_Annual_Mean_Temperature', 'CHELSA_Annual_Precipitation',
       'CHELSA_Mean_Temperature_of_Warmest_Quarter',
       'CHELSA_Precipitation_Seasonality',
       'CHELSA_Precipitation_of_Driest_Quarter', 'EarthEnvTopoMed_Eastness',
       'EarthEnvTopoMed_Elevation', 'EarthEnvTopoMed_Northness', 'Lat', 'Long',
       'Resolve_Biome', 'SG_Depth_to_bedrock', 'SG_Sand_Content_000cm',
       'SG_Sand_Content_005cm', 'shrubcover', 'treecover', 'CV_Fold'],
      dtype='object')
(0, 17)


In [32]:
%%time
# Run a for loop to create multiple bootstrap iterations and upload them to the Google Cloud Storage Bucket

# Create an empty list to store all of the file name strings being uploaded (for later use)
fileNameList = []

for n in seedsToUseForBootstrapping:
    # Perform the subsetting
    stratSample = preppedCollection.groupby('Resolve_Biome', group_keys=False).apply(lambda x: x.sample(n=int(round((biomeDict.get(x.name)/100)*bootstrapModelSize)), replace=True, random_state=n))
    
    # Format the title of the CSV and export it to a holding location
    titleOfBootstrapCSV = fileNameHeader+str(n).zfill(3)
    fileNameList.append(titleOfBootstrapCSV)
    fullLocalPath = holdingFolder+'/'+titleOfBootstrapCSV+'.csv'
    stratSample.to_csv(holdingFolder+'/'+titleOfBootstrapCSV+'.csv',index=False)
    
    # Format the bash call to upload the files to the Google Cloud Storage bucket
    gsutilBashUploadList = [bashFunctionGSUtil]+arglist_preGSUtilUploadFile+[fullLocalPath]+[formattedBucketOI]
    subprocess.run(gsutilBashUploadList)
    print(titleOfBootstrapCSV+' uploaded to a GCSB!')
    

ForestPotential2_BootstrapColl_RegTest_001 uploaded to a GCSB!
ForestPotential2_BootstrapColl_RegTest_002 uploaded to a GCSB!
ForestPotential2_BootstrapColl_RegTest_003 uploaded to a GCSB!
ForestPotential2_BootstrapColl_RegTest_004 uploaded to a GCSB!
ForestPotential2_BootstrapColl_RegTest_005 uploaded to a GCSB!
CPU times: user 157 ms, sys: 32 ms, total: 189 ms
Wall time: 6.62 s


In [33]:
# Wait for the GSUTIL uploading process to finish before moving on
while not all(x in subprocess.run([bashFunctionGSUtil,'ls',formattedBucketOI],stdout=subprocess.PIPE).stdout.decode('utf-8') for x in fileNameList):
    print('Not everything is uploaded...')
    time.sleep(5)
print('Everything is uploaded; moving on...')


Everything is uploaded; moving on...


In [34]:
# Create a folder to house the bootstrapped feature collection

# Turn the folder string into an assetID and perform the deletion
assetIDToCreate_Folder = 'users/'+usernameFolderString+'/'+projectFolder+'/'+bootstrapCollFolder
print(assetIDToCreate_Folder,'being created...')

# Create the image collection before classifying each of the bootstrap images
subprocess.run(bashCommandList_CreateFolder+[assetIDToCreate_Folder])
while any(x in subprocess.run(bashCommandList_Detect+[assetIDToCreate_Folder], stdout=subprocess.PIPE).stdout.decode('utf-8') for x in stringsOfInterest):
    print('Waiting for asset to be created...')
    time.sleep(normalWaitTime)
print('Asset created!')


# Sleep to allow the server time to receive incoming requests
time.sleep(normalWaitTime/2)


users/acottam/ETH_Global_Forest_Cover/Bootstrap_Collections_RegTest being created...
Asset created!


In [35]:
# Loop through the file names and upload each of them to Earth Engine
for f in fileNameList:
    assetIDForBootstrapColl = 'users/'+usernameFolderString+'/'+projectFolder+'/'+bootstrapCollFolder
    gsStorageFileLocation = formattedBucketOI
    earthEngineUploadTableCommands = [bashFunction_EarthEngine]+arglist_preEEUploadTable+[assetIDStringPrefix+assetIDForBootstrapColl+'/'+f]+[gsStorageFileLocation+'/'+f+'.csv']+arglist_postEEUploadTable
    subprocess.run(earthEngineUploadTableCommands)
    print(f+' EarthEngine Ingestion started!')
print('All files are being ingested.')


ForestPotential2_BootstrapColl_RegTest_001 EarthEngine Ingestion started!
ForestPotential2_BootstrapColl_RegTest_002 EarthEngine Ingestion started!
ForestPotential2_BootstrapColl_RegTest_003 EarthEngine Ingestion started!
ForestPotential2_BootstrapColl_RegTest_004 EarthEngine Ingestion started!
ForestPotential2_BootstrapColl_RegTest_005 EarthEngine Ingestion started!
All files are being ingested.


In [36]:
# !! Break and wait
while any(x in str(ee.batch.Task.list()) for x in ['RUNNING','READY']):
    print('You have jobs running! ',datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    time.sleep(normalWaitTime)
print('Moving on...')


You have jobs running!  2020-04-05 18:47:30
You have jobs running!  2020-04-05 18:47:35
You have jobs running!  2020-04-05 18:47:41
You have jobs running!  2020-04-05 18:47:47
Moving on...


## Using the best model, load then train all of the bootstrapped collections and make maps with each of them within an assigned image collection

In [37]:
# Create an image collection to house the outputted bootstrapped images

# Turn the folder string into an assetID and perform the creation
assetIDToCreate_Collection = 'users/'+usernameFolderString+'/'+projectFolder+'/'+bootstrapImageColl
print(assetIDToCreate_Collection,'being created...')

# Create the image collection before classifying each of the bootstrap images
subprocess.run(bashCommandList_CreateCollection+[assetIDToCreate_Collection])
while any(x in subprocess.run(bashCommandList_Detect+[assetIDToCreate_Collection],stdout=subprocess.PIPE).stdout.decode('utf-8') for x in stringsOfInterest):
    print('Waiting for asset to be created...')
    time.sleep(normalWaitTime)
print('Asset created!')


# Sleep to allow the server time to receive incoming requests
time.sleep(normalWaitTime/2)


users/acottam/ETH_Global_Forest_Cover/Bootstrap_Images_RegTest being created...
Asset created!


In [38]:
# Load the best model from the classifier list
classifierToBootstrap = ee.Classifier(ee.Feature(ee.FeatureCollection(classifierList).filterMetadata('cName','equals',bestModelName).first()).get('c'))

# Run a for loop to create multiple bootstrap iterations
for n in seedsToUseForBootstrapping:
    
    # Format the title of the CSV and export it to a holding location
    titleOfColl = fileNameHeader+str(n).zfill(3)
    collectionPath = 'users/'+usernameFolderString+'/'+projectFolder+'/'+bootstrapCollFolder+'/'+titleOfColl
    
    # Load the collection from the path
    fcToTrain = ee.FeatureCollection(collectionPath)
    
    # Train the classifier with the collection
    trainedClassifer = classifierToBootstrap.train(fcToTrain,classProperty,covariateList)
    
    # Classify the image
    classifiedImage = compositeToClassify.classify(trainedClassifer,classProperty+'_Predicted')
    
    # Queue the export
    # !! The current pyramiding policy is set to mode as the current map is categorical;
    # !! Make sure to change this argument when dealing with continuous datasets
    bootstrapImageExport = ee.batch.Export.image.toAsset(
        image=classifiedImage,
        description=titleOfColl,
        assetId=assetIDToCreate_Collection+'/'+titleOfColl,
        crs='EPSG:4326',
        crsTransform='[0.008333333333333333,0,-180,0,-0.008333333333333333,90]',
        region=exportingGeometry.getInfo()['coordinates'],
        maxPixels=int(1e13),
        pyramidingPolicy={".default": pyramidingPolicy}
    );
    bootstrapImageExport.start()
    print(titleOfColl+' queued!')

EEException: Collection.loadTable: Collection asset 'users/acottam/ETH_Global_Forest_Cover/Bootstrap_Collections_RegTest/ForestPotential2_BootstrapColl_RegTest_001' not found.

In [None]:
# # !! Break and wait
# while any(x in str(ee.batch.Task.list()) for x in ['RUNNING','READY']):
#     print('You have jobs running! ',datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
#     time.sleep(longWaitTime)
# print('Moving on...')
