# Cloud COGs to Image Collection

In [46]:
import ee
import pandas as pd
from pathlib import Path
from datetime import datetime, timezone
import os
import re

from google.auth.transport.requests import AuthorizedSession

ee.Authenticate()  #  or !earthengine authenticate --auth_mode=gcloud
session = AuthorizedSession(ee.data.get_persistent_credentials())

## Cleanup existing imageCollection, if needed

imageCollections cannot be deleted until all images inside them are deleted. For a cloud-backed image collection with hundreds or thousands of tiles, this can take a while

bash earthengine CLI
```
for i in `earthengine ls projects/akveg-map/assets/reflectance_vhr/ortho_toa_images`; do earthengine rm $i; done
for i in `earthengine ls projects/akveg-map/assets/reflectance_vhr/ortho_toa_images_cloud`; do earthengine rm $i; done
for i in `earthengine ls projects/akveg-map/assets/reflectance_vhr/landsat_ccdc_sr`; do earthengine rm $i; done

# earthengine rm projects/akveg-map/assets/s2_2019_2023_gMedian_v20240311
```

## Create empty image collections as target
TODO Automate creation of empty image collection.

For now, manually create empty image collection with earthengine CLI

```
earthengine create collection projects/akveg-map/assets/reflectance_vhr/ortho_toa_images
earthengine create collection projects/akveg-map/assets/reflectance_vhr/ortho_toa_images_cloud
earthengine create collection projects/akveg-map/assets/reflectance_vhr/landsat_ccdc_sr
```

## Create list of cloud geotiffs in bucket and image collection

Create list of cogs in a bucket folder to load into an imageCollection.

Create list of cogs that have already been loaded into the imageCollection.

Run in bash in a conda env with gsutil and earthengine command line installed and authenticated.

TODO: Configure it to run directly in python

```
cd /data/gis/raster_base/Alaska/AKVegMap/EVWHS

gsutil ls gs://akveg-data/vhr/ortho_toa_images/*.tif > vhr_toa_cogs.txt
earthengine ls projects/akveg-map/assets/reflectance_vhr/ortho_toa_images > vhr_toa_cogs_inColl.txt

gsutil ls gs://akveg-data/vhr/ortho_toa_images_cloud/*.tif > vhr_cloud_cogs.txt
earthengine ls projects/akveg-map/assets/reflectance_vhr/ortho_toa_images_cloud > vhr_cloud_cogs_inColl.txt

gsutil ls gs://akveg-data/vhr/landsat_ccdc_sr/*.tif > landsat_ccdc_sr_cogs.txt
earthengine ls projects/akveg-map/assets/reflectance_vhr/landsat_ccdc_sr > landsat_ccdc_sr_cogs_inColl.txt

```

In [59]:
# Function to read a text file and return a list of paths
def read_paths_from_file(filepath):
    with open(filepath, 'r') as file:
        paths = file.read().splitlines()
    return paths

# Function to extract the last part of the path and remove the extension
def get_filename_without_extension(path):
    return os.path.splitext(os.path.basename(path))[0]


# Function to save a list of paths to a text file
def save_paths_to_file(paths, filepath):
    with open(filepath, 'w') as file:
        for path in paths:
            file.write(f"{path}\n")

def in_list_1_not_list_2(csv1_path, csv2_path):
    cogList = read_paths_from_file(csv1_path)
    cogListIC = read_paths_from_file(csv2_path)
    
    filenames1 = [get_filename_without_extension(path) for path in cogList]
    filenames2 = [get_filename_without_extension(path) for path in cogListIC]
    
    # Convert lists to pd Series
    filenames_series1 = pd.Series(filenames1, index=cogList)
    filenames_series2 = pd.Series(filenames2, index=cogListIC)
    
    # Find filenames in list1 that are not in list2
    unique_to_list1 = filenames_series1[~filenames_series1.isin(filenames2)]
    
    print("\nPaths in list1 that are not in list2 based on filenames:")
    print(unique_to_list1.index.tolist()[:5])
    print(len(unique_to_list1.index.tolist()))
    
    return unique_to_list1

In [60]:
# Apply the functions to for full list and already loaded list of vhr_toa_cogs
unique_to_list1 = in_list_1_not_list_2('/data/gis/raster_base/Alaska/AKVegMap/EVWHS/vhr_toa_cogs.txt',
                                       '/data/gis/raster_base/Alaska/AKVegMap/EVWHS/vhr_toa_cogs_inColl.txt')

save_paths_to_file(unique_to_list1.index.tolist(), '/data/gis/raster_base/Alaska/AKVegMap/EVWHS/vhr_toa_cogs_notInColl.txt')


Paths in list1 that are not in list2 based on filenames:
[]
0


In [61]:
##Now do it for cloudCogs
unique_to_list1 = in_list_1_not_list_2('/data/gis/raster_base/Alaska/AKVegMap/EVWHS/vhr_cloud_cogs.txt',
                                       '/data/gis/raster_base/Alaska/AKVegMap/EVWHS/vhr_cloud_cogs_inColl.txt')

save_paths_to_file(unique_to_list1.index.tolist(), '/data/gis/raster_base/Alaska/AKVegMap/EVWHS/vhr_cloud_cogs_notInColl.txt')


Paths in list1 that are not in list2 based on filenames:
[]
0


In [62]:
# Now do it for CCDC
unique_to_list1 = in_list_1_not_list_2('/data/gis/raster_base/Alaska/AKVegMap/EVWHS/landsat_ccdc_sr_cogs.txt',
                                       '/data/gis/raster_base/Alaska/AKVegMap/EVWHS/landsat_ccdc_sr_cogs_inColl.txt')

save_paths_to_file(unique_to_list1.index.tolist(), '/data/gis/raster_base/Alaska/AKVegMap/EVWHS/landsat_ccdc_sr_cogs_notInColl.txt')


Paths in list1 that are not in list2 based on filenames:
['gs://akveg-data/vhr/landsat_ccdc_sr/ccdc_20070621_214244.tif', 'gs://akveg-data/vhr/landsat_ccdc_sr/ccdc_20070811_215942.tif', 'gs://akveg-data/vhr/landsat_ccdc_sr/ccdc_20100823_213936.tif', 'gs://akveg-data/vhr/landsat_ccdc_sr/ccdc_20120822_212842.tif', 'gs://akveg-data/vhr/landsat_ccdc_sr/ccdc_20130618_214811.tif']
70


## Open list of geotiffs to ingest

In [63]:
toa_cogs = pd.read_csv('/data/gis/raster_base/Alaska/AKVegMap/EVWHS/vhr_toa_cogs_notInColl.txt', header=None,names=['tif'])
print(toa_cogs[0:2])
print(len(toa_cogs.index))

cloud_cogs = pd.read_csv('/data/gis/raster_base/Alaska/AKVegMap/EVWHS/vhr_cloud_cogs_notInColl.txt', header=None,names=['tif'])
print(cloud_cogs[0:2])
print(len(cloud_cogs.index))

landsat_ccdc_sr_cogs = pd.read_csv('/data/gis/raster_base/Alaska/AKVegMap/EVWHS/landsat_ccdc_sr_cogs_notInColl.txt', header=None,names=['tif'])
print(landsat_ccdc_sr_cogs[0:2])
print(len(landsat_ccdc_sr_cogs.index))


Empty DataFrame
Columns: [tif]
Index: []
0
Empty DataFrame
Columns: [tif]
Index: []
0
                                                 tif
0  gs://akveg-data/vhr/landsat_ccdc_sr/ccdc_20070...
1  gs://akveg-data/vhr/landsat_ccdc_sr/ccdc_20070...
70


## Setup parameters

In [64]:
import json
# from urllib.parse import urlparse
import os
from pprint import pprint

# Earth Engine enabled Cloud Project.
project_folder = 'akveg-map'
# collection = 'reflectance_vhr/ortho_toa_images'

## View list of cogs to ingest
Optional, can skip when list is long.

In [65]:
for cog in toa_cogs['tif']:
    print(cog)

## Function to load list of gcs cogs to GEE imageCollection
comment out pprint and most print except when troubleshooting

In [66]:
#TODO Fix date extraction from filename to work for different filename formats
def load_gcs_cogs_to_collection(cogs, project_folder, collection, yyyymmdd_part, hhmmss_part):
    # Request body as a dictionary.
    for cog in cogs['tif']:
      fileOnly = os.path.split(cog)[1]
      # print(fileOnly)

      cogName = fileOnly[:-4]
      print(cogName)
      
      # parts = fileOnly.split('_')
      # Split on both underscore and period
      parts = re.split(r'[_\.]', fileOnly)
      print(parts)
      
      yyyymmdd_hhmmss = f"{parts[yyyymmdd_part]}_{parts[hhmmss_part]}"
      # yyyymmdd_hhmmss = f"{parts[3]}_{parts[4]}"
      print(yyyymmdd_hhmmss)

      dt = datetime.strptime(yyyymmdd_hhmmss, "%Y%m%d_%H%M%S").replace(tzinfo=timezone.utc)
      dt = f"{dt:%Y-%m-%dT%H:%M:%SZ}"  # Format string for ISO 8601 + Z
      print(dt)
        
      request = {
        'type': 'IMAGE',
        'gcs_location': {
          'uris': cog
        },
        'properties': {
        #   'source': 'https://code.earthengine.google.com/067b10ee56537817756a3177a9138aee',
            'yyyymmdd_hhmmss': yyyymmdd_hhmmss
        },
        'startTime': dt,#'2023-01-01T00:00:00.000000000Z',
        'endTime': dt,#'2024-01-01T00:00:00.000000000Z',
      }

      # pprint(json.dumps(request))

      # A folder (or ImageCollection) name and the new asset name.
      asset_id = collection+'/'+cogName
      # print(project_folder)
      # print(asset_id)
        
      url = 'https://earthengine.googleapis.com/v1alpha/projects/{}/assets?assetId={}'
      print(url)

      response = session.post(
        url = url.format(project_folder, asset_id),
        data = json.dumps(request)
      )

      # pprint(json.loads(response.content))
    print('done')


## Run it

In [56]:
load_gcs_cogs_to_collection(toa_cogs, project_folder, 'reflectance_vhr/ortho_toa_images', 2, 3)


done


In [57]:
load_gcs_cogs_to_collection(cloud_cogs, project_folder, 'reflectance_vhr/ortho_toa_images_cloud', 3, 4)


done


In [67]:
load_gcs_cogs_to_collection(landsat_ccdc_sr_cogs, project_folder, 'reflectance_vhr/landsat_ccdc_sr', 1, 2)


ccdc_20070621_214244
['ccdc', '20070621', '214244', 'tif']
20070621_214244
2007-06-21T21:42:44Z
https://earthengine.googleapis.com/v1alpha/projects/{}/assets?assetId={}
ccdc_20070811_215942
['ccdc', '20070811', '215942', 'tif']
20070811_215942
2007-08-11T21:59:42Z
https://earthengine.googleapis.com/v1alpha/projects/{}/assets?assetId={}
ccdc_20100823_213936
['ccdc', '20100823', '213936', 'tif']
20100823_213936
2010-08-23T21:39:36Z
https://earthengine.googleapis.com/v1alpha/projects/{}/assets?assetId={}
ccdc_20120822_212842
['ccdc', '20120822', '212842', 'tif']
20120822_212842
2012-08-22T21:28:42Z
https://earthengine.googleapis.com/v1alpha/projects/{}/assets?assetId={}
ccdc_20130618_214811
['ccdc', '20130618', '214811', 'tif']
20130618_214811
2013-06-18T21:48:11Z
https://earthengine.googleapis.com/v1alpha/projects/{}/assets?assetId={}
ccdc_20130618_214818
['ccdc', '20130618', '214818', 'tif']
20130618_214818
2013-06-18T21:48:18Z
https://earthengine.googleapis.com/v1alpha/projects/{}/asse