In [2]:
!curl -O https://storage.googleapis.com/gcp-public-data-landsat/index.csv.gz
!gunzip index.csv.gz
!mamba install -y -c conda-forge python-fmask
!pip install lcmap-pyccd
!mkdir tmp_scenedir

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  694M  100  694M    0     0  60.6M      0  0:00:11  0:00:11 --:--:-- 74.3M

                  __    __    __    __
                 /  \  /  \  /  \  /  \
                /    \/    \/    \/    \
███████████████/  /██/  /██/  /██/  /████████████████████████
              /  / \   / \   / \   / \  \____
             /  /   \_/   \_/   \_/   \    o \__,
            / _/                       \_____/  `
            |/
        ███╗   ███╗ █████╗ ███╗   ███╗██████╗  █████╗
        ████╗ ████║██╔══██╗████╗ ████║██╔══██╗██╔══██╗
        ██╔████╔██║███████║██╔████╔██║██████╔╝███████║
        ██║╚██╔╝██║██╔══██║██║╚██╔╝██║██╔══██╗██╔══██║
        ██║ ╚═╝ ██║██║  ██║██║ ╚═╝ ██║██████╔╝██║  ██║
        ╚═╝     ╚═╝╚═╝  ╚═╝╚═╝     ╚═╝╚═════╝ ╚═╝  ╚═╝

        mamba (0.6.3) supported by @QuantStack

        GitHub:  https://github.com/mamba

In [3]:
import pandas as pd
import os
import glob
import subprocess
from rasterio.io import MemoryFile
from rio_cogeo.cogeo import cog_translate
from rio_cogeo.profiles import cog_profiles
import tempfile

import fsspec
# from dask.distributed import Client
# import dask

In [4]:
df = pd.read_csv('index.csv')
df.head()

Unnamed: 0,SCENE_ID,PRODUCT_ID,SPACECRAFT_ID,SENSOR_ID,DATE_ACQUIRED,COLLECTION_NUMBER,COLLECTION_CATEGORY,SENSING_TIME,DATA_TYPE,WRS_PATH,WRS_ROW,CLOUD_COVER,NORTH_LAT,SOUTH_LAT,WEST_LON,EAST_LON,TOTAL_SIZE,BASE_URL
0,LT50250341988273XXX03,LT05_L1TP_025034_19880929_20161003_01_T2,LANDSAT_5,TM,1988-09-29,1,T2,1988-09-29T16:19:16.2370440Z,L1TP,25,34,91.0,38.44254,36.51232,-94.10922,-91.40392,110461152,gs://gcp-public-data-landsat/LT05/01/025/034/L...
1,LT50570111995180PAC00,LT05_L1GS_057011_19950629_20170107_01_T2,LANDSAT_5,TM,1995-06-29,1,T2,1995-06-29T19:02:49.1410380Z,L1GS,57,11,100.0,70.74043,68.4876,-127.33729,-120.27468,113044382,gs://gcp-public-data-landsat/LT05/01/057/011/L...
2,LC80030082018057LGN00,LC08_L1GT_003008_20180226_20180308_01_T2,LANDSAT_8,OLI_TIRS,2018-02-26,1,T2,2018-02-26T14:22:03.7781520Z,L1GT,3,8,100.0,74.63525,72.21839,-38.16706,-29.14978,652516143,gs://gcp-public-data-landsat/LC08/01/003/008/L...
3,LE71321082000008SGS00,LE07_L1GT_132108_20000108_20170215_01_T2,LANDSAT_7,ETM,2000-01-08,1,T2,2000-01-08T04:12:17.8195827Z,L1GT,132,108,27.0,-67.15247,-69.39869,63.88054,70.24175,186849087,gs://gcp-public-data-landsat/LE07/01/132/108/L...
4,LT51340461999059BKT00,LT05_L1TP_134046_19990228_20161219_01_T1,LANDSAT_5,TM,1999-02-28,1,T1,1999-02-28T03:46:17.5930500Z,L1TP,134,46,1.0,21.19646,19.28073,93.16942,95.40497,128934199,gs://gcp-public-data-landsat/LT05/01/134/046/L...


In [None]:
x0, y0, x1, y1 = [-124.763068, 45.543541, -116.915989, 49.002494]
time_start = '2003-02-20T00:00:00Z'
time_end = '2009-10-11T23:59:59Z'

In [None]:
df_wa = df[(time_start < df['SENSING_TIME']) &
           (df['SENSING_TIME'] < time_end) &
           (df['NORTH_LAT'] < y1) &
           (df['SOUTH_LAT'] > y0) &
           (df['WEST_LON'] > x0) &
           (df['EAST_LON'] < x1)]

In [None]:
fs = fsspec.get_filesystem_class('gs')()
tmp_scenedir = 'tmp_scenedir'
tmp_fmask_dir = 'tmp_fmask'
tmp_pub_dir = 'tmp_pub'

os.makedirs(tmp_scenedir, exist_ok=True)
os.makedirs(tmp_fmask_dir, exist_ok=True)
os.makedirs(tmp_pub_dir, exist_ok=True)


def download_scene(base_url, download_dir):
    paths = fs.ls(base_url)
    for p in paths:
        target = p.split('/')[-1]
        print(f'downloading {target}')
        fs.get_file(p, f'./{download_dir}/{target}')
        

def clean_tempdir(dirname=tmp_scenedir):
    paths = glob.glob(f'{dirname}/*')
    for p in paths:
        os.unlink(p)


def run_fmask(base_url, scendir):
    target = base_url.split('/')[-1]
    output_fname = f'{tmp_fmask_dir}/{target}_MASK.img'
    print(output_fname)
    
    # Question for Jon: did you use the default parameters here or did you customize a bit?    
    subprocess.run(['fmask_usgsLandsatStacked.py', '--verbose',
                    '--output', output_fname,
                    '--tempdir', tmp_fmask_dir,
                    '--scenedir', scendir], check=True)
    
    return output_fname


def translate(fo, out_file):
    """translate a file object (`fo`) to cloud optimized geotiff

    the resulting COG is written to the filesystem (`fs`) defined above.
    """
    dst_profile = cog_profiles.get("deflate")
    with MemoryFile() as mem_dst:
        # Important, we pass `mem_dst.name` as output dataset path
        cog_translate(fo, mem_dst.name, dst_profile, in_memory=True)
        print(f"writing cog to {out_file}")
        with open(out_file, "wb") as f:
            f.write(mem_dst.read())
    

def publish(base_url, input_fname):
    target = base_url.split('/')[-1]
    output_fname = input_fname.replace('.img', '.TIF')
    cloud_uri = f'carbonplan-climatetrace/v1/landsat/cloudmasks/{target}_MASK.TIF'
    
    with open(input_fname, mode='rb') as f:
        translate(f, output_fname)
        
    print(f'putting file to: {cloud_uri}')
    fs.put_file(output_fname, cloud_uri)
    os.unlink(output_fname)

def process_scene(base_url):
    tmp_scenedir = tempfile.mkdtemp()
    print(tmp_scenedir)
    try:
        download_scene(base_url, tmp_scenedir)
        mask_fname = run_fmask(base_url, tmp_scenedir)
        publish(base_url, mask_fname)
    finally:
        print('cleaning up')
        clean_tempdir(tmp_scenedir)

In [None]:
tasks = []
for base_url in df_wa['BASE_URL']:
    try:
        process_scene(base_url)
    except Exception as e:
        print('failed: ', base_url)
        raise e