In [None]:
# !curl -O https://storage.googleapis.com/gcp-public-data-landsat/index.csv.gz
# !gunzip index.csv.gz
# !mamba install -y -c conda-forge python-fmask
# !pip install lcmap-pyccd
# !mkdir tmp_scenedir

In [None]:
import pandas as pd
import os
import glob
import subprocess
from rasterio.io import MemoryFile
from rio_cogeo.cogeo import cog_translate
from rio_cogeo.profiles import cog_profiles
import tempfile

import fsspec

# from dask.distributed import Client
# import dask

In [None]:
df = pd.read_csv("index.csv")
df.head()

In [None]:
x0, y0, x1, y1 = [-124.763068, 45.543541, -116.915989, 49.002494]
time_start = "2003-02-20T00:00:00Z"
time_end = "2009-10-11T23:59:59Z"

In [None]:
df_wa = df[
    (time_start < df["SENSING_TIME"])
    & (df["SENSING_TIME"] < time_end)
    & (df["NORTH_LAT"] < y1)
    & (df["SOUTH_LAT"] > y0)
    & (df["WEST_LON"] > x0)
    & (df["EAST_LON"] < x1)
]

In [None]:
fs = fsspec.get_filesystem_class("gs")()
tmp_scenedir = "tmp_scenedir"
tmp_fmask_dir = "tmp_fmask"
tmp_pub_dir = "tmp_pub"

os.makedirs(tmp_scenedir, exist_ok=True)
os.makedirs(tmp_fmask_dir, exist_ok=True)
os.makedirs(tmp_pub_dir, exist_ok=True)


def download_scene(base_url, download_dir):
    paths = fs.ls(base_url)
    for p in paths:
        target = p.split("/")[-1]
        print(f"downloading {target}")
        fs.get_file(p, f"./{download_dir}/{target}")


def clean_tempdir(dirname=tmp_scenedir):
    paths = glob.glob(f"{dirname}/*")
    for p in paths:
        os.unlink(p)


def run_fmask(base_url, scendir):
    target = base_url.split("/")[-1]
    output_fname = f"{tmp_fmask_dir}/{target}_MASK.img"
    print(output_fname)

    # Question for Jon: did you use the default parameters here or did you customize a bit?
    subprocess.run(
        [
            "fmask_usgsLandsatStacked.py",
            "--verbose",
            "--output",
            output_fname,
            "--tempdir",
            tmp_fmask_dir,
            "--scenedir",
            scendir,
        ],
        check=True,
    )

    return output_fname


def translate(fo, out_file):
    """translate a file object (`fo`) to cloud optimized geotiff

    the resulting COG is written to the filesystem (`fs`) defined above.
    """
    dst_profile = cog_profiles.get("deflate")
    with MemoryFile() as mem_dst:
        # Important, we pass `mem_dst.name` as output dataset path
        cog_translate(fo, mem_dst.name, dst_profile, in_memory=True)
        print(f"writing cog to {out_file}")
        with open(out_file, "wb") as f:
            f.write(mem_dst.read())


def publish(base_url, input_fname):
    target = base_url.split("/")[-1]
    output_fname = input_fname.replace(".img", ".TIF")
    cloud_uri = f"carbonplan-climatetrace/v1/landsat/cloudmasks/{target}_MASK.TIF"

    with open(input_fname, mode="rb") as f:
        translate(f, output_fname)

    print(f"putting file to: {cloud_uri}")
    fs.put_file(output_fname, cloud_uri)
    os.unlink(output_fname)


def process_scene(base_url):
    tmp_scenedir = tempfile.mkdtemp()
    print(tmp_scenedir)
    try:
        download_scene(base_url, tmp_scenedir)
        mask_fname = run_fmask(base_url, tmp_scenedir)
        publish(base_url, mask_fname)
    finally:
        print("cleaning up")
        clean_tempdir(tmp_scenedir)

In [None]:
tasks = []
for base_url in df_wa["BASE_URL"]:
    try:
        process_scene(base_url)
    except Exception as e:
        print("failed: ", base_url)
        raise e