In [None]:
import sys
sys.path.append("..")
import csv
import shutil
import rasterio
import geopandas as gpd
import numpy as np
import utils.data as dt
from pathlib import Path

The three parameters of this script are,

* Which directory contains all the scenes that we want to extract preprocessing-relevant summaries and masks for?
* Where do we want to save our results?
* What is the full path to the labels from which we will construct our masks?

For the first question, we will use the following choices (on the Azure machine),

* Bing Recent: `/datadrive/glaciers/bing_glaciers/bing_glacial_lakes`
* Landsat 2015: `/datadrive/snake/lakes/le7-2015/{split}`
* Landsat all: `/datadrive/snake/lakes/imagery`

Technically we only need Landsat all, but Landsat 2015 is convenient because that's the only data that we can train on (the rest would be for purely inference purposes).

Landsat:
```
papermill -p in_dir /datadrive/snake/lakes/le7-2015/splits/train -p out_dir /datadrive/snake/lakes/le7-2015/splits/train/processed preprocess.ipynb -
papermill -p in_dir /datadrive/snake/lakes/le7-2015/splits/val -p out_dir /datadrive/snake/lakes/le7-2015/splits/val/processed preprocess.ipynb -
papermill -p in_dir /datadrive/snake/lakes/le7-2015/splits/test -p out_dir /datadrive/snake/lakes/le7-2015/splits/test/processed preprocess.ipynb -
```

Bing:
```
papermill -p in_dir /datadrive/glaciers/bing_glaciers/bing_glacial_lakes/splits/train -p out_dir /datadrive/glaciers/bing_glaciers/bing_glacial_lakes/splits/train/processed preprocess.ipynb -
```

In [None]:
in_dir = "/datadrive/snake/lakes/le7-2015/splits/train"
label_path = "/datadrive/snake/lakes/GL_3basins_2015.shp"

In [None]:
paths = {"in": Path(in_dir), "label": Path(label_path)}
if (paths["in"] / "images").exists():
    shutil.rmtree(paths["in"] / "images")
    shutil.rmtree(paths["in"] / "labels")
(paths["in"] / "labels").mkdir(parents=True)

Next, we read in the label data and set up the writer to which we will save summary statistics.

In [None]:
scene_list = list(paths["in"].glob("*.tif"))
y = gpd.read_file(paths["label"])
fields = ["scene"] + sum([[f"{s}_{i}" for i in range(11)] for s in ["mean", "sdev"]], [])
f = open(paths["in"] / "statistics.csv", "a")
writer = csv.writer(f)
writer.writerow(fields)

Finally, we can loop over all the scenes in `in_dir` and save the relevant statistics and masks.

In [None]:
for scene in scene_list:
    img = rasterio.open(scene)
    result = dt.preprocessor(img, y)
    out_path = paths["in"] / f"labels/{scene.stem}-labels.tif"
    dt.save_raster(result[2], img.meta, img.transform, out_path)
    writer.writerow([str(scene.stem)] + list(np.hstack(result[:2])))
    
f.close()

In [None]:
(paths["in"] / "images").mkdir(parents=True)
[shutil.move(str(s), paths["in"] / "images") for s in scene_list]

In [None]:
import matplotlib.pyplot as plt

for s in result[2]:
    plt.imshow(s)
    plt.show()