In [40]:
%matplotlib inline
import sys, os, time
import pandas as pd
import numpy as np
import json

from collections import defaultdict

import matplotlib
import matplotlib.pyplot as plt

highres_colors = [
    "#000000",
    "#0000FF",
    "#008000",
    "#80FF80",
    "#806060",
]
highres_cmap = matplotlib.colors.ListedColormap(highres_colors)

import fiona
import fiona.transform
import rasterio
import rasterio.mask
import shapely
import shapely.geometry

In [2]:
NLCD_CLASSES = [
    0, 11, 12, 21, 22, 23, 24, 31, 41, 42, 43, 51, 52, 71, 72, 73, 74, 81, 82, 90, 95, 255
]
NLCD_CLASSES_TO_IDX = defaultdict(lambda: 0, {cl:i for i,cl in enumerate(NLCD_CLASSES)})
NLCD_CLASS_IDX = range(len(NLCD_CLASSES))

In [3]:
def humansize(nbytes):
    suffixes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
    i = 0
    while nbytes >= 1024 and i < len(suffixes)-1:
        nbytes /= 1024.
        i += 1
    f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
    return '%s %s' % (f, suffixes[i])

In [4]:
def get_nlcd_stats(data):
    counts = []
    for val in NLCD_CLASSES:
        counts.append((data==val).sum())
    return np.array(counts)

In [5]:
def get_lc_stats(data):
    vals = [1, 2, 3, 4, 5, 6, 15]
    counts = []
    for val in vals:
        counts.append((data==val).sum())
    return np.array(counts)

In [6]:
def get_random_string(n):
    alphabet = list("abcdefghijklmnopqrstuvwxyz".upper())
    return ''.join(np.random.choice(alphabet, n, replace=True))

In [7]:
def bounds_intersection(bound0, bound1):
    left0, bottom0, right0, top0 = bound0
    left1, bottom1, right1, top1 = bound1
    left, bottom, right, top = \
            max([left0, left1]), max([bottom0, bottom1]), \
            min([right0, right1]), min([top0, top1])
    return (left, bottom, right, top)

In [8]:
new_to_old_map = {}
f = open("data/2013_2014-to-2011_2012.csv", "r")
f.readline()
lines = f.read().strip().split("\n")
for line in lines:
    parts = line.split(",")
    new_to_old_map[parts[0]] = parts[1]
f.close()

In [12]:
states = [
    "de_1m_2013", # 107 tiles
    "ny_1m_2013", # 407 tiles
    "md_1m_2013", # 691 tiles
    "pa_1m_2013", # 2239 tiles
    "wv_1m_2014", # 292 tiles
    "va_1m_2014"  # 1238 tiles
]

## Sample patches from the train and val files

In [23]:
num_tiles = 25
samples_per_tile = 500
sample_size = 240
num_channels = 6
num_bytes_per_channel = 1
average_tile_size = 7000

print("Number of samples", num_tiles * samples_per_tile)
print("Number of samples_per_tile that will give complete coverage", (average_tile_size/sample_size) * (average_tile_size/sample_size))
print("Expected fraction of each tile that will be sampled", (samples_per_tile * (sample_size*sample_size)) / (average_tile_size*average_tile_size))
print("Size of sampled data", humansize(num_tiles * samples_per_tile * (sample_size*sample_size) * num_channels * num_bytes_per_channel))

Number of samples 15000
Number of samples_per_tile that will give complete coverage 850.6944444444445
Expected fraction of each tile that will be sampled 0.5877551020408164
Size of sampled data 8.05 GB


In [47]:
def make_dataset(fns, state, output_dir):
    
    patch_fns = []
    patch_metadata = []
    patch_shapes = []

    for i, lc_fn in enumerate(fns):
        print(i, len(fns))

        new_naip_fn = lc_fn.replace("resampled-lc", "esri-naip")[:-7] + ".mrf"
        old_naip_fn = new_to_old_map[new_naip_fn]
        nlcd_fn = old_naip_fn.replace("esri-naip", "resampled-nlcd")[:-4] + "_nlcd.tif"

        naip_f = rasterio.open(new_naip_fn, "r")
        crs = naip_f.crs["init"]
        naip_bounds = naip_f.bounds

        lc_f = rasterio.open(lc_fn, "r")
        assert lc_f.crs["init"] == crs
        lc_bounds = lc_f.bounds

        nlcd_f = rasterio.open(nlcd_fn, "r")
        assert nlcd_f.crs["init"] == crs
        nlcd_bounds = nlcd_f.bounds

        bounds = bounds_intersection(bounds_intersection(naip_bounds, lc_bounds), nlcd_bounds)
        left, bottom, right, top = bounds
        geom = shapely.geometry.mapping(shapely.geometry.box(left, bottom, right, top, ccw=True))
                
        naip_data, _ = rasterio.mask.mask(naip_f, [geom], crop=True)
        #naip_data = np.rollaxis(naip_data, 0, 3)
        naip_f.close()
        lc_data, _ = rasterio.mask.mask(lc_f, [geom], crop=True)
        #lc_data = np.squeeze(lc_data)
        lc_f.close()
        nlcd_data, _ = rasterio.mask.mask(nlcd_f, [geom], crop=True)
        #nlcd_data = np.vectorize(NLCD_CLASSES_TO_IDX.__getitem__)(nlcd_data).astype(np.uint8)
        nlcd_f.close()

        geom = fiona.transform.transform_geom(crs,'epsg:4326', geom)
        
        #print(naip_fn, naip_data.shape, naip_data.dtype)
        #print(nlcd_fn, nlcd_data.shape, nlcd_data.dtype)
        #print(lc_fn, lc_data.shape, lc_data.dtype)

        _, height, width = naip_data.shape

        for j in range(samples_per_tile):

            y = np.random.randint(0, height-sample_size)
            x = np.random.randint(0, width-sample_size)

            merged = np.concatenate([
                naip_data[:, y:y+sample_size, x:x+sample_size],
                lc_data[:, y:y+sample_size, x:x+sample_size],
                nlcd_data[:, y:y+sample_size, x:x+sample_size],
            ])

            lc_string = ','.join(map(str,get_lc_stats(merged[4,:,:])))
            nlcd_string = ','.join(map(str,get_nlcd_stats(merged[5:,:])))
            
            
            left, bottom, right, top
            
            t_left = left + x
            t_right = left + x + sample_size
            t_top = top - y
            t_bottom = top - y - sample_size
            t_geom = shapely.geometry.mapping(shapely.geometry.box(t_left, t_bottom, t_right, t_top, ccw=True))
            t_geom = fiona.transform.transform_geom(crs, 'epsg:4326', t_geom)

            output_fn = "%s-%s-%d.npy" % (
                state,
                os.path.basename(new_naip_fn)[:-4],
                j
            )

            np.save(os.path.join(output_dir, output_fn), merged[np.newaxis].data)
            patch_fns.append(os.path.join(output_dir, output_fn))
            patch_metadata.append((
                new_naip_fn,
                x, y,
                lc_string,
                nlcd_string
            ))
            patch_shapes.append(json.dumps(t_geom))
    
    return patch_fns, patch_metadata, patch_shapes

In [48]:
for state in states:
    for ds in ["train", "val"]:
        print(state, ds)
        output_dir = "/mnt/blobfuse/cnn-minibatches/cvpr_splits/%s_%s/" % (state, ds)
        os.makedirs(output_dir, exist_ok=True)

        f = open("splits/%s_%s.txt" % (state, ds),"r")
        fns = f.read().strip().split("\n")
        f.close()

        patch_fns, patch_metadata, patch_shapes = make_dataset(fns, state, output_dir)
        
        f = open("splits/%s_%s_metadata.csv" % (state, ds), "w")
        for i in range(len(patch_fns)):
            f.write("%s,%s,%d,%d,%s,%s\n" % (
                patch_fns[i],
                *patch_metadata[i]
            ))
        f.close()
        
        f = open("splits/%s_%s_patches.txt" % (state, ds), "w")
        f.write("\n".join(patch_fns))
        f.close()
        
        f = open("splits/%s_%s_shapes.txt" % (state, ds), "w")
        f.write("\n".join(patch_shapes))
        f.close()

de_1m_2013 train
0 25
1 25
2 25
3 25
4 25
5 25
6 25
7 25
8 25
9 25
10 25
11 25
12 25
13 25
14 25
15 25
16 25
17 25
18 25
19 25
20 25
21 25
22 25
23 25
24 25
de_1m_2013 val
0 5
1 5
2 5
3 5
4 5
ny_1m_2013 train
0 25
1 25
2 25
3 25
4 25
5 25
6 25
7 25
8 25
9 25
10 25
11 25
12 25
13 25
14 25
15 25
16 25
17 25
18 25
19 25
20 25
21 25
22 25
23 25
24 25
ny_1m_2013 val
0 5
1 5
2 5
3 5
4 5
md_1m_2013 train
0 25
1 25
2 25
3 25
4 25
5 25
6 25
7 25
8 25
9 25
10 25
11 25
12 25
13 25
14 25
15 25
16 25
17 25
18 25
19 25
20 25
21 25
22 25
23 25
24 25
md_1m_2013 val
0 5
1 5
2 5
3 5
4 5
pa_1m_2013 train
0 25
1 25
2 25
3 25
4 25
5 25
6 25
7 25
8 25
9 25
10 25
11 25
12 25
13 25
14 25
15 25
16 25
17 25
18 25
19 25
20 25
21 25
22 25
23 25
24 25
pa_1m_2013 val
0 5
1 5
2 5
3 5
4 5
wv_1m_2014 train
0 25
1 25
2 25
3 25
4 25
5 25
6 25
7 25
8 25
9 25
10 25
11 25
12 25
13 25
14 25
15 25
16 25
17 25
18 25
19 25
20 25
21 25
22 25
23 25
24 25
wv_1m_2014 val
0 5
1 5
2 5
3 5
4 5
va_1m_2014 train
0 25
1 25
2 25
3 25
4 2