In [40]:
%matplotlib inline
import sys, os, time
import pandas as pd
import numpy as np
import json

from collections import defaultdict

import matplotlib
import matplotlib.pyplot as plt

highres_colors = [
    "#000000",
    "#0000FF",
    "#008000",
    "#80FF80",
    "#806060",
]
highres_cmap = matplotlib.colors.ListedColormap(highres_colors)

import fiona
import fiona.transform
import rasterio
import rasterio.mask
import shapely
import shapely.geometry

In [2]:
NLCD_CLASSES = [
    0, 11, 12, 21, 22, 23, 24, 31, 41, 42, 43, 51, 52, 71, 72, 73, 74, 81, 82, 90, 95, 255
]
NLCD_CLASSES_TO_IDX = defaultdict(lambda: 0, {cl:i for i,cl in enumerate(NLCD_CLASSES)})
NLCD_CLASS_IDX = range(len(NLCD_CLASSES))

In [3]:
def humansize(nbytes):
    suffixes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
    i = 0
    while nbytes >= 1024 and i < len(suffixes)-1:
        nbytes /= 1024.
        i += 1
    f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
    return '%s %s' % (f, suffixes[i])

In [4]:
def get_nlcd_stats(data):
    counts = []
    for val in NLCD_CLASSES:
        counts.append((data==val).sum())
    return np.array(counts)

In [5]:
def get_lc_stats(data):
    vals = [1, 2, 3, 4, 5, 6, 15]
    counts = []
    for val in vals:
        counts.append((data==val).sum())
    return np.array(counts)

In [6]:
def get_random_string(n):
    alphabet = list("abcdefghijklmnopqrstuvwxyz".upper())
    return ''.join(np.random.choice(alphabet, n, replace=True))

In [7]:
def bounds_intersection(bound0, bound1):
    left0, bottom0, right0, top0 = bound0
    left1, bottom1, right1, top1 = bound1
    left, bottom, right, top = \
            max([left0, left1]), max([bottom0, bottom1]), \
            min([right0, right1]), min([top0, top1])
    return (left, bottom, right, top)

In [8]:
new_to_old_map = {}
f = open("data/2013_2014-to-2011_2012.csv", "r")
f.readline()
lines = f.read().strip().split("\n")
for line in lines:
    parts = line.split(",")
    new_to_old_map[parts[0]] = parts[1]
f.close()

In [12]:
states = [
    "de_1m_2013", # 107 tiles
    "ny_1m_2013", # 407 tiles
    "md_1m_2013", # 691 tiles
    "pa_1m_2013", # 2239 tiles
    "wv_1m_2014", # 292 tiles
    "va_1m_2014"  # 1238 tiles
]

## Find data that we can sample from

In [2]:
# count of classes per tile
f = open("data/resampled-lc_counts.csv","r")
header = f.readline().strip().split(",")
lines = f.read().strip().split("\n")
fns = []
counts = []
for line in lines:
    parts = line.split(",")
    fn = parts[0]
    count = np.array(list(map(int, parts[1:])))
    fns.append(fn)
    counts.append(count)
f.close()
fns = np.array(fns)
counts = np.array(counts)

In [3]:
# These naip files have blacked out areas 
bad_fns = pd.read_csv("data/naip_num_zeros.csv")
bad_fns = bad_fns[bad_fns.num_zeros>0].naip_fn.tolist()
bad_fns = set([
    fn.replace("esri-naip", "resampled-lc")[:-4] + "_lc.tif"
    for fn in bad_fns
])

In [7]:
mask = ~(counts[:,-1] > 0)
good_fns = fns[mask]

good_fns = [
    fn for fn in good_fns
    if fn not in bad_fns
]

In [15]:
good_fns_counts = []
good_fns_counts_map = {}
for fn in good_fns:
    count = counts[fns == fn]
    good_fns_counts.append(count[0])
    good_fns_counts_map[fn] = count[0]
good_fns = np.array(good_fns)
good_fns_counts = np.array(good_fns_counts)

In [16]:
good_fns_counts.shape

(4975, 7)

In [11]:
state_year_fns = defaultdict(list)
for fn in good_fns:
    parts = fn.split("/")
    state_year_fns[parts[9]].append(fn)

In [12]:
for k, vs in state_year_fns.items():
    print(k, len(vs))

md_1m_2013 691
de_1m_2013 107
va_1m_2014 1238
md_1m_2015 1
wv_1m_2014 292
ny_1m_2013 407
pa_1m_2013 2239


In [28]:
state_year_expected_class_dist = {}
for state in states:
    counts = np.zeros((7), dtype=int)
    for fn in state_year_fns[state]:
        counts += good_fns_counts_map[fn]
    probs = counts / counts.sum()
    probs = probs[:-1]
    state_year_expected_class_dist[state] = probs

## Sample 50 tiles per state to split into (train, val, test)

In [29]:
from scipy import stats

In [35]:
def do_test(fns, expected_dist):
    counts = np.zeros((7), dtype=int)
    for fn in fns:
        counts += good_fns_counts_map[fn]
    probs = counts / counts.sum()
    probs = probs[:-1]
    return stats.ks_2samp(probs, expected_dist)

In [51]:
num_total = 50
num_train, num_test = 25, 20
num_val = num_total - num_train - num_test

state_year_splits = {}
for state in states:
    
    fns = state_year_fns[state]
    all_fns = np.random.choice(fns, size=num_total, replace=False)
    np.random.shuffle(all_fns)
    
    train_fns, test_fns, val_fns = all_fns[:num_train], all_fns[num_train:(num_train+num_test)], all_fns[(num_train+num_test):]
    print(len(train_fns), len(test_fns), len(val_fns))
    
    statistic, p1 = do_test(train_fns, state_year_expected_class_dist[state])
    statistic, p2 = do_test(test_fns, state_year_expected_class_dist[state])
    statistic, p3 = do_test(val_fns, state_year_expected_class_dist[state])
    print(state, p1, p2, p3)
    
    state_year_splits[state] = (train_fns, test_fns, val_fns)
    
    f = open("splits/%s_train.txt" % (state),"w")
    f.write("\n".join(train_fns))
    f.close()
    
    f = open("splits/%s_test.txt" % (state),"w")
    f.write("\n".join(test_fns))
    f.close()
    
    f = open("splits/%s_val.txt" % (state),"w")
    f.write("\n".join(val_fns))
    f.close()

25 20 5
de_1m_2013 0.9999565148992584 0.9999565148992584 0.8095573106166531
25 20 5
ny_1m_2013 0.8095573106166531 0.8095573106166531 0.8095573106166531
25 20 5
md_1m_2013 0.9999565148992584 0.9999565148992584 0.9999565148992584
25 20 5
pa_1m_2013 0.8095573106166531 0.8095573106166531 0.9999565148992584
25 20 5
wv_1m_2014 0.8095573106166531 0.8095573106166531 0.8095573106166531
25 20 5
va_1m_2014 0.8095573106166531 0.9999565148992584 0.8095573106166531
