
# Set up

In [2]:
import ipyparallel

rc = ipyparallel.Client()
all_engines = rc[:]
lbv = rc.load_balanced_view()

print len(all_engines)

48


In [6]:
%%px --local

# numeric packages
import numpy as np
import pandas as pd

# filesystem and OS
import sys, os, time
import glob

# plotting
from matplotlib import pyplot as plt
import matplotlib
%matplotlib inline

import seaborn as sns
sns.set_style("whitegrid", {'axes.grid' : False})

# compression
import gzip
import cPickle as pickle
import copy

# geo stuff
import geopandas as gpd
from shapely.geometry import Point

# widgets and interaction
from ipywidgets import FloatProgress
from IPython.display import display, clear_output

import warnings
warnings.filterwarnings('ignore')

# these magics ensure that external modules that are modified are also automatically reloaded
# %load_ext autoreload
%reload_ext autoreload

In [7]:
%%px --local

# path to shapefiles

shapefiles_path = "/home/data/urban-atlas/shapefiles/"

shapefiles = glob.glob("%s/*/*/*.shp"%shapefiles_path)
shapefiles = {" ".join(f.split("/")[-1].split("_")[1:]).replace(".shp",""):f for f in shapefiles}


In [8]:
%%px --local

# path to save data

outPath = "/home/data/urban-atlas/extracted-data"

if not os.path.exists(outPath):
    os.makedirs(outPath)

In [5]:
%%px --local

classes = '''Agricultural + Semi-natural areas + Wetlands
Airports
Construction sites
Continuous Urban Fabric (S.L. > 80%)
Discontinuous Dense Urban Fabric (S.L. : 50% -  80%)
Discontinuous Low Density Urban Fabric (S.L. : 10% - 30%)
Discontinuous Medium Density Urban Fabric (S.L. : 30% - 50%)
Discontinuous Very Low Density Urban Fabric (S.L. < 10%)
Fast transit roads and associated land
Forests
Green urban areas
Industrial, commercial, public, military and private units
Isolated Structures
Land without current use
Mineral extraction and dump sites
Other roads and associated land
Port areas
Railways and associated land
Sports and leisure facilities
Water bodies'''.split("\n")

class2label = {c:i for i,c in enumerate(classes)}
label2class = {i:c for i,c in enumerate(classes)}

In [9]:
%%px --local

def load_shapefile(shapefile):
    # read in shapefile
    try:
        gdf = gpd.GeoDataFrame.from_file(shapefile)
    except:
        print "--> %s: error reading file!"%shapefile
        return None, None

    city = shapefile.split("/")[-1].split("_")[1]
    gdf.columns = [c.upper() if c != "geometry" else c for c in gdf.columns ]
    if 'SHAPE_AREA' not in gdf.columns:
        gdf['SHAPE_AREA'] = gdf['geometry'].apply(lambda p: p.area)
    if 'SHAPE_LEN' not in gdf.columns:
        gdf['SHAPE_LEN'] = gdf['geometry'].apply(lambda p: p.length)
        
    # convert area & length to km
    gdf['SHAPE_AREA'] = gdf['SHAPE_AREA'] / 1.0e6 # convert to km^2
    gdf['SHAPE_LEN']  = gdf['SHAPE_LEN'] / 1.0e3 # convert to km

    classes = gdf['ITEM'].unique()
    print "%s: %d polygons | %d land use classes" % (city, len(gdf), len(classes))

    # read in projection file associated with shapefile
    prjfile = shapefile.replace(".shp", ".prj")
    prj = satimg.read_prj(prjfile)   
    
    # change coordinate system from northing/easting to lonlat
    targetcrs = {u'ellps': u'WGS84', u'datum': u'WGS84', u'proj': u'longlat'}
    gdf.to_crs(crs=targetcrs, inplace=True)

    return gdf, prj

# Generate locations to extract imagery at

Our sampling strategy has the following goals:
* ensure that a uniform $100 \times 100 ~ (25km \times 25km)$ "main grid" is completely sampled (except for where there are no ground truth polygons). We generate samples in this grid first, and assign the ground truth label of the image sampled in each grid cell to the class of the polygon that has the maximum intersection area with that cell; 
* ensure that the resulting dataset is balanced with respect to the land use classes. The trouble is that the classes are highly imbalanced among the polygons in the dataset (e.g., many more polygons are agricultural land and isolated structures than airports).
* sample additional polygons apart from the ones in the initial grid, such that only polygons above a certain threshold size are considered (so that we can ensure that the sampled images contain a large enough area of the class they represent). 
* to ensure higher match between labels and sampled images, sample more images from polygons of larger areas

In [30]:
%%px --local

img_area = (224 * 1.19/ 1000)**2 # in km^2, at zoom level 17
thresh_frac = 0.25 # at least <thresh_frac> % of the image should be covered by a polygon of a given class
thresh_area = img_area * thresh_frac  
# print "Threshold area: %2.2f km^2"%thresh_area

n_classes = len(classes)

N_SAMPLES_PER_CITY  = 25000
N_SAMPLES_PER_CLASS = N_SAMPLES_PER_CITY / n_classes
MAX_SAMPLES_PER_POLY= 50

In [17]:
img_area

0.07105423360000002

In [51]:
shapefile = shapefiles['madrid']

gdf, proj = load_shapefile(shapefile)

gdf_sel = gdf[gdf.SHAPE_AREA>=thresh_area]
print len(gdf), len(gdf_sel)

gdf_sel.groupby("ITEM").apply(len)

madrid.shp: 79135 polygons | 19 land use classes
79135 20688


ITEM
Agricultural + Semi-natural areas + Wetlands                    5866
Airports                                                           8
Construction sites                                               648
Continuous Urban Fabric (S.L. > 80%)                             549
Discontinuous Dense Urban Fabric (S.L. : 50% -  80%)            1525
Discontinuous Low Density Urban Fabric (S.L. : 10% - 30%)       1498
Discontinuous Medium Density Urban Fabric (S.L. : 30% - 50%)    1547
Discontinuous Very Low Density Urban Fabric (S.L. < 10%)         563
Fast transit roads and associated land                            89
Forests                                                         2675
Green urban areas                                                643
Industrial, commercial, public, military and private units      3460
Isolated Structures                                               61
Mineral extraction and dump sites                                379
Other roads and associated la

In [52]:
%%px --local

def fn_select_polygons(df, n_samples=1000, max_samples=None):    
    samples_per_poly = (df.SHAPE_AREA/float(df.SHAPE_AREA.min()))\
                            .astype(int)
    # print df.ITEM.iloc[0]
    if samples_per_poly.sum() > n_samples:
        pvec = np.array([0.0, 0.2, 0.5, 0.7, 0.9, 0.95, 1])
        bins = np.percentile(samples_per_poly, pvec*100)
        cnts, _ = np.histogram(samples_per_poly, bins)

        ret = []
        x = samples_per_poly
        for i in range(len(bins)-1):
            if cnts[i] == 0:
                continue
            y = x[(x>=bins[i]) & (x<bins[i+1])] if i<len(bins)-2 \
                    else x[(x>=bins[i]) & (x<=bins[i+1])]
            # print i, (bins[i], bins[i+1]), cnts[i], pvec[i+1], len(x[(x>=bins[i]) & (x<=bins[i+1])])
            y = y.sample(frac=pvec[i+1])
            ret.append(y)
        ret = pd.concat(ret)
        ret_scaled = (ret.astype(float) / ret.sum() * n_samples)\
                        .apply(np.ceil).astype(int)
        ret_df = df.ix[ret_scaled.index]
        ret_df['samples'] = ret_scaled.values
    else:
        ret_df = df
        ret_df['samples'] = samples_per_poly.values
    
    # clamp # samples per polygon if specified
    if max_samples is not None:
        ret_df['samples'] = ret_df['samples'].apply(\
                                    lambda x: min([x, max_samples]))
    ret_df['samples'] = ret_df['samples'].astype(int)
    return ret_df

In [53]:
select_polygons = gdf_sel.groupby("ITEM")\
    .apply(lambda x: fn_select_polygons(x, n_samples=N_SAMPLES_PER_CLASS, max_samples=MAX_SAMPLES_PER_POLY))


In [54]:
pd.concat([gdf.groupby("ITEM").apply(len),
           gdf_sel.groupby("ITEM").apply(len),
           select_polygons.groupby("ITEM").apply(lambda x: x['samples'].sum())
          ], 1).rename(columns={0:"#polys", 1:"#polys>thres_area", 2:"#samples"})

Unnamed: 0,#polys,#polys>thres_area,#samples
Agricultural + Semi-natural areas + Wetlands,9302,5866.0,4659.0
Airports,11,8.0,125.0
Construction sites,2189,648.0,1400.0
Continuous Urban Fabric (S.L. > 80%),13470,549.0,682.0
Discontinuous Dense Urban Fabric (S.L. : 50% - 80%),12208,1525.0,1851.0
Discontinuous Low Density Urban Fabric (S.L. : 10% - 30%),5091,1498.0,1952.0
Discontinuous Medium Density Urban Fabric (S.L. : 30% - 50%),6227,1547.0,1833.0
Discontinuous Very Low Density Urban Fabric (S.L. < 10%),2547,563.0,842.0
Fast transit roads and associated land,98,89.0,984.0
Forests,3522,2675.0,2695.0


In [21]:
%%px --local

# given a list of polygons of the same type, generate locations for sampling images

def fn_sample_locations(df, sample_on_boundary=False):
    polygons = df['geometry']
    nsamples = df['samples']
    
    if not sample_on_boundary:
        centroids = np.array([(p.centroid.coords.xy[0][0], p.centroid.coords.xy[1][0]) \
                      for p in polygons])    
        idx = nsamples > 1
        if idx.sum()>0:
            polygons = polygons[idx]
            nsamples = nsamples[idx]
            locs = [satimg.generate_locations_within_polygon(p, nSamples=m-1, strict=True) \
                    for p,m in zip(polygons, nsamples)]
            locs = np.vstack(locs).squeeze()
            locs = np.vstack([locs, centroids])
        else:
            locs = centroids
    else:
        boundaries= [zip(p.exterior.coords.xy[0], p.exterior.coords.xy[1]) \
                     for p in polygons]
        locs = np.array([b[l] for b,m in zip(boundaries,nsamples) \
                         for l in np.random.choice(np.arange(0,len(b)), min([len(b),m]))])
    ret = pd.DataFrame(locs, columns=["lon", "lat"])
    return ret


In [55]:
locations = select_polygons.groupby("ITEM")\
                .apply(lambda x: fn_sample_locations(x,
                        sample_on_boundary = ('road' in x['ITEM'].iloc[0].lower() or 'railway' in x['ITEM'].iloc[0].lower())
            ))

    
# locations.to_csv("%s/samples_%s.csv"%(outPath, city))

print locations.shape

locations.reset_index().groupby("ITEM").apply(len)

(26937, 2)


ITEM
Agricultural + Semi-natural areas + Wetlands                    4659
Airports                                                         125
Construction sites                                              1400
Continuous Urban Fabric (S.L. > 80%)                             682
Discontinuous Dense Urban Fabric (S.L. : 50% -  80%)            1851
Discontinuous Low Density Urban Fabric (S.L. : 10% - 30%)       1952
Discontinuous Medium Density Urban Fabric (S.L. : 30% - 50%)    1833
Discontinuous Very Low Density Urban Fabric (S.L. < 10%)         842
Fast transit roads and associated land                           984
Forests                                                         2695
Green urban areas                                               1241
Industrial, commercial, public, military and private units      3203
Isolated Structures                                               61
Mineral extraction and dump sites                               1249
Other roads and associated la

# Generate locations for all cities in the Urban Atlas dataset

It does take ~30 seconds for each city,so this will take a while...

In [56]:
outPath

'/home/adalbert/data/urban-atlas/extracted-data'

In [69]:
def fn_generate_locations(shapefile, force=True):
    city = " ".join(shapefile.split("/")[-1].split("_")[1:]).replace(".shp","")
    
    savefile = "%s/%s/additional_sample_locations.csv"%(outPath, city)
    if os.path.exists(savefile) and not force:
        return "Sample file already exists %s" % savefile
    
    gdf, prj = load_shapefile(shapefile)
    gdf_sel = gdf[gdf.SHAPE_AREA>=thresh_area]

    # select polygons to sample
    select_polygons = gdf_sel.groupby("ITEM")\
                        .apply(lambda x: fn_select_polygons(x, 
                                            n_samples=N_SAMPLES_PER_CLASS, max_samples=MAX_SAMPLES_PER_POLY))
    if "ITEM" not in select_polygons.columns:
        select_polygons.reset_index(inplace=True)
    
    # make sure all polygons are ok
    # some polygons have their geometries messed up in the previous step??
    select_polygons['geometry'] = select_polygons['geometry'].apply(lambda p: p.buffer(0) if not p.is_valid else p)
    
    # sample locations from each polygon
    locations = select_polygons.groupby("ITEM")\
                .apply(lambda x: fn_sample_locations(x,
                        sample_on_boundary = 'road' in x['ITEM'].iloc[0].lower() \
                                                or 'railway' in x['ITEM'].iloc[0].lower()))
    
    print "--> selected %d sampling locations."%len(locations)
    locations.to_csv(savefile)

In [78]:
res_locs = lbv.map_async(fn_generate_locations, shapefiles.values())

In [85]:
res_locs.progress

299

In [76]:
fn_generate_locations(shapefiles.values()[47])

kielce.shp: 19232 polygons | 18 land use classes


Self-intersection at or near point 20.458340738537622 51.053626277262069


--> selected 9599 sampling locations.


In [None]:
res_locs.result()

In [None]:
tr_df = pd.read_csv("/home/adalbert/data/urban-atlas/extracted-data/bucuresti/additional_sample_locations.csv")
ts_df = pd.read_csv("/home/adalbert/data/urban-atlas/extracted-data/bucuresti/sample_locations_raster_25.csv")


In [None]:
tr_df['ITEM'].value_counts()

In [None]:
ts_df.head()

In [None]:
ts_df['class'].value_counts()

In [None]:
sys.path.append("/home/adalbert/nbserver/satsense/keras-models/")

import keras_utils as ku


In [None]:
ku.balanced_df(ts_df, nrows=100)['class'].value_counts()