
# Set up

In [5]:
import ipyparallel

rc = ipyparallel.Client()
all_engines = rc[:]
lbv = rc.load_balanced_view()

print len(all_engines)

Waiting for connection file: ~/.ipython/profile_default/security/ipcontroller-client.json


IOError: Connection file '~/.ipython/profile_default/security/ipcontroller-client.json' not found.
You have attempted to connect to an IPython Cluster but no Controller could be found.
Please double-check your configuration and ensure that a cluster is running.

In [7]:
%%px --local

# numeric packages
import numpy as np
import pandas as pd

# filesystem and OS
import sys, os, time
import glob

# plotting
from matplotlib import pyplot as plt
import matplotlib
%matplotlib inline

import seaborn as sns
sns.set_style("whitegrid", {'axes.grid' : False})

# compression
import gzip
import cPickle as pickle
import copy

# geo stuff
import geopandas as gpd
from shapely.geometry import Point

# widgets and interaction
from ipywidgets import FloatProgress
from IPython.display import display, clear_output

import warnings
warnings.filterwarnings('ignore')

# these magics ensure that external modules that are modified are also automatically reloaded
%load_ext autoreload
%autoreload 2

In [3]:
%%px --local

# path to shapefiles

shapefiles_path = "/home/adalbert/data/urban-atlas/shapefiles/"

shapefiles = glob.glob("%s/*/*/*.shp"%shapefiles_path)
shapefiles = {" ".join(f.split("/")[-1].split("_")[1:]).replace(".shp",""):f for f in shapefiles}


In [8]:
%%px --local

# path to save data

outPath = "/home/adalbert/data/urban-atlas/extracted-data"

if not os.path.exists(outPath):
    os.makedirs(outPath)

In [5]:
%%px --local

classes = '''Agricultural + Semi-natural areas + Wetlands
Airports
Construction sites
Continuous Urban Fabric (S.L. > 80%)
Discontinuous Dense Urban Fabric (S.L. : 50% -  80%)
Discontinuous Low Density Urban Fabric (S.L. : 10% - 30%)
Discontinuous Medium Density Urban Fabric (S.L. : 30% - 50%)
Discontinuous Very Low Density Urban Fabric (S.L. < 10%)
Fast transit roads and associated land
Forests
Green urban areas
Industrial, commercial, public, military and private units
Isolated Structures
Land without current use
Mineral extraction and dump sites
Other roads and associated land
Port areas
Railways and associated land
Sports and leisure facilities
Water bodies'''.split("\n")

class2label = {c:i for i,c in enumerate(classes)}
label2class = {i:c for i,c in enumerate(classes)}

# Construct ground truth rasters for validation

Also compute useful stats within windows of L=25,30,50km around the city center:
* percentage of polygons per class 
* percentage of classified area per class
* percentage of classified area vs total area

In [6]:
%%px --local

# satellite imagery modules

import sys
sys.path.append("/home/adalbert/nbserver/satellite-image-tools/satimage-processing/")
import satimg 

In [7]:
%%px --local

def load_shapefile(shapefile):
    # read in shapefile
    try:
        gdf = gpd.GeoDataFrame.from_file(shapefile)
    except:
        print "--> %s: error reading file!"%shapefile
        return None, None

    city = shapefile.split("/")[-1].split("_")[1]
    gdf.columns = [c.upper() if c != "geometry" else c for c in gdf.columns ]
    if 'SHAPE_AREA' not in gdf.columns:
        gdf['SHAPE_AREA'] = gdf['geometry'].apply(lambda p: p.area)
    if 'SHAPE_LEN' not in gdf.columns:
        gdf['SHAPE_LEN'] = gdf['geometry'].apply(lambda p: p.length)
        
    # convert area & length to km
    gdf['SHAPE_AREA'] = gdf['SHAPE_AREA'] / 1.0e6 # convert to km^2
    gdf['SHAPE_LEN']  = gdf['SHAPE_LEN'] / 1.0e3 # convert to km

    classes = gdf['ITEM'].unique()
    print "%s: %d polygons | %d land use classes" % (city, len(gdf), len(classes))

    # read in projection file associated with shapefile
    prjfile = shapefile.replace(".shp", ".prj")
    prj = satimg.read_prj(prjfile)   
    
    # change coordinate system from northing/easting to lonlat
    targetcrs = {u'ellps': u'WGS84', u'datum': u'WGS84', u'proj': u'longlat'}
    gdf.to_crs(crs=targetcrs, inplace=True)

    return gdf, prj

In [30]:
city = "bucuresti"

# read in shapefile
shapefile = shapefiles[city]

gdf, prj = load_shapefile(shapefile)

bucuresti.shp: 12292 polygons | 18 land use classes


In [31]:
gdf['ITEM'].value_counts()

Continuous Urban Fabric (S.L. > 80%)                            7126
Industrial, commercial, public, military and private units      1978
Agricultural + Semi-natural areas + Wetlands                     797
Discontinuous Dense Urban Fabric (S.L. : 50% -  80%)             592
Land without current use                                         359
Green urban areas                                                329
Isolated Structures                                              310
Construction sites                                               181
Water bodies                                                     173
Sports and leisure facilities                                    140
Forests                                                          128
Mineral extraction and dump sites                                 59
Other roads and associated land                                   50
Railways and associated land                                      34
Discontinuous Medium Density Urban

In [10]:
%%px --local

def get_bounds(gdf):
    bounds = np.array(gdf['geometry'].apply(lambda p: list(p.bounds)).values.tolist())
    xmin = bounds[:,[0,2]].min()
    xmax = bounds[:,[0,2]].max()
    ymin = bounds[:,[1,3]].min()
    ymax = bounds[:,[1,3]].max()
    return xmin, ymin, xmax, ymax


def compute_stats(gdf, prj=""):
    ''' 
    Statistics about the polygons in the geo data frame.
    '''
    lonmin, latmin, lonmax, latmax = get_bounds(gdf)
    xmin, ymin = satimg.lonlat2xy((lonmin, latmin), prj=prj)
    xmax, ymax = satimg.lonlat2xy((lonmax, latmax), prj=prj)

    box_area =  (xmax-xmin) / 1.0e3 * (ymax-ymin) / 1.0e3
    L = np.sqrt((xmax-xmin)**2 + (ymax-ymin)**2) / 1.0e3 / np.sqrt(2)
    classified_area = gdf['SHAPE_AREA'].sum()
    frac_classified = classified_area/box_area

    print "Spatial extent: %2.2f km." % L
    print "Land use classified area: %2.3f km^2 (%2.2f of total area covered within bounds %2.3f km^2)"%(classified_area, frac_classified, box_area)
    
    return L, frac_classified

In [11]:
L, frac_classified = compute_stats(gdf, prj=prj)

Spatial extent: 79.73 km.
Land use classified area: 1799.517 km^2 (0.35 of total area covered within bounds 5116.257 km^2)


In [12]:
label2class

{0: 'Agricultural + Semi-natural areas + Wetlands',
 1: 'Airports',
 2: 'Construction sites',
 3: 'Continuous Urban Fabric (S.L. > 80%)',
 4: 'Discontinuous Dense Urban Fabric (S.L. : 50% -  80%)',
 5: 'Discontinuous Low Density Urban Fabric (S.L. : 10% - 30%)',
 6: 'Discontinuous Medium Density Urban Fabric (S.L. : 30% - 50%)',
 7: 'Discontinuous Very Low Density Urban Fabric (S.L. < 10%)',
 8: 'Fast transit roads and associated land',
 9: 'Forests',
 10: 'Green urban areas',
 11: 'Industrial, commercial, public, military and private units',
 12: 'Isolated Structures',
 13: 'Land without current use',
 14: 'Mineral extraction and dump sites',
 15: 'Other roads and associated land',
 16: 'Port areas',
 17: 'Railways and associated land',
 18: 'Sports and leisure facilities',
 19: 'Water bodies'}

In [52]:
%%px --local

from geopy.geocoders import Nominatim
from shapely.geometry import Polygon

def get_city_center(shapefile):
    geolocator = Nominatim()
    country_code = shapefile.split("/")[-1].split("_")[0][:2]
    city = " ".join(shapefile.split("/")[-1].split("_")[1:]).split(".")[0]
    location = geolocator.geocode(city + "," + country_code)
    if location is None:
        return None, None
    latlon = (location.latitude, location.longitude)
    return latlon, country_code


def filter_gdf_by_polygon(gdf, polygon):
    spatial_index = gdf.sindex
    possible_matches_index = list(spatial_index.intersection(polygon.bounds))
    possible_matches = gdf.iloc[possible_matches_index]
    precise_matches = possible_matches[possible_matches.intersects(polygon)]
    return precise_matches


def filter_gdf_by_centered_window(gdf0, center=None, window=None):
    if window is None:
        return gdf0
    else:
        latmin, lonmin, latmax, lonmax = satimg.bounding_box_at_location(center, window)
        pbox = Polygon([(lonmin,latmin), (lonmax,latmin), (lonmax,latmax), (lonmin,latmax)])
        return filter_gdf_by_polygon(gdf0, pbox)
    
    
def construct_class_raster(gdf, bbox, grid_size=(100,100)):
    grid_size_lon, grid_size_lat = grid_size
    latmin_grid, lonmin_grid, latmax_grid, lonmax_grid = bbox
    latv = np.linspace(latmin_grid, latmax_grid, grid_size_lat+1)
    lonv = np.linspace(lonmin_grid, lonmax_grid, grid_size_lon+1)
    
    raster = np.zeros((grid_size_lon, grid_size_lat, len(classes)))
    locations = []
    for i in range(len(lonv)-1):
        clear_output(wait=True)
        print "%d / %d"%(i, len(lonv)-1)
        for j in range(len(latv)-1):
            cell_poly = Polygon([(lonv[i],latv[j]), (lonv[i+1],latv[j]), \
                                 (lonv[i+1],latv[j+1]), (lonv[i],latv[j+1])])
            gdf_frame = filter_gdf_by_polygon(gdf, cell_poly)
            if len(gdf_frame) == 0:
                continue
            areas_per_class = gdf_frame.groupby("ITEM")\
                                .apply(lambda x: x.intersection(cell_poly)\
                                       .apply(lambda y: y.area*(6400**2)).sum())
            classified_area = areas_per_class.sum()
            if classified_area > 0:
                areas_per_class = areas_per_class / float(classified_area) 
                raster[i,j,:] = [areas_per_class[label2class[k]] if label2class[k] in areas_per_class\
                                 else 0 for k in range(len(classes))]  
                # also save sampling locations
                # only if we can get ground truth label for the cell
                cell_class = areas_per_class.argmax()
                loc = (i, j, 
                       cell_poly.centroid.xy[0][0], 
                       cell_poly.centroid.xy[1][0], 
                       cell_class)
                locations.append(loc)
    
    locations = pd.DataFrame(locations, \
                    columns=["grid-i", "grid-j", "lon", "lat", "class"])
    return raster, locations

In [77]:
%%px --local

grid_cell = 100
grid_size = (grid_cell, grid_cell)
window_km_vec = [25, 30, 50]


In [78]:
def fn_generate_stats(shapefile):
    city = " ".join(shapefile.split("/")[-1].split("_")[1:]).replace(".shp","")
    
    # weird issues with several cities, skip
    if city in ["limoges", "linz"]:
        return "Error for city %s"%city
    
    print "Processing %s"%city
    
    savedir = "%s/%s/"%(outPath, city)
    if not os.path.exists(savedir):
        os.makedirs(savedir)

    if len([x for x in os.listdir(savedir) if 'raster' in x])==3:
        return "Already processed!"
   
    gdf, prj = load_shapefile(shapefile)
    if gdf is None:
        return "Error reading shapefile %s"%shapefile
        
    city_center, country_code = get_city_center(shapefile)
    lonmin, latmin, lonmax, latmax = get_bounds(gdf)
    bounds_gdf = Polygon([(lonmin,latmin), (lonmax,latmin), (lonmax,latmax), (lonmin,latmax)])

    if city_center is None:
        city_center = ((latmin+latmax)/2.0, (lonmin+lonmax)/2.0)

    # there's some weird issue with the shapefile for Graz
    # lat and lon are inverted?
    if city in ["graz"]: #not bounds_gdf.contains(Point(city_center[::-1])):
        city_center = ((latmin+latmax)/2.0, (lonmin+lonmax)/2.0)
        gdf['geometry'] = gdf['geometry'].apply(\
                lambda p: Polygon((lon,lat) \
                    for (lon,lat) in zip(p.exterior.coords.xy[1], p.exterior.coords.xy[0])))
    
    # compute spatial extent of city and fraction of land classified
    L, frac_classified = compute_stats(gdf, prj=prj)
    df = pd.DataFrame([L, frac_classified], \
                      index=["spatial extent", "pct land classified"]).T
    df.to_csv("%s/basic_stats.csv"%savedir)
        
    for window_km in window_km_vec:
        window = (window_km, window_km)
        gdf_window = filter_gdf_by_centered_window(gdf, center=city_center, window=window)
        
        # compute stats
        class_coverage_by_area = gdf_window.groupby("ITEM").apply(\
                                lambda x: x["SHAPE_AREA"].sum())/float(window[0]*window[1])
        class_coverage_by_poly= gdf_window.groupby("ITEM").apply(len)/ gdf.groupby("ITEM").apply(len)
        class_coverage_by_area_classified = gdf_window.groupby("ITEM").apply(\
                                                lambda x: x['SHAPE_AREA'].sum()) / gdf_window['SHAPE_AREA'].sum()
    
        # format and save stats
        stats_df = pd.concat([class_coverage_by_area, class_coverage_by_poly, class_coverage_by_area_classified], axis=1)
        stats_df.columns = ["pct area", "pct polygons", "pct classified area"]
        stats_df['window km'] = window_km
        stats_df = stats_df.ix[classes]
        stats_df.to_csv("%s/stats_class_window_%d.csv"%(savedir,window_km))
        
        # compute raster for given window size
        bbox = satimg.bounding_box_at_location(city_center, window)
        raster, locations_df = construct_class_raster(gdf_window, bbox, grid_size=grid_size)
        np.savez_compressed("%s/ground_truth_class_raster_%d.npz"%(savedir,window_km), raster)
        locations_df.to_csv("%s/sample_locations_raster_%d.csv"%(savedir,window_km))

In [54]:
# city_center, country_code = get_city_center(shapefile)
# lonmin, latmin, lonmax, latmax = get_bounds(gdf)
# bounds_gdf = Polygon([(lonmin,latmin), (lonmax,latmin), (lonmax,latmax), (lonmin,latmax)])
# window = (window_km_vec[0], window_km_vec[0])
# gdf_window = filter_gdf_by_centered_window(gdf, center=city_center, window=window)
# bbox = satimg.bounding_box_at_location(city_center, window)
# raster, locations_df = construct_class_raster(gdf_window, bbox, grid_size=grid_size)


In [79]:
res = lbv.map_async(fn_generate_stats, shapefiles.values())

In [408]:
res.progress

101

In [57]:
# res.result()

# Generate locations to extract imagery at

Our sampling strategy has the following goals:
* ensure that a uniform $100 \times 100 ~ (25km \times 25km)$ "main grid" is completely sampled (except for where there are no ground truth polygons). We generate samples in this grid first, and assign the ground truth label of the image sampled in each grid cell to the class of the polygon that has the maximum intersection area with that cell; 
* ensure that the resulting dataset is balanced with respect to the land use classes. The trouble is that the classes are highly imbalanced among the polygons in the dataset (e.g., many more polygons are agricultural land and isolated structures than airports).
* sample additional polygons apart from the ones in the initial grid, such that only polygons above a certain threshold size are considered (so that we can ensure that the sampled images contain a large enough area of the class they represent). 
* to ensure higher match between labels and sampled images, sample more images from polygons of larger areas

In [463]:
%%px --local

img_area = (224 * 1.19/ 1000)**2 # in km^2, at zoom level 17
thresh_frac = 0.25 # at least <thresh_frac> % of the image should be covered by a polygon of a given class
thresh_area = img_area * thresh_frac  
# print "Threshold area: %2.2f km^2"%thresh_area

n_classes = len(classes)

N_SAMPLES_PER_CITY  = 25000
N_SAMPLES_PER_CLASS = N_SAMPLES_PER_CITY / n_classes
MAX_SAMPLES_PER_POLY= 50

In [366]:
gdf_sel = gdf[gdf.SHAPE_AREA>=thresh_area]
print len(gdf), len(gdf_sel)

gdf_sel.groupby("ITEM").apply(len)

12292 4674


ITEM
Agricultural + Semi-natural areas + Wetlands                     594
Airports                                                           4
Construction sites                                                94
Continuous Urban Fabric (S.L. > 80%)                            2245
Discontinuous Dense Urban Fabric (S.L. : 50% -  80%)             284
Discontinuous Medium Density Urban Fabric (S.L. : 30% - 50%)       2
Fast transit roads and associated land                             3
Forests                                                          117
Green urban areas                                                123
Industrial, commercial, public, military and private units       857
Isolated Structures                                                2
Land without current use                                          69
Mineral extraction and dump sites                                 42
Other roads and associated land                                    7
Railways and associated land 

In [471]:
%%px --local

def fn_select_polygons(df, n_samples=1000, max_samples=None):    
    samples_per_poly = (df.SHAPE_AREA/float(df.SHAPE_AREA.min()))\
                            .astype(int)
    # print df.ITEM.iloc[0]
    if samples_per_poly.sum() > n_samples:
        pvec = np.array([0.0, 0.2, 0.5, 0.7, 0.9, 0.95, 1])
        bins = np.percentile(samples_per_poly, pvec*100)
        cnts, _ = np.histogram(samples_per_poly, bins)

        ret = []
        x = samples_per_poly
        for i in range(len(bins)-1):
            if cnts[i] == 0:
                continue
            y = x[(x>=bins[i]) & (x<bins[i+1])] if i<len(bins)-2 \
                    else x[(x>=bins[i]) & (x<=bins[i+1])]
            # print i, (bins[i], bins[i+1]), cnts[i], pvec[i+1], len(x[(x>=bins[i]) & (x<=bins[i+1])])
            y = y.sample(frac=pvec[i+1])
            ret.append(y)
        ret = pd.concat(ret)
        ret_scaled = (ret.astype(float) / ret.sum() * n_samples)\
                        .apply(np.ceil).astype(int)
        ret_df = df.ix[ret_scaled.index]
        ret_df['samples'] = ret_scaled.values
    else:
        ret_df = df
        ret_df['samples'] = samples_per_poly.values
    
    # clamp # samples per polygon if specified
    if max_samples is not None:
        ret_df['samples'] = ret_df['samples'].apply(\
                                    lambda x: min([x, max_samples]))
    ret_df['samples'] = ret_df['samples'].astype(int)
    return ret_df

In [383]:
select_polygons = gdf_sel.groupby("ITEM")\
    .apply(lambda x: fn_select_polygons(x, n_samples=N_SAMPLES_PER_CLASS))


In [394]:
pd.concat([gdf.groupby("ITEM").apply(len),
           gdf_sel.groupby("ITEM").apply(len),
           select_polygons.groupby("ITEM").apply(lambda x: x['samples'].sum())
          ], 1)

Unnamed: 0,0,1,2
Agricultural + Semi-natural areas + Wetlands,797,594.0,1479.0
Airports,4,4.0,433.0
Construction sites,181,94.0,287.0
Continuous Urban Fabric (S.L. > 80%),7126,2245.0,2263.0
Discontinuous Dense Urban Fabric (S.L. : 50% - 80%),592,284.0,1180.0
Discontinuous Low Density Urban Fabric (S.L. : 10% - 30%),4,,
Discontinuous Medium Density Urban Fabric (S.L. : 30% - 50%),25,2.0,4.0
Fast transit roads and associated land,3,3.0,6.0
Forests,128,117.0,1287.0
Green urban areas,329,123.0,585.0


In [472]:
%%px --local

def fn_sample_locations(df, sample_on_boundary=False):
    polygons = df['geometry']
    nsamples = df['samples']
    
    if not sample_on_boundary:
        centroids = np.array([(p.centroid.coords.xy[0][0], p.centroid.coords.xy[1][0]) \
                      for p in polygons])    
        idx = nsamples > 1
        if idx.sum()>0:
            polygons = polygons[idx]
            nsamples = nsamples[idx]
            locs = [satimg.generate_locations_within_polygon(p, nSamples=m-1, strict=True) \
                    for p,m in zip(polygons, nsamples)]
            locs = np.vstack(locs).squeeze()
            locs = np.vstack([locs, centroids])
        else:
            locs = centroids
    else:
        boundaries= [zip(p.exterior.coords.xy[0], p.exterior.coords.xy[1]) \
                     for p in polygons]
        locs = np.array([b[l] for b,m in zip(boundaries,nsamples) \
                         for l in np.random.choice(np.arange(0,len(b)), min([len(b),m]))])
    ret = pd.DataFrame(locs, columns=["lon", "lat"])
    return ret


In [403]:
locations = select_polygons.groupby("ITEM")\
                .apply(lambda x: fn_sample_locations(x,
                        sample_on_boundary = ('road' in x['ITEM'].iloc[0].lower() or 'railway' in x['ITEM'].iloc[0].lower())
            ))

    
# locations.to_csv("%s/samples_%s.csv"%(outPath, city))

print locations.shape

locations.reset_index().groupby("ITEM").apply(len)

(12195, 2)


ITEM
Agricultural + Semi-natural areas + Wetlands                    1479
Airports                                                         433
Construction sites                                               287
Continuous Urban Fabric (S.L. > 80%)                            2263
Discontinuous Dense Urban Fabric (S.L. : 50% -  80%)            1180
Discontinuous Medium Density Urban Fabric (S.L. : 30% - 50%)       4
Fast transit roads and associated land                             6
Forests                                                         1287
Green urban areas                                                585
Industrial, commercial, public, military and private units      1568
Isolated Structures                                                2
Land without current use                                         131
Mineral extraction and dump sites                                123
Other roads and associated land                                 1013
Railways and associated land 

# Generate locations for all cities in the Urban Atlas dataset

It does take ~30 seconds for each city,so this will take a while...

In [407]:
outPath

'/home/adalbert/data/urban-atlas/extracted-data'

In [473]:
def fn_generate_locations(shapefile):
    city = " ".join(shapefile.split("/")[-1].split("_")[1:]).replace(".shp","")
    
    savefile = "%s/%s/additional_sample_locations.csv"%(outPath, city)
    if os.path.exists(savefile):
        return "Sample file already exists %s" % savefile
    
    gdf, prj = load_shapefile(shapefile)
    gdf_sel = gdf[gdf.SHAPE_AREA>=thresh_area]

    # select polygons to sample
    select_polygons = gdf_sel.groupby("ITEM")\
                        .apply(lambda x: fn_select_polygons(x, n_samples=N_SAMPLES_PER_CLASS))
    if "ITEM" not in select_polygons.columns:
        select_polygons.reset_index(inplace=True)
    
    # make sure all polygons are ok
    # some polygons have their geometries messed up in the previous step??
    select_polygons['geometry'] = select_polygons['geometry'].apply(lambda p: p.buffer(0) if not p.is_valid else p)
    
    # sample locations from each polygon
    locations = select_polygons.groupby("ITEM")\
                .apply(lambda x: fn_sample_locations(x,
                        sample_on_boundary = 'road' in x['ITEM'].iloc[0].lower() \
                                                or 'railway' in x['ITEM'].iloc[0].lower()))
    
    print "--> selected %d sampling locations."%len(locations)
    locations.to_csv(savefile)

In [474]:
res_locs = lbv.map_async(fn_generate_locations, shapefiles.values())

In [478]:
res_locs.progress

299

# Compute statistics on locations generated for a few cities

In [1]:
cities = ["bucuresti", "berlin", "barcelona", "paris", "athina", \
          "firenze", "dublin", "london", "tallinn", "bremen"]

In [2]:
def load_locations():
    grid_locations_df = pd.read_csv("sample_locations_raster_25.csv").drop("Unnamed: 0", 1)
    more_locations_df = pd.read_csv("additional_sample_locations.csv")\
                            .rename(columns={"ITEM":"class"})\
                            .drop("Unnamed: 1", 1)
    print "Grid samples: %d. Additional samples: %d" % \
            (len(grid_locations_df), len(more_locations_df))
        
    more_locations_df['grid-i'] = np.nan
    more_locations_df['grid-j'] = np.nan
    columns = ["lon", "lat", "grid-i", "grid-j", "class"]
    locations = pd.concat([grid_locations_df[columns], more_locations_df[columns]])
    locations = locations.reset_index().drop("index", 1)
    
    return locations

In [13]:
stats_df = []
for city in cities:
    workdir = "%s/%s" % (outPath, city)
    os.chdir(workdir)
    if os.path.isdir("./img"):
        print city,
        locations = load_locations()
        stats_df.append(locations)
stats_df = pd.concat(stats_df, 0)

bucuresti Grid samples: 9745. Additional samples: 12213
berlin Grid samples: 10000. Additional samples: 49325
barcelona Grid samples: 5944. Additional samples: 18299
athina Grid samples: 8738. Additional samples: 18544
dublin Grid samples: 8514. Additional samples: 22391
tallinn Grid samples: 7000. Additional samples: 16414


In [14]:
stats_df.shape

(187127, 5)

In [None]:
files = np.array([f for c in cities for f in glob.glob(outPath + "%s/img/*/*.jpg"%c)])
files = [f for f in files if not ('grid' in os.path.basename(f))]
files_df =  pd.DataFrame(files).rename(columns={0:"filename"})
files_df['class'] = files_df['filename'].apply(lambda x: x.split("/")[-2])
files_df['city'] = files_df['filename'].apply(lambda x: x.split("/")[-4])
