In [1]:
import numpy as np
import pandas as pd

import sys, os, time
import glob

from matplotlib import pyplot as plt
%matplotlib inline

from IPython.display import clear_output

# these magics ensure that external modules that are modified are also automatically reloaded
%load_ext autoreload
%autoreload 2

In [6]:
# for geospatial data
import geopandas as gpd
import shapely

# for interactive plotting
import mplleaflet
import seaborn as sns


# Set up data access

In [8]:

shapefilesDir = "/home/adalbert/data/urban-environments/city-shapefiles/"

imgPath = "/home/adalbert/data/urban-environments/"


In [9]:
locationFiles = glob.glob("%s/locations_*.csv"%imgPath)
locationFiles = {os.path.basename(x).split("_")[1].split(".")[0]:x for x in locationFiles}

print locationFiles

{'beijing': '/home/adalbert/data/urban-environments/locations_beijing.csv', 'doha': '/home/adalbert/data/urban-environments/locations_doha.csv', 'riyadh': '/home/adalbert/data/urban-environments/locations_riyadh.csv', 'boston': '/home/adalbert/data/urban-environments/locations_boston.csv', 'US': '/home/adalbert/data/urban-environments/locations_US.csv', 'rio-de-janeiro': '/home/adalbert/data/urban-environments/locations_rio-de-janeiro.csv', 'mexico-city': '/home/adalbert/data/urban-environments/locations_mexico-city.csv', 'san-francisco-bay': '/home/adalbert/data/urban-environments/locations_san-francisco-bay.csv', 'los-angeles': '/home/adalbert/data/urban-environments/locations_los-angeles.csv'}


In [None]:
# gdf = gpd.read_file(shapefilesDir + "boston/boston_massachusetts_osm_admin.shp")
gdf = gpd.read_file(shapefilesDir + "san-francisco-bay/san-francisco-bay_california_osm_admin.shp")


Function to gather info on image samples

In [None]:
def read_info(imgPath, city):
    
    # read in geographic locations of samples
    locations = pd.read_csv("%s/locations_%s.csv"%(imgPath, city)).rename(columns={"Unnamed: 0":"id"})
    
    # read in info on image files
    imgFiles = glob.glob("%s/samples/%s/*/*.jpg"%(imgPath, city)) + \
                glob.glob("%s/samples/%s/*/*.png"%(imgPath, city))    
    df = pd.DataFrame([(int(f.split("/")[-2]),os.path.basename(f).split("_")[0],f) for f in imgFiles], \
                      columns=["sample ID", "source", "filename"])
    df.set_index("sample ID", inplace=True)
    df = df.pivot(columns="source")
    df.columns = df.columns.droplevel(0)
    df = df.reset_index()
    
    # read in low-res pixel data (nightlights, population)
    valFiles = glob.glob("%s/samples/%s/*/pixel_data.csv"%(imgPath, city))
    valuesDf = pd.DataFrame([pd.read_csv(f).to_dict("records")[0] for f in valFiles])
    valuesDf.set_index("sample ID", inplace=True)

    print "There are %d missing values for population."%(valuesDf['population value']<0).sum()

    # convert nightlights values to low/medium/high according to the poverty mapping paper
    valuesDf['nightlight class'] = valuesDf['nightlight value'].apply(lambda x: 0 if x < 3 else 1 if x < 34 else 2)

    # merge dataframes
    valuesDf = pd.merge(valuesDf.reset_index(), df, on="sample ID")
    valuesDf.set_index("sample ID", inplace=True)
    valuesDf.sort_index(inplace=True)
    
    return valuesDf

In [None]:
valuesDF = read_info(imgPath, "san-francisco-bay")

# convert to geopandas and make each location into a shapely Point
valuesDF = gpd.GeoDataFrame(valuesDF, geometry=valuesDF.apply(
        lambda srs: shapely.geometry.Point(srs['img lon'], srs['img lat']), axis='columns'
    ))

valuesDF.head()

Function to plot locations on a map

In [None]:
# plot density of samples

f = plt.figure(figsize=(8, 4))
ax = f.gca()

valuesDF.plot(ax=ax, alpha=0.1, linewidth=0.25, color='white')
sns.kdeplot(data=valuesDF.apply(lambda srs: pd.Series({'x': srs.geometry.x, 'y': srs.geometry.y}), axis='columns'), ax=ax,
            alpha=1)
ax.set_axis_off()


In [None]:
# plot nightlights density contours

f = plt.figure(figsize=(10, 6))
ax = f.gca()
gdf.plot(ax=ax, alpha=0)
kw = dict(column='population value', k=6, cmap='YlGn', alpha=1, legend=True, linewidth=0.5)
valuesDF.plot(scheme='QUANTILES', ax=ax, **kw)
ax.set_axis_off()
ax.set_title("Population Density")

In [None]:
idx = np.random.choice(len(valuesDF), 500)

f = plt.figure(figsize=(15, 8))
ax = f.gca()
ax.scatter(valuesDF['img lon'][idx], valuesDF['img lat'][idx])
gdf.plot(ax=ax)
mplleaflet.display(fig=f)

Plot a list of image thumbnails on a canvas

In [None]:
def plot_canvas(data, cmap="jet", gridSize=None, normalize=False):
    """Take an array of shape (n, height, width) or (n, height, width, 3)
       and visualize each (height, width) thing in a grid of size approx. sqrt(n) by sqrt(n)"""
    
    # normalize data for display
    if normalize:
        data = (data - data.min()) / (data.max() - data.min())
    
    # force the number of filters to be square
    if gridSize is None:
        n = int(np.ceil(np.sqrt(data.shape[0])))
        m = n
    else:
        n,m = gridSize
        
    padding = (((0, n*m - data.shape[0]),
               (0, 1), (0, 1))                 # add some space between filters
               + ((0, 0),) * (data.ndim - 3))  # don't pad the last dimension (if there is one)
    data = np.pad(data, padding, mode='constant', constant_values=1)  # pad with ones (white)
    
    # tile the filters into an image
    data = data.reshape((n, m) + data.shape[1:]).transpose((0, 2, 1, 3) + tuple(range(4, data.ndim + 1)))
    data = data.reshape((n * data.shape[1], m * data.shape[3]) + data.shape[4:])
    
    plt.imshow(data, cmap=cmap); plt.axis('off')

Function to plot samples on each row

In [None]:
import matplotlib.gridspec as gridspec
from scipy.misc import imread
import skimage

def plot_sample(rows, gdf=None, cols=["nightlights", "population", "googlemaps"]):
    # read in data
    nRows = len(rows)
    nCols = len(cols) if gdf is None else len(cols)+1
    clear_output(wait=True)
    gs1 = gridspec.GridSpec(nRows, nCols)
    gs1.update(wspace=0.05, hspace=0.05) # set the spacing between axes. 
    axes = np.array([[plt.subplot(gs1[i*nCols+j]) for j in range(nCols)] for i in range(nRows)])
    
    for ax in axes.flatten():
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.grid(False)

    cols1 = cols if gdf is None else cols+["location"]
    for ax, c in zip(axes[0], cols1):
        ax.set_title(c, fontsize=16)
    
    for ax, i in zip(axes[:,0], rows.index):
        ax.set_ylabel("%s"%i, rotation=0, size='large', labelpad=20)

    # plot grid of images
    for i,c in enumerate(cols):
        for j,(idx,r) in enumerate(rows.iterrows()):
            img = imread(r[c])
            axes[j,i].imshow(img)
            
    # plot geographical location on last column
    if gdf is not None:
        for i in range(nRows):
            gdf.plot(ax=axes[i,-1])
            axes[i,-1].scatter(rows.iloc[i]['img lon'], rows.iloc[i]['img lat'])
    plt.show()

In [None]:
plt.figure(figsize=(12,8))
plot_sample(pd.DataFrame(valuesDF.iloc[[1,2]]))

In [None]:
# import matplotlib.gridspec as gridspec
# from scipy.misc import imread
# import skimage

# def read_sample_canvas(rows, cols=["nightlights", "population", "googlemaps"], res=(400,400)):
#     # read in data
#     nRows = len(rows)
#     nCols = len(cols)
#     data = np.zeros((nRows*nCols,) + res + (3,))
#     for i,(idx,r) in enumerate(rows.iterrows()):
#         for j,c in enumerate(cols):
#             img = imread(r[c])
#             if len(img.shape)==2 or img.shape[2] == 1:
#                 img = np.round((img.astype(float) / img.max() * 255))
#                 img = skimage.color.gray2rgb(img)
#             img = skimage.transform.resize(img, res, mode="wrap")
#             data[i*nCols+j] = img
#     return data

# # this gives some weird looking results when upsampling is too extreme (for nightlights and population images)

# canvas = read_sample_canvas(valuesDF[:2])
# plot_canvas(canvas, gridSize=(2,3))

# Set up interactive plots

In [None]:
from IPython.html import widgets
from IPython.display import display

In [None]:
valDF = read_info(imgPath,"boston")

# dropdown to change city
def func_change_city(change):
    global valDF
    if change['type'] == 'change' and change['name'] == 'value':
        city = change['new']
        print "Reading in data for %s"%city
        valDF = read_info(imgPath,city)
        textWidget.value = "0,1,2"
        func(btn)

cityWidget = widgets.Dropdown(description="City: ", options=locationFiles.keys(), value="boston")
cityWidget.observe(func_change_city)

# textbox to plot specific samples (if empty, will plot 3 random samples)
textWidget = widgets.Text()
textWidget.value = "0,1,2"

# plot button 
def func(btn):
    clear_output(wait=True)
    plt.figure(figsize=(8,8))
    if len(textWidget.value)>0:
        idx = [int(i) for i in textWidget.value.split(",")]
    else:
        idx = np.random.choice(valDF.index, 3)
    plot_sample(valDF.ix[idx])

btn = widgets.Button(description="Plot sample")
btn.on_click(func)

# tab containers
page1 = widgets.Box(children=[cityWidget])
page2 = widgets.Box(children=[textWidget, btn])
tabs  = widgets.Tab(children=[page1, page2])
tabs.set_title(0, 'Geography')
tabs.set_title(1, 'Samples')

In [None]:
display(tabs)

In [None]:
plt.imshow(imread(valDF.iloc[400]["googlemaps"]))
plt.axis("off")

In [None]:
1

In [None]:
pd.DataFrame(np.random.sample((5,2)), columns=["economic score", "light use score"])\
    .sort_values("light use score", ascending=False).round(3)