In [3]:
import pandas as pd
import numpy as np

geo = pd.read_csv('olist_geolocation_dataset.csv', dtype={'geolocation_zip_code_prefix': str})

In [None]:
geo.head()

: 

: 

: 

In [None]:
# Gets the first three and four digits of zip codes, and will explore this further to understand how zip codes works
geo['geolocation_zip_code_prefix_1_digits'] = geo['geolocation_zip_code_prefix'].str[0:1]
geo['geolocation_zip_code_prefix_2_digits'] = geo['geolocation_zip_code_prefix'].str[0:2]
geo['geolocation_zip_code_prefix_3_digits'] = geo['geolocation_zip_code_prefix'].str[0:3]
geo['geolocation_zip_code_prefix_4_digits'] = geo['geolocation_zip_code_prefix'].str[0:4]
geo.head()

: 

: 

: 

In [None]:
geo['geolocation_zip_code_prefix'].value_counts().to_frame().describe().round(2)

: 

: 

: 

There are 19.051 different zip_code_prefix. On average there are 52.6 coordinates for each prefix. But there is one prefix wih 1.146 coordinates available

There are some outliers coordinates in the dataset that are outside of Brazilian territory. Lets guarantee that all coordinates are within a rectangle delimited by the limits of Brazil.

In [None]:
# Removing some outliers
#Brazils most Northern spot is at 5 deg 16′ 27.8″ N latitude.;
geo = geo[geo.geolocation_lat <= 5.27438888]

#it’s most Western spot is at 73 deg, 58′ 58.19″W Long.
geo = geo[geo.geolocation_lng >= -73.98283055]

#It’s most southern spot is at 33 deg, 45′ 04.21″ S Latitude.
geo = geo[geo.geolocation_lat >= -33.75116944]

#It’s most Eastern spot is 34 deg, 47′ 35.33″ W Long.
geo = geo[geo.geolocation_lng <=  -34.79314722]

: 

: 

: 

Then we treat the longtitude coordinates and transform then to Mercator x/y Coordinates

In [None]:
from datashader.utils import lnglat_to_meters as webm

x, y = webm(geo.geolocation_lng, geo.geolocation_lat)
geo['x'] = pd.Series(x)
geo['y'] = pd.Series(y)

: 

: 

: 

In [None]:
geo.head(3)

: 

: 

: 

### Zip Codes in Brazil

Finally plotting the coordinates on a map. We see there is a relationship between the zip code prefix and location to that zip code. They start in Sao Paulo, with prefix 01001, and then increase counterclockwise finishing in Rio Grande do Sul (south of Brazil), with prefix 99990.

In [None]:
# transforming the prefix to int for plotting purposes
geo['geolocation_zip_code_prefix'] = geo['geolocation_zip_code_prefix'].astype(int)
geo['geolocation_zip_code_prefix_1_dgits'] = geo['geolocation_zip_code_prefix_1_digits'].astype(int)
geo['geolocation_zip_code_prefix_2_dgits'] = geo['geolocation_zip_code_prefix_2_digits'].astype(int)
geo['geolocation_zip_code_prefix_3_dgits'] = geo['geolocation_zip_code_prefix_3_digits'].astype(int)
geo['geolocation_zip_code_prefix_4_dgits'] = geo['geolocation_zip_code_prefix_4_digits'].astype(int)

: 

: 

: 

In [None]:
brazil = geo
agg_name = 'geolocation_zip_code_prefix'
brazil[agg_name].describe().to_frame()

: 

: 

: 

In [None]:
# plot with holoviews + datashader - bokeh with map background
import holoviews as hv
import geoviews as gv
import datashader as ds
from colorcet import fire, rainbow, bgy, bjy, bkr, kb, kr
from datashader.colors import colormap_select, Greys9
from holoviews.streams import RangeXY
from holoviews.operation.datashader import datashade, dynspread, rasterize
from bokeh.io import push_notebook, show, output_notebook
from IPython.display import display

output_notebook()
hv.extension('bokeh')

%opts Overlay[width=800 height=600 toolbar='above' xaxis=None yaxis=None]
%opts QuadMesh [tools=['hover'] colorbar=True] (alpha=0 hover_alpha=0.2)

T = 0.05
PX = 1

def plot_map(data, label, agg_data, agg_name, cmap):
    url = "http://server.arcgisonline.com/ArcGIS/rest/services/Canvas/World_Dark_Gray_Base/MapServer/tile/{Z}/{Y}/{X}.png"
    geomap = gv.WMTS(url)
    points = hv.Points(gv.Dataset(data, kdims=['x', 'y'], vdims=[agg_name]))
    agg = datashade(points, element_type=gv.Image, aggregator=agg_data, cmap=cmap)
    zip_codes = dynspread(agg, threshold=T, max_px=PX)
    hover = hv.util.Dynamic(rasterize(points, aggregator=agg_data, width=50, height=25, streams=[RangeXY]), operation=hv.QuadMesh)
    hover = hover.options(cmap=cmap)
    img = geomap * zip_codes * hover
    img = img.relabel(label)
    return img

: 

: 

: 

In [None]:
display(plot_map(brazil, 'Zip Codes in Brazil', ds.min(agg_name), agg_name, cmap=rainbow))

: 

: 

: 

In [None]:
# plot with datadasher - image with black background
import datashader as ds
from datashader import transfer_functions as tf
from functools import partial
from datashader.utils import export_image
from IPython.core.display import HTML
from colorcet import fire, rainbow, bgy, bjy, bkr, kb, kr

background = "black"
cm = partial(colormap_select, reverse=(background!="black"))
export = partial(export_image, background = background, export_path="export")
display(HTML("<style>.container { width:100% !important; }</style>"))
W = 700

def create_map(data, cmap, data_agg, export_name='img'):
    pad = (data.x.max() - data.x.min())/50
    x_range, y_range = ((data.x.min() - pad, data.x.max() + pad),
                        (data.y.min() - pad, data.y.max() + pad))
    ratio = (y_range[1] - y_range[0]) / (x_range[1] - x_range[0])
    
    plot_width = int(W)
    plot_height = int(plot_width * ratio)
    if ratio > 1.5:
        plot_height = 550
        plot_width = int(plot_height / ratio)
        
    cvs = ds.Canvas(plot_width=plot_width, plot_height=plot_height, x_range=x_range, y_range=y_range)
    
    agg = cvs.points(data, 'x', 'y', data_agg)
    img = tf.shade(agg, cmap=cmap, how='eq_hist')
    return export(img, export_name)

: 

: 

: 

In [None]:
create_map(brazil, rainbow, ds.mean(agg_name), 'brazil_zip_codes')

: 

: 

: 

#### Zip Codes in States

lets look at the state of Sao Paulo (SP) to see how zip code prefixes works in a regional level. We see that:

- zip codes prefixes in Sao Paulo state ranges from 01001 to 19990
- zip codes starting with 0 in the Sao Paulo metro region
- zip codes starting with 1 are in the interior of the state

In [None]:
def filter_data(level, name):
    df = geo[geo[level] == name]
    # remove outliers
    df = df[(df.x <= df.x.quantile(0.999)) & (df.x >= df.x.quantile(0.001))]
    df = df[(df.y <= df.y.quantile(0.999)) & (df.y >= df.y.quantile(0.001))]
    
    return df

: 

: 

: 

In [None]:
sp = filter_data('geolocation_state', 'SP')
agg_name = 'geolocation_zip_code_prefix'
sp[agg_name].describe().to_frame().round(2)

: 

: 

: 

In [None]:
plot_map(sp, 'Zip Codes in Sao Paulo State', ds.min(agg_name), agg_name, cmap=rainbow)

: 

: 

: 

In [None]:
create_map(sp, rainbow, ds.mean(agg_name), 'sp_zip_codes')

: 

: 

: 

#### Zip Codes in Large Cities

Lets look at the city of Sao Paulo to see how zip code prefixes works in a city level. We see that:
- zip code prefixes in Sao Paulo city ranges from 01001 to 09540
- zip code prefixes are somehow related to neighborhoods or city districts

In [None]:
saopaulo = filter_data('geolocation_city', 'sao paulo')
agg_name = 'geolocation_zip_code_prefix'
saopaulo[agg_name].describe().to_frame()

: 

: 

: 

In [None]:
plot_map(saopaulo, 'Zip Codes in Sao Paulo City', ds.min(agg_name), agg_name, cmap=rainbow)

: 

: 

: 

In [None]:
create_map(saopaulo, rainbow, ds.mean(agg_name), 'sao_paulo_zip_codes')

: 

: 

: 

#### Zip Codes in Small Cities

Lets look at the city of Atibaia to see how zip code prefixes works in a city level. We see that:
- zip code prefix of Atibaia city is between 12940 to 12954
- but there are other neighbor cities with the same zip code prefix
- to have more detail and go down to a city level we would probably need more zip code digits (the 4th and 5th digit)

In [None]:
atibaia = geo[geo['geolocation_city'] == 'atibaia']
agg_name = 'geolocation_zip_code_prefix'
atibaia[agg_name].describe().to_frame().round(2)

: 

: 

: 

In [None]:
plot_map(atibaia, 'Zip Codes in Atibaia', ds.min(agg_name), agg_name, cmap=rainbow)

: 

: 

: 

In [None]:
create_map(atibaia, rainbow, ds.mean(agg_name), 'atibaia_zip_codes')

: 

: 

: 

#### Abrangence of zip code digits

what does every digit in the zip code means? Lets see how it behave, from 1 to 5 digits.

**Zip codes starting with 2 are all from Rio de Janeiro (RJ) and Espírito Santo (ES) States**

In [None]:
# Zip code: 2
df = filter_data('geolocation_zip_code_prefix_1_digits', 2)
create_map(df, cm(Greys9), ds.count(), 'zip_code_2')

: 

: 

: 

: 

: 

: 

: 

: 

: 