# Intro

Get the data from http://overpass-turbo.eu/ [amenity=bar]

using https://github.com/Guts/Paris-Beer-Week/blob/master/data/raw_data/getOpenBeerMap.py

![](dataset-cover.jpg)

![](beer map.png)

# 1. Map Reduce & DAG

In [1]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff

init_notebook_mode(connected=True)

import unicodedata

# helper function to fix unicode in JSON file
def fixUnicode(str):
    return unicodedata.normalize('NFKD', str).encode('ascii','ignore')

In [2]:
path = "/user/mathias/beermap.json"
beerMap = sqlContext.read.json(path)

beerMap.printSchema()

root
 |-- geometry: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- id: long (nullable = true)
 |-- properties: struct (nullable = true)
 |    |-- BEERS: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- BREWER: string (nullable = true)
 |    |-- NAME: string (nullable = true)
 |    |-- OSM_ID: long (nullable = true)
 |    |-- TYPE: string (nullable = true)
 |-- type: string (nullable = true)



In [3]:
# get all beers
allBeer = beerMap.rdd.flatMap(lambda b: b.properties['BEERS'])

# fix unicode characters
allBeer = allBeer.map(lambda b: fixUnicode(b))

# MAP
allBeerMap = allBeer.map(lambda b: (b,1))

# Print table
table = ff.create_table([('Key', 'Value')] + allBeerMap.take(10))
iplot(table, filename='map_table')

In [4]:
# REDUCE
allBeerReduce = allBeerMap.reduceByKey(lambda v1,v2: v1+v2)

# Print table
table = ff.create_table([('Key', 'Value')] + allBeerReduce.take(10))
iplot(table, filename='reduce_table')

# 2. Spark SQL

In [5]:
# register the DataFrame as a Table
beerMap.registerTempTable("beerMap")

# SQL
allBeer = sqlContext.sql("SELECT properties.name, beer FROM beerMap LATERAL VIEW explode(properties.beers) beersTable AS beer")

# fix unicode characters
allBeer = allBeer.map(lambda r: [ fixUnicode(r.name), fixUnicode(r.beer) ])

# Print table
table = ff.create_table([('Bar', 'Beer')] + allBeer.take(10))
iplot(table, filename='bar_beer_table')

In [6]:
# what is the most famous beer ?
sql = """SELECT count(*) as count, beer 
FROM beerMap 
LATERAL VIEW explode(properties.beers) beersTable AS beer 
GROUP BY beer
ORDER BY count(*) DESC"""

countBeer = sqlContext.sql(sql)

df = countBeer.toPandas()

In [7]:
trace = go.Bar(x=df['beer'],y=df['count'])
data = [trace]

iplot(data, filename='beer-bar')

In [8]:
# where to find the Chimay ?
chimay = sqlContext.sql("SELECT properties.name, geometry.coordinates FROM beerMap LATERAL VIEW explode(properties.beers) beersTable AS beer where beer = 'Chimay'")
pdf = chimay.toPandas()

# Print table
table = ff.create_table(pdf)
iplot(table, filename='chimay_table')