# Exploratory Data Analysis

In [35]:
from pymongo import MongoClient
import matplotlib.pyplot as plt
from bson.son import SON

In [75]:
# Open mongodb
client = MongoClient('localhost:27017')
db = client.elements

### How many elements does the dataset contain? 

In [76]:
print("Nodes: {}".format(db.nodes.count()))
print("Ways: {}".format(db.ways.count()))
print("Relations: {}".format(db.relations.count()))

Nodes: 333959
Ways: 64160
Relations: 2138


### Who are the users in the dataset? 

In [77]:
users_nodes = db.nodes.distinct('user')
users_ways = db.ways.distinct('user')
users_relations = db.relations.distinct('user')

users = users_nodes + users_ways + users_relations

print("{} users contributed to the map".format(len(set(users))))

814 users contributed to the map


#### Who are users that contribute the most? 

In [78]:
pipeline = [
    {"$unwind": "$user"},
    {"$group": {"_id": "$user", "count": {"$sum": 1}}},
    {"$sort": SON([("count", -1), ("_id", -1)])}
]

print(list(db.nodes.aggregate(pipeline)))

[{u'count': 188421, u'_id': u'q_un_go'}, {u'count': 76483, u'_id': u'skyper'}, {u'count': 5392, u'_id': u'malta-dinger'}, {u'count': 5222, u'_id': u'release_candidate'}, {u'count': 5006, u'_id': u'mgoe'}, {u'count': 4330, u'_id': u'rurseekatze'}, {u'count': 3129, u'_id': u'daloop'}, {u'count': 3035, u'_id': u'ferrero2'}, {u'count': 2403, u'_id': u'dddos'}, {u'count': 1906, u'_id': u'Tronikon'}, {u'count': 1868, u'_id': u'd003232'}, {u'count': 1848, u'_id': u'jokien'}, {u'count': 1828, u'_id': u'ralfhergert'}, {u'count': 1779, u'_id': u'rene78'}, {u'count': 1673, u'_id': u'klausis'}, {u'count': 1632, u'_id': u'Pico4life'}, {u'count': 1312, u'_id': u'woodpeck_repair'}, {u'count': 1291, u'_id': u'blan'}, {u'count': 1252, u'_id': u'EdAllen'}, {u'count': 1002, u'_id': u'FahRadler'}, {u'count': 934, u'_id': u'Grauer Elefant'}, {u'count': 915, u'_id': u'Geoflo'}, {u'count': 893, u'_id': u'GP4Flo'}, {u'count': 828, u'_id': u'kebekus'}, {u'count': 808, u'_id': u'jacobbraeutigam'}, {u'count': 66

### How many keys there are for every collection?

In [87]:
pipeline = [
    {"$unwind": "$tags"},
    {"$unwind": "$tags.key"},
    {"$group": {"_id": "$tags.key", "count": {"$sum": 1}}},
    {"$sort": SON([("count", -1), ("_id", -1)])}
]

most_popular_keys = list(db.nodes.aggregate(pipeline))[1:100]

most_popular_keys

[{u'_id': u'addr:street', u'count': 2796},
 {u'_id': u'source', u'count': 1658},
 {u'_id': u'wheelchair', u'count': 1630},
 {u'_id': u'level', u'count': 1278},
 {u'_id': u'addr:housenumber', u'count': 1212},
 {u'_id': u'entrance', u'count': 1170},
 {u'_id': u'barrier', u'count': 1079},
 {u'_id': u'sloped_curb', u'count': 809},
 {u'_id': u'railway', u'count': 688},
 {u'_id': u'amenity', u'count': 659},
 {u'_id': u'highway', u'count': 593},
 {u'_id': u'website', u'count': 584},
 {u'_id': u'power', u'count': 433},
 {u'_id': u'tourism', u'count': 400},
 {u'_id': u'source:addr', u'count': 395},
 {u'_id': u'noexit', u'count': 305},
 {u'_id': u'FIXME', u'count': 297},
 {u'_id': u'name', u'count': 278},
 {u'_id': u'material', u'count': 271},
 {u'_id': u'shop', u'count': 247},
 {u'_id': u'railway:switch', u'count': 229},
 {u'_id': u'tactile_paving', u'count': 228},
 {u'_id': u'source:addr:housenumber', u'count': 220},
 {u'_id': u'vending', u'count': 219},
 {u'_id': u'operator', u'count': 198},


In [90]:
pipeline = [
    {"$unwind": "$tags"},
    {"$unwind": "$tags.value"},
    {"$group": {"_id": "$tags.value", "count": {"$sum": 1}}},
    {"$sort": SON([("count", -1), ("_id", -1)])}
]

most_popular_values = list(db.nodes.aggregate(pipeline))[1:100]

most_popular_values

[{u'_id': u'yes', u'count': 2278},
 {u'_id': u'0', u'count': 1231},
 {u'_id': u'no', u'count': 773},
 {u'_id': u'both', u'count': 768},
 {u'_id': u'gate', u'count': 734},
 {u'_id': u'maps4bw (LGL, www_lgl-bw_de)', u'count': 698},
 {u'_id': u'main', u'count': 653},
 {u'_id': u'survey', u'count': 494},
 {u'_id': u'level_crossing', u'count': 434},
 {u'_id': u'voice', u'count': 368},
 {u'_id': u'tower', u'count': 357},
 {u'_id': u'limited', u'count': 338},
 {u'_id': u'information', u'count': 328},
 {u'_id': u'crossing', u'count': 296},
 {u'_id': u'bench', u'count': 278},
 {u'_id': u'survey 2015-12-06', u'count': 222},
 {u'_id': u'Bing', u'count': 220},
 {u'_id': u'traffic_signals', u'count': 214},
 {u'_id': u'underground', u'count': 192},
 {u'_id': u'turning_circle', u'count': 176},
 {u'_id': u'entrance', u'count': 162},
 {u'_id': u'wood', u'count': 158},
 {u'_id': u'default', u'count': 153},
 {u'_id': u'container', u'count': 137},
 {u'_id': u'waste_basket', u'count': 136},
 {u'_id': u'bol

In [104]:
amenities = db.nodes.find({'tags.key': 'amenity'})

print("There are {} amenities".format(amenities.count()))

for amenity in amenities:
    print(amenity['tags']['value'])

There are 659 amenities
telephone
post_box
parking
telephone
post_box
telephone
telephone
post_box
bicycle_parking
telephone
bicycle_parking
post_box
telephone
telephone
telephone
post_box
post_office
recycling
recycling
post_box
telephone
telephone
telephone
telephone
telephone
telephone
post_box
post_box
bicycle_parking
bicycle_parking
bicycle_parking
bicycle_parking
bicycle_parking
bicycle_parking
bicycle_parking
bicycle_parking
bicycle_parking
bicycle_parking
restaurant
taxi
bicycle_parking
bicycle_parking
taxi
bicycle_parking
telephone
parking
parking
bench
bench
bench
bench
bench
waste_basket
bench
dentist
taxi
telephone
post_box
bench
bench
bench
bench
bench
bench
bench
bench
bench
bench
vending_machine
bench
bench
bench
fountain
bench
car_wash
bench
bench
bench
bench
bench
shelter
bench
bench
bench
bench
fountain
bench
bench
bench
bench
bench
bench
bench
bench
atm
bench
bench
drinking_water
doctors
fire_station
bench
bench
bench
bench
bench
taxi
bench
bench
bench
bench
bench
be

In [112]:
# What's are most popular amenities

pipeline = [
    {"$unwind": "$tags"},
    {"$unwind": "$tags.key"},
    {"$group": {"_id": {
        "key": "$amenity"}, "count": {"$sum": 1}}},
    {"$sort": SON([("count", -1), ("_id", -1)])}
]

most_popular_values = list(db.nodes.aggregate(pipeline))[1:100]

most_popular_values

[]