In [12]:
import pprint as pprint
import xml.etree.cElementTree as ET
import re
from collections import defaultdict
import time
import codecs
import json
from pymongo import MongoClient

## Load File and Create Sample File

In [2]:
OSM_FILE = "chicago.osm"

In [4]:
#Create a sample file from the full chicago OSM data
SAMPLE_FILE = "chicago_sample.osm"

k = 150

def get_element(OSM_FILE, tags=('node', 'way', 'relation')):
    context = ET.iterparse(OSM_FILE, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()
            
            
with open (SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n ')

    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

# Explore OSM Data

#### Key value check

In [3]:
# Before you process the data and add it into MongoDB, you should
# check the "k" value for each "<tag>" and see if they can be valid keys in MongoDB,
# as well as see if there are any other potential problems.

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

#determine if a tag matches one of the regular expression conditions above 
# and add a count to the dict for that condition

def key_type(element, keys):
    if element.tag == "tag":
       
        for tag in element.iter('tag'):
            tagk = tag.attrib['k']
            if lower.match(tagk):
                keys['lower'] += 1
            elif problemchars.match(tagk):
                keys['problemchars'] += 1
            elif lower_colon.match(tagk):
                keys['lower_colon'] += 1
            #if none of the searches as true add a count to 'other'
            else:
                keys['other'] += 1

    return keys


def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

In [6]:
process_map(OSM_FILE)

{'lower': 1946125, 'lower_colon': 3088289, 'other': 1596539, 'problemchars': 0}

#### Tag Type Count | Map Parser

In [7]:
def get_element(filename):
   
    context = ET.iterparse(filename, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end':
            yield elem
            root.clear()


def count_tags (filename):
    tags_dict = defaultdict(int)
    for i, elem in enumerate(get_element(filename)):
        tags_dict[elem.tag] += 1
    return tags_dict

In [8]:
count_tags(OSM_FILE)

defaultdict(int,
            {'bounds': 1,
             'member': 85629,
             'nd': 9927164,
             'node': 8399408,
             'osm': 1,
             'relation': 4592,
             'tag': 6630953,
             'way': 1187563})

#### User IDs

In [19]:
def get_user(element):
    return

def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if element.tag == 'node':
            for node in element.iter('node'):
                userid = node.attrib['uid']
                users.add(userid)

    return users

In [11]:
users = process_map(OSM_FILE)
pprint.pprint(users[0:6])

TypeError: 'set' object has no attribute '__getitem__'

#### Number of Unique Users

In [13]:
def get_user(element):
    return

def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if element.tag == 'node':
            for node in element.iter('node'):
                userid = node.attrib['uid']
                users.add(userid)

    return users

In [14]:
len(process_map(OSM_FILE))

2017

# Data Audit

In [21]:
"""
Your task in this exercise has two steps:
- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix 
    the unexpected street types to the appropriate ones in the expected list.
    You have to add mappings only for the actual problems you find in this OSMFILE,
    not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
    The function takes a string with street name as an argument and should return the fixed name
    We have provided a simple test so that you see what exactly is expected
"""

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Circle", "South Avenue", "North", "South","East","West","Lane","Route"]

mapping = {"St": "Street", "St.": "Street", "street": "Street","st": "Street",
           "Dr": "Drive", "Dr.": "Drive","Ct":"Court", 
          "Blvd":"Boulevard","blvd":"Boulevard","Blvd.":"Boulevard", 
           "Ct.":"Court","HWY":"Highway",
           "Ln":"Lane","Ter":"Terrace","Ave.":"Avenue","Ave":"Avenue",
           "road":"Road","rd":"Road","Rd.":"Road","Rd": "Road","place":"Place","US":"U.S.",
           "Pl":"Place","Rte":"Route","IL":"Illinois","W":"West","Trl":"Trail",
           "avenue":"Avenue","Pkwy":"Parkway","Cir":"Circle",
            "N":"North","N.":"North","E":"East","S":"South","S":"South","S.":"South",
           "W.":"West","E.":"East","W":"West"
          }


def audit_street_type(street_types, street_name):
    """this function audit street name type"""
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    """this function if an attribute includes a street"""
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    """this function return all street name types"""
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

    return street_types


def update_name(name, mapping):
    """this function update street name and make street name to be consistent"""
    name = name.replace(",", "")
    for word in name.split(" "):
        if word in mapping.keys():
            name = name.replace(word, mapping[word])

    return name


def output(osmfile):
    """This function print updated street name"""
    st_types = audit(osmfile)
    #assert len(st_types) == 3
    pprint.pprint(dict(st_types))

    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name
    print "Pass"


In [22]:
output(OSM_FILE)

{'104': set(['North Sherman Avenue #104']),
 '127': set(['Skokie Boulevard #127']),
 '14': set(['Route 14', 'U.S. 14']),
 '1425': set(['N Lake Shore Dr #1425']),
 '176': set(['IL Route 176', 'Route 176']),
 '1E': set(['West Waveland Avenue Apt 1E']),
 '20': set(['US 20', 'West US Highway 20']),
 '200': set(['Lake Street #200']),
 '201': set(['Route 30, Suite 201']),
 '2105': set(['North Michigan Avenue # 2105']),
 '22': set(['E Route 22', 'Il 22']),
 '231': set(['East US 231']),
 '25': set(['Larkin Ave #25']),
 '30': set(['US 30']),
 '31': set(['Route 31']),
 '321': set(['East Woodfield Road  #321']),
 '34': set(['US 34', 'West Route 34']),
 '38': set(['Route 38']),
 '400': set(['South Michigan Avenue # 400']),
 '47': set(['HWY 47', 'South Route 47']),
 '500': set(['Thayer Court #500']),
 '510': set(['W Madison St #510']),
 '53': set(['Route 53']),
 '59': set(['HWY 59',
            'IL 59',
            'Illinois 59',
            'North Route 59',
            'Route 59',
            'Ro

# Data Setup

In [23]:
"""
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:
{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
          "version":"2",
          "changeset":"17206049",
          "timestamp":"2013-08-03T16:43:42Z",
          "user":"linuxUser16",
          "uid":"1219059"
        },
"pos": [41.9757030, -87.6921867],
"address": {
          "housenumber": "5157",
          "postcode": "60625",
          "street": "North Lincoln Ave"
        },
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}
You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB. You could also do some cleaning
before doing that, like in the previous exercise, but for this exercise you just have to
shape the structure.
In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
    - attributes in the CREATED array should be added under a key "created"
    - attributes for latitude and longitude should be added to a "pos" array,
      for use in geospacial indexing. Make sure the values inside "pos" array are floats
      and not strings.
- if second level tag "k" value contains problematic characters, it should be ignored
- if second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if second level tag "k" value does not start with "addr:", but contains ":", you can process it
  same as any other tag.
- if there is a second ":" that separates the type/direction of a street,
  the tag should be ignored, for example:
<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>
  should be turned into:
{...
"address": {
    "housenumber": 5158,
    "street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}
- for "way" specifically:
  <nd ref="305896090"/>
  <nd ref="1719825889"/>
should be turned into
"node_ref": ["305896090", "1719825889"]
"""


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
addresschars = re.compile(r'addr:(\w+)')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

def shape_element(element):
    #node = defaultdict(set)
    node = {}
    if element.tag == "node" or element.tag == "way" :
        #create the dictionary based on exaclty the value in element attribute.
        node = {'created':{}, 'type':element.tag}
        for k in element.attrib:
            try:
                v = element.attrib[k]
            except KeyError:
                continue
            if k == 'lat' or k == 'lon':
                continue
            if k in CREATED:
                node['created'][k] = v
            else:
                node[k] = v
        try:
            node['pos']=[float(element.attrib['lat']),float(element.attrib['lon'])]
        except KeyError:
            pass
        
        if 'address' not in node.keys():
            node['address'] = {}
        #Iterate the content of the tag
        for stag in element.iter('tag'):
            #Init the dictionry

            k = stag.attrib['k']
            v = stag.attrib['v']
            #Checking if indeed prefix with 'addr' and no ':' afterwards
            if k.startswith('addr:'):
                if len(k.split(':')) == 2:
                    content = addresschars.search(k)
                    if content:
                        node['address'][content.group(1)] = v
            else:
                node[k]=v
        if not node['address']:
            node.pop('address',None)
        #Special case when the tag == way,  scrap all the nd key
        if element.tag == "way":
            node['node_refs'] = []
            for nd in element.iter('nd'):
                node['node_refs'].append(nd.attrib['ref'])
#         if  'address' in node.keys():
#             pprint.pprint(node['address'])
        return node
    else:
        return None
    
def process_map(file_in, pretty = False):
    """
    Process the osm file to json file to be prepared for input file to monggo
    """
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data


In [24]:
OSM_data = process_map(OSM_FILE)

In [32]:
pprint.pprint(OSM_data[0:6])

[{'_id': ObjectId('58079e975536b53cabe049db'),
  'created': {'changeset': '7781188',
              'timestamp': '2011-04-06T05:17:15Z',
              'uid': '207745',
              'user': 'NE2',
              'version': '54'},
  'exit_to': 'Joliet Road',
  'highway': 'motorway_junction',
  'id': '219850',
  'pos': [41.7585879, -87.9101245],
  'ref': '276C',
  'type': 'node'},
 {'_id': ObjectId('58079e975536b53cabe049dc'),
  'created': {'changeset': '7781188',
              'timestamp': '2011-04-06T05:18:47Z',
              'uid': '207745',
              'user': 'NE2',
              'version': '47'},
  'exit_to': 'North I-294 ; Tri-State Tollway;  Wisconsin',
  'highway': 'motorway_junction',
  'id': '219851',
  'pos': [41.7593116, -87.9076432],
  'ref': '277A',
  'type': 'node'},
 {'_id': ObjectId('58079e975536b53cabe049dd'),
  'created': {'changeset': '485405',
              'timestamp': '2009-04-13T11:21:51Z',
              'uid': '18480',
              'user': 'nickvet419',
       

In [26]:
client = MongoClient('localhost', 27017)

In [27]:
db = client.Final_osm

In [28]:
db.osm_final.insert(OSM_data)

  if __name__ == '__main__':


[ObjectId('58079e975536b53cabe049db'),
 ObjectId('58079e975536b53cabe049dc'),
 ObjectId('58079e975536b53cabe049dd'),
 ObjectId('58079e975536b53cabe049de'),
 ObjectId('58079e975536b53cabe049df'),
 ObjectId('58079e975536b53cabe049e0'),
 ObjectId('58079e975536b53cabe049e1'),
 ObjectId('58079e975536b53cabe049e2'),
 ObjectId('58079e975536b53cabe049e3'),
 ObjectId('58079e975536b53cabe049e4'),
 ObjectId('58079e975536b53cabe049e5'),
 ObjectId('58079e975536b53cabe049e6'),
 ObjectId('58079e975536b53cabe049e7'),
 ObjectId('58079e975536b53cabe049e8'),
 ObjectId('58079e975536b53cabe049e9'),
 ObjectId('58079e975536b53cabe049ea'),
 ObjectId('58079e975536b53cabe049eb'),
 ObjectId('58079e975536b53cabe049ec'),
 ObjectId('58079e975536b53cabe049ed'),
 ObjectId('58079e975536b53cabe049ee'),
 ObjectId('58079e975536b53cabe049ef'),
 ObjectId('58079e975536b53cabe049f0'),
 ObjectId('58079e975536b53cabe049f1'),
 ObjectId('58079e975536b53cabe049f2'),
 ObjectId('58079e975536b53cabe049f3'),
 ObjectId('58079e975536b5

In [80]:
#Collection Stats
print db.command("dbstats")

{u'storageSize': 727793664.0, u'ok': 1.0, u'avgObjSize': 248.1989511598606, u'db': u'Final_osm', u'indexes': 1, u'objects': 9586971, u'collections': 1, u'numExtents': 0, u'dataSize': 2379476147.0, u'indexSize': 87064576.0}


In [33]:
#Testing find method
def find():
    nodes = db.osm_final.find({'id':'219850'})
    for a in nodes:
        pprint.pprint(a)
        
find()

{u'_id': ObjectId('58079e975536b53cabe049db'),
 u'created': {u'changeset': u'7781188',
              u'timestamp': u'2011-04-06T05:17:15Z',
              u'uid': u'207745',
              u'user': u'NE2',
              u'version': u'54'},
 u'exit_to': u'Joliet Road',
 u'highway': u'motorway_junction',
 u'id': u'219850',
 u'pos': [41.7585879, -87.9101245],
 u'ref': u'276C',
 u'type': u'node'}


In [34]:
#Show 5 data that have street
def street_5():
    result  = db.osm_final.aggregate([
            {'$match': {'address.street':{'$exists':1}}},
            {'$limit' : 5}
        ])
    for a in result:
        pprint.pprint(a)

street_5()

{u'_id': ObjectId('58079eb05536b53cabe57cfa'),
 u'address': {u'city': u'Evanston',
              u'housenumber': u'1022',
              u'postcode': u'60201',
              u'street': u'Central Street'},
 u'created': {u'changeset': u'34237971',
              u'timestamp': u'2015-09-25T04:19:04Z',
              u'uid': u'185986',
              u'user': u'jonwchgo',
              u'version': u'9'},
 u'id': u'249656713',
 u'name': u'Central',
 u'network': u'CTA',
 u'operator': u'Chicago Transit Authority',
 u'pos': [42.064201, -87.685828],
 u'railway': u'station',
 u'type': u'node',
 u'website': u'http://www.transitchicago.com/travel_information/station.aspx?StopId=34',
 u'wheelchair': u'no',
 u'wikipedia': u'en:Central (CTA Purple Line station)'}
{u'_id': ObjectId('58079eb05536b53cabe58bea'),
 u'address': {u'city': u'Chicago',
              u'housenumber': u'1358',
              u'postcode': u'60626',
              u'state': u'IL',
              u'street': u'West Morse Avenue'},
 u'creat

In [84]:
#Examine make up of postal codes

def postal_codes():
    result  = db.osm_final.aggregate([{"$match":{"address.postcode":{"$exists":1}}}, 
                   {"$group":{"_id":"$address.postcode", "count":{"$sum":1}}}, 
                   {"$sort":{"count":1}}])
    for a in result:
        pprint.pprint(a)

postal_codes()

{u'_id': u'46373', u'count': 1}
{u'_id': u'60487', u'count': 1}
{u'_id': u'60425', u'count': 1}
{u'_id': u'60409-9998', u'count': 1}
{u'_id': u'60191-1983', u'count': 1}
{u'_id': u'60827-6427', u'count': 1}
{u'_id': u'6051', u'count': 1}
{u'_id': u'Il', u'count': 1}
{u'_id': u'606', u'count': 1}
{u'_id': u'60203', u'count': 1}
{u'_id': u'46404', u'count': 1}
{u'_id': u'60712-2716', u'count': 1}
{u'_id': u'60654-5799', u'count': 1}
{u'_id': u'60627', u'count': 1}
{u'_id': u'60016-5670', u'count': 1}
{u'_id': u'60016-5608', u'count': 1}
{u'_id': u'46406', u'count': 1}
{u'_id': u'601412', u'count': 1}
{u'_id': u'60521-2101', u'count': 1}
{u'_id': u'IL 60605-1226', u'count': 1}
{u'_id': u'60422', u'count': 1}
{u'_id': u'60469', u'count': 1}
{u'_id': u'60483', u'count': 1}
{u'_id': u'60401', u'count': 1}
{u'_id': u'0201', u'count': 1}
{u'_id': u'60696', u'count': 1}
{u'_id': u'60076-2000', u'count': 1}
{u'_id': u'60546-1262', u'count': 1}
{u'_id': u'60077-3495', u'count': 1}
{u'_id': u'IL 6

In [35]:
#Show the top 5 of contributed users
def users_5():
    result  = db.osm_final.aggregate([
            {'$match': {'created.user':{'$exists':1}}},
            {'$group': {'_id':'$created.user',
                        'count':{'$sum':1}}},
            {'$sort': {'count':-1}},
            {'$limit' : 5}
        ])
    for a in result:
        pprint.pprint(a)

users_5()

{u'_id': u'chicago-buildings', u'count': 5635534}
{u'_id': u'Umbugbene', u'count': 1101042}
{u'_id': u'alexrudd (NHD)', u'count': 232740}
{u'_id': u'woodpeck_fixbot', u'count': 225702}
{u'_id': u'patester24', u'count': 109091}


In [36]:
#Show the restaurant's name, the food they serve, and contact number

def restaurant_5():
    result  = db.osm_final.aggregate([
           {'$match': {'amenity':'restaurant',
                        'name':{'$exists':1}}},
            {'$project':{'_id':'$name',
                         'cuisine':'$cuisine',
                         'contact':'$phone'}}
        ])
    for a in result:
        pprint.pprint(a)

restaurant_5()


{u'_id': u'Coffee Beanery'}
{u'_id': u'Panera Bread', u'cuisine': u'sandwich'}
{u'_id': u'Char House Grill', u'cuisine': u'burger'}
{u'_id': u"Clarke's", u'cuisine': u'burger'}
{u'_id': u"Chili's", u'cuisine': u'southwest'}
{u'_id': u'Bubba Gump Shrimp Co.'}
{u'_id': u"Capi's Italian Kitchen"}
{u'_id': u"Ada's Famous Deli & Restaurant"}
{u'_id': u'Outback', u'cuisine': u'steak_house'}
{u'_id': u"Winberie's"}
{u'_id': u'Yen Yen', u'cuisine': u'Cantonese;American'}
{u'_id': u'Pompei Pizza & Pasta',
 u'contact': u'847-259-056',
 u'cuisine': u'italian'}
{u'_id': u"Domino's Pizza"}
{u'_id': u'Sushi Kamon', u'cuisine': u'Sushi;Japanese'}
{u'_id': u'Hotshots'}
{u'_id': u'Baskin 31 Robbins'}
{u'_id': u'Buffalo'}
{u'_id': u"Branko's Submarines", u'cuisine': u'burger'}
{u'_id': u"Portillo's", u'contact': u'847-933-0700', u'cuisine': u'sandwich'}
{u'_id': u'Village Inn Pizza', u'cuisine': u'pizza'}
{u'_id': u"Pizano's Pizza & Pasta", u'cuisine': u'pizza;pasta'}
{u'_id': u"Portillo's Hot Dogs"}
{u

In [40]:
# Sort cities by count, descending

def cities():
    result  = db.osm_final.aggregate([{"$match":{"address.city":{"$exists":1}}}, 
                   {"$group":{"_id":"$address.city", "count":{"$sum":1}}}, 
                   {"$sort":{"count":1}}])

    for a in result:
            pprint.pprint(a)

cities()

{u'_id': u'Griffith', u'count': 1}
{u'_id': u'http://www.woodlawngrant-apts.com/', u'count': 1}
{u'_id': u'Lynwood', u'count': 1}
{u'_id': u'Markham', u'count': 1}
{u'_id': u'Dyer', u'count': 1}
{u'_id': u'Porter', u'count': 1}
{u'_id': u'Addon', u'count': 1}
{u'_id': u'Prospect Heights', u'count': 1}
{u'_id': u'Saint John', u'count': 1}
{u'_id': u'elmhurst', u'count': 1}
{u'_id': u'Stone Park', u'count': 1}
{u'_id': u'Dixmoor', u'count': 1}
{u'_id': u'Itasca', u'count': 1}
{u'_id': u'Hebron', u'count': 1}
{u'_id': u'Orland Hills', u'count': 1}
{u'_id': u'Glenwood', u'count': 1}
{u'_id': u'Chciago', u'count': 1}
{u'_id': u'60706', u'count': 1}
{u'_id': u'CHicago', u'count': 1}
{u'_id': u'mount prospect', u'count': 1}
{u'_id': u'Green Oaks', u'count': 1}
{u'_id': u'Flossmoor', u'count': 1}
{u'_id': u'Posen', u'count': 1}
{u'_id': u'Park Forest', u'count': 1}
{u'_id': u'Montgomery', u'count': 1}
{u'_id': u'Hawthorn Woods', u'count': 1}
{u'_id': u'Olympia Fields', u'count': 1}
{u'_id': u'

In [54]:
def find_2():
    result = db.osm_final.find({'city':'Evanston'})
    for a in result:
        pprint.pprint(a)
        
find_2()

In [55]:
# Number of documents
db.osm_final.find().count()

9586971

In [56]:
# Number of nodes
db.osm_final.find({"type":"node"}).count()

8399395

In [57]:
# Number of ways
db.osm_final.find({"type":"way"}).count()

1187370

In [92]:
# Number of unique users

def unique_Users():
    result = db.osm_final.aggregate([{'$group':{'_id':'$created.user','count':{'$sum':1}}}, 
                        {'$group':{'_id':'$created.user', 'count':{"$sum":1}}}
                         ])
    for a in result:
        pprint.pprint(a)

unique_Users()

{u'_id': None, u'count': 2274}


In [82]:
# Top 5 contributing user

def con_user():
    result = db.osm_final.aggregate([{"$group":{"_id":"$created.user", "count":{"$sum":1}}}, 
                                     {"$sort":{"count":-1}}, 
                                     {"$limit":5}]
                                   )     
    for a in result:
        pprint.pprint(a)
        
        
con_user()

{u'_id': u'chicago-buildings', u'count': 5635534}
{u'_id': u'Umbugbene', u'count': 1101042}
{u'_id': u'alexrudd (NHD)', u'count': 232740}
{u'_id': u'woodpeck_fixbot', u'count': 225702}
{u'_id': u'patester24', u'count': 109091}


In [85]:
# Number of users appearing only once (having 1 post)
def num_user():
    result = db.osm_final.aggregate([{"$group":{"_id":"$created.user", "count":{"$sum":1}}}, 
                        {"$group":{"_id":"$count", "num_users":{"$sum":1}}}, 
                        {"$sort":{"_id":1}}, {"$limit":1}])
    for a in result:
        pprint.pprint(a)

num_user()

{u'_id': 1, u'num_users': 503}


In [88]:
# Total Number of Amenities
# Covering an assortment of community facilities including toilets, 
# telephones, banks, pharmacies and schools.
def amenities():
    result = db.osm_final.aggregate([{"$match":{"amenity":{"$exists":True}}}, 
                        {"$group":{"_id":"$amenity", "count":{"$sum":1}}}, 
                        {'$group':{'_id':'$amenity', 'count':{'$sum':'$count'}}}
                         ])
    for a in result:
        pprint.pprint(a)

amenities()

{u'_id': None, u'count': 31821}


In [91]:
# Total Number of Unique Amenities
# Covering an assortment of community facilities including toilets, 
# telephones, banks, pharmacies and schools.
def amenities_unique():
    result = db.osm_final.aggregate([{'$group':{'_id':'$amenity','count':{'$sum':1}}}, 
                        {'$group':{'_id':'$amenity', 'count':{"$sum":1}}}
                         ])
    for a in result:
        pprint.pprint(a)

amenities_unique()

{u'_id': None, u'count': 145}


In [94]:
# Top 10 Amenities
def amenities_top():
    result = db.osm_final.aggregate([{'$match':{'amenity':{'$exists':True}}}, 
                        {'$group':{'_id':'$amenity', 'count':{"$sum":1}}},
                                     {'$sort':{'count':-1}},
                                     {'$limit':10}
                         ])
    for a in result:
        pprint.pprint(a)

amenities_top()

{u'_id': u'parking', u'count': 12577}
{u'_id': u'place_of_worship', u'count': 4305}
{u'_id': u'school', u'count': 3423}
{u'_id': u'restaurant', u'count': 1967}
{u'_id': u'fast_food', u'count': 1369}
{u'_id': u'fuel', u'count': 805}
{u'_id': u'bank', u'count': 595}
{u'_id': u'cafe', u'count': 455}
{u'_id': u'grave_yard', u'count': 450}
{u'_id': u'pharmacy', u'count': 370}


In [97]:
# One Cuisine
def Cuisine_top():
    result = db.osm_final.aggregate([{'$match':{'cuisine':{'$exists':True}}}, 
                        {'$group':{'_id':'$cuisine', 'count':{"$sum":1},'name':{'$push':'$name'}}},
                                     {'$sort':{'count':-1}},
                                     {'$limit':1}
                         ])
    for a in result:
        pprint.pprint(a)

Cuisine_top()

{u'_id': u'burger',
 u'count': 494,
 u'name': [u"McDonald's",
           u'Char House Grill',
           u"Clarke's",
           u"McDonald's",
           u"McDonald's",
           u"Branko's Submarines",
           u"Wendy's",
           u"McDonald's",
           u"McDonald's",
           u"Wendy's",
           u"McDonald's",
           u'Burger King',
           u"McDonald's",
           u"McDonald's",
           u"McDonald's",
           u"McDonald's",
           u"McDonald's",
           u"McDonald's",
           u"McDonald's",
           u"McDonald's",
           u"McDonald's",
           u'White Castle',
           u"McDonald's",
           u'Dear Franks',
           u'Burger King',
           u"Culver's",
           u"Culver's",
           u"Culver's",
           u"Culver's",
           u"Culver's",
           u"Culver's",
           u"Culver's",
           u"Culver's",
           u"Culver's",
           u"Culver's",
           u"Culver's",
           u"Culver's",
           u"B