# Read Data

In [23]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from collections import defaultdict
import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow
import re
import pprint

OSM_FILE = "shanghai_china.osm"  
SAMPLE_FILE = "sample.osm"
#iterate the tag of the file and count the total number of these tags
k = 10 
unique_tags = ['node', 'nd', 'bounds', 'member', 'tag', 'relation', 'way', 'osm']
tags = {}
for event,elem in ET.iterparse(OSM_FILE):
    #if elem.tag=='node':
    if elem.tag not in tags:
        tags[elem.tag] = 1
    else:
        tags[elem.tag] = tags[elem.tag] + 1

In [5]:
tags

{'bounds': 1,
 'member': 47782,
 'nd': 3516830,
 'node': 2978675,
 'osm': 1,
 'relation': 2298,
 'tag': 1097138,
 'way': 361842}

In [24]:
#a function to get specific element whose tag is node, way or relation
def get_element(osm_file, tags=( 'node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

In [8]:
#extract some data from the original dataset and write these data to sample.osm
with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

In [18]:
#check the child tag of each element in the sample file
children = {}
for event,elem in ET.iterparse(SAMPLE_FILE):
    for child in elem:
        if child.tag not in children:
            children[child.tag] = 1
        else:
            children[child.tag] = children[child.tag] + 1

In [19]:
children

{'member': 5088,
 'nd': 355784,
 'node': 297868,
 'relation': 230,
 'tag': 109538,
 'way': 36184}

# Audit Street Name

In [1]:
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json

#regular expression used to check the dataset
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
housenumber_type_re = re.compile(r'^\d+')
postcode_type_re = re.compile(r'^\d+')

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "St":"Street",
            "St.":"Street",
            "street":"Street",
            "Ave":"Avenue",
            "AVE":"Avenue",
            "Ave.":"Avenue",
            "ave":"Avenue",
            "garden":"Garder",
            "Rd. ":"Road",
            "Rd.":"Road",
            "Rd,":"Road",
            "Rd":"Road",
            "lu":"Road",
            "lu ":"Road",
            "Lu":"Road",
            "rd":"Road",
            "road":"Road"}



In [27]:

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

#find out the element whose key is "addr:street"
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

#update the street name according to mapping
def update_name(name, mapping):
    m = street_type_re.search(name)
    other_street_types = [ ]
    if m:
        street_type= m.group( )
    if street_type in mapping.keys( ):
        name = re.sub(street_type,mapping[street_type],name)
    else:
        other_street_types.append(street_type)
    return name

In [18]:
#audit housenumber
def audit_housenumber(housenumber_types, housenumber): 
    m = housenumber_type_re.search(housenumber)
    if m:
        housenumber_type = m.group()
        if housenumber_type not in expected:
            housenumber_types[housenumber_type].add(housenumber)

def is_housenumber(elem):
    return (elem.attrib['k'] == "addr:housenumber")

def audit_h(osmfile):
    osm_file = open(osmfile, "r")
    housenumber_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way": 
            for tag in elem.iter("tag"):
                if is_housenumber(tag):
                    audit_housenumber(housenumber_types, tag.attrib['v'])
    return housenumber_types

In [61]:
import sys
reload(sys)
sys.setdefaultencoding('utf-8') 

hanzi=re.compile(u'[\u4e00-\u9fa5]');

#update house number
def update_number(housenumber): 
    if '-' in housenumber:
        convert = housenumber.split('-')[0]
        return convert
    elif ';' in housenumber:
        convert = housenumber.split(';')[0]
        return convert
    elif ' ' in housenumber:
        convert = housenumber.split()[0]
        return convert
    elif re.findall(ur"[\u4e00-\u9fa5]",housenumber.decode('utf-8')):
        convert = hanzi.sub("", housenumber.decode('utf-8'))
        return convert
    else:
        return housenumber


In [17]:
#audit postcode
def audit_postcode(postcode_types, postcode): 
    m = postcode_type_re.search(postcode)
    if m:
        postcode_type = m.group()
        if postcode_type not in expected:
            postcode_types[postcode_type].add(postcode)

def is_postcode(elem):
    return (elem.attrib['k'] == "addr:postcode")

def audit_p(osmfile):
    osm_file = open(osmfile, "r")
    postcode_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way": 
            for tag in elem.iter("tag"):
                if is_postcode(tag):
                    audit_postcode(postcode_types, tag.attrib['v'])
    return postcode_types

In [63]:
#update postcode
def update_postcode(postcode):
    if len(postcode) > 6:
        return postcode[:6]
    elif re.findall(ur"[\u4e00-\u9fa5]",postcode.decode('utf-8')):
        convert = hanzi.sub("", housenumber.decode('utf-8'))
        return convert
    else:
        return postcode

In [28]:
from collections import defaultdict
OSM_FILE = "shanghai_china.osm"
st_types = audit(OSM_FILE)

#update the street name
for st_type, ways in st_types.iteritems():
    for name in ways:
        better_name = update_name(name, mapping)
        print name, "=>", better_name


In [62]:
#update the house number
hn_types = audit_h(OSM_FILE)
for hn_type, numbers in hn_types.iteritems():
    for number in numbers:
        better_number = update_number(number)
        print number, "=>", better_number

In [64]:
#update the postcode
pc_types = audit_p(OSM_FILE)
for pc_type, ways in pc_types.iteritems():
        for code in ways:
            better_code = update_postcode(code)
            print code, "=>", better_code

In [89]:
#define the structure of the data I want to extract
def shape_element(element):
    node = {}
    created={"version":None,
            "changeset":None,
            "timestamp":None,
            "user":None,
            "uid":None}
    pos = [None,None]
    address={}
    name ={}
    building={}
    roof={}
    node_refs = []
    if element.tag == "node" or element.tag == "way" :
        node['type']=element.tag
        for key in element.attrib:
            m = re.search(problemchars, element.attrib[key])
            if m:
                continue
            elif key in CREATED:
                created[key] = element.attrib[key]
            elif key == 'lat':
                pos[0] = float(element.attrib[key])
            elif key == 'lon':
                pos[1] = float(element.attrib[key])
            else:
                node[key] = element.attrib[key]    
        if created:
            node['created'] = created
        if pos:
            node['pos'] = pos
        #iterate the tag and extract key-value pair    
        for tag in element.iter('tag'):
            match = re.search(problemchars, tag.attrib['k'])
            if match:
                continue
            else:
                #if 'k' is like attr1:attr2:attr3, only extract the first two attributes.
                if "addr:" in tag.attrib['k']:
                    key_split = []
                    key_split = tag.attrib['k'].split(":")
                    if len(key_split) <= 2:
                        key = key_split[1]
                        address[key] = tag.attrib['v']
                elif "name:" in tag.attrib['k']:
                    key_split = []
                    key_split = tag.attrib['k'].split(":")
                    if len(key_split) <= 2:
                        key = key_split[1]
                        name[key] = tag.attrib['v']
                elif "building:" in tag.attrib['k']:
                    key_split = []
                    key_split = tag.attrib['k'].split(":")
                    if len(key_split) <= 2:
                        key = key_split[1]
                        building[key] = tag.attrib['v']
                elif "roof:" in tag.attrib['k']:
                    key_split = []
                    key_split = tag.attrib['k'].split(":")
                    if len(key_split) <= 2:
                        key = key_split[1]
                        roof[key] = tag.attrib['v']
                elif ":" not in tag.attrib['k']:
                    node[tag.attrib['k']] = tag.attrib['v']
        # add values like attr1:{attr2:""} to node
        if address:
            node['address'] = address 
        if name:
            node['name'] = name
        if building:
            node['building'] = building
        if roof:
            node['roof'] = roof 
        if element.tag == "way":
            for child in element:
                if child.tag == 'nd':
                    if child.get('ref'):
                        node_refs.append(child.attrib['ref'])
        if node_refs:
            node['node_refs'] = node_refs
        return node
    else:
        return None

In [90]:
def process_map(file_in, pretty = False):
    # a function to process the dataset and write it to a json file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

In [91]:
# process the dataset and check one example
SAMPLE_FILE = "sample.osm"
data = process_map(SAMPLE_FILE, True)
pprint.pprint(data[-100])

{'created': {'changeset': '43600625',
             'timestamp': '2016-11-13T10:11:41Z',
             'uid': '1804331',
             'user': 'daxigua',
             'version': '1'},
 'highway': 'motorway',
 'id': '453004842',
 'maxspeed': '120',
 'name': {'en': 'Tongyang Expressway'},
 'node_refs': ['3919826273', '3283358899', '3283358900'],
 'oneway': 'yes',
 'pos': [None, None],
 'ref': 'S19',
 'type': 'way'}


In [28]:
# start mongodb 
#1. cd /usr/local/Cellar/mongodb/3.2.11/bin/    Reference: http://cnodejs.org/topic/5504f44c73263b0e4eef9ca0
#2. sudo ./mongod
#3. open another terminal
#4. cd /usr/local/Cellar/mongodb/3.2.11/bin/
#5. sudo mongo
#6. cd kris/Desktop/project3/   Reference: https://discussions.udacity.com/t/unable-to-import-json-file-syntaxerror-missing-before-statement-shell-1-14/41246
#7. mongoimport --host=127.0.0.1 --db osm --collection osm_data --type json --file sample.osm.json


In [1]:
#Reference: https://discussions.udacity.com/t/pymongo-producing-null-output-p3/41237/2

from pymongo import MongoClient
from pprint import pprint
import pymongo

#use mongodb to count the total number of the documents
client=MongoClient("localhost", 27017)
db = client.osm
coll = db.osm_data

print "Number of documents"                                                
print  coll.find().count()


Number of documents
334052


In [68]:
# find the number of nodes whose type is "way"
way = coll.find({"type":"way"})
print "Number of ways"
print way.count()

Number of ways
36162


In [69]:
# find the number of nodes whose type is "node"
node = coll.find({"type":"node"})
print "Number of nodes"
print node.count()

Number of nodes
297858


In [97]:
# find the number of unique users who create the node
created_user = coll.distinct("created.user")
print "Number of unique created user"
print len(created_user)

Number of unique created user
1263


In [3]:
from pprint import PrettyPrinter

pprinter = PrettyPrinter()

In [99]:
#count the number of record created by unique users and display the top 10 contributor
#Top 10 contributing user
top_10_contributor = coll.aggregate([{'$group':{'_id':'$created.user',
                                               'count':{'$sum':1}}},
                                    {'$sort':{'count':-1}},
                                    {'$limit':10}])
[pprinter.pformat(c) for c in top_10_contributor]

["{u'_id': u'Chen Jia', u'count': 52675}",
 "{u'_id': u'aighes', u'count': 18555}",
 "{u'_id': u'katpatuka', u'count': 13479}",
 "{u'_id': u'XBear', u'count': 12720}",
 "{u'_id': u'yangfl', u'count': 11303}",
 "{u'_id': u'Holywindon', u'count': 10270}",
 "{u'_id': u'dkt', u'count': 10220}",
 "{u'_id': u'u_kubota', u'count': 9398}",
 "{u'_id': u'jamesks', u'count': 8529}",
 "{u'_id': u'zzcolin', u'count': 8364}"]

In [102]:
#count the number of different amenities and display the top 10 amenity
#Top 10 amenity
top_10_amenity = coll.aggregate([{'$match':{'amenity':{'$exists':1}}},
                                 {'$group':{'_id':'$amenity',
                                           'count':{'$sum':1}}},
                                {'$sort':{'count':-1}},
                                {'$limit':10}])
[pprinter.pformat(a) for a in top_10_amenity]

["{u'_id': u'bicycle_rental', u'count': 238}",
 "{u'_id': u'restaurant', u'count': 132}",
 "{u'_id': u'parking', u'count': 126}",
 "{u'_id': u'school', u'count': 122}",
 "{u'_id': u'bank', u'count': 57}",
 "{u'_id': u'toilets', u'count': 40}",
 "{u'_id': u'fuel', u'count': 33}",
 "{u'_id': u'hospital', u'count': 32}",
 "{u'_id': u'fast_food', u'count': 31}",
 "{u'_id': u'cafe', u'count': 30}"]

In [19]:
#count the number of the banks and display the top 5 amenity
#Top 5 bank 
top_5_bank = coll.aggregate([{'$match':{'amenity':'bank','name.en':{'$exists':1}}},
                                   {'$group':{'_id':'$name.en',
                                             'count':{'$sum':1}}},
                                   {'$sort':{'count':-1}},
                                   {'$limit':5}])
[pprinter.pformat(b) for b in top_5_bank]

["{u'_id': u'China Merchants Bank', u'count': 2}",
 "{u'_id': u'ICBC', u'count': 2}",
 "{u'_id': u'Bank of China', u'count': 1}",
 "{u'_id': u'BANK OF NANJING', u'count': 1}",
 "{u'_id': u'CCB', u'count': 1}"]

In [131]:
#count the number of different highways and display the top 10 highway
#Top 10 highway
top_10_highway = coll.aggregate([{'$match':{'highway':{'$exists':1}}},
                                   {'$group':{'_id':'$highway',
                                             'count':{'$sum':1}}},
                                   {'$sort':{'count':-1}},
                                   {'$limit':10}])
[pprinter.pformat(h) for h in top_10_highway]

["{u'_id': u'residential', u'count': 4204}",
 "{u'_id': u'tertiary', u'count': 2831}",
 "{u'_id': u'service', u'count': 2437}",
 "{u'_id': u'unclassified', u'count': 2264}",
 "{u'_id': u'secondary', u'count': 2156}",
 "{u'_id': u'primary', u'count': 1551}",
 "{u'_id': u'motorway', u'count': 1271}",
 "{u'_id': u'motorway_link', u'count': 798}",
 "{u'_id': u'footway', u'count': 748}",
 "{u'_id': u'bus_stop', u'count': 576}"]