In [70]:
import codecs
import collections
import json
import os
import pprint
import pymongo
import re

import xml.etree.ElementTree as ET

osmfilenm = 'data/perth_australia.osm'

In [66]:
# Count tags

def count_tags(filename):
    tags = defaultdict(int)

    for event, node in ET.iterparse(filename):
        if event == 'end': 
            tags[node.tag] += 1
        node.clear()             
    return tags


count_tags(osmfilenm)

defaultdict(<type 'int'>, {'node': 1000411, 'nd': 1236818, 'bounds': 1, 'member': 10565, 'tag': 368028, 'relation': 1505, 'way': 123824, 'osm': 1})

In [67]:
# Count character types

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        k_value = element.attrib['k']
        if lower.search(k_value) is not None:
            keys['lower'] += 1
        elif lower_colon.search(k_value) is not None:
            keys['lower_colon'] += 1
        elif problemchars.search(k_value) is not None:
            keys["problemchars"] += 1
        else:
            keys['other'] += 1

    return keys


def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys


process_map(osmfilenm)

{'lower': 328169, 'lower_colon': 39185, 'other': 674, 'problemchars': 0}

In [68]:
# Audit street names

street_type_re = re.compile(r'\S+\.?$', re.IGNORECASE)
street_types = defaultdict(int)

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()

        street_types[street_type] += 1

        
def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower())
    for k in keys:
        v = d[k]
        print "%s: %d" % (k, v) 

        
def is_street_name(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "addr:street")


def audit(filename):
    osm_file = open(filename, "r")
    for event, elem in ET.iterparse(osm_file):
        if is_street_name(elem):
            audit_street_type(street_types, elem.attrib['v'])   
    print_sorted_dict(street_types)

    
audit(osmfilenm)

(LANEWAY): 1
Ave: 6
Avenue: 328
Avenuet: 1
Beaufort: 1
Boulevard: 185
Boulevarde: 23
Broadway: 2
Circle: 62
Circuit: 31
Close: 129
Corner: 9
Court: 304
Courtyard: 12
Cove: 18
Cres: 2
Crescent: 108
Cross: 153
Crossway: 21
Crs: 1
CRT: 7
Crt: 8
Ct: 2
Dale: 9
Drive: 391
drive: 1
E/ENT: 3
E/Ent: 1
East: 7
Edge: 8
Elbow: 55
Entrance: 153
ENTRANCE: 1
Esplanade: 9
Fairway: 2
Gap: 14
Garden: 8
Gardens: 135
Gate: 3
gate: 1
Gates: 2
Gelderland: 21
Grade: 13
Grange: 1
Green: 22
Grove: 87
Haven: 3
Heights: 3
Highgate: 1
Highway: 89
Hill: 25
Lane: 163
Laneway): 1
Link: 3
Loop: 29
Mews: 112
Morrison: 1
North: 9
Oxford: 1
Parade: 45
Parkway: 5
Pass: 48
Place: 306
plaza: 1
Promenade: 2
Quarry: 10
Quays: 1
Ramble: 13
Rd: 13
RD: 14
Retreat: 74
Ridgeway: 19
Rise: 98
Road: 936
Road,: 1
Sava: 6
Square: 54
St: 20
ST: 10
st: 1
St): 1
Street: 1203
Tce: 1
Terrace: 96
Terriace: 1
Trail: 12
Turn: 3
University: 16
Vale: 32
View: 35
Vista: 1
W/ENT: 3
WA: 1
Way: 399
WAY: 11
West: 12
Wharf: 1


In [71]:
# Check proposed street name updates

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Avenue", "Beaufort", "Boulevard", "Boulevarde", "Broadway", "Circle", "Circuit", "Close", "Corner", "Court",
            "Courtyard", "Cove", "Crescent", "Cross", "Crossway", "Dale", "Drive", "East", "Edge", "Elbow", "Entrance",
            "Elbow", "Esplanade", "Fairway", "Gap", "Garden", "Gardens", "Gate", "Gates", "Gelderland", "Grade", "Grange",
            "Green", "Grove", "Haven", "Heights", "Highgate", "Highway", "Hill", "Lane", "Laneway", "Link", "Loop", "Mews",
            "Morrison", "North", "Oxford", "Parade", "Parkway", "Pass", "Place", "Promenade", "Quarry", "Quays", "Ramble",
            "Road", "Retreat", "Ridgeway", "Rise", "Sava", "Square", "Street", "Terrace", "Trail", "Turn", "University", 
            "Vale", "View", "Vista", "Way", "West", "Wharf"]

mapping = { "ave" : "Avenue",
            "avenuet" : "Avenue",
            "cres" : "Crescent",
            "crs" : "Cross",
            "crt" : "Court",
            "ct" : "Court",
            "rd" : "Road",
            "st" : "Street",
            "tce" : "Terrace",
            "terriace" : "Terrace",
            "wa" : "Way"}


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = collections.defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

    return street_types


def update_name(name, mapping):
    after = []
    
    for part in name.split(" "):
        part = part.strip(",\.").lower()
        
        if part in mapping.keys():
            part = mapping[part]
        after.append(part.capitalize())
    
    return " ".join(after)


ret_st_types = audit(osmfilenm)

for st_type, ways in ret_st_types.iteritems():
    for name in ways:
        proposed_name = update_name(name, mapping)
        print name, "=>", proposed_name

TILLIA CRT => Tillia Court
POPLAR CRT => Poplar Court
SORREL CRT => Sorrel Court
Barrow Cres => Barrow Crescent
Hero Cres => Hero Crescent
Fitzgerald St (corner View St) => Fitzgerald Street (corner View St)
Scarborough Beach Rd => Scarborough Beach Road
Morilla Rd => Morilla Road
Rockeby Rd => Rockeby Road
Runyon Rd => Runyon Road
Cypress Rd => Cypress Road
Reynolds Rd => Reynolds Road
TARATA WY E/ENT (LANEWAY) => Tarata Wy E/ent (laneway)
Starling st => Starling Street
Boas Avenuet => Boas Avenue
Beaufort St WA => Beaufort Street Way
Beasley Road, => Beasley Road
CYPRESS RD => Cypress Road
REYNOLDS RD => Reynolds Road
TARATA WY E/ENT => Tarata Wy E/ent
TARATA WAY E/ENT => Tarata Way E/ent
Edjudina gate => Edjudina Gate
Bradshaw Crs => Bradshaw Cross
TARATA WAY SW ENTRANCE => Tarata Way Sw Entrance
Tillia Crt => Tillia Court
Sorrel Crt => Sorrel Court
Hay St => Hay Street
William St => William Street
Hill St => Hill Street
Foundry St => Foundry Street
Newcastle St => Newcastle Street


In [72]:
# Clean data and write to a json file

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
LOCATION = ['lat','lon']

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def update_name(name, mapping):
    match = street_type_re.search(name)
    match = match.group(0)
    
    if match in mapping.keys():
        name = name.replace(match,mapping[match])
        
    return name


def clean_street_name(street_name):
    street_name = street_name.title()
    street_name = street_name.split(",")[0]
    street_name = street_name.split("(")[0]
    street_name = street_name.strip()
    street_name = update_name(street_name, mapping)

    return street_name


def is_postal_code(elem):
    return (elem.attrib['k'] == "addr:postcode")


def clean_postal_code(postcode):
    return postcode.replace(" ","")


def shape_element(element):
    node = {}
    node['created'] = dict()

    if element.tag == "node" or element.tag == "way" :
        node['type'] = element.tag
        attributes = element.attrib

        for key, value in attributes.items():
            if key in CREATED:
                node['created'][key] = value
            elif key in LOCATION:
                node.setdefault('pos',[]).insert(0,float(value))
            else:
                node[key] = value

        for child in element:
            if child.tag == 'nd':
                node.setdefault('node_refs',[]).append(child.attrib['ref'])
            elif child.tag == 'tag':
                k = child.attrib['k']
                v = child.attrib['v']

                if is_street_name(child):
                    v = clean_street_name(v)

                if is_postal_code(child):
                    v = clean_postal_code(v)
                    if len(v) > 6:
                        continue

                if problemchars.search(k) or k.count(":") > 1:
                    pass
                elif k.startswith('addr:'):
                    node.setdefault('address',{})[ k.split(":")[1] ] = v
                else:
                    node[k] = v
                    
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    file_out = "{0}.json".format(file_in)
    data = []
    
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data


data = process_map(osmfilenm, True)
pprint.pprint(data[0])

{'created': {'changeset': '12341071',
             'timestamp': '2012-07-19T17:00:36Z',
             'uid': '722137',
             'user': 'OSMF Redaction Account',
             'version': '9'},
 'id': '2306306',
 'pos': [-32.0388963, 115.7735345],
 'type': 'node'}


In [81]:
def process_map(file_in, limiter, pretty = False):     
    #initialize pymongo connection and db.collection schema
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017")
    db = 'data/perth_australia.osm'
    elements = db.elements
    elements.drop()
    
    #process the osm doc
    file_out = "{0}.json".format(file_in.split('.')[0])
    i=0
    with codecs.open(file_out, "w") as fo:
        for event, element in ET.iterparse(file_in):
            if i > limiter and limiter != -1:
                break
            i+=1
            el = shape_element(element)
            if el:
                #write to mongo
                elements.insert(el)
                #write to file
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
                    element.clear()
    return file_out

data = process_map(osmfilenm, True)

AttributeError: 'str' object has no attribute 'elements'

In [73]:
## Load json data in pymongo

from pymongo import MongoClient

def get_db(db_name):
    client = MongoClient("mongodb://localhost:27017")
    db = client[db_name]
    return db

def get_collection(db, collection):
    collections_db = db[collection]
    return collections_db

def insert_data(json_data, db_collection):
    with open(json_data, 'r') as f:
        for each_line in f.readlines():
            db_collection.insert(json.loads(each_line))
    print("Complete.")

def map_aggregate(db, collection, pipeline):
    db_collection = db[collection]
    result = db_collection.aggregate(pipeline)
    return result

In [79]:
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017")
db = 'data/perth_australia.osm'
elements = db.elements

count = elements.find().count()
print 'elements.find().count()'
print 'number of elements: {}\n'.format(count)

elements.find().count()
number of elements: 0



In [77]:
nodes = elements.find({"type":"node"}).count()
print 'elemnts.find({"type":"node"}).count()'
print 'number of node elements: {}\n'.format(nodes)

In [76]:
# Top user contributions overall grouped by city

def make_city_pipeline(city):
    pipeline = [{"$match":{"created.user":{"$exists":1},
                                          "city_name":city}},
                 {"$group": {"_id": {"City":"$city_name",
                                     "User":"$created.user"},
                            "count": {"$sum": 1}}},                            
                 {"$project": {'_id':0,
                               "City":"$_id.City",
                               "User":"$_id.User",
                               "Count":"$count"}},
                 {"$sort": {"Count": -1}},
                 {"$limit" : 5 }]
    return pipeline

pipeline = make_city_pipeline('perth')
result1 = map_aggregate(db, 'cities', pipeline)
pprint.pprint(result1)

<pymongo.command_cursor.CommandCursor object at 0x00000000C4D4D630>


In [45]:
db.perth.aggregate( [ { $group : { "_id" : "$type", "count" : { $sum : 1 } } } ] )

SyntaxError: invalid syntax (<ipython-input-45-6205809cd3e2>, line 1)

In [50]:
OSMFILE = "data/perth_australia.osm"

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

expected = ["Avenue", "Beaufort", "Boulevard", "Boulevarde", "Broadway", "Circle", "Circuit", "Close", "Corner", "Court",
            "Courtyard", "Cove", "Crescent", "Cross", "Crossway", "Dale", "Drive", "East", "Edge", "Elbow", "Entrance",
            "Elbow", "Esplanade", "Fairway", "Gap", "Garden", "Gardens", "Gate", "Gates", "Gelderland", "Grade", "Grange",
            "Green", "Grove", "Haven", "Heights", "Highgate", "Highway", "Hill", "Lane", "Laneway", "Link", "Loop", "Mews",
            "Morrison", "North", "Oxford", "Parade", "Parkway", "Pass", "Place", "Promenade", "Quarry", "Quays", "Ramble",
            "Road", "Retreat", "Ridgeway", "Rise", "Sava", "Square", "Street", "Terrace", "Trail", "Turn", "University", 
            "Vale", "View", "Vista", "Way", "West", "Wharf"]

street_mapping = {"ave" : "Avenue",
            "avenuet" : "Avenue",
            "cres" : "Crescent",
            "crs" : "Cross",
            "crt" : "Court",
            "ct" : "Court",
            "rd" : "Road",
            "st" : "Street",
            "tce" : "Terrace",
            "terriace" : "Terrace",
            "wa" : "Way"}

postal_code_range = [6000,7000]
postal_code_default = 6000

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def audit_postal_code(invalid_postal_codes, postal_code):
    try:
        if not (postal_code_range[0] <= int(postal_code) <= postal_code_range[1]):
            raise ValueError
    except ValueError:
        invalid_postal_codes[postal_code] += 1

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def is_postal_code(elem):
    return (elem.attrib['k'] == "addr:postcode")

def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    invalid_postal_codes = defaultdict(int)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
                elif is_postal_code(tag):
                    audit_postal_code(invalid_postal_codes, tag.attrib['v'])

    return [invalid_postal_codes, street_types]

#standardizes street types with a replacement map
def update_name(name, mapping):
    name = name.split(' ')
    type = name[-1]
    if type in mapping:
        name[-1] = mapping[type]
    
    name = ' '.join(name)
    name = name.title()

    return name

#checks if postal code within valid range, if not replaces with 11000 default
def update_postal_code(postal_code):
    try:
        if not (postal_code_range[0] <= int(postal_code) <= postal_code_range[1]):
            raise ValueError
        else:
            return int(postal_code)
    except ValueError:
        return postal_code_default

def shape_element(e):
    node = {}
    node['created'] = {}
    node['pos'] = [0,0]
    if e.tag == "way":
        node['node_refs'] = []
    if e.tag == "node" or e.tag == "way" :
        node['type'] = e.tag
        #attributes
        for k, v in e.attrib.iteritems():
            #latitude
            if k == 'lat':
                try:
                    lat = float(v)
                    node['pos'][0] = lat
                except ValueError:
                    pass
            #longitude
            elif k == 'lon':
                try:
                    lon = float(v)
                    node['pos'][1] = lon
                except ValueError:
                    pass
            #creation metadata
            elif k in CREATED:
                node['created'][k] = v
            else:
                node[k] = v
        #children
        for tag in e.iter('tag'):
            k = tag.attrib['k']
            v = tag.attrib['v']
            if problemchars.match(k):
                continue
            elif lower_colon.match(k):
                k_split = k.split(':')
                #address fields
                if k_split[0] == 'addr':
                    k_item = k_split[1]
                    if 'address' not in node:
                        node['address'] = {}
                    #streets
                    if k_item == 'street':
                        v = update_name(v, street_mapping)                    
                    #postal codes
                    if k_item == 'postcode':
                        v = update_postal_code(v)
                    node['address'][k_item] = v
                    continue
            node[k] = v
        #way children
        if e.tag == "way":
            for n in e.iter('nd'):
                ref = n.attrib['ref']
                node['node_refs'].append(ref);
        return node
    else:
        return None

def process_map(file_in, pretty = False):
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

def audit_report():
    audit_data = audit(OSMFILE)
    pprint.pprint(audit_data[0])
    pprint.pprint(audit_data[1])
    pprint.pprint(dict(audit_data[2]))

'''
PRINT OUT AUDIT REPORT
'''
#audit_report()

'''
PROCESS DATA AND OUTPUT JSON
'''
process_map(OSMFILE, False)

[{'created': {'changeset': '12341071',
   'timestamp': '2012-07-19T17:00:36Z',
   'uid': '722137',
   'user': 'OSMF Redaction Account',
   'version': '9'},
  'id': '2306306',
  'pos': [-32.0388963, 115.7735345],
  'type': 'node'},
 {'created': {'changeset': '12548730',
   'timestamp': '2012-07-30T14:14:36Z',
   'uid': '142807',
   'user': 'SDavies',
   'version': '3'},
  'id': '21390176',
  'pos': [-31.9624823, 115.9130943],
  'type': 'node'},
 {'created': {'changeset': '12543711',
   'timestamp': '2012-07-30T06:41:54Z',
   'uid': '189263',
   'user': 'wildmyron',
   'version': '3'},
  'id': '21390184',
  'pos': [-31.9566591, 115.9019806],
  'type': 'node'},
 {'created': {'changeset': '12543711',
   'timestamp': '2012-07-30T06:41:54Z',
   'uid': '189263',
   'user': 'wildmyron',
   'version': '4'},
  'id': '21390189',
  'pos': [-31.9550943, 115.8999843],
  'type': 'node'},
 {'created': {'changeset': '12543711',
   'timestamp': '2012-07-30T06:43:33Z',
   'uid': '189263',
   'user': 'wil