In [1]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
from pprint import pprint
import codecs
import json

OSMFILE = "Cincinnati.xml"

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
address_regex = re.compile(r'^addr\:')
street_regex = re.compile(r'^street')
gnis_regex = re.compile(r'^gnis\:')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]


mapping = { "St": "Street",
            "Ave": "Avenue",
           "Avnue": "Avenue",
           "Rd.": "Road"
            }

expectedZip = [45201,45999]
mappedZip = 45201

def audit_postal_code(invalid_postal_codes, postal_code):
    try:
        if not (expectedZip[0] <= int(postal_code) <= expectedZip[1]):
            raise ValueError
    except ValueError:
        invalid_postal_codes[postal_code] += 1

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def audit_phone_number(invalid_phone, phone_number):
    try:
        if len(phone_number) != 12 or phone_number[:3] != '+513':
            raise ValueError
    except ValueError:
        invalid_phone[phone_number] += 1

def is_postal_code(elem):
    return (elem.attrib['k'] == "addr:postcode")

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def is_phone_number(elem):
    return (elem.attrib['k'] == "phone")

def audit(osmfile):
    osm_file = open(osmfile, "r")
    invalid_postal_codes = defaultdict(int)
    street_types = defaultdict(set)
    invalid_phone = defaultdict(int)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_postal_code(tag):
                    audit_postal_code(invalid_postal_codes, tag.attrib['v'])
                elif is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
                elif is_phone_number(tag):
                    audit_phone_number(invalid_phone, tag.attrib['v'])

    osm_file.close()
    return [street_types,invalid_postal_codes, invalid_phone]

def update_postal_code(postal_code):
    try:
        if not (expectedZip[0] <= int(postal_code) <= expectedZip[1]):
            raise ValueError
        else:
            return int(postal_code)
    except ValueError:
        return mappedZip
    return name

def update_name(name, mapping, regex):
    m = regex.search(name)
    if m:
        street_type = m.group()
        if street_type in mapping:
            name = re.sub(regex, mapping[street_type], name)
            
def update_phone_number(phone_number):
    phone_number = phone_number.translate(None, ' ()-')
    phone_number = '513' + phone_number[-6:]
    
    return phone_number

def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :
        node['type'] = element.tag
        address = {}
        for a in element.attrib:
            if a in CREATED:
                if 'created' not in node:
                    node['created'] = {}
                node['created'][a] = element.get(a)
            elif a in ['lat', 'lon']:
                continue
            else:
                node[a] = element.get(a)
        if 'lat' in element.attrib and 'lon' in element.attrib:
            node['pos'] = [float(element.get('lat')), float(element.get('lon'))]
        for e in element:
            if e.tag == 'nd':
                if 'node_refs' not in node:
                    node['node_refs'] = []
                if 'ref' in e.attrib:
                    node['node_refs'].append(e.get('ref'))
            if e.tag != 'tag' or 'k' not in e.attrib or 'v' not in e.attrib:
                continue
            key = e.get('k')
            val = e.get('v')

            if problemchars.search(key):
                continue

            elif address_regex.search(key):
                key = key.replace('addr:', '')
                address[key] = val

            else:
                node[key] = val

        if len(address) > 0:
            node['address'] = {}
            street_full = None
            street_dict = {}
            street_format = ['prefix', 'name', 'type']
            for key in address:
                val = address[key]
                if street_regex.search(key):
                    if key == 'street':
                        street_full = val
                    elif 'street:' in key:
                        street_dict[key.replace('street:', '')] = val
                else:
                    node['address'][key] = val
            if street_full:
                node['address']['street'] = street_full
            elif len(street_dict) > 0:
                node['address']['street'] = ' '.join([street_dict[key] for key in street_format])
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data
def main():
    data = process_map(OSMFILE)
    
if __name__ == '__main__':
    main()    