# Wrangle New York OpenStreet Map

In [1]:
import xml.etree.cElementTree as ET
import pprint
import re
from collections import defaultdict

## Tag names and counts

In [2]:
def count_tags(filename):
    tags = {}
    for event, element in ET.iterparse(filename):
        if element.tag not in tags.keys():
            tags[element.tag] = 1
        else:
            tags[element.tag] += 1
    return tags

In [3]:
tags = count_tags('newyork_sample.osm')
pprint.pprint(tags)

{'member': 28,
 'nd': 14329,
 'node': 11447,
 'osm': 1,
 'relation': 9,
 'tag': 9721,
 'way': 1794}


## Check Tags

We first check the "k" value for each tag and see if there are any potential problems. I proposed 3 regular expressions to check for certain patterns in the "k" value.
- "lower", for tags that contain only lowercase letters and are valid
- "lower_colon", for otherwise valid tags with a colon in their names
- "problemchars", for tags with problematic characters
- "other", for other tags that do not fall into the other three categories



In [4]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

In [5]:
def key_type(element, keys):
    if element.tag == "tag":

        k = element.get("k")

        if re.search(lower,k):
            keys['lower'] += 1
        elif re.search(lower_colon,k):
            keys['lower_colon'] += 1
        elif re.search(problemchars,k):
            keys['problemchars'] += 1
        else:
            keys['other'] +=1
    
    return keys

def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)
        
    return keys

In [6]:
keys = process_map('newyork_sample.osm')
pprint.pprint(keys)

{'lower': 3760, 'lower_colon': 5835, 'other': 105, 'problemchars': 21}


## Find unique users
Find out how many unique users have contributed to the map in the map.

In [7]:
def get_users(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if element.get("uid"):
            users.add(element.attrib["uid"])
    return users

In [8]:
users = get_users('newyork_sample.osm')

In [9]:
len(users)

433

## Audit street names

In [10]:
# Look for street name pattern
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

# Some expected comman street names
expected = ["Street", "Avenue", "Boulevard", 
            "Drive", "Court", "Place", 
            "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

In [11]:
# Group all the street names not expected
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

In [12]:
# Check if it is a street name
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")        

In [13]:
def audit(filename):
    
    # specify the encoding for python3
    osm_file = open(filename, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

In [14]:
street_types = audit('newyork_sample.osm')

In [15]:
# Print all the steet types and names
# We should be able to identify all problem steet names
pprint.pprint(street_types)

defaultdict(<type 'set'>, {'West': set(['Henry Hudson Parkway West']), 'Way': set(['Ilyssa Way']), 'Circle': set(['Covington Circle']), 'East': set(['Van Cortlandt Avenue East']), 'Highway': set(['Kings Highway', 'New Highway']), 'North': set(['Grand Central Parkway Sr North', 'Horace Harding Expressway Service Road North']), 'Park': set(['Harding Park']), 'Loop': set(['Leewood Loop', 'Hart Loop']), 'Path': set(['Carlls Path', 'Carlls Straight Path']), 'A': set(['Avenue A']), 'B': set(['Avenue B']), 'I': set(['Avenue I']), 'H': set(['Avenue H']), 'K': set(['Avenue K']), 'J': set(['Avenue J']), 'L': set(['Avenue L']), 'P': set(['Avenue P']), 'Green': set(['Dover Green']), 'T': set(['Avenue T']), 'W': set(['Somerset Road W']), 'Turnpike': set(['Union Turnpike', 'Hempstead Turnpike']), 'Z': set(['Avenue Z']), 'South': set(['Juniper Boulevard South']), 'Cir': set(['Winnecomac Cir']), 'Ballfields': set(['John Golden Ballfields']), 'Terrace': set(['Susan Terrace']), 'Slip': set(['Catherine S

In [16]:
# Fix these street names (wrong name: correct name)
mapping = { "AVENUE": "Avenue",
            "AVenue": "Avenue",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "Avene": "Avenue",
            "Blvd": "Boulevard",
            "Ct": "Court",
            "DRIVE": "Drive",
            "Dr": "Drive",
            "Pkwy": "Parkway",
            "Plz": "Plaza",
            "ROAD": "Road",
            "Rd": "Road",
            "STREET": "Street",            
            "St": "Street",
            "St.": "Street",
            "Steet": "Street",
            "Trce": "Terrace",
            "Tpke": "Turnpike",
            "avenue": "Avenue",
            "street": "Street"
            }


In [17]:
def update_name(name, mapping):
    m = street_type_re.search(name)
    better_name = name
    # condition: if the street name does have a last word
    if m:
        # check if the street type is a key in your mapping dictionary:
        if m.group() in mapping.keys():
            better_street_type = mapping[m.group()]
            better_name = street_type_re.sub(better_street_type, name)
    return better_name

In [18]:
# Fix the street names
for st_type, ways in street_types.items():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name

Henry Hudson Parkway West => Henry Hudson Parkway West
Ilyssa Way => Ilyssa Way
Covington Circle => Covington Circle
Van Cortlandt Avenue East => Van Cortlandt Avenue East
Kings Highway => Kings Highway
New Highway => New Highway
Grand Central Parkway Sr North => Grand Central Parkway Sr North
Horace Harding Expressway Service Road North => Horace Harding Expressway Service Road North
Harding Park => Harding Park
Leewood Loop => Leewood Loop
Hart Loop => Hart Loop
Carlls Path => Carlls Path
Carlls Straight Path => Carlls Straight Path
Avenue A => Avenue A
Avenue B => Avenue B
Avenue I => Avenue I
Avenue H => Avenue H
Avenue K => Avenue K
Avenue J => Avenue J
Avenue L => Avenue L
Avenue P => Avenue P
Dover Green => Dover Green
Avenue T => Avenue T
Somerset Road W => Somerset Road W
Union Turnpike => Union Turnpike
Hempstead Turnpike => Hempstead Turnpike
Avenue Z => Avenue Z
Juniper Boulevard South => Juniper Boulevard South
Winnecomac Cir => Winnecomac Cir
John Golden Ballfields => Joh

## Generate the csv

In [19]:
import csv
import codecs
import cerberus
import schema

In [20]:
# File path
OSM_PATH = "newyork_sample.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

In [21]:
LOWER_COLON = re.compile(r'^([a-zA-Z0-9]|_)+:([a-zA-Z0-9]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
SCHEMA = schema.schema

In [22]:
# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

In [23]:
# ================================================== #
#               Helper Functions                     #
# ================================================== #

def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    # YOUR CODE HERE
    if element.tag == 'node':
        for i in node_attr_fields:
            node_attribs[i] = element.attrib[i]
            
    if element.tag == 'way':
        for i in way_attr_fields:
            way_attribs[i] =element.attrib[i]
            
    for tag in element.iter('tag'):
        dic = {}
        
        if problem_chars.search(tag.attrib['k']):
            continue
        
        if element.tag == 'node':
            dic['id'] = node_attribs['id']
        elif element.tag == 'way':
            dic['id'] = way_attribs['id']
            
        dic['value'] = tag.attrib['v']
        
        if LOWER_COLON.search(tag.attrib['k']):
            dic['type'] = re.search('[a-zA-Z0-9]*:', tag.attrib['k']).group()[:-1] 
            dic['key'] = re.search(':([a-zA-Z0-9]|_)+:?([a-zA-Z0-9]|_)*$', tag.attrib['k']).group()[1:]
        else:
            dic['key'] = tag.attrib['k']
            dic['type'] = 'regular'

        tags.append(dic)

    if element.tag == 'way':
        count = 0 
    
        for nd in element.iter('nd'):
            way_node_dict = {}
            way_node_dict['id'] = way_attribs['id']
            way_node_dict['node_id'] = nd.attrib['ref']
            way_node_dict['position'] = count
            count += 1
        
            way_nodes.append(way_node_dict)     
        
    if element.tag == 'node':   
        return {'node': node_attribs, 'node_tags': tags}
    elif element.tag == 'way':
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()
            
def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))
        
class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.items()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)

In [24]:
# ================================================== #
#               Main Function                        #
# ================================================== #

# -*- coding: utf-8 -*-

def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])

In [25]:
process_map(OSM_PATH, validate=True)

In [26]:
import glob

In [27]:
def clear_empty(filenames):
    for filename in filenames:
        input = open(filename, 'rb')
        output = open(filename[:-4]+'_noempty.csv', 'wb')
        writer = csv.writer(output)
        for row in csv.reader(input):
            if any(row):
                writer.writerow(row)
        input.close()
        output.close()

In [28]:
filenames = glob.glob("*.csv")
clear_empty(filenames)