# OpenStreetMap Case Study

## Introduction
<p>I am to investigate data set from a location of my choosing from openstreetmap, identify problems, clean it and store the data in SQL. In addition, I am to propose ideas on how to improve the data. This investigation is a practice project needed to complete the Data Analyst Nanodegree from Udacity.
</p>

### Location
<p>I chose Auckland, New Zealand as my location for my investigation because I have been planning to take a trip here for sometime now. I would like to take the opportunity to get myself familiar with the place by using it as an example for this project</p>
- [www.openstreetmap.org/node/292806332](https://www.openstreetmap.org/node/292806332)

In [4]:
from collections import defaultdict
import xml.etree.cElementTree as ET
import re
import pprint

file_sample = "auckland_new-zealand-sample.osm"
file_actual = "auckland_new-zealand.osm"

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Avenue", "Crescent", "Drive", "Highway", "Lane", "Place", "Road", "Street", "Way"]

mapping = {
    "street": "Street",
    "st": "Street",
    "st.": "Street",
    "rd": "Road",
    "road": "Road",
    "strret": "Street",
    "cr": "Crescent",
    "cresent": "Crescent",
    "crest": "Crescent",
    "hwy": "Highway",
    "ave": "Avenue",
    "plc,": "Place",
    "beach": "Beach",
    "way": "Way",
    "ln": "Lane"
}

def update_name(name, mapping):
    name_a = name.split(" ")

    for w in range(len(name_a)):
        if name_a[w].lower() in mapping.keys():
            name_a[w] = mapping[name_a[w].lower()]
    name = " ".join(name_a)
    
    return name
            
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            new_name = update_name(street_name, mapping)
            street_types[street_type].add(new_name)
    
def is_street_name(elem):
    return (elem.attrib['k'] == 'addr:street')

def audit(osmfile):
    osm_file = open(osmfile, 'r')
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=('start',)):
        if elem.tag == "way" or elem.tag == 'node':
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

st_types = audit(actual_sample)

pprint.pprint(dict(st_types))

{'0632': set(['15 Arrenway Dr, Rosedale, Auckland 0632']),
 u'1010\u65b0\u897f\u862d': set([u'38 Lorne St, Auckland, 1010\u65b0\u897f\u862d']),
 '16': set(['State Highway 16']),
 '2': set(['State Highway 2']),
 '22': set(['State Highway 22']),
 '26': set(['26']),
 'Auckland': set(['Exmouth Road, Northcote, Auckland']),
 'Ave': set(['Brennan Avenue',
             'Delta Avenue',
             'Erson Avenue',
             'Gillies Avenue',
             'Vitasovich Avenue',
             'Waverley Avenue']),
 'Broadway': set(['Broadway']),
 'Circle': set(['Leybourne Circle']),
 'Close': set(['Challen Close', 'Court Town Close', 'Regia Close']),
 'Coronation': set(['Coronation']),
 'Court': set(['Fantail Court', 'Palm Court', 'Palmgreen Court']),
 'Cove': set(['Clearwater Cove']),
 'Cr': set(['Marjorie Jayne Crescent']),
 'Cresent': set(['Tawa Crescent']),
 'Crest': set(['The Crescent']),
 'East': set(['Customs Street East',
              'Durham Street East',
              'Greenlane East',

## Problems Encountered in Your Map
<p>Running the data set through the code, I explored the data and took notes of some problems that I have encountered in the map that I have chosen.</p>
- Mispelled Names
- Incorrect Capitalization
- Abbreviated Names
- Problematic Format

In [2]:
import csv
import codecs
import cerberus
import schema

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema

NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

def shape_element(element, node_attr_fields=NODE_FIELDS, 
                  way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, 
                  default_tag_type='regular'):
    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []
    
    if element.tag == 'node':
        for elem in NODE_FIELDS:
            if element.get(elem):
                node_attribs[elem] = element.attrib[elem]
            else:
                return 
        for elem in element:
            item = {}
            if PROBLEMCHARS.match(elem.attrib['k']):
                continue
            elif LOWER_COLON.match(elem.attrib['k']):
                item['id'] = element.attrib['id']
                item['key'] = elem.attrib['k'].split(':')[1]
                item['type'] = elem.attrib['k'].split(':')[0]
                if is_street_name(elem):
                    item['value'] = update_name(elem.attrib['v'], mapping)
                else:
                    item['value'] = elem.attrib['v']
            else:
                item['id'] = element.attrib['id']
                item['key'] = elem.attrib['k']
                item['type'] = 'regular'
                if is_street_name(elem):
                    item['value'] = update_name(elem.attrib['v'], mapping)
                else:
                    item['value'] = elem.attrib['v']
            tags.append(item)
        return {'node': node_attribs, 'node_tags': tags}
    
    if element.tag == 'way':
        i = 0
        for elem in element.attrib:
            if elem in WAY_FIELDS:
                way_attribs[elem] = element.attrib[elem]
        for elem in element:
            item= {}
            item_nd = {}
            if elem.tag == "tag":
                if LOWER_COLON.match(elem.attrib["k"]):
                    item["id"] = element.attrib["id"]
                    item["key"] = elem.attrib["k"].split(":", 1)[1]
                    item["type"] = elem.attrib["k"].split(":", 1)[0]
                    if is_street_name(elem):
                        item['value'] = update_name(elem.attrib['v'], mapping)
                    else:
                        item["value"] = elem.attrib["v"]
                else:
                    item["id"] = element.attrib["id"]
                    item["key"] = elem.attrib["k"]
                    item["type"] = "regular"
                    if is_street_name(elem):
                        item['value'] = update_name(elem.attrib['v'], mapping)
                    else:
                        item["value"] = elem.attrib["v"]
                tags.append(item)
                
            if elem.tag == "nd":
                item_nd["id"] = int(element.attrib["id"])
                item_nd["node_id"] = int(elem.attrib["ref"])
                item_nd["position"] = i
                i += 1
                
                way_nodes.append(item_nd)
        return {"way": way_attribs, "way_nodes": way_nodes, "way_tags": tags}
                    

def get_element(osm_file, tags=('node', 'way', 'relation')):
    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()
            
def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))
            
class UnicodeDictWriter(csv.DictWriter, object):
    def writerow(self,row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v, in row.iteritems()
        })
        
    def writerows(self, rows):
        for row in rows:
            self.writerow(row)            
    
def process_map(file_in, validate):
    
    with codecs.open(NODES_PATH, "w") as nodes_file, \
    codecs.open(NODE_TAGS_PATH, "w") as node_tags_file, \
    codecs.open(WAYS_PATH, "w") as ways_file, \
    codecs.open(WAY_NODES_PATH, "w") as way_nodes_file, \
    codecs.open(WAY_TAGS_PATH, "w") as way_tags_file:
        
        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(node_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)
        
        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()
        
        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)
                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == "way":
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el["way_nodes"])
                    way_tags_writer.writerows(el["way_tags"])
#         pprint.pprint(el)
    
process_map(file_actual, validate=True)

In [None]:
import sqlite3
import csv
from pprint import pprint

sqlite_file = 'project.db'
db = sqlite3.connect(sqlite_file) # connects to database
cur = db.cursor()

cur.execute('''DROP TABLE IF EXISTS nodes_tags''')
cur.execute('''DROP TABLE IF EXISTS ways''')
cur.execute('''DROP TABLE IF EXISTS ways_nodes''')
cur.execute('''DROP TABLE IF EXISTS ways_tags''')
cur.execute('''DROP TABLE IF EXISTS nodes''')
db.commit()

cur.execute('''CREATE TABLE nodes_tags(id INTEGER, key TEXT, value TEXT, type TEXT)''')
cur.execute('''CREATE TABLE ways(id INTEGER, user TEXT, uid INTEGER, version TEXT, changeset INTEGER, timestamp TEXT)''')
cur.execute('''CREATE TABLE ways_nodes(id INTEGER, node_id INTEGER, position INTEGER)''')
cur.execute('''CREATE TABLE ways_tags(id INTEGER, key TEXT, value TEXT, type TEXT)''')
cur.execute('''CREATE TABLE nodes(id INTEGER, lat REAL, lon REAL, user TEXT, uid INTEGER, version TEXT, changeset INTEGER, timestamp TEXT)''')
db.commit()

FILE_CSV = ['nodes_tags.csv', 'ways.csv', 'ways_nodes.csv', 'ways_tags.csv', 'nodes.csv']

def read_csv(file_csv):
    with open(file_csv, 'rb') as f:
        dr = csv.DictReader(f)
        for x in dr:
            print (x)

        db = [(
            i['id'], 
            i['key'], 
            i['value'].decode('utf-8'), 
            i['type']) for i in dr]
    return db

nt_db = read_csv('nodes_tags.csv')

with open('ways.csv', 'rb') as f:
    dr = csv.DictReader(f)
    wa_db = [(i['id'], i['user'].decode('utf-8'), i['uid'], i['version'], i['changeset'], i['timestamp']) for i in dr]

with open('ways_nodes.csv', 'rb') as f:
    dr = csv.DictReader(f)
    wn_db = [(i['id'], i['node_id'], i['position']) for i in dr]
    
with open('ways_tags.csv', 'rb') as f:
    dr = csv.DictReader(f)
    wt_db = [(i['id'], i['key'], i['value'].decode('utf-8'), i['type']) for i in dr]
        
with open('nodes.csv', 'rb') as f:
    dr = csv.DictReader(f)
    no_db = [(i['id'], i['lat'], i['lon'], i['user'].decode('utf-8'), i['uid'], i['version'], i['changeset'], i['timestamp']) for i in dr]

cur.executemany('INSERT INTO nodes_tags(id, key, value, type) VALUES (?, ?, ?, ?);', nt_db)
cur.executemany('INSERT INTO ways(id, user, uid, version, changeset, timestamp) VALUES (?, ?, ?, ?, ?, ?);', wa_db)
cur.executemany('INSERT INTO ways_nodes(id, node_id, position) VALUES (?, ?, ?);', wn_db)
cur.executemany('INSERT INTO ways_tags(id, key, value, type) VALUES (?, ?, ?, ?);', wt_db)
cur.executemany('INSERT INTO nodes(id, lat, lon, user, uid, version, changeset, timestamp) VALUES (?, ?, ?, ?, ?, ?, ?, ?);', no_db)
db.commit()

cur.execute('SELECT * FROM nodes_tags')
all_rows = cur.fetchone()
pprint(all_rows)

db.close()

{'value': 'node', 'type': 'regular', 'id': '715398', 'key': 'class'}
{'value': 'node', 'type': 'regular', 'id': '715399', 'key': 'class'}
{'value': 'traffic_signals', 'type': 'regular', 'id': '715515', 'key': 'highway'}
{'value': 'Merkaartor 0.13', 'type': 'regular', 'id': '715597', 'key': 'created_by'}
{'value': 'traffic_signals', 'type': 'regular', 'id': '715705', 'key': 'highway'}
{'value': 'traffic_signals', 'type': 'regular', 'id': '715726', 'key': 'highway'}
{'value': 'traffic_signals', 'type': 'regular', 'id': '715742', 'key': 'highway'}
{'value': '437', 'type': 'regular', 'id': '715784', 'key': 'ref'}
{'value': 'South Eastern Highway', 'type': 'regular', 'id': '715784', 'key': 'name'}
{'value': 'motorway_junction', 'type': 'regular', 'id': '715784', 'key': 'highway'}
{'value': '438', 'type': 'regular', 'id': '715804', 'key': 'ref'}
{'value': 'Mount Wellington Highway', 'type': 'regular', 'id': '715804', 'key': 'name'}
{'value': 'motorway_junction', 'type': 'regular', 'id': '715

In [10]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

def key_type(element,keys):
    if element.tag == 'tag':
        query = element.attrib['k']
        if lower.search(query):
            keys['lower'] += 1
        elif lower_colon.search(query):
            keys['lower_colon'] += 1
        elif problemchars.search(query):
            keys['problemchars'] += 1
        else:
            keys['other'] += 1
        pass
    return keys

def process_map(filename):
    keys = {'lower': 0, 'lower_colon': 0, 'problemchars': 0, 'other': 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)
    return keys

keys = process_map(file_current)
print keys

def count_tags(filename):
    tags = {}
    
    for i, elem in ET.iterparse(filename):
        if elem.tag not in tags.keys():
            tags[elem.tag] = 1
        else:
            tags[elem.tag] += 1
    return tags

tags = count_tags(file_current)
print tags

{'problemchars': 0, 'lower': 88852, 'other': 76919, 'lower_colon': 1960}
