In [1]:
import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "melbourne_australia.osm"  # Replace this with your osm file
SAMPLE_FILE = "sample1.osm"

k = 100 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-


import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus

import schema

OSM_PATH = "sample.osm"

NODES_PATH = "nodes1.csv"
NODE_TAGS_PATH = "nodes_tags1.csv"
WAYS_PATH = "ways1.csv"
WAY_NODES_PATH = "ways_nodes1.csv"
WAY_TAGS_PATH = "ways_tags1.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,NODE_TAGS_FIELDS=NODE_TAGS_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements
    
    # YOUR CODE HERE
    if element.tag == 'node':
        for i in node_attr_fields:
            node_attribs[i]=element.attrib[i]
        
        for tag in element:
            nodes={}
            if problem_chars.match(tag.attrib['k']):
                pass
            elif LOWER_COLON.match(tag.attrib['k']):
                nodes['id']=element.attrib['id']
                nodes['key']=tag.attrib['k'].split(":",1)[1]
                nodes['type']=tag.attrib['k'].split(":",1)[0]
                nodes['value']=tag.attrib['v']
            else:
                nodes['id']=element.attrib['id']
                nodes['key']=tag.attrib['k']
                nodes['type']='regular'
                nodes['value']=tag.attrib['v']
            tags.append(nodes)
        
        return {'node': node_attribs, 'node_tags': tags}
    elif element.tag == 'way':
        for i in way_attr_fields:
            way_attribs[i]=element.attrib[i]
            position=0
        for child in element:
            way_n = {}
            way_t = {}
            if child.tag == "nd":
                if element.attrib["id"] not in way_nodes:
                    way_n["position"] = position
                    way_n["id"] = element.attrib["id"]
                    way_n["node_id"] = child.attrib["ref"]
                    way_nodes.append(way_n)
                    position += 1
                else:
                    way_n["position"] += 1
                    way_n["id"] = element.attrib["id"]
                    way_n["node_id"] = child.attrib["ref"]
                    way_nodes.append(way_n)
            elif child.tag == "tag":
                if PROBLEMCHARS.match(child.attrib["k"]):
                    pass
                elif LOWER_COLON.match(child.attrib["k"]):
                    way_t["type"] = child.attrib["k"].split(":",1)[0]
                    way_t["key"] = child.attrib["k"].split(":",1)[1]
                    way_t["id"] = element.attrib["id"]
                    way_t["value"] = child.attrib["v"]
                    tags.append(way_t)
                else:
                    way_t["type"] = "regular"
                    way_t["key"] = child.attrib["k"]
                    way_t["id"] = element.attrib["id"]
                    way_t["value"] = child.attrib["v"]
                    tags.append(way_t)
                    
        print way_nodes
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=True)


In [16]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
The first task is a fun one - find out how many unique users
have contributed to the map in this particular area!

The function process_map should return a set of unique user IDs ("uid")
"""

def get_user(element):
    return


def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if element.tag=='node' or element.tag=='way':
            if element.attrib['uid'] not in users:
                users.add(element.attrib['uid'])
       

    return users


def test():

    users = process_map('sample1.osm')
    pprint.pprint(users)
    print len(users)



if __name__ == "__main__":
    test()

set(['100257',
     '1003880',
     '1009527',
     '101437',
     '1016361',
     '102147',
     '102893',
     '103120',
     '103574',
     '10359',
     '1038504',
     '104757',
     '106990',
     '107612',
     '10906',
     '109093',
     '109694',
     '1101474',
     '110263',
     '11037',
     '1105905',
     '11111',
     '111529',
     '111865',
     '112105',
     '1122112',
     '112555',
     '112648',
     '112728',
     '113717',
     '114310',
     '116933',
     '1171733',
     '118021',
     '118498',
     '1185702',
     '1187510',
     '1193827',
     '119892',
     '1199841',
     '1202665',
     '121124',
     '1211859',
     '121281',
     '1213459',
     '12140',
     '12254',
     '1227089',
     '122844',
     '1229950',
     '1230627',
     '1232327',
     '1240849',
     '12434',
     '1247041',
     '1247726',
     '1247732',
     '1279506',
     '1292902',
     '1293194',
     '129966',
     '130472',
     '1306',
     '13271',
     '1335798',
     '13

In [8]:

import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "sample.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["St","St.","Rd.","Av","Gr",'Stg',"Rd"]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "Rd.":"Road",
            "Av":"Avenue",
           "Gr":'Grove',
           'Stg':"Street"
            }


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types


def update_name(name, mapping):
    m = street_type_re.search(name)
    for key,value in (mapping).iteritems():
        if key==m.group():
           name=name.replace(key,value)
        

    return name


def test():
    st_types = audit(OSMFILE)
    
    pprint.pprint(dict(st_types))

    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name


if __name__ == '__main__':
    test()

{'Av': set(['Intrepid Av']),
 'Gr': set(['McCarthy Gr']),
 'St': set(['Queen St']),
 'Stg': set(['Leigh Stg'])}
Intrepid Av => Intrepid Avenue
Leigh Stg => Leigh Street
McCarthy Gr => McCarthy Grove
Queen St => Queen Street


In [7]:

import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "example.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "Rd.":"Road",
            "Ave":"Avenue"
            }


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types


def update_name(name, mapping):
    m = street_type_re.search(name)
    for key,value in (mapping).iteritems():
        if key==m.group():
           name=name.replace(key,value)
        

    return name


def test():
    st_types = audit(OSMFILE)
    assert len(st_types) == 3
    pprint.pprint(dict(st_types))

    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name
            if name == "West Lexington St.":
                assert better_name == "West Lexington Street"
            if name == "Baldwin Rd.":
                assert better_name == "Baldwin Road"


if __name__ == '__main__':
    test()

{'Ave': set(['N. Lincoln Ave', 'North Lincoln Ave']),
 'Rd.': set(['Baldwin Rd.']),
 'St.': set(['West Lexington St.'])}
N. Lincoln Ave => N. Lincoln Avenue
North Lincoln Ave => North Lincoln Avenue
West Lexington St. => West Lexington Street
Baldwin Rd. => Baldwin Road


In [3]:
"""
Your task in this exercise has two steps:

- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix 
    the unexpected street types to the appropriate ones in the expected list.
    You have to add mappings only for the actual problems you find in this OSMFILE,
    not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
    The function takes a string with street name as an argument and should return the fixed name
    We have provided a simple test so that you see what exactly is expected
"""
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "sample.osm"
code_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["3000","3001","3002"]


def audit_postal_type(postal_types, postal_code):
    m = code_type_re.search(postal_code)
    if m:
        postal_type = m.group()
        if postal_type not in expected:
            postal_types[postal_type].add(postal_code)


def is_postal_code(elem):
    return (elem.attrib['k'] == "addr:postcode")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    postal_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_postal_code(tag):
                    audit_postal_type(postal_types, tag.attrib['v'])
    osm_file.close()
    return postal_types



def test():
    postal_types = audit(OSMFILE)
    pprint.pprint(dict(postal_types))

#     for st_type, ways in st_types.iteritems():
#         for name in ways:
#             better_name = update_name(name, mapping)
#             print name, "=>", better_name
           


if __name__ == '__main__':
    test()

{'3003': set(['3003']),
 '3004': set(['3004']),
 '3006': set(['3006']),
 '3006;3130': set(['3006;3130']),
 '3008': set(['3008']),
 '3010': set(['3010']),
 '3011': set(['3011']),
 '3012': set(['3012']),
 '3013': set(['3013']),
 '3018': set(['3018']),
 '3019': set(['3019']),
 '3020': set(['3020']),
 '3021': set(['3021']),
 '3023': set(['3023']),
 '3025': set(['3025']),
 '3026': set(['3026']),
 '3027': set(['3027']),
 '3028': set(['3028']),
 '3029': set(['3029']),
 '3030': set(['3030']),
 '3031': set(['3031']),
 '3032': set(['3032']),
 '3033': set(['3033']),
 '3037': set(['3037']),
 '3038': set(['3038']),
 '3039': set(['3039']),
 '3040': set(['3040']),
 '3042': set(['3042']),
 '3043': set(['3043']),
 '3044': set(['3044']),
 '3045': set(['3045']),
 '3047': set(['3047']),
 '3051': set(['3051']),
 '3052': set(['3052']),
 '3053': set(['3053']),
 '3054': set(['3054']),
 '3055': set(['3055']),
 '3056': set(['3056']),
 '3057': set(['3057']),
 '3058': set(['3058']),
 '3060': set(['3060']),
 '3061

In [5]:
from pprint import pprint
import os
from hurry.filesize import size

dirpath = 'C:\Users\AKANKSHA\Desktop\udacity\openstreet project\sqlite_windows'

files_list = []
for path, dirs, files in os.walk(dirpath):
    files_list.extend([(filename, size(os.path.getsize(os.path.join(path, filename)))) for filename in files])

for filename, size in files_list:
    print '{:.<40s}: {:5s}'.format(filename,size)

.DS_Store...............................: 6K   
melbourne_australia.osm.................: 818M 
nodes.csv...............................: 3M   
nodes.db................................: 2M   
nodes_tags.csv..........................: 287K 
node_tags.db............................: 305K 
sample.osm..............................: 82M  
sample1.osm.............................: 8M   
sqlite3.exe.............................: 655K 
ways.csv................................: 309K 
ways.db.................................: 292K 
ways.db,nodes.db........................: 0B   
ways.db,nodes.db;.......................: 0B   
ways_nodes.csv..........................: 1M   
ways_tags.csv...........................: 483K 
ways_tags.db............................: 508K 
