# P3: Wrangle Open Street Map Data

In [4]:
import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "kolkata.osm"  # Replace this with your osm file
SAMPLE_FILE = "kolkata_sample.osm"

k = 10 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

In [19]:
import xml.etree.ElementTree as ET 
import re


#Regular Expressions 
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

#Function to check valid Post Codes
def isPostCodeValid(postcode):
    return len(postcode)==6


#Get street type
def get_street_type(street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        return street_type
    else:
        return False
            



In [21]:
def key_type(element, keys):
    # lower
    # lower_colon
    # problemchars
    if element.tag == "tag":
        key_text=element.attrib['k']
        if lower.match(key_text):
            keys['lower']+=1
        elif lower_colon.match(key_text):
            keys['lower_colon']+=1
        elif problemchars.match(key_text):
            keys['problemchars']+=1
        else:
            keys['other']+=1
            print key_text
        
    return keys

keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
mapping = { "St": "Street",
            "St.": "Street",
            "Rd":"Road",
            "Rd.":"Road",
            "Ave":"Avenue",
            "Ave.":"Avenue"
            }
names=set()
et_parse_data=ET.iterparse('kolkata_sample.osm')
count=0
for _,elem in et_parse_data:
    count+=1
    keys = key_type(elem, keys)
    for t in elem.iter('tag'):
        if 'addr' in t.attrib['k']:
            if 'postcode' in t.attrib['k']:
                #print t.attrib['v']
                if len(t.attrib['v'])!=6:
                    print 'wrong -> ',t.attrib['v']
            names.add(t.attrib['k']+' : '+t.attrib['v'])


police
police
school
school
marketplace
marketplace
fuel
fuel
hospital
hospital
restaurant
restaurant
restaurant
restaurant
pharmacy
pharmacy
hospital
hospital
university
university
fuel
fuel
bank
bank
library
library
atm
atm
bank
bank
office
office
townhall
townhall
college
college
restaurant
restaurant
fast_food
fast_food
community_centre
community_centre
atm
atm
cafe
cafe
cinema
cinema
pharmacy
pharmacy
college
college
cafe
cafe
place_of_worship
place_of_worship
phone_1
phone_2
phone_3
atm
atm
restaurant
restaurant
bus_station
bus_station
place_of_worship
place_of_worship
toilets
toilets
bus_station
bus_station
bank
bank
fast_food
fast_food
place_of_worship
place_of_worship
restaurant
restaurant
college
college
school
school
hospital
hospital
hospital
hospital
place_of_worship
place_of_worship
school
school
college
college
school
school
school
school
hospital
hospital
library
library
college
college
fuel
fuel
fuel
fuel
cinema
cinema
place_of_worship
place_of_worship
bank
bank
cinema

In [16]:
names

{'addr:city : Kolkata',
 'addr:city : Salt Lake (Bidhan Nagar)',
 'addr:city : Saltlake (Bidhannagar)',
 'addr:country : IN',
 'addr:housename : 6th Cross Road, DJ Block, Salt Lake CityKolkata, WB 700091',
 u'addr:housename : EE Block, Salt Lake City Kolkata, West Bengal 700091\u200e',
 u'addr:housename : HC Block, Sector 3, Bidhannagar, Kolkata, West Bengal, 700106\u200e',
 'addr:housename : IB 175, Sector III, Salt Lake City Kolkata',
 'addr:housename : Sector III IB 201, IB Block, Salt Lake City, Kolkata, West Bengal',
 'addr:housename : The French Loaf',
 'addr:housenumber : #4A',
 'addr:housenumber : 102',
 'addr:housenumber : 106/A/3',
 'addr:housenumber : 110/5/L',
 'addr:housenumber : 111',
 'addr:housenumber : 117',
 'addr:housenumber : 118',
 'addr:housenumber : 12/A',
 'addr:housenumber : 120',
 'addr:housenumber : 128',
 'addr:housenumber : 134',
 'addr:housenumber : 137',
 'addr:housenumber : 138',
 'addr:housenumber : 14',
 'addr:housenumber : 144',
 'addr:housenumber : 1