# P3 Wrangle Open Street Map Data

## 1 Initial Parse of the file

In [22]:
import xml.etree.cElementTree as ET
import pprint

In [21]:
filename_short = "/media/removable/Elements/Udacity Data Monging Course/dublin_short.osm"
filename_addr = "/media/removable/Elements/Udacity Data Monging Course/elephants_ear.osm"
filename = "/media/removable/Elements/Udacity Data Monging Course/dublin_ireland.osm"

In [53]:
 def count_tags_k(filename):
    # initialize dict objects and counter
    tag_count = {}
    tag_keys = {}
    counter = 0

    for _, element in ET.iterparse(filename, events=("start",)):
        # add to tag count
        if element.tag in tag_count:
            tag_count[element.tag] += 1
        else:
            tag_count[element.tag] = 1

        # if tag and has key, add the tag key to tag_keys dict
        if element.tag == 'tag' and 'k' in element.attrib:
            #add_tag(element.get('k'), tag_keys)
            if element.get('k') in tag_keys:
                tag_keys[element.get('k')] += 1
            else:
                tag_keys[element.get('k')] = 1

    # produces sorted-by-decreasing lists of tag key-count pairs
    tag_count = sorted(tag_count.items(), key=lambda t: t[1], reverse = True)
    tag_keys = sorted(tag_keys.items(), key=lambda t: t[1], reverse = True)

    # return values
    return tag_count, tag_keys

In [55]:
count_tags_k(filename)

([('nd', 1450408),
  ('node', 1068565),
  ('tag', 764612),
  ('way', 187706),
  ('member', 59217),
  ('relation', 3391),
  ('bounds', 1),
  ('osm', 1)],
 [('building', 99787),
  ('addr:street', 82432),
  ('addr:housenumber', 71061),
  ('highway', 66932),
  ('name', 45890),
  ('house', 42879),
  ('building:levels', 37375),
  ('building:roof:shape', 29419),
  ('maxspeed', 24675),
  ('addr:city', 13774),
  ('created_by', 13301),
  ('levels', 12457),
  ('name:ga', 12396),
  ('landuse', 10206),
  ('amenity', 9990),
  ('oneway', 7947),
  ('ref', 7858),
  ('natural', 7732),
  ('barrier', 6959),
  ('building:use', 6877),
  ('source', 6783),
  ('building:cladding', 5950),
  ('operator', 5782),
  ('name:en', 5725),
  ('service', 5581),
  ('building:roof', 4579),
  ('access', 4128),
  ('surface', 4095),
  ('addr:country', 3732),
  ('boundary', 3469),
  ('type', 3417),
  ('shop', 3388),
  ('foot', 3291),
  ('leisure', 3168),
  ('traffic_calming', 3089),
  ('building:roof_shape', 2635),
  ('admin_l

## 2 Audit tags

In [16]:
import re

In [17]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

In [18]:
def key_type(element, keys):
    if element.tag == "tag":
        if lower.search(element.attrib['k']):
            keys['lower'] += 1
        elif lower_colon.search(element.attrib['k']):
            keys['lower_colon'] += 1
        elif problemchars.search(element.attrib['k']):
            keys['problemchars'] += 1
        else:
            keys['other'] += 1
    return keys

In [19]:
def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

In [23]:
keys = process_map(filename)
pprint.pprint(keys)

{'lower': 472762, 'lower_colon': 257464, 'other': 34386, 'problemchars': 0}


## 3 Check out user contributions

In [40]:
def process_map(filename):
    users = {}
    for _, element in ET.iterparse(filename):
        if 'user' in element.attrib.keys():
            if element.attrib['user'] not in users.keys():
                users[element.attrib['user']] = 1
            else:
                users[element.attrib['user']] += 1
    return users

In [41]:
users = process_map(filename)
users_sorted = sorted(users.items(), key=lambda t: t[1], reverse = True)
print "There are " + str(len(users)) + " contributors to the dataset."
print "The top contributors are:"
pprint.pprint(users_sorted[0:10])

There are 1122 contributors to the dataset.
The top contributors are:
[('Nick Burrett', 236906),
 ('mackerski', 183786),
 ('brianh', 156316),
 ('Dafo43', 150926),
 ('Conormap', 64014),
 ('Ignobilis', 52864),
 ('VictorIE', 48672),
 ('Autarch', 21808),
 ('wigs', 20793),
 ('Blazejos', 19574)]


## 4 Audit Addresses

### 4.1 Audit Street Addresses

#### 4.1.1 Import and predefine methods and variables 

In [10]:
from collections import defaultdict

In [11]:
get_last_word = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Centre", "Close", "Crescent", "Grove", "Garden", "Gardens", "Green",
            "Heights", "Hill", "Lawn", "Lawns", "Park", "Quay", "Row", "Terrace", "Wood", "Way", "Walk", "Upper",
            "Lower", "North", "South", "East", "West"]

mapping = { "St.": "Street",
            "St": "Street",
            "Sreet": "Street",
            "Rd.": "Road",
            "Ave": "Avenue",
            "Avevnue": "Avenue",
            "Center": "Centre",
            "Cente": "Centre"
            }

#### 4.1.2 Helper functions for Street Address Audit 

In [12]:
def audit_street_type(street_types, street_name):
    m = get_last_word.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

In [13]:
def is_street_name(elem):
    return (elem.attrib['k'] == "adr:street")

In [14]:
def audit_streets(filename):
    osm_file = open(filename, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

    return street_types

In [15]:
def update_streets(name, mapping, st_types):
    
    # Loop through dictionary and correct pre-defined mistakes:
    for st_type, value_set in st_types.iteritems():
        if name in value_set:
            # Ensure all words start with a capital letter: "lord edward's st" -> "Lord Edward'S St"
            name = name.title()
    
            # Rewrite "'S" to "'s": "Lord Edward'S St" -> "Lord Edward's St"
            name = re.sub('\'S', '\'s', name)
            
            # Correct common mistake from mapping dict: "Lord Edward's St -> "Lord Edward's Street"
            for key in mapping:       
                if name.find(key) != -1:
                    name = name[:name.find(key)]+mapping[key]+name[name.find(key)+len(str(key)):]
    return name

#### 4.1.3 Street Address Audit

In [16]:
st_types = audit_streets(filename)
pprint.pprint(dict(st_types))

{'1-13': set(['The Rise 1-13']),
 '1-9': set(['Manor Court 1-9']),
 '10-21': set(['Manor Court 10-21']),
 '14-28': set(['The Rise 14-28']),
 '26': set(['26']),
 '27-31': set(['Supple Park 27-31']),
 '32-39': set(['Supple Park 32-39']),
 '4': set(['Serpentine Avenue, Ballsbridge, Dublin 4']),
 '40-44': set(['Supple Park 40-44']),
 '48-': set(['Supple Park 48-']),
 'Abbey': set(['Fonthill Abbey', "Mary's Abbey", 'Seachnall Abbey']),
 'Airport': set(['Dublin Airport']),
 'Albany': set(['Mount Albany']),
 'Alders': set(['The Alders']),
 'Alley': set(['Copper Alley',
               'Crown Alley',
               'Lamb Alley',
               'Smock Alley',
               "Swift's Alley",
               'Thundercut Alley']),
 'Apartments': set(['Shelbourne Park Apartments']),
 'Archerswood': set(['Archerswood']),
 'Ardglas': set(['Ardglas']),
 'Arundel': set(['Arundel']),
 'Ashurst': set(['Ashurst']),
 'Aspencourt': set(['Aspencourt']),
 'Ave': set(['First Ave', 'Griffith Ave', 'Spruce Ave']),

#### 4.1.3 Street Address Update Function Test

In [20]:
for st_type, ways in st_types.iteritems():
    for name in ways:
        better_name = update_streets(name, mapping, st_type)
        print name, "=>", better_name

The Gables => The Gables
Mount Albany => Mount Albany
Westhaven => Westhaven
Brooklawn => Brooklawn
Estate Cottages => Estate Cottages
Kilternan Cottages => Kilternan Cottages
Jamestown Cottages => Jamestown Cottages
Foxside Cottages => Foxside Cottages
Emerald Cottages => Emerald Cottages
Shamrock Cottages => Shamrock Cottages
Mounttown Cottages => Mounttown Cottages
Saint Brocs Cottages => Saint Brocs Cottages
Ballyedmonduff Cottages => Ballyedmonduff Cottages
Dodsboro Cottages => Dodsboro Cottages
Stradbrook Cottages => Streetradbrook Cottages
Bohernabreena Cottages => Bohernabreena Cottages
Bank Side Cottages => Bank Side Cottages
Riverside Cottages => Riverside Cottages
Priestfield Cottages => Priestfield Cottages
Golden Ball Cottages => Golden Ball Cottages
Harold's Cross Cottages => Harold's Cross Cottages
Pembroke Cottages => Pembroke Cottages
Glenamuck Cottages => Glenamuck Cottages
Wayside Cottages => Wayside Cottages
Railway Cottages => Railway Cottages
Woodside Cottages => 

### 4.2 Postal Code Audit

#### 4.2.1 Helper functions for Postal Code Audit 

In [24]:
def audit_pcode_type(pcode_types, pcode):
    if pcode:
        if pcode.startswith("Dublin"):
            pcode_types["Dublin"].add(pcode)
        elif pcode.startswith("D") and len(str(pcode)) in (2,3):
            pcode_types["D"].add(pcode)
        elif len(str(pcode)) == 2 and pcode[0] == "0":
            pcode_types["0"].add(pcode)
        elif len(pcode.split(" ")) > 1:
            pcode_types["Two items"].add(pcode)
        elif len(str(pcode)) > 2:
            pcode_types["too long"].add(pcode)
        elif len(str(pcode)) <= 2:
            try:
                int(pcode)
                pass
            except:
                pcode_types[pcode].add(pcode)
        else:
            pcode_types[pcode].add(pcode)

In [25]:
def is_postal_code(elem):
    return (elem.attrib['k'] == "addr:postcode")

In [26]:
def audit_pcodes(filename):
    filename = open(filename, "r")
    pcode_types = defaultdict(set)
    for event, elem in ET.iterparse(filename, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_postal_code(tag):
                    audit_pcode_type(pcode_types, tag.attrib['v'])
    
    return pcode_types

In [28]:
def update_pcodes(pcode, pcode_types):
        
    # Loop through dictionary and correct formats to get just integer postcode
    for pcode_type, value_set in pcode_types.iteritems():
        if pcode in value_set:
            '''Order of corrections matters, as i.e. D01F5P2 -> D01 -> 1'''
            if pcode_type == "Dublin":
                pcode = re.sub("Dublin ", "", pcode)
            if pcode_type == "Two items":
                pcode = pcode.split(" ")
                pcode = pcode[0]
            if pcode_type == "too long":
                pcode = pcode[0:3]
            if pcode_type == "D":
                pcode = re.sub("0", "", pcode[1:])

    return pcode

#### 4.2.2 Postal Code Audit

In [21]:
pcode_types = audit_pcodes(filename)
pprint.pprint(dict(pcode_types))

{'D': set(['D01',
           'D02',
           'D03',
           'D04',
           'D05',
           'D06',
           'D07',
           'D08',
           'D11',
           'D12',
           'D13',
           'D14',
           'D15',
           'D18',
           'D6W']),
 'Dublin': set(['Dublin 14', 'Dublin 15', 'Dublin 2', 'Dublin 24']),
 'Two items': set(['A94 FA39',
                   'A94 PC95',
                   'A96 DF24',
                   'A96 XN50',
                   'A98 KC91',
                   'D01 V6V6',
                   'D01 WY49',
                   'D02 NH04',
                   'D07 X9YN',
                   'D08 P3K4',
                   'D15 A259',
                   'D15 A3A8',
                   'D15 A4ET',
                   'D15 A583',
                   'D15 A9WH',
                   'D15 ACP4',
                   'D15 AE72',
                   'D15 AF80',
                   'D15 AK27',
                   'D15 AP60',
                   'D15 AW77',
        

#### 4.1.3 Postal Code Update Function Test

In [26]:
for pcode_type, value_set in pcode_types.iteritems():
    for pcode in value_set:
        better_code = update_pcodes(pcode, pcode_types)
        print pcode, "=>", better_code

D01X2P2 => 1
D01F5P2 => 1
A96X259 => A96
Dublin 2 => 2
Dublin 14 => 14
Dublin 24 => 24
Dublin 15 => 15
D07 => 7
D06 => 6
D12 => 12
D04 => 4
D15 => 15
D02 => 2
D01 => 1
D13 => 13
D18 => 18
D05 => 5
D14 => 14
D08 => 8
D03 => 3
D6W => 6W
D11 => 11
D15 EC6A => D15
D08 P3K4 => D08
D15 XP60 => D15
D15 YY8W => D15
D15 F782 => D15
D15 E671 => D15
D15 K304 => D15
D15 PX44 => D15
D15 TRX7 => D15
D15 W590 => D15
D15 F673 => D15
D15 E8FX => D15
A98 KC91 => A98
D15 NY59 => D15
D15 AW9X => D15
D15 CR26 => D15
D15 XE63 => D15
D15 Y9WA => D15
D15 WF88 => D15
D15 YK8A => D15
D02 CC60 => D02
D18 KX90 => D18
D15 VF76 => D15
D15 FX09 => D15
D15 HF8N => D15
D15 YA32 => D15
D15 CR90 => D15
D01 V6V6 => D01
D15 XC6R => D15
D15 K004 => D15
D15 XR70 => D15
D18 T8N4 => D18
D15 YP92 => D15
D15 ACP4 => D15
D15 PK71 => D15
D15 DC2W => D15
D15 KR62 => D15
D15 ET61 => D15
D15 K6YR => D15
D15 F3803 => D15
D15 P5FN => D15
D15 R242 => D15
D07 X9YN => D07
D15 K8YW => D15
D15 HD96 => D15
D15 WTY7 => D15
D15 PW52 => D15
D1

## 5 Correct data, store as JSON and import into MongoDB

#### 5.1 Import and predefine methods and variables

In [2]:
import codecs
import json
from pymongo import MongoClient

In [23]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

#### 5.2 Define Helper functions

In [24]:
def shape_element(element):
    data = []
    node = {}
    node["pos"]=[]
    if element.tag == "node" or element.tag == "way" :
        node["type"] = element.tag
        for key in element.keys():
            if key in CREATED:
                if "created" not in node.keys():
                    node["created"] = {}
                node["created"][key] = element.attrib[key]
            elif key in ("lat", "lon"):
                try:
                    node['pos']=[float(element.attrib['lat']),float(element.attrib['lon'])]
                except KeyError:
                    pass
            else:
                node[key] = element.attrib[key]

        for tag in element.iter("tag"):
            if not(problemchars.search(tag.attrib['k'])):
                if tag.attrib['k'].startswith("addr:"):
                        addr_fields = tag.attrib['k'].split(':')
                        if len(addr_fields) == 2:
                            if 'address' not in node.keys():
                                node['address'] = {}
                            if addr_fields[1] == "street":
                                node["address"][addr_fields[1]] = update_streets(tag.attrib['v'], mapping, st_types)
                            elif addr_fields[1] == "postcode":
                                node["address"][addr_fields[1]] = update_pcodes(tag.attrib['v'], pcode_types)
                            else:
                                node["address"][addr_fields[1]] = tag.attrib['v']
                else:
                    node[tag.attrib['k']] = tag.attrib['v']
        
        for nd in element.iter("nd"):
            if "node_refs" not in node.keys(): 
                node["node_refs"] = []
            node["node_refs"].append(nd.attrib['ref'])
        
        return node
    else:
        return None

In [25]:
def process_map(file_in):
    data = []
    for _, element in ET.iterparse(file_in):
        el = shape_element(element)
        if el:
            data.append(el)
    return data

In [26]:
def write_json(file_in):
    process_map(file_in)
    file_out = "{0}.json".format(file_in)
    with codecs.open(file_out, "w") as fo:
        fo.write(json.dumps(data))

#### 5.3 Convert to JSON format and dump as file

In [27]:
data = process_map(filename)

In [28]:
pprint.pprint(data[0])

{'created': {'changeset': '5277755',
             'timestamp': '2010-07-21T10:37:10Z',
             'uid': '114310',
             'user': 'Joe E',
             'version': '3'},
 'id': '384519',
 'pos': [53.3763419, -6.3699187],
 'type': 'node'}


In [29]:
file_out = "{0}.json".format(file_in)
with codecs.open(file_out, "w") as fo:
    fo.write(json.dumps(data))

#### 5.4 Insert into MongoDB

In [30]:
def insert_data(jsonfile):
    
    print jsonfile

    client = MongoClient("mongodb://localhost:27017")
    db = client.examples

    with open(jsonfile) as f:
        data = json.loads(f.read())
        db.dublin.insert_many(data)


In [38]:
#jsonfile = "/home/pi/Downloads/dublin_ireland.json"
jsonfile = "{0}.json".format(filename)
insert_data(jsonfile)

/home/pi/Downloads/dublin_ireland.json


# 6 Query Data in MongoDB

#### 6.1 Setup Mongo Client and load Database

In [3]:
client = MongoClient("mongodb://localhost:27017")
db = client.examples

#### 6.2 Define aggregate function to save some typing

In [4]:
def aggregate(pipeline):
    return [doc for doc in db.dublin.aggregate(pipeline)]

#### 6.3 Nodes & Ways in Dataset

In [40]:
nodes = db.dublin.find({"type": "node"}).count()
ways = db.dublin.find({"type": "way"}).count()

print "There are " + str(nodes) + " nodes and " + str(ways) + " ways in the dataset"

There are 1068553 nodes and 187688 ways in the dataset


#### 6.4 User stats

In [39]:
list = db.dublin.distinct("created.user")
print len(list)

1108


In [76]:
superusers = [{"$group": {"_id": "$created.user",
                          "count": {"$sum": 1}}},
              {"$sort": {"count": -1}},
              {"$limit": 10}
                                ]
pprint.pprint(aggregate(superusers))

[{u'_id': u'Nick Burrett', u'count': 236755},
 {u'_id': u'mackerski', u'count': 183610},
 {u'_id': u'brianh', u'count': 154844},
 {u'_id': u'Dafo43', u'count': 150829},
 {u'_id': u'Conormap', u'count': 63958},
 {u'_id': u'Ignobilis', u'count': 52850},
 {u'_id': u'VictorIE', u'count': 48596},
 {u'_id': u'Autarch', u'count': 21800},
 {u'_id': u'wigs', u'count': 20792},
 {u'_id': u'Blazejos', u'count': 19569}]


##### 6.5 Most common amenities

In [81]:
amenities = [{"$match": {"amenity": {"$ne": None}}},
             {"$group": {"_id": "$amenity",
                          "count": {"$sum": 1}}},
             {"$sort": {"count": -1}},
             {"$limit": 10}
                                ]
pprint.pprint(aggregate(amenities))

[{u'_id': u'parking', u'count': 2134},
 {u'_id': u'pub', u'count': 698},
 {u'_id': u'restaurant', u'count': 650},
 {u'_id': u'fast_food', u'count': 586},
 {u'_id': u'cafe', u'count': 561},
 {u'_id': u'school', u'count': 540},
 {u'_id': u'post_box', u'count': 437},
 {u'_id': u'place_of_worship', u'count': 389},
 {u'_id': u'bench', u'count': 346},
 {u'_id': u'bicycle_parking', u'count': 302}]


##### 6.6 Religions for places of worship

In [57]:
religion = [{"$match": {"amenity": "place_of_worship",
                        "religion": {"$ne": None}}},
              {"$group": {"_id": "$religion",
                          "count": {"$sum": 1}}},
              {"$sort": {"count": -1}},
              {"$limit": 10}
                                ]
pprint.pprint(aggregate(religion))

[{u'_id': u'christian', u'count': 348},
 {u'_id': u'buddhist', u'count': 4},
 {u'_id': u'muslim', u'count': 2},
 {u'_id': u'jewish', u'count': 2},
 {u'_id': u'sikh', u'count': 1},
 {u'_id': u'bahai', u'count': 1},
 {u'_id': u'multifaith', u'count': 1},
 {u'_id': u'hindu', u'count': 1}]


##### 6.6 Street with most pubs

In [39]:
pub_street = [{"$match": {"amenity": "pub"}},
              {"$group": {"_id": "$address.street",
                          "count": {"$sum": 1}}},
              {"$sort": {"count": -1}},
              {"$limit": 10}
                                ]
pprint.pprint(aggregate(pub_street))

[{u'_id': None, u'count': 334},
 {u'_id': u'Main Street', u'count': 10},
 {u'_id': u'Thomas Street', u'count': 6},
 {u'_id': u'Duke Street', u'count': 5},
 {u'_id': u'Wexford Street', u'count': 5},
 {u'_id': u'Camden Street Lower', u'count': 5},
 {u'_id': u'Baggot Street Lower', u'count': 5},
 {u'_id': u"Harold's Cross Road", u'count': 5},
 {u'_id': u'Phibsborough Road', u'count': 5},
 {u'_id': u'Dame Street', u'count': 4}]


##### 6.7 Leisure resources

In [12]:
leisure = [{"$match": {"leisure": {"$ne": None}}},
              {"$group": {"_id": "$leisure",
                        "count": {"$sum": 1}}},
              {"$sort": {"count": -1}},
              {"$limit": 10}
                                ]
pprint.pprint(aggregate(leisure))

[{u'_id': u'pitch', u'count': 1304},
 {u'_id': u'park', u'count': 881},
 {u'_id': u'sports_centre', u'count': 241},
 {u'_id': u'garden', u'count': 205},
 {u'_id': u'playground', u'count': 162},
 {u'_id': u'golf_course', u'count': 98},
 {u'_id': u'picnic_table', u'count': 50},
 {u'_id': u'common', u'count': 32},
 {u'_id': u'recreation_ground', u'count': 30},
 {u'_id': u'swimming_pool', u'count': 23}]


##### 6.8 Listed cities

In [11]:
city = [{"$match": {"address.city": {"$ne": None}}},
              {"$group": {"_id": "$address.city",
                        "count": {"$sum": 1}}},
              {"$sort": {"count": -1}},
              {"$limit": 10}
                                ]
pprint.pprint(aggregate(city))

[{u'_id': u'Dublin', u'count': 7331},
 {u'_id': u'Lucan', u'count': 1322},
 {u'_id': u'Dublin 6', u'count': 990},
 {u'_id': u'Blanchardstown', u'count': 943},
 {u'_id': u'Dublin 1', u'count': 390},
 {u'_id': u'Dublin 8', u'count': 324},
 {u'_id': u'Dublin 7', u'count': 315},
 {u'_id': u'Dublin 2', u'count': 297},
 {u'_id': u'Dublin 3', u'count': 240},
 {u'_id': u'Dublin 6W', u'count': 196}]


In [43]:
amenities_wo_address = [{"$match": {"address.street": None,
                                    "amenity": {"$ne": None},
                                    "pos": {"$ne": None}}},
                        {"$group": {"_id": "Amenities w/o address but positional data",
                                    "count": {"$sum": 1}}}
                        ]
pprint.pprint(aggregate(amenities_wo_address))

[{u'_id': u'Amenities w/o address but positional data', u'count': 7342}]
