### Process Data
### Look at sample data

In [1]:
import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "leeds.osm"  # Replace this with your osm file
SAMPLE_FILE = "sample.osm"

k = 10 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

* explore common tags provided by users
* explore essoteric "ways" if any exist
* explore incomplete street names and fix
* source known postal codes for leeds city area
* common amenities
* expletives and swearwords


In [155]:
%pdb

Automatic pdb calling has been turned ON


In [2]:
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
from collections import defaultdict


### Audit Street Names

In [12]:
osmfile = 'leeds.osm'

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

mapping = { "St": "Street",
            "St.": "Street",
           "Rd.": "Road",
           "Ave": "Avenue",
           "Ave.": "Avenue",
           "Avenueue": "Avenue"
            }
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
addr = re.compile(r'addr:')
doubled_colon = re.compile(r':')
naptan = re.compile(re.compile(r'naptan'))

def update_name(name, mapping):
    for incorrect_value, correct in mapping.items():
        if re.search(incorrect_value, name):
            name = re.sub(incorrect_value, correct, name, count=1)
            return name

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

def test():
    st_types = audit(osmfile)
    #print pprint.pprint(st_types.keys())
    better_names = {}
    for st_type, ways in st_types.iteritems():
            for name in ways:
                better_name = update_name(name, mapping)
                if better_name:
                    better_names[name]=[better_name]
    print len(better_names)
    return better_names
    

test()

12


{'Buck Stone Close': ['Buck Streetne Close'],
 'Buck Stone Crescent': ['Buck Streetne Crescent'],
 'Buck Stone Gardens': ['Buck Streetne Gardens'],
 'Buck Stone Green': ['Buck Streetne Green'],
 'Buck Stone Mount': ['Buck Streetne Mount'],
 'Buck Stone Oval': ['Buck Streetne Oval'],
 'Buck Stone Rise': ['Buck Streetne Rise'],
 'Buck Stone View': ['Buck Streetne View'],
 'Buck Stone Way': ['Buck Streetne Way'],
 'Sheepscar Street South': ['Sheepscar Streeteet South'],
 "St Wilfrid's Circus": ["StreetWilfrid's Circus"],
 "St. Michael's Crescent": ["Street Michael's Crescent"]}

### Audit postcodes

In [13]:
import csv

OSMFILE = "leeds.osm"
postcode_pattern = re.compile(r'LS')
postcode_pattern_lower = re.compile(r'ls')

def get_standard_postcodes(postcode_csv):
    postcode_list=[]
    with open(postcode_csv, 'r') as f:
        read = csv.reader(f)
        f.next()
        for postcode in read:
            postcode_list.append(postcode[0])
        print "postcode list 0",postcode_list[0]
    return postcode_list

def is_postcode_element(elem):
    return (elem.attrib['k'] == "addr:postcode")

def is_bradford(elem):
    return re.match(r'BD',elem)

def audit_postcodes(osm_file, postcode_csv):
    number_of_postcodes = 0
    lower_case_postcodes = 0
    postcode_list = get_standard_postcodes(postcode_csv)
    print 'postcode list =',len(postcode_list), '\n'
    osm_file = open(OSMFILE, "r")
    non_standard_postcodes = set()
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_postcode_element(tag):
                    number_of_postcodes +=1

                    if tag.attrib['v'] not in postcode_list:
                        if not is_bradford(tag.attrib['v']):
                            non_standard_postcodes.add(tag.attrib['v'])
                    if re.match(postcode_pattern_lower, tag.attrib['v']):
                        lower_case_postcodes +=1
    print 'lower_case_postcodes=',lower_case_postcodes, "number of postcodes=", number_of_postcodes
    return non_standard_postcodes, lower_case_postcodes
    
audit_postcodes(OSMFILE, 'ls_postcodes.csv')

postcode list 0 LS1 1AZ
postcode list = 21382 

lower_case_postcodes= 0 number of postcodes= 1567


({'LS1',
  'LS1 4AA',
  'LS10 1DU',
  'LS10 1JU',
  'LS10 1LA',
  'LS11',
  'LS11 5EF',
  'LS14 6',
  'LS15 8',
  'LS17',
  'LS18 4ER',
  'LS2 7PJ',
  'LS26',
  'LS27',
  'LS3 1YL',
  'LS7',
  'LS7 4DP',
  'LS8',
  'LS8 4BD',
  'LS9 0HA'},
 0)

In [31]:
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
from collections import defaultdict

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
addr = re.compile(r'addr:')
doubled_colon = re.compile(r':')
naptan = re.compile(re.compile(r'naptan'))

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
POS = ['lon','lat']

def yes_is_true(x):
    if x =="yes" or x=='Yes' or x=='YES':
        return True
    elif x=="no" or x=="No" or x=="NO"
        return False
    else:
        return x

def shape_element(element):

    node = defaultdict(dict)

    if element.tag == "node" or element.tag == "way" :


        node['type']=element.tag
        node['pos']=[0,0]
        node['node_refs']=[]
        # YOUR CODE HERE            
        for key, value in element.attrib.items():
            if key in CREATED:
                node['created'][key] = value
            elif key in POS:
                #print '\n', key, 'true'
                if key == 'lat':
                    node['pos'][0]=float(value)
                elif key == 'lon':
                    node['pos'][1]=float(value)
        for child in element.iter():
            #print child.tag, child.attrib
# put all child entries into respective dictionaries
            if child.tag=='tag':
                
                if re.match(addr,child.attrib['k']) and len(re.findall(doubled_colon,child.attrib['k']))<2:
                    node['address']
                    node['address'][re.sub(addr,'',child.attrib['k'])] = yes_is_true(child.attrib['v'])
                else:
                    if re.match(naptan, child.attrib['k']):
                        node['naptan:']
                        node['naptan:'][re.sub(naptan, '',child.attrib['k'])]=yes_is_true(child.attrib['v'])


                    else:    
                        node[child.attrib['k']] = yes_is_true(child.attrib['v'])
                
            if child.tag=='nd':
                node['node_refs'].append(child.attrib['ref'])
                

        if node['pos']==[0,0]:
            del node['pos']

        if node['k']=={}:
            del node['k']
        if node['v']=={}:
            del node['v']
        node = dict(node)

        if node['node_refs']==[]:
            del node['node_refs']        
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        count =0
        for _, element in ET.iterparse(file_in):
            #print element.tag, element.attrib
            
            count +=1
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")

    return data

def test():
    # NOTE: if you are running this code on your computer, with a larger dataset, 
    # call the process_map procedure with pretty=False. The pretty=True option adds 
    # additional spaces to the output, making it significantly larger.
    data = process_map('leeds.osm', False)
    pprint.pprint(data[-1])

test()

{'created': {'changeset': '45753035',
             'timestamp': '2017-02-02T17:00:39Z',
             'uid': '4543415',
             'user': 'The_JF',
             'version': '1'},
 'golf': 'water_hazard',
 'natural': 'water',
 'node_refs': ['4656975166',
               '4656975167',
               '4656975168',
               '4656975169',
               '4656975166'],
 'type': 'way'}


### Mongo Queries

In [None]:
2. Data Overview

                                                
This section contains basic statistics about the dataset and the MongoDB queries used to gather them.



File sizes
                                                
leeds.osm ......... 117 MB
leeds.osm.json .... 117.2 MB



In [None]:
# Number of documents
                                                
db.char.find().count()                                                
1555851
                                                
# Number of nodes
                                                
> db.char.find({"type":"node"}).count()
1471349
                                                
# Number of ways
                                                
> db.char.find({"type":"way"}).count()
84502
                                                
# Number of unique users
                                                
> db.char.distinct({"created.user"}).length
336
                                                
# Top 1 contributing user
                                                
> db.char.aggregate([{"$group":{"_id":"$created.user", "count":{"$sum":1}}}, {"$sort":{"count":­1}}, {"$limit":1}])
[ { "_id" : "jumbanho", "count" : 823324 } ]                
                                                
# Number of users appearing only once (having 1 post)
                                                
> db.char.aggregate([{"$group":{"_id":"$created.user", "count":{"$sum":1}}}, {"$group":{"_id":"$count", "num_users":{"$sum":1}}}, {"$sort":{"_id":1}}, {"$limit":1}])
[ {"_id":1,"num_users":56} ]
# “_id” represents postcount

#postal codes, yes = true, naptan values, street types contain no abbreviations 


In [None]:
### pymongo defaults
###

In [9]:
def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def make_pipeline():
    # complete the aggregation pipeline
    pipeline = [
        db.leeds.find().count()
                ]
    return pipeline

if __name__ == '__main__':
    db = get_db('maps')
    pipeline = make_pipeline()
    result = (db, pipeline)
    pprint.pprint(result)

NameError: name 'pprint' is not defined

In [14]:
# Number of documents
                                                
db.char.find().count()                                                


import pprint

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def find_simple():
    return db.leeds.find().count()

if __name__ == '__main__':
    get_db('maps')
    results = find_simple()
    pprint.pprint(results)

576209


In [None]:
db.leeds.find().count()
576209

In [15]:
def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def find_simple():
    return db.leeds.find({'type':'node'}).count()

if __name__ == '__main__':
    get_db('maps')
    results = find_simple()
    pprint.pprint(results)

488980


db.leeds.find({'type':'node'}).count()
488980



In [24]:
def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def find_simple():
    return db.leeds.find({'type':'way'}).count()

if __name__ == '__main__':
    get_db('maps')
    results = find_simple()
    pprint.pprint(results)

87188


In [25]:
db.leeds.find({'type':'way'}).count()
87188

87188

In [36]:
## largest contributer

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def find():
    return db.leeds.aggregate([{"$group":{"_id":"$created.user", "count":{"$sum":1}}},
                              {"$sort":{"count":-1}}, {"$limit":1}]) 



if __name__ == '__main__':
    get_db('maps')
    results = find()
    pprint.pprint(results.next())
    


{u'_id': u'Arthtoach', u'count': 239755}


In [None]:
## distinct users

(from command line)

db.leeds.distinct('created.user').length

    669

In [79]:
## users with over 100 contributions

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def find():
    return db.leeds.aggregate([{"$group":{"_id":"$created.user", 
                                          "count":{"$sum":1}}},
                               {"$match":{"count":{"$gte":100}}},
                               {"$group":{"_id":"count",
                                          "count":{"$sum":1}}}
                               
                              
                              
                              ])



 
if __name__ == '__main__':
    get_db('maps')
    results = find()
    count =0
    for result_line in results:
        pprint.pprint(result_line)
#        count +=1
#    print count


{u'_id': u'count', u'count': 140}


In [100]:
### who is submitting naptan data?

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

## top 5 naptan submitters
pipeline1 = db.leeds.aggregate([{"$match":{"naptan:":{"$exists":True}}},
                                {"$group": {"_id":"$created.user",
                                            "count":{"$sum":1}}},
                                {"$sort":{"count":-1}},
                                {"$limit":5}
                               ])


"""
{u'_id': u'NaPTAN', u'count': 2087}
{u'_id': u'JonS', u'count': 564}
{u'_id': u'Pobice', u'count': 207}
{u'_id': u'LeedsTracker', u'count': 65}
{u'_id': u'sc71', u'count': 63}
"""
pipeline2 = db.leeds.aggregate([{"$match":{"naptan:":{"$exists":True}}},
                                {"$group": {"_id":"$created.user",
                                            "count":{"$sum":1}}},
                                {"$group":{"_id":"$null",
                                          "count":{"$sum":1}}}
                               ])
#{u'_id': None, u'count': 65}


#number of records with naptan
pipeline3 = db.leeds.aggregate([{"$match":{"naptan:":{"$exists":True}}},

                                {"$group":{"_id":"$null",
                                          "count":{"$sum":1}}}
                               ])

#{u'_id': None, u'count': 3255}


 
if __name__ == '__main__':
    get_db('maps')
    results = pipeline1
    count =0
    for result_line in results:
        pprint.pprint(result_line)
        
    #print results




{u'_id': u'NaPTAN', u'count': 2087}
{u'_id': u'JonS', u'count': 564}
{u'_id': u'Pobice', u'count': 207}
{u'_id': u'LeedsTracker', u'count': 65}
{u'_id': u'sc71', u'count': 63}


In [None]:
## largest data points
## is there any similarities in who is submitting the largest pieces of data

In [135]:
## number of amenities

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

## number of amenities
pipeline1 = db.leeds.aggregate([{"$match":{"amenity":{"$exists":True}}}
                               
                               ])
                               
#{u'_id': None, u'count': 5043}


## top amenities
pipeline2 = db.leeds.aggregate([{"$match":{"amenity":{"$exists":True}}},
                               {"$group":{"_id":"$amenity",
                                          "count":{"$sum": 1}}},
                                {"$sort":{"count":-1}},
                                {"$limit": 20}
                               
                               ])
'''
{u'_id': u'parking', u'count': 879}
{u'_id': u'post_box', u'count': 696}
{u'_id': u'pub', u'count': 360}
{u'_id': u'fast_food', u'count': 347}
{u'_id': u'place_of_worship', u'count': 242}
{u'_id': u'telephone', u'count': 225}
{u'_id': u'bench', u'count': 224}
{u'_id': u'school', u'count': 221}
{u'_id': u'cafe', u'count': 215}
{u'_id': u'restaurant', u'count': 198}
{u'_id': u'atm', u'count': 145}
{u'_id': u'bicycle_parking', u'count': 112}
{u'_id': u'waste_basket', u'count': 108}
{u'_id': u'pharmacy', u'count': 77}
{u'_id': u'bank', u'count': 76}
{u'_id': u'post_office', u'count': 74}
{u'_id': u'recycling', u'count': 67}
{u'_id': u'fuel', u'count': 57}
{u'_id': u'bar', u'count': 53}
{u'_id': u'doctors', u'count': 50}
'''
#number of pubs with beer gardens
    
pipeline3 = db.leeds.aggregate([{"$match":{"beer_garden":True}},
                               {"$group":{"_id":"$null",
                                          "count":{"$sum": 1}}}

                               
                               ])
#{u'_id': None, u'count': 9}

    
if __name__ == '__main__':
    get_db('maps')
    results = pipeline3

    for result_line in results:
        pprint.pprint(result_line)
        #break

## top 5 amenities


{u'_id': None, u'count': 9}


In [None]:


#https://upload.wikimedia.org/wikipedia/commons/9/98/LS_postcode_area_map.svg

### Improvements

To improve the data overall, it may be useful to have local amenities and business go ahead and make updates, however with the dominance of Google and Facebook in these areas, it may be difficult without further incentives. The best bet for adding further ways and nodes to the map may be with hobbiests deploying generic bots and programs that can pick up data and map it in any given area for example, from photos.

Contributers with special with local SME knowledge who are already active on other open source project ie. Wikipedia should also. Government agencies particaularly in transportation could also be encouraged to open source their data via this route. The source for much of the naptan data is "naptan_import" which lends a belief that such  an inititative may already be underway at some level.

### Conclusion

It seems like the data for the Leeds area is in good shape from a consistency perspective. Street types in the UK are hetergenous to regions, however there were no misspelling. Some of the postcodes are incomplete, only providing the first portion eg. 'LS12', however this still provides some indication of where the node is marked so does not need to be cleaned.

