Los Angeles Open Street Map Analysis
------------------

In [2]:
import xml.etree.ElementTree as ET
import re
from pymongo import MongoClient

####Open Street Map Data File####

In [1]:
osm_file = "los-angeles_california.osm"

####Iterable XML parsing function####

In [51]:
def parse_xml(filename):
    
    lower = re.compile(r'^([a-z]|_)*$')
    lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
    problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
    CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
    parsed_json = []

    for event, elem in ET.iterparse(filename):
        if elem.tag == 'node' or elem.tag == 'way':
            osm_json = {}
            osm_json['address'] = {}
            a = elem.attrib
            osm_json['created'] = {k.strip():v.strip() for k,v in a.items() if k in CREATED}
            osm_json['type'] = elem.tag
            for k,v in a.items():
                if k == 'visible':
                    osm_json['visible'] = a['visible']
                elif k == 'id':
                    osm_json['id'] = a['id']
                elif k == 'lat' or k == 'lon':
                    osm_json['pos'] = [float(a['lat']),float(a['lon'])]

            for tag in elem.iter('tag'):
                t = tag.attrib
                if problemchars.match(str(t['k'])):
                    pass
                elif re.search('^addr:',str(t['k'])):
                    if t['k'].count(':') == 1:
                        sk = t['k'].replace('addr:','')
                        try:
                            osm_json['address'][sk] = t['v']
                        except:
                            pass
                    else:
                        pass
                elif ':' in t['k']:
                    osm_json[t['k']] = t['v']
                else:
                    osm_json[t['k']] = t['v']

            for tag in elem.iter('way'):
                nd = []
                for x in tag:
                    if x.tag == 'nd':
                        nd_x = x.attrib['ref']
                        nd.append(nd_x)
                        osm_json.update({'node_refs':nd})

            if len(osm_json['address']) == 0:
                del osm_json['address']
                print (osm_json)
                parsed_json.append(osm_json)
            else:
                print (osm_json)
                parsed_json.append(osm_json)
    return parsed_json

####MongoDB connection, database, and collection setup####

In [3]:
client = MongoClient()
db = client.project3
db.osm_v2.insert_many(parse_xml(osm_file))

####Number of documents in collection####

In [4]:
db.osm_v2.count()

5533000

####Number of node-based documents####

In [5]:
db.osm_v2.find({"type":"node"}).count()

5268722

####Number of way-based documents####

In [6]:
db.osm_v2.find({"type":"way"}).count()

263722

####Number of unique users####

In [8]:
unique_users = db.osm_v2.aggregate([{"$group":{"_id":"$created.user","count":{"$sum":1}}},
                                 {"$group":{"_id":1,"count":{"$sum":1}}},
                                 {"$sort":{"count":-1}}])
for x in unique_users:
    print(x)

{'count': 2790, '_id': 1}


####Most common contributor####

In [9]:
first_user = db.osm_v2.aggregate([{"$group":{"_id":"$created.user","count":{"$sum":1}}},
                         {"$sort":{"count":-1}},
                        {"$limit":1}])
for x in first_user:
    print(x)

{'count': 546384, '_id': 'woodpeck_fixbot'}


####Top cities listed####

In [40]:
top_cities = db.osm_v2.aggregate([{'$match':{"address.city":{"$exists":1}}},
                             {"$group":{"_id":"$address.city","count":{"$sum":1}}},
                              {'$sort':{"count":-1}},
                              {"$limit":3}])
for x in top_cities:
    print(x)

{'count': 14114, '_id': 'San Diego'}
{'count': 12217, '_id': 'Lake Forest'}
{'count': 11252, '_id': 'Irvine'}


####Number of unique cities####

In [38]:
unique_cities = db.osm_v2.aggregate([{"$match":{"address.city":{"$exists":1}}},
                          {"$group":{"_id":"$address.city","count":{"$sum":1}}},
                          {"$group":{"_id":1,"count":{"$sum":1}}},
                          {"$sort":{"count":1}}])
for x in unique_cities:
    print(x)

{'count': 296, '_id': 1}


####Household numbers with non-digit characters####

In [19]:
non_digit_housenumbers = db.osm_v2.aggregate([{"$match":{"address.housenumber":{"$regex":"[A-Za-z]"}}},                       
                                           {"$group":{"_id":"$address.housenumber","count":{"$sum":1}}},
                                           {"$group":{"_id":1,"count":{"$sum":1}}},
                                           {"$sort":{"count":1}}])
for x in non_digit_housenumbers:
    print (x)

{'count': 101, '_id': 1}


####Long ZIP codes for cleaning####

In [15]:
long_zip = db.osm_v2.find({'address.postcode':{'$regex':'^\d.*-'}})
for x in long_zip:
    x['address']['postcode'] = re.sub('-.*','',str(x['address']['postcode']))
    db.osm_v2.save(x)

####Convert short street names to long street names####

In [22]:
short_streets = db.osm_v2.aggregate([{"$match":{"address.street":{"$regex":"[A-Za-z]\."}}}])
street_endings= {"Blvd.":"Boulevard ","St.":"Street ","Ave.":"Avenue ","Dr.":"Drive ","Ctr.":"Center ","Rd.":"Road ",
                 "Blvd ":"Boulevard ","St ":"Street ","Ave ":"Avenue ","Dr ":"Drive ","Ctr ":"Center ","Rd ":"Road "}

for x in short_streets:
    street = x['address']['street']
    for k,v in street_endings.items():
        if k in street:
            fixed_street = re.sub(k,v,str(street)).strip()
            x['address']['street'] = fixed_street
            db.osm_v2.save(x)

####Top amentities listed####

In [24]:
amenities = db.osm_v2.aggregate([{'$match':{"amenity":{'$exists':1}}},
                                {'$group':{'_id':'$amenity','count':{'$sum':1}}},
                                {'$sort':{'count':-1}},
                                {'$limit':5}])
for x in amenities:
    print(x)

{'count': 3892, '_id': 'place_of_worship'}
{'count': 3806, '_id': 'school'}
{'count': 2271, '_id': 'parking'}
{'count': 1873, '_id': 'restaurant'}
{'count': 1341, '_id': 'fast_food'}


####Top cuisines listed####

In [27]:
cuisines = db.osm_v2.aggregate([{'$match':{'amenity':'restaurant'}},
                        {'$group':{'_id':'$cuisine','count':{'$sum':1}}},
                        {'$sort':{'count':-1}},
                        {'$limit':5}])
for x in cuisines:
    print(x)

{'count': 795, '_id': None}
{'count': 173, '_id': 'american'}
{'count': 152, '_id': 'mexican'}
{'count': 87, '_id': 'pizza'}
{'count': 73, '_id': 'italian'}


####Top fast-food restaurants listed####

In [36]:
fastfood = db.osm_v2.aggregate([{'$match':{'amenity':'fast_food'}},
                         {'$group':{'_id':'$name','count':{'$sum':1}}},
                         {'$sort':{'count':-1}},
                         {'$limit':3}])
for x in fastfood:
    print(x)

{'count': 117, '_id': 'Subway'}
{'count': 113, '_id': "McDonald's"}
{'count': 67, '_id': 'Jack in the Box'}


####Quiz 1 solution####

In [None]:
def count_tags(filename):
        tag_dict = {}
        for event, elem in ET.iterparse(filename):
            if elem.tag not in tag_dict:
                tag_dict[elem.tag] = 1
            else:
                tag_dict[elem.tag] += 1           
        return tag_dict

####Quiz 3 solution####

In [None]:
def key_type(element, keys):
    if element.tag == "tag":
        k = element.get('k')
        if re.search(lower,str(k)):
            keys['lower'] += 1
        elif re.search(lower_colon,str(k)):
            keys['lower_colon'] += 1
        elif re.search(problemchars,str(k)):
            keys['problemchars'] += 1
        else:
            keys['other'] += 1
    return keys

####Quiz 4 solution####

In [None]:
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if element.tag == 'node':
            for tag in element.iter('node'):
                uid = tag.get('uid')
                users.add(uid)
        elif element.tag == 'way':
            for tag in element.iter('way'):
                uid = tag.get('uid')
                users.add(uid)
        elif element.tag == 'relation':
            for tag in element.iter('relation'):
                uid = tag.get('uid')
                users.add(uid)       
    return users

####Quiz 5 solution####

In [None]:
def update_name(name, mapping):
    if street_type_re.search(name):
        s = street_type_re.search(name).group()
        if s not in expected:
            return re.sub(street_type_re,mapping[s],name)

####Quiz 6 solution####

In [None]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

def shape_element(element):
    node = {}
    node['address'] = {}
    if element.tag == "node" or element.tag == "way":
        for tag in element.iter('tag'):
            t = tag.attrib
            if problemchars.match(str(t['k'])):
                pass
            elif re.findall('addr:',str(t['k'])):
                if t['k'].count(':') == 2:
                    pass
                elif t['k'].count(':') == 1:
                    stripped_t = t['k'].replace('addr:','')
                    node['address'].update({stripped_t:t['v']})
                else:
                    pass
            else:
                node[t['k']] = t['v']
                                                                       
        a = element.attrib
        node['type'] = element.tag
        node['id'] = a['id']
        
        for tag in element.iter('way'):
            nd = []
            for x in tag:
                if x.tag == 'nd':
                    nd_x = x.attrib['ref']
                    nd.append(nd_x)
                    node.update({'node_refs':nd})
        
        for k,v in a.items():
            if k == 'visible':
                node.update({k:v})
            elif k == 'lat' or k == 'lon':
                node['pos'] = [float(a['lat']),float(a['lon'])]
            elif k in CREATED:
                node['created'] = {'version':a['version'],
                           'changeset':a['changeset'],
                           'timestamp':a['timestamp'],
                           'user':a['user'],
                           'uid':a['uid']}
        for k,v in node.items():
            if v == {}:
                del node[k]
        return node
        print node
    else:
        return None