In [46]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "dallas_texas.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

mapping = { "St"  : "Street",
            "St." : "Street",
            "Ave" : "Avenue",
           "Ave." : "Avenue",
            "Rd." : "Road",
            "Dr"  : "Drive",
           "Dr."  : "Drive",
           "Blvd" : "Boulevard",
           "blvd" : "Boulevard",
           "Pkwy" : "Parkway",
           "pkwy" : "Parkway",
           "RD"   : "Road",
           "Rd"   : "Road",
           "road" : "Road",
           "W"    : "West",
           "N."    : "North",
           "BLVD.": "Boulevard",
           "E."   : "East",
           "Av"   : "Avenue",
           "Hwy"  : "Highway",
           "Dr"   : "Drive",
           "dr"   : "Drive",
           "E"    : "East",
           "N"    : "North",
           "Fwy"  : "FreeWay",
           "Expy" : "ExpressWay",
      "Exressway" : "ExpressWay",
           "Fwy"  : "Freeway",
           "Trl"  : "Trail",
           "S."   : "South",
           "N."   : "North",
           "E."   : "East",
           "W."   : "West",
           "Ln"   : "Lane"
            }


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    
    return street_types


def update_name(name, mapping):

    # YOUR CODE HERE
    m=street_type_re.search(name)
    
    first=re.search(r'^[a-zA-z]+\.',name)
  
    
    if m:
        last_part=m.group()
        for key in mapping:
            
            if last_part==key:
                name=re.sub(last_part,mapping[key],name)
                return name
                
        
    if first:
        first_re=first.group()
        
        for key in mapping:
            
            if first_re==key:
                name=re.sub(first_re,mapping[key],name)
        
    return name


def test():
    print "started at"+" "+( strftime("%Y-%m-%d %H:%M:%S", gmtime()))
    st_types = audit(OSMFILE)
    #assert len(st_types) == 3
    #pprint.pprint(dict(st_types))
    print "finished at"+" "+( strftime("%Y-%m-%d %H:%M:%S", gmtime()))

    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            
            print name, "=>", better_name
           
        


if __name__ == '__main__':
    test()


started at 2016-11-04 02:19:05
finished at 2016-11-04 02:23:19
TX 66 => TX 66
Glasshouse Walk => Glasshouse Walk
Highpoint Ridge => Highpoint Ridge
Riva Ridge => Riva Ridge
Windsor Ridge => Windsor Ridge
Savannah Ridge => Savannah Ridge
Farm-to-Market Road 664 => Farm-to-Market Road 664
Jupiter Rd => Jupiter Road
Stone Canyon Rd => Stone Canyon Road
E Trinity Mills Rd => E Trinity Mills Road
N Murphy Rd => N Murphy Road
Flower Mound Rd => Flower Mound Road
US-175 Frontage Rd => US-175 Frontage Road
Preston Rd => Preston Road
Coit Rd => Coit Road
Glade Rd => Glade Road
W Randol Mill Rd => W Randol Mill Road
McDermott Rd => McDermott Road
N Support Rd => N Support Road
S. Hampton Rd => S. Hampton Road
E Belt Line Rd => E Belt Line Road
S Murphy Rd => S Murphy Road
Custer Rd => Custer Road
Interstate 30 Frontage Rd => Interstate 30 Frontage Road
W Wheatland Rd => W Wheatland Road
Abrams Rd => Abrams Road
Big Stone Gap Rd => Big Stone Gap Road
W Campbell Rd => W Campbell Road
Hall Rd => Ha

In [None]:
import xml.etree.cElementTree as ET
from collections import defaultdict
from time import gmtime, strftime
import re
import pprint
import pprint
import re
import codecs
import json
from collections import defaultdict

OSMFILE = "dallas_texas.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons","Tollway","Trail","Trace"]


mapping = { "St"  : "Street",
            "St." : "Street",
            "Ave" : "Avenue",
           "Ave." : "Avenue",
            "Rd." : "Road",
            "Dr"  : "Drive",
           "Dr."  : "Drive",
           "Blvd" : "Boulevard",
           "blvd" : "Boulevard",
           "Pkwy" : "Parkway",
           "pkwy" : "Parkway",
           "RD"   : "Road",
           "Rd"   : "Road",
           "road" : "Road",
           "W"    : "West",
           "N."    : "North",
           "BLVD.": "Boulevard",
           "E."   : "East",
           "Av"   : "Avenue",
           "Hwy"  : "Highway",
           "Dr"   : "Drive",
           "dr"   : "Drive",
           "E"    : "East",
           "N"    : "North",
           "Fwy"  : "FreeWay",
           "Expy" : "ExpressWay",
      "Exressway" : "ExpressWay",
           "Fwy"  : "Freeway",
           "Trl"  : "Trail",
           "S."   : "South",
           "N."   : "North",
           "E."   : "East",
           "W."   : "West",
           "Ln"   : "Lane"
            }



lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_[^addr])*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

#updating elements of the osm file and establishing a neat structure

def shape_element(element):
    node = {}
    created={}
    pos=[]
    address={}
    node_refs=[]
    if element.tag == "node" or element.tag == "way" :
        
        if element.tag=='node':
            node['type']="node"
            
            
        else:
            node['type']="way"
            for node_tag in element.iter("nd"):
                node_refs.append(node_tag.attrib["ref"])


        
        for tag in element.iter("tag"):          
            
            k_attribute=tag.attrib["k"]
            v_attribute=tag.attrib["v"]
            
            
            colon_lower=lower.search(k_attribute)
            colon_lower_colon=lower_colon.search(k_attribute) 
            prob_chars=problemchars.search(k_attribute)
       
            
        
               
            if colon_lower:
                colon=colon_lower.group()
                node[colon]=v_attribute
                        
            if colon_lower_colon:
                colon1=colon_lower_colon.group()
                if colon1.startswith("addr"):
                    v_attribute=update_name(v_attribute,mapping)
                    address[colon1[colon1.index(':')+1:]]=v_attribute
                    
            if prob_chars:
                
                continue
                
        for keys in node:
            for key1 in address:
                if keys==key1:
                    node.pop(key,None)
                    
        x=element.attrib      
        
        for vals in x:
            
            if vals in CREATED:
                created[vals]=x[vals]
                
            elif vals=='lat':
                
                pos.insert(0,float(x[vals]))
            
            elif vals=='lon':
                
                pos.insert(1,float(x[vals]))
            
            else:
                node[vals]=x[vals]
            
        if created:
            node['created']=created  
            
        if pos:
            node['pos']=pos
                
        if address:
            node['address']=address
            
        if node_refs:
            node['node_refs']=node_refs
        
                
        return node
    else:
        return None

#updating the elements of the osm file and putting it to a definite structure suitable for writing to a json file 


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
            
    return data

#update street names by using keys and values in mapping dictionary

def update_name(name, mapping):
    
    m=street_type_re.search(name)
    
    first=re.search(r'^[a-zA-z]+\.',name)
  
    
    if m:
        last_part=m.group()
        for key in mapping:
            
            if last_part==key:
                name=re.sub(last_part,mapping[key],name)
                return name
                
        
    if first:
        first_re=first.group()
        
        for key in mapping:
            
            if first_re==key:
                name=re.sub(first_re,mapping[key],name)
        
    return name


   


def test():
   
    data = process_map(OSMFILE, True)
    pprint.pprint(data)
    


if __name__ == '__main__':
    test()
    

started at 2016-11-04 03:49:06
finished at  2016-11-04 04:08:57
[{'created': {'changeset': '641383',
              'timestamp': '2008-10-31T13:10:04Z',
              'uid': '9065',
              'user': 'brianboru',
              'version': '4'},
  'id': '26450261',
  'pos': [32.9901295, -97.0027785],
  'type': 'node'},
 {'created': {'changeset': '232647',
              'timestamp': '2007-03-09T23:15:37Z',
              'uid': '6514',
              'user': 'user_6514',
              'version': '1'},
  'id': '26450262',
  'pos': [32.9905615, -97.0033364],
  'type': 'node'},
 {'created': {'changeset': '641383',
              'timestamp': '2008-10-31T13:10:08Z',
              'uid': '9065',
              'user': 'brianboru',
              'version': '2'},
  'id': '26450263',
  'pos': [32.9890496, -96.9993453],
  'type': 'node'},
 {'created': {'changeset': '641383',
              'timestamp': '2008-10-31T13:10:04Z',
              'uid': '9065',
              'user': 'brianboru',
          

In [86]:
from pymongo import MongoClient
import pprint
pymongoclient=MongoClient("localhost", 27017)
db = pymongoclient.data_wrangling_project
coll=db.dallas_texas


def make_pipeline():
    # complete the aggregation pipeline
    match={"$match":{"name":{"$exists":1}, "website":{"$exists":1}}}
    group={"$group":{"_id":"$name","website":{"$addToSet":"$website"} ,"count":{"$sum":1}}}
    limit={"$limit":10}
    sort={"$sort":{"count":-1}}
    pipeline = [match,group,sort,limit]
    return pipeline

def aggregate(db, pipeline):
    return [doc for doc in db.aggregate(pipeline)]
          

if __name__ == '__main__':
    
    pipeline=make_pipeline()
    result = aggregate(coll, pipeline)
    pprint.pprint(result)




[{u'_id': u'Batteries Plus Bulbs',
  u'count': 22,
  u'website': [u'http://www.batteriesplus.com/']},
 {u'_id': u"McDonald's",
  u'count': 18,
  u'website': [u'http://www.mctexas.com/4777',
               u'http://mcdonalds.com',
               u'http://www.mcdonalds.com/',
               u'http://www.mctexas.com/23450',
               u'https://mylocalmcds.com/170campbell/',
               u'http://www.mcdonalds.com']},
 {u'_id': u'Dove Creek Villas Apartments',
  u'count': 15,
  u'website': [u'http://www.dovecreekvillas.com/']},
 {u'_id': u'James Avery Jewelry',
  u'count': 14,
  u'website': [u'http://www.jamesavery.com/custserv/store_details.jsp?storeId=713',
               u'http://www.jamesavery.com/custserv/store_details.jsp?storeId=692',
               u'http://www.jamesavery.com/custserv/store_details.jsp?storeId=704',
               u'http://www.jamesavery.com/custserv/store_details.jsp?storeId=907',
               u'http://www.jamesavery.com/custserv/store_details.jsp?storeId