## 地图区域

格拉斯哥


## 地图数据统计

#### 根节点tag统计

In [4]:
import xml.etree.cElementTree as ET
from collections import Counter
from collections import defaultdict
import re
import codecs
import csv

filename=r'd:\udacity\Glasgow.osm'
def count_tags(filename):
    data=[]
    for _,tree in ET.iterparse(filename,events=('start',)):
        data.append(tree.tag)
    return Counter(data)

{'nd': 562843, 'node': 493780, 'tag': 320815, 'way': 80094, 'member': 24036, 'relation': 1010, 'bounds': 1, 'osm': 1}

#### 检查k值标签，是否存在有问题的字符

In [5]:
NODES_PATH = "d:\\nodes.csv"  #node标签生成的文件名
NODE_TAGS_PATH = "d:\\nodes_tags.csv"  #node下的tag标签生成的文件名
WAYS_PATH = "d:\\ways.csv"  #way标签生成的文件名
WAY_NODES_PATH = "d:\\ways_nodes.csv" #way标签下的node生成的文件名
WAY_TAGS_PATH = "d:\\ways_tags.csv" #way标签下的tag生成的文件名

#“lower”，表示仅包含小写字母且有效的标记，
#“lower_colon”，表示名称中有冒号的其他有效标记，
#“problemchars”，表示字符存在问题的标记。
#“other”，表示不属于上述三大类别的其他标记。

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
def key_type(element,keys):
    if element.tag=='tag':
        if re.search(lower, element.attrib['k']):
            keys['lower']+=1
        elif re.search(lower_colon, element.attrib['k']):
            keys['lower_colon']+=1
        elif re.search(problemchars,element.attrib['k']):
            keys['problemchars']+=1
        else:
            keys['other']+=1
    return keys

def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

In [6]:
print process_map(filename)

{'problemchars': 0, 'lower': 276700, 'other': 20718, 'lower_colon': 23397}


#### 完善街道名,将街道中的一些不合法的值去除

<font color=#0C3A6B>1. 提取所有街道名</font>

In [7]:
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def street_name_set(osm_file):
    streets = set()
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == 'way' or elem.tag == 'node':
            for tag in elem.iter('tag'):
                if is_street_name(tag):
                    streets.add(tag.attrib['v'].split(' ')[-1])
    return streets

print street_name_set(filename)

set(['', 'Court', 'Rigg', 'West', 'Candleriggs', 'Cross', 'Rd', 'street', 'Way', 'Gate', 'Dovehill', 'Quay', 'Gallowgate', 'Road,', 'Park', 'Strret', 'bank', 'Path', 'Bridgegate', 'Drygate', 'Crescent', 'Lane', 'Sreet', 'Roystonhill', 'Drive', 'St', 'downi', 'Broomielaw', 'Wharf', 'Place', 'Brae', 'Close', 'North', 'Saltmarket', 'Gardens', 'Road', 'Square', 'Parade', 'Estate', 'Circus', 'Craigpark', 'Wynd', 'Walk', 'Street', 'Trongate', 'Terrace', 'Green', 'Westercraigs', 'Avenue', 'road', 'Row'])


<font color=#0C3A6B>2. 检查所有街道名，发现以下错误    
      
 (1)使用缩写  
 (2)首字母未大写  
 (3)拼写错误 <font>

In [8]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
expected = [ 'Court', 'Rigg', 'West', 'Candleriggs', 'Cross', 'Rd', 'street', 'Way', 'Gate', 'Dovehill', 'Quay', 
            'Gallowgate', 'Road,', 'Park', 'Path', 'Bridgegate', 'Drygate', 'Crescent', 'Lane', 
            'Roystonhill', 'Drive', 'Broomielaw', 'Wharf', 'Place', 'Brae', 'Close', 'North', 'Saltmarket', 
            'Gardens', 'Road', 'Square', 'Parade', 'Estate', 'Circus', 'Craigpark', 'Wynd', 'Walk', 'Street', 'Trongate', 
            'Terrace', 'Green', 'Westercraigs', 'Avenue', 'Row']
mapping = { "St": "Street", "Rd": "Road", "street": "Street", "Strret":"Street", "bank": "Bank","Sreet": "Street",
           "road": "Road","downi":"Downy"}
def audit_street_type(street_types, street_name):
    m=street_type_re.search(street_name)
    if m:
        street_type=m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)
            


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

def update_name(name, mapping):
    if re.search(street_type_re,name):
        mapped=name.split(' ')
        mapped[-1]=mapping[mapped[-1]]
        for i in range(len(mapped)):
            mapped[i]=mapped[i].capitalize()
        name=' '.join(mapped)
    return name

In [9]:
st_types=audit(filename)
print st_types

defaultdict(<type 'set'>, {'Sreet': set(['Springbank Sreet']), 'St': set(['Ardgay St']), 'downi': set(['downi']), 'Strret': set(['Garfield Strret']), 'road': set(['pollokshaws road', 'Marihill road']), 'bank': set(['canal bank'])})


<font color=#0C3A6B>3. 列出改动内容<font>

In [10]:
name_maping={}
for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            name_maping[name]=better_name
            print name, "=>", better_name

Springbank Sreet => Springbank Street
Ardgay St => Ardgay Street
downi => Downy
Garfield Strret => Garfield Street
pollokshaws road => Pollokshaws Road
Marihill road => Marihill Road
canal bank => Canal Bank


<font color=#0C3A6B>4. 将改动内容写入<font>

In [21]:
def fix_name(name):
    if name in name_maping.keys():
        return name_maping[name]
    else:
        return name
                    
                    

#### 分割数据并清洗数据，并将数据写入CSV文件

In [22]:
#处理类型字段
def check_type(value):
    colon = lower_colon.search(value)
    problem = problemchars.search(value)
    if colon:
        return [value.split(':')[0],':'.join(value.split(':')[1:])]
    elif problem:
        return None
    else:
        return ['regular',value]

In [27]:
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=problemchars, default_tag_type='regular'):
    node_attribs = {}  
    way_attribs = {}  
    way_nodes = []    
    tags = []
    if element.tag=='node':
        for key in NODE_FIELDS:
                node_attribs[key]=element.attrib[key]
        for child in element.iter('tag'):
                node_tag_con={}
                node_tag_con['id']=node_attribs['id']
                node_tag_con['value']=fix_name(child.attrib['v'])
                if check_type(child.attrib['k']):
                    node_tag_con['key']=check_type(child.attrib['k'])[1]
                    node_tag_con['type']=check_type(child.attrib['k'])[0]
                tags.append(node_tag_con)
        return {'node': node_attribs, 'node_tags': tags}
    elif element.tag=='way':
        for key in WAY_FIELDS:
            way_attribs[key]=element.attrib[key]
            position=0
        for child in element.iter():
            Way_Node_con = {} #存放nd子标签
            Way_Tag_con = {}  #存放tag子标签
            if child.tag=='nd':
                Way_Node_con['id']=element.attrib['id']
                Way_Node_con['node_id'] = child.attrib['ref']
                Way_Node_con['position']=position
                position+=1
                way_nodes.append(Way_Node_con)
            elif child.tag=='tag':
                Way_Tag_con['id']=element.attrib['id']
                Way_Tag_con['value']=child.attrib['v']
                if check_type(child.attrib['k']):
                    Way_Tag_con['key']=check_type(child.attrib['k'])[1]
                    Way_Tag_con['type']=check_type(child.attrib['k'])[0]
                tags.append(Way_Tag_con)
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}
                

In [28]:
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)

In [29]:
def process_map(file_in):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        #validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            try:
                el = shape_element(element)
            except KeyError:
                print element.attrib.items()                          #部分项内容确实，防止报错并找出该项
            if el:
                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])
                    
                    


In [30]:
process_map(filename)

[('changeset', '111345'), ('timestamp', '2007-06-24T00:47:44Z'), ('lon', '-4.2694446'), ('version', '1'), ('lat', '55.8610873'), ('id', '29392055')]
