In [91]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
from pprint_utf import pprint
import re
from collections import defaultdict
import csv
import codecs

### 获取探索样本

In [92]:
OSM_FILE = "beijing_china.osm"
SAMPLE_FILE = "sample.osm"

k = 10 # Parameter: taken every k-th top level element

def get_element(osm_file, tags = ("node", "way", "relation")):
    """Yield element if it is the right type of tag"""
    context = iter(ET.iterparse(osm_file, events = ('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()
            
with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')
    
    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding = 'utf-8'))
            
    output.write('</osm>')

### 对探索样本中部分字段数据质量进行审查

In [93]:
# 审查函数
def audit(osm_file, k_value):
    to_be_audited = {k_value: set()}
    for event, elem in ET.iterparse(osm_file, events = ('start',)):
        if elem.tag == 'way' or elem.tag == 'node':
            for tag in elem.iter('tag'):
                if tag.attrib['k'] == k_value:
                    to_be_audited[k_value].add(tag.attrib['v'])
    pprint(to_be_audited)

In [94]:
# 查看‘tag’标签中所有'k'的值并计数，从而方便进行审查
tag_name = {}
for _, elem in ET.iterparse(SAMPLE_FILE):
    for tag in elem.iter('tag'):
        if tag.attrib['k'] not in tag_name:
            tag_name[tag.attrib['k']] = 1
        else:
            tag_name[tag.attrib['k']] += 1
pprint(sorted(tag_name.iteritems(), key = lambda d:d[1], reverse = True))

[('highway', 23535),
 ('building', 15081),
 ('name', 12168),
 ('oneway', 6339),
 ('power', 4290),
 ('public_transport', 3528),
 ('source', 2985),
 ('name:en', 2826),
 ('railway', 2451),
 ('ref', 2436),
 ('bridge', 2394),
 ('layer', 2364),
 ('amenity', 2184),
 ('type', 1719),
 ('lanes', 1569),
 ('name:zh', 1344),
 ('shelter', 1191),
 ('bench', 1134),
 ('service', 1125),
 ('building:levels', 1068),
 ('landuse', 924),
 ('barrier', 903),
 ('gauge', 789),
 ('leisure', 651),
 ('electrified', 603),
 ('natural', 591),
 ('voltage', 588),
 ('shop', 558),
 ('name:zh_pinyin', 543),
 ('to', 522),
 ('from', 522),
 ('route', 510),
 ('tourism', 477),
 ('frequency', 441),
 ('surface', 441),
 ('int_name', 435),
 ('access', 405),
 ('operator', 393),
 ('tunnel', 390),
 ('addr:housenumber', 366),
 ('aeroway', 345),
 ('crossing', 342),
 ('man_made', 321),
 ('addr:street', 318),
 ('waterway', 312),
 ('network', 261),
 ('route_master', 237),
 ('alt_name', 234),
 ('int_ref', 225),
 ('maxheight', 225),
 ('subwa

In [95]:
# 审查'name:en'字段
audit(SAMPLE_FILE, "name:en")

{'name:en': set(['2nd Zhenwumiao Road',
                 '5th Ring road',
                 '60 Bar',
                 '6th Ring Road',
                 '7 Days Inn',
                 '7-Eleven',
                 '9th Dynasty',
                 'ANDINGMEN DONGDAJIE',
                 'APM',
                 'ASIA PACIFIC GARDEN HOTEL',
                 'Aimin Str',
                 'Airport shuttle shangdi stop',
                 'Alley cafe',
                 'American animals',
                 'An Li road',
                 'An Ning Zhuang Road',
                 'An Ning Zhuang West Road',
                 'Anding Lu',
                 'Andingmenwai Street',
                 'Anhuaqiao',
                 'Animal Sacrifice Pavilion',
                 'Anxiang North Road',
                 'Anyuanli Community',
                 'Anzhen Road',
                 'Aroma Zen Tea Club',
                 'Auspicious Phoenix',
                 'BJDW',
                 'Ba Da Chu Road',
      

审查“name:en”字段，发现某些地址的英文名称中，存在过度简写的情况。如：“Aimin Str”。
另外，某些英文名称中，存在拼音直译的情况。如：“Anding Lu”。

### 修正数据

#### 完善mapping

In [96]:
# 查找不规范命名
street_type_re = re.compile(r'\b\S+\.?$')
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

def is_street_enname(elem):
    return (elem.attrib['k'] == 'name:en')

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def audit(osm_file):
    osmfile = open(osm_file, 'r')
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osmfile, events = ("start",)):
        for tag in elem.iter('tag'):
            if is_street_enname(tag):
                audit_street_type(street_types, tag.attrib['v'])
    osmfile.close()
    pprint(sorted(street_types.iteritems(), key = lambda d:d[0]))

audit(OSM_FILE)

[('1',
  set(['Beijing Subway Line 1',
       'Building 1',
       'Cafeteria No. 1',
       'Capital Normal University - North Campus 1',
       'No. 3 Teaching Bldg Sector 1',
       'Residential Hall 1',
       'Seasons Park bldg 1',
       'Suncity bldg 1'])),
 ('10', set(['Hall 10', 'Residential Hall 10'])),
 ('101', set(['China National Highway 101'])),
 ('102', set(['China National Highway 102'])),
 ('104', set(['China National Highway 104'])),
 ('105', set(['China National Highway 105'])),
 ('106', set(['China National Highway 106'])),
 ('107', set(['China National Highway 107'])),
 ('108', set(['Bistrot 108', 'China National Highway 108'])),
 ('109', set(['China National Highway 109'])),
 ('11', set(['7 11', 'Residential Hall 11'])),
 ('111', set(['China National Highway 111'])),
 ('12', set(['Residential Hall 12'])),
 ('13', set(['Beijing Subway Line 13', 'Sirteen 13'])),
 ('1308', set(['1308'])),
 ('14',
  set(['International Office BIT, No. 14',
       'Residential Hall 14'

 ('Coffees', set(["Gloria Jean's Coffees"])),
 ('Coffetel', set(['James Joyce Coffetel'])),
 ('College',
  set(['Beijing Administrative College',
       'Beijing Information Vocational and Technical College',
       'Beijing Union College',
       'Imperial College',
       'Schwartman College'])),
 ('Committee', set(['Youth League Committee'])),
 ('Communication', set(['Bank of Communication'])),
 ('Communications', set(['Bank of Communications'])),
 ('Community', set(['Anyuanli Community', 'Huixinli Community'])),
 ('Compound',
  set(['Diplomatic Residence Compound', 'Russian Embassy Compound'])),
 ('Concept', set(['Three Gun Living Concept'])),
 ('Confucius', set(['Cafe Confucius'])),
 ('Continua', set(['Galeria Continua'])),
 ('CornerTower', set(['CornerTower'])),
 ('Corporation', set(['China Banknote Printing and Minting Corporation'])),
 ('Corridor', set(['Long Corridor'])),
 ('Costa', set(['Costa'])),
 ('Counter', set(['Burger Counter'])),
 ('County',
  set(['Dachang Hui Autonom

 ('Kuai)', set(['Bus 916 Express (Kuai)'])),
 ('LEGO', set(['LEGO'])),
 ('LIUFANG', set(['LIUFANG'])),
 ('LONGZE', set(['LONGZE'])),
 ('Lab',
  set(['Engineering Lab',
       'H.I.S. Science Lab',
       'High-Voltage Lab',
       'Wind Tunnel Lab'])),
 ('Laiguangying', set(['Laiguangying'])),
 ('Lake',
  set(['Houhai Lake',
       'Kunming Lake',
       'Nanhai Lake',
       'Qianhai Lake',
       'Qixing Lake',
       'West Lake',
       'Xihai Lake',
       'Yanqi Lake',
       'Zhonghai Lake'])),
 ("Laker's", set(["Laker's"])),
 ('Landgraf', set(['Der Landgraf'])),
 ('Landiaoshalong', set(['Landiaoshalong'])),
 ('Landmark', set(['Landmark'])),
 ('Langfang', set(['Langfang'])),
 ('Laojuntang', set(['Laojuntang'])),
 ('Laoshe', set(['Residence of Laoshe'])),
 ('Latvia', set(['Embassy of republic of Latvia'])),
 ('Law',
  set(['China University of Political Science and Law', 'School of Law'])),
 ('Lawson', set(['Lawson'])),
 ('Lee', set(['Mr. Lee'])),
 ('Lehongyuan', set(['Lehongyuan'

 ('Siqu', set(['Longjinyuan Siqu', 'Longyueyuan Siqu'])),
 ('Sister', set(['Yunnan Camillia Sister'])),
 ('Site', set(['Section Qiruizhi Government Former Site'])),
 ('Sitiao', set(['Caochang Sitiao', 'Xingjiekou Sitiao'])),
 ('Slope', set(['Bhodisattva Slope'])),
 ('Smoke', set(['The Big Smoke'])),
 ('Snacks', set(['Jiumen Snacks'])),
 (u'Sofa', set([u'Café de Sofa'])),
 ('Soho', set(['The Galaxy Soho'])),
 ('Songjiazhuang', set(['Songjiazhuang'])),
 ('Songyu', set(['Songyu'])),
 ('Songzhuangcun', set(['Songzhuangcun'])),
 ('Soshow', set(['Soshow'])),
 ('Source', set(['Water Source'])),
 ('South',
  set(['Baishiqiao South',
       'Beijing South',
       'Chaoyangmen South',
       'Ciqu South',
       'Fengyiqiao South',
       'GAOMIDIAN South',
       'Gaomidian South',
       'Houhai South',
       'LISHUIQIAO South',
       'Lishuiqiao South',
       'Outer Xizimen Rd South',
       'Qianhai South',
       'Tiantongyuan South',
       'Wangjing South',
       'Wenhuiyuan South',


 ('cuisin', set(['canton cuisin'])),
 ('dashanzilukoudong', set(['dashanzilukoudong'])),
 ('di', set(['cao chang di'])),
 ('dongfengqiaodong', set(['dongfengqiaodong'])),
 ('donglu', set(['Wenhuayuan donglu'])),
 ('dragon', set(['Beijing saga youth hostel. Happy dragon'])),
 (u'déjeuner)', set([u'wudaoku (quartier étudiant/pause déjeuner)'])),
 ('east', set(['east'])),
 ('eleven', set(['seven eleven'])),
 ('entry)', set(['ghost street (entry)'])),
 ('equipment', set(['Beijing photography equipment'])),
 ('erlongluxijie', set(['erlongluxijie'])),
 ('erun360.com', set(['erun360.com'])),
 ('exchange', set(['money exchange'])),
 ('expressway', set(['Jingtai expressway'])),
 ('farm', set(['Super farm'])),
 ('field', set(['High school soccer field'])),
 ('fine', set(['beijing fine'])),
 ('flore', set(['Cafe de flore'])),
 ('food', set(['Ghost street food', 'Very good local food'])),
 ('friend', set(['friend'])),
 ('fung', set(['Dumpling restaurant din tai fung'])),
 ('gallery', set(['intelli

In [97]:
# 根据审查数据完善mapping
mapping = {
    'Str': 'Street',
    'St': 'Street',
    'street': 'Street',
    'Rd': 'Road',
    'road': 'Road',
    'Lu': 'Road',
    'lu': 'Road',
    'Bld': 'Buiding',
    'Bldg': 'Buiding'
}

#### 修正name:en并生成csv文件

In [99]:
OSM_PATH = "beijing_china.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

def update_name(name, mapping):
    """将地址名以mapping键值结尾修改为mapping对应的value"""
    name_to_be_fixed = mapping.keys()
    for word in name_to_be_fixed:
        if name.endswith(word):
            name = name.replace(word, mapping[word])
    return name

def fix_name(street_name, mapping):
    """匹配数据集中结尾可能不规范的地址名称，并进行更新"""
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            update_name(street_name, mapping)
            
def shape_tag(el, tag):
    """按SQL格式对一级tag标签进行处理"""
    tg = {
        'id'   : el.attrib['id'],
        'key'  : tag.attrib['k'],
        'value': tag.attrib['v'],
        'type' : 'regular'
    }
            
    if is_street_enname(tag):
        tg['value'] = fix_name(tag.attrib['v'], mapping)
        
    if LOWER_COLON.match(tg['key']):
        tg['type'], _, tg['key'] = tg['key'].partition(':')
        
    return tg

def shape_way_node(el, i, nd):
    """按SQL格式对way标签下的node进行处理"""
    return {
        'id'      : el.attrib['id'],
        'node_id' : nd.attrib['ref'],
        'position': i
    }

def shape_element(el, node_attr_fields = NODE_FIELDS, way_attr_fields = WAY_FIELDS):
    """Clean and shape node or way XML element to Python dict"""
    tags = [shape_tag(el,t) for t in el.iter('tag')]
    if el.tag == 'node':
        node_attribs = {f: el.attrib[f] for f in node_attr_fields}
        return {'node': node_attribs, 'node_tags': tags}
    elif el.tag == 'way':
        way_attribs = {f: el.attrib[f] for f in way_attr_fields}
        way_nodes = [shape_way_node(el, i, nd)
                     for i, nd
                     in enumerate(el.iter('nd'))]
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}
    
def get_element(osm_file, tags = ('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""
    context = ET.iterparse(osm_file, events=("start", "end"))
    _, root = next(context)
    for event, elem in context:
        if event == "end" and elem.tag in tags:
            yield elem
            root.clear()
            
class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""
    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k:(v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })
        
    def writerows(self, rows):
        for row in rows:
            self.writerow(row)
            
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""    
    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:
                
        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)
        
        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()
        
        for element in get_element(file_in, tags = ('node', 'way')):
            el = shape_element(element)
            if el:
                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])

if __name__ == '__main__':
    process_map(OSM_PATH, validate = False)