# 提取数据样本,方便研究

In [1]:
%%time
import xml.etree.cElementTree as ET  # Use cElementTree or lxml if too slow
# import lxml.etree.ElementTree as ET

OSM_FILE = "shanghai_china.osm"  # Replace this with your osm file
SAMPLE_FILE = "sample.osm"

k = 100 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way' )):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

CPU times: user 30.4 s, sys: 592 ms, total: 30.9 s
Wall time: 31.6 s


In [6]:
%%time
import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus

# import schema

OSM_PATH = "shanghai_china.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

# SCHEMA = schema.schema
# print SCHEMA

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements
    
    if element.tag=='node':               # 提取 node 字段
        for key in  element.attrib.keys():
            if key in node_attr_fields:
                node_attribs[key] = element.attrib[key]    


    if  element.findall('tag'):  
        for elem in element.iter('tag'):
            tag={}
            tag['id']=element.attrib['id']
            tag['value']=elem.attrib['v']
            if ':' in elem.attrib['k']:
                type = re.split(':',elem.attrib['k'])[0]  #这里通过分号切分字段,通过切片按到各自字段
                tag['key']=elem.attrib['k'][len(type)+1:]
                tag['type']=type
            else:
                tag['key']=elem.attrib['k']
                tag['type']='regular'
                
            
            if  LOWER_COLON.match(tag['key']) and PROBLEMCHARS.match(tag['key']): # 抛弃不符合正则的 key
                print tag['key']
            else:
                tags.append(tag)
    
    chinese_name=None
    pinyin_name=None
    for tag in tags:                 # 第一次循环是拿到中文名或者拼音名如果有的话
        if tag['key']=='zh':
            chinese_name = tag['value']
        elif tag['key']=='zh_pinyin':
            pinyin_name = tag['value']
            
    for tag in tags:              #再次循环是来更改 name 字段
        if tag['key']=='name':
            if chinese_name!=None:
                tag['value']=chinese_name
            elif pinyin_name!=None:
                tag['value']=pinyin_name
          
              
        
            

    
    if element.tag=='way':  # 提取 way 字段
        for key in element.attrib.keys():
            if key in way_attr_fields:
                way_attribs[key] = element.attrib[key]


   
    
    if element.tag =='way':  # 提取 way_node
       
        i=0  
        for tag in element.iter('nd'):
            temp={}
            temp['id']=element.attrib['id']
            temp['node_id']= tag.attrib['ref']
            temp['position']= i
            i+=1
            way_nodes.append(temp)
        
        
    
    if element.tag == 'node':
        return {'node': node_attribs, 'node_tags': tags}
    elif element.tag == 'way':
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


# def validate_element(element, validator, schema=SCHEMA):
#     """Raise ValidationError if element does not match schema"""
#     if validator.validate(element, schema) is not True:
#         field, errors = next(validator.errors.iteritems())
#         message_string = "\nElement of type '{0}' has the following errors:\n{1}"
#         error_string = pprint.pformat(errors)
        
#         raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
#                 if validate is True:
#                     validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])



    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
process_map(OSM_PATH, validate=True)

CPU times: user 2min 7s, sys: 17.1 s, total: 2min 24s
Wall time: 2min 26s


In [15]:
chinese_name=None
pinyin_name=None
tags =[{'key': 'name', 'type': 'regular', 'id': '26466690', 'value': u'\u739b\u96c5\u9152\u5427 Maya Pub'}, {'key': 'amenity', 'type': 'regular', 'id': '26466690', 'value': 'pub'}, {'key': 'en', 'type': 'name', 'id': '26466690', 'value': 'Maya'}, {'key': 'zh', 'type': 'name', 'id': '26466690', 'value': u'\u739b\u96c5\u9152\u5427'}, {'key': 'street', 'type': 'addr', 'id': '26466690', 'value': u'\u767d\u6c99\u6cc9'}, {'key': 'zh_pinyin', 'type': 'name', 'id': '26466690', 'value': u'M\u01cey\u01ce Ji\u01d4b\u0101'}, {'key': 'housenumber', 'type': 'addr', 'id': '26466690', 'value': '94'}]
for tag in tags:
        if tag['key']=='zh':
            chinese_name = tag['value']
        elif tag['key']=='zh_pinyin':
            pinyin_name = tag['value']
for tag in tags:
     if tag['key']=='name':
            if chinese_name!=None:
                tag['value']=chinese_name
            elif pinyin_name!=None:
                tag['value']=pinyin_name
            print tag['value']
              
            
            

print chinese_name,pinyin_name

玛雅酒吧
玛雅酒吧 Mǎyǎ Jiǔbā
