# Statistical Overview about chosen dataset from selected OSMF (Open Street Map File) - 'Meddersheim and surrounding area':

0. Size of original file
1. Number of nodes, ways and other data
2. Number of unique users 

In [15]:
# -*- coding: utf-8 -*-
# get original file size from exported file "Meddersheim_Bad-Muenster_Bad-Kreuznach_original"
# get number of data tags from exported file "Meddersheim_Bad-Muenster_Bad-Kreuznach_original"
# get number of unique users for this dataset
'''
Created on Dec 6, 2015

@author: anita
'''

import xml.etree.ElementTree as ET
import pprint
import os
from stat import *

street_map_file = "Meddersheim_Bad-Muenster_Bad-Kreuznach_original"
    
def read_file(street_map_file):
    dic_tag = {}
    users = []
    file_orig = os.stat(street_map_file) #get statistical information from dataset
    file_orig_size = float(file_orig.st_size) #get file size
    for event, element in ET.iterparse(street_map_file):
        # get user IDs
        uid = element.get('uid')
        if uid !=None:
            users.append(uid)
        # count xml-tags
        if dic_tag.has_key(element.tag):
            dic_tag[element.tag] = dic_tag[element.tag] + 1
        else:
            dic_tag[element.tag] = 1
    users = set(users)
    pprint.pprint("0. Size of original file from OSM [MB]:")
    pprint.pprint(file_orig_size/1000000) #get size of file in [MB]
    pprint.pprint("1. Number of nodes, ways and other data:")
    pprint.pprint(dic_tag)
    pprint.pprint("2. Number if unique users:")
    pprint.pprint(len(users))
    return dic_tag, users

if __name__ == '__main__':
    read_file(street_map_file)

'Size of original file from OSM [MB]:'
50.786579
'Number of nodes, ways and other data:'
{'bounds': 1,
 'member': 24704,
 'meta': 1,
 'nd': 291833,
 'node': 218983,
 'note': 1,
 'osm': 1,
 'relation': 682,
 'tag': 94015,
 'way': 35578}
'Number if unique users:'
484


4. Call query for getting wineries from OpenStreetMap-data
5. Number of chosen types of nodes

In [3]:
import pprint
from pymongo import MongoClient

def get_db():    
    client = MongoClient('localhost',27017)
    db = client['test']
    coll = db.P3_Meddersheim_Germany
    return db

#getting wineries 
def range_query():
    query = {"tag.shop":"winery"}
    return query
    
#getting types of nodes  
def make_pipeline():
    pipeline = []
    #count types of nodes and get them
    pipeline = [{"$match":{"type":"node","type":{"$ne":None},"tag.amenity":{"$ne":None}}},
                {"$group":{"_id":"$tag.amenity","count":{"$sum":1}}},
                {"$sort":{"count":-1}}]
    #alternative pipeline for checking which type is the right one
    #pipeline = [{"$match":{"openGeoDB.type":{"$ne":None}}},
    #            {"$group":{"_id":"$openGeoDB.type","count":{"$sum":1}}},
    #            {"$sort":{"count":-1}}]
    #print pipeline
    return pipeline

def osm_sources(db, pipeline):
    return [doc for doc in db.P3_Meddersheim_Germany.aggregate(pipeline)]

if __name__ == '__main__':
    #read_file(street_map_file)
    db = get_db()
    pipeline = make_pipeline()
    query = range_query()
    result_id = db.P3_Meddersheim_Germany.find(query)
    result = osm_sources(db, pipeline)
    for r in result_id:
        pprint.pprint(r)
    pprint.pprint(result)

{u'_id': ObjectId('5686ecf1cf4e09e2ffdb0808'),
 u'address': {u'city': u'Monzingen',
              u'country': u'DE',
              u'housenumber': u'23',
              u'postcode': u'55569',
              u'street': u'Soonwaldstra\xdfe'},
 u'contact': {u'fax': u'+49 6751 94705', u'phone': u'+49 6751 3847'},
 u'created': {u'changeset': u'13478987',
              u'timestamp': u'2012-10-13T13:23:36Z',
              u'uid': u'39424',
              u'user': u'atressel',
              u'version': u'7'},
 u'id': u'385151392',
 u'pos': [49.8019191, 7.5910752],
 u'tag': {u'landuse': u'farmyard',
          u'name': u'Weingut Jaeger',
          u'shop': u'winery',
          u'tourism': u'wine_cellar'},
 u'type': u'node'}
{u'_id': ObjectId('5686ecf1cf4e09e2ffdb2fa2'),
 u'created': {u'changeset': u'4214878',
              u'timestamp': u'2010-03-23T18:51:38Z',
              u'uid': u'234214',
              u'user': u'Anubis85',
              u'version': u'1'},
 u'id': u'672850551',
 u'pos': [49.82

# Problems encountered in your map

Encode original xml-data in that way so that german characters are readable via setting 'utf8' encoding. The original py-file also defines '-*- coding: utf-8 -*-' at the beginning of the coding file. Then write the re-shaped data in json-Format in a file. Before 'shape_element' is called for re-shaping.

In [None]:
 for event, element in ET.iterparse(street_map_file):
    #shape element (see below for what is done)
    el = shape_element(element)
    if el != {}:
        #set german characters is readable
        json_data = json.dumps(el, ensure_ascii = False, indent = 2).encode('utf8') +"\n"
        json_data_array.append(json_data)
    #write re-shaped data in in json-file
    f = open("P3_Meddersheim_Germany",'w')
    f.writelines(json_data_array)
    f.close()

Separate the 'addr:'-tag by splitting at the separator colon. The method is called from 'def shape_element(element):'

In [None]:
#separate definition in case 'addr:' is available in different data types
def address_element(child_name, child):
    child_name = child_name.replace("addr:","")
    child_name = child_name.split(":")
    if len(child_name)<2:
        add_address[child_name[0]] = child.get('v')
    return add_address

Define needed arrays and dictionary for further re-shaping, splitting and organizing the original xml-data and preparing for writing in json-format. At last get type of tag. I tried this type of coding to try a two different ways of re-shaping data. This is only for 'addr:'-tags. Below I used the separator 'colon'for shaping the 'tag'-tags. Main idea was to separat the 'contact:'-data in an own dictionary.

In [None]:
def shape_element(element):
    node = {}
    openGeoDB = {}
    auto_update = []
    is_in = []
    position = []
    node_refs = []
    relation_member_attrib = []
    relation_member = []
    add_created = {}
    tag_sep = []
    tag_sep_dict = {}
    relation_tag_attrib = {}
    child_name = None
    #set type of parsed data based on tag
    if element.tag =="bounds" or element.tag == "note" or element.tag == "meta" or element.tag == "osm" or\
    element.tag == "node" or element.tag == "way" or element.tag == "relation":
        node["type"] = element.tag

Just to complete get information from tags in xml-file which only appear once.

In [None]:
# get information from tags which only appear once
    if element.tag =="bounds" or element.tag == "note" or element.tag == "meta" or element.tag == "osm":
        if element.tag == "note":
            node[element.tag] = element.text
        else:
            node[element.tag] = element.attrib
            #print node

Ensure to get all 'openGeoDB' tags, even those with small letters. Prepare for later separation of this tag and saving in own dictionary 'openGeoDB'.

In [None]:
#shape data
    for child in element:
        child_name = child.get('k')
        p = re.compile('openGeoDB:', re.IGNORECASE)     

Prepare all data within 'is_in' within an array. Split at separator comma. There is also another 'is_in' field within 'openGeoDB'. Put also an array there, but also export this field to json-format.

In [None]:
#put data in an array, because it is a list of descriptive location
        if child_name == "is_in":
            is_in = child.get('v').split(',')
            node[child_name]=is_in       

The xml-data from 'auto_update' are stored in the array 'auto_update' and splitted by the separator comma. Further the same is done for 'is_in' and both information are stored within the dictionary 'openGeoDB'.
The position data from all tags are taken from the longitude and latitude information and put together in the array 'position'. Also stored in the dictionary 'openGeoDB' within the dictionary 'node'.

In [None]:
# get all data which contain 'openGeoDB' also in small letters
        elif child_name != None and re.search(p,child_name) != None:
            child_name = p.sub("", child_name)
            if child_name == 'auto_update':
                auto_update = child.get('v').split(',')
                openGeoDB[child_name]=auto_update
            elif child_name == "is_in":
                is_in = child.get('v').split(',')
                openGeoDB[child_name]=is_in
            elif child_name == 'lat':
                position.append(float(child.get('v')))
            elif child_name == 'lon':
                position.append(float(child.get('v')))
            else:
                openGeoDB[child_name] = child.get('v')
            # put latitude and longitude data in an array which are inside 'openGeoDB' data
            openGeoDB['position'] = position
            node['openGeoDB'] = openGeoDB
            #print node

Put all reference information from separate xml-tags together in one array 'node_refs'.

In [None]:
#check all data which contain a 'ref'-tag
        elif child.get('ref') != None:
                node_refs.append(child.get('ref'))
                node['node_refs'] = node_refs       

Here the 'def address-element(child_name, child)' is called to re-shape the tag 'addr:'. Furthermore separate the other other information by the separator colon and use value before first separator and value after last separator for defining information in json-format.
Otherwise use tag with 'k'(key)-information and 'v'(value)-information for creating key-value pairs in key-value-dictionary whithin 'tag' dictionary. Keep 'tag' as separate dictionary because at least 'type' is also available as key in node-dictionary.
Main idead was to separate the 'contact:'-data in an own dictionary to reach easier email and phone information from contact dictionary.

In [None]:
#shape all data in 'tag'-tag because some names are the same like in nodes => type, so cluster 'tag' is kept
        elif child.tag == "tag":
            child_name = child.get('k')
            #shape address data in dictionary address
            if child_name != None and child.get('k').startswith("addr:"):
                add_address = address_element(child_name, child)
                node["address"]=add_address
            #shape all other data which use ':' as a separator depending on amount of this separator
            elif re.search(':', child_name) != None:
                tag_sep = child_name.split(':')
                i.append(len(tag_sep))
                if len(tag_sep) == 2:
                    tag_sep_dict[tag_sep[1]] = child.get('v') 
                    node[tag_sep[0]] = tag_sep_dict
                elif len(tag_sep) == 3:
                    tag_sep_dict[tag_sep[2]] = child.get('v') 
                    node[tag_sep[0]] = tag_sep_dict
                elif len(tag_sep) == 4:
                    tag_sep_dict[tag_sep[3]] = child.get('v') 
                    node[tag_sep[0]] = tag_sep_dict   
            else:
                relation_tag_attrib[child.get('k')] = child.get('v')
                node['tag'] = relation_tag_attrib         

Get member data and shape for json-format.

In [None]:
#shape 'member'-tag data in dicitionary 'member' inside dictionary node
        elif child.tag == "member":
                #print child.get('type')
                relation_member_attrib = child.get('type'), child.get('ref'), child.get('role')
                relation_member.append(relation_member_attrib)
                node['member'] = relation_member
        else:
            if child_name != None:
                node[child_name]=child.get('v')

Put header-information together in a dictionary 'created'. Like done in lesson 6 - Case Study.

In [None]:
#shape tag header-data in 'CREATED' dictionary, see lesson 6 - Case Study - OpenStreetMap data
    for elem in element.attrib:
        for entry in CREATED:
            if elem == entry and elem != None:
                if element.get(entry) != None:
                    add_created[elem] = element.get(entry)
            elif elem == "id" and elem != None:
                if element.get(elem) != None:
                    node[elem] = element.get(elem) 

Shape longitude and latitude data in node dictionary in an array 'position'. These data define the position of each node.

In [None]:
#shape position data in positin array for each data-element
    if element.get != None and element.get('lat') or element.get('lon'):
        position = []
        position.append(float(element.get('lat')))   
        position.append(float(element.get('lon')))
        node["pos"]= position
    if len(add_created) > 1:
        node["created"] = add_created
    return node

# Other ideas about the datasets

1. The field 'is_in' is available twice in the dictionary node and seems to contain the same data. The second time it is stored in node => openGeoDB => is_in dictionary and could be checked if those data are really the same compared to what is directly stored in node => is_in.
2. Independent from the tag-type I re-shaped and exported the xml-data in json-format with encoding utf8 for reading german characters. The encoding for the Mongo DB is not yet implemented. But could be done later.
3. The contact data could be grouped according to the 'amenity or shop'-type and contacted for services (e. g. winery services and equipment, vineyard or restaurants support, holiday and tourism service).