# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
document_tree

<xml.etree.ElementTree.ElementTree at 0x4bf4eb8>

In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [6]:
document = ET.parse( './data/mondial_database.xml' )

In [7]:
root = document.getroot()

In [8]:
countries = root.findall('country') # create a list of countries separate from the main tree
topop = []
for index, child in enumerate(countries): # collects the indicies of countries with no infant mortality data
    #print child.find('name').text
    if child.find('infant_mortality') == None:
        topop.append(index)
        
        
for i in sorted(topop, reverse=True): #removes the countries with no infant mortality data
    countries.pop(i)


countries = sorted(countries, key=lambda child: float(child.find('infant_mortality').text)) 

for j in range(0,10):
    print countries[j].find('name').text + ': ' + countries[j].find('infant_mortality').text


Monaco: 1.81
Japan: 2.13
Norway: 2.48
Bermuda: 2.48
Singapore: 2.53
Sweden: 2.6
Czech Republic: 2.63
Hong Kong: 2.73
Macao: 3.13
Iceland: 3.15


In [9]:
cities = []

for city in root.iter('city'): #for every city
    peeps = 0
    for ppl in city.iter('population'): # find the highest population figure
        if int(ppl.text) > peeps:
            peeps = int(ppl.text)
    if peeps != 0: #only if the for loop found population data for the city
        cities.append([city.find('name').text, peeps]) # add the city name a population to a list of cities


cities = sorted(cities, key = lambda cities: cities[1], reverse = True) # sort decending by population
for j in range(0,10):
    print cities[j][0] + ': ' + str(cities[j][1])

Shanghai: 22315474
Istanbul: 13710512
Delhi: 12877470
Mumbai: 12442373
Moskva: 11979529
Beijing: 11716620
São Paulo: 11152344
Tianjin: 11090314
Guangzhou: 11071424
Shenzhen: 10358381


In [10]:
ethgrp = {}

for country in root.iter('country'):

    year = 0
    for ppl in country.findall('population'): #find the latest population figure.  
        if int(ppl.attrib['year']) > year:
            year = int(ppl.attrib['year'])
            yearpop = ppl

    for egp in country.findall('ethnicgroup'): #for each ethnic group
        if egp.text in ethgrp: #if the ethnic group is already represented in ethgrp
            ethgrp[egp.text] += int(float(egp.attrib['percentage'])/ 100 *int(yearpop.text)) # add (country population) * % to existing count
        else:
            ethgrp[egp.text] = int(float(egp.attrib['percentage']) / 100 *int(yearpop.text)) #create a new entry for the ethnic group

eths = sorted(ethgrp.items(), key = lambda ethgrp: ethgrp[1], reverse = True) #decending sort based on population

for j in range(0,10):
    print eths[j][0] + ': ' + str(eths[j][1])
    
#small problem with this output, overlapping sets and subsets aren't handled.  Han Chinese and Chinese are separate.
#European and French/Italian/etc are separate.  In a larger project I would probably want to address that.

Han Chinese: 1245058800
Indo-Aryan: 871815583
European: 494872201
African: 318325104
Dravidian: 302713744
Mestizo: 157734349
Bengali: 146776916
Russian: 131856989
Japanese: 126534212
Malay: 121993548


In [11]:
length = 0
area = 0
elevation = 0
for river in root.findall('river'):
    if river.find('length') != None:
        if float(river.find('length').text) > length:
            length = float(river.find('length').text)
            longestriver = river
            
for lake in root.findall('lake'):
    if lake.find('area') != None:
        if float(lake.find('area').text) > area:
            area = float(lake.find('area').text)
            biggestlake = lake

for air in root.findall('airport'):
    if air.find('elevation').text != None:
        if int(air.find('elevation').text) > elevation:
            elevation = float(air.find('elevation').text)
            highestport = air
            
            
print('Longest River: ' + longestriver.find('name').text + ' ' + longestriver.find('length').text + ' ' +
        longestriver.find('estuary').attrib['country'])

print('Biggest Lake: ' + biggestlake.find('name').text + ' ' + biggestlake.find('area').text + ' ' +
        biggestlake.find('located').attrib['country'])

print('Highest Airport: ' + highestport.find('name').text + ' ' + highestport.find('elevation').text + ' ' +
        highestport.attrib['country'])

#for each category, find the largest length/area/elevation listed, save a record of the element associated with it, and print

Longest River: Amazonas 6448 BR
Biggest Lake: Caspian Sea 386400 R
Highest Airport: El Alto Intl 4063 BOL


In [12]:
1.1>1

True