# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [3]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [5]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

xml.etree.ElementTree.ElementTree

In [6]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [24]:
document = ET.parse( './data/mondial_database.xml' )
country="";
mortality=0;
mortalityList=[];

for element in document.iterfind('country'):
    country=element.find('name').text
    node=element.find('infant_mortality')
    if node != None:
        mortality=float(node.text)
        mortalityList.append((country,mortality))
sortedCountry_by_second = sorted(mortalityList, key=lambda tup: tup[1])[0:10]
print(sortedCountry_by_second)


[('Monaco', 1.81), ('Japan', 2.13), ('Norway', 2.48), ('Bermuda', 2.48), ('Singapore', 2.53), ('Sweden', 2.6), ('Czech Republic', 2.63), ('Hong Kong', 2.73), ('Macao', 3.13), ('Iceland', 3.15)]


In [40]:
city=""
population=0;
populationList=[];

for element in document.iterfind('country'):
    for subelement in element.getiterator('city'):
        city=subelement.find('name').text
        for pop in subelement.getiterator('population'):
            if pop.attrib['year']=='2011':
                population=float(pop.text)
                populationList.append((element.find('name').text+'-'+city,population))
sortedCity_by_second = sorted(populationList, key=lambda tup: tup[1],reverse=True)[0:10]
print(sortedCity_by_second)

[('India-Mumbai', 12442373.0), ('India-Delhi', 11034555.0), ('India-Bangalore', 8443675.0), ('United Kingdom-London', 8250205.0), ('Iran-Tehran', 8154051.0), ('Bangladesh-Dhaka', 7423137.0), ('India-Hyderabad', 6731790.0), ('India-Ahmadabad', 5577940.0), ('Angola-Luanda', 5000000.0), ('India-Chennai', 4646732.0)]


In [67]:
ethnicGroupDict={}
ethnicName="";
for element in document.iterfind('country'):
    country=element.find('name').text
    population=0;
    pct=0;
    for pop in element.getiterator('population'):
        if pop.attrib['year']=='2011':
            population=float(pop.text)
    for ethGroup in element.getiterator('ethnicgroup'):
        pct = float(ethGroup.attrib['percentage'])
        ethnicName = ethGroup.text
        if ethnicName in ethnicGroupDict:
           ethnicGroupDict[ethnicName] = ethnicGroupDict[ethnicName] + pct * population
        else:
           ethnicGroupDict[ethnicName] = pct * population
ethnicGroupDictSort=sorted(ethnicGroupDict.items(), key=lambda tup: tup[1],reverse=True)[0:10]
print(ethnicGroupDictSort)

[('Polish', 3760435896.3), ('Chinese', 735730628.0), ('Arab', 733887960.0), ('Indo-Aryan', 726213024.0), ('African', 443165508.7), ('Dravidian', 252157300.0), ('Slovene', 186602536.0), ('Ovimbundu', 185000000.0), ('Hungarian', 163970427.4), ('Kimbundu', 125000000.0)]


In [115]:
#long river
riverName=""
riverLength=0;
riverList=[]
for element in document.iterfind('river'):
    riverName= element.find('name').text
    if element.find('length') != None:
        riverLength= float(element.find('length').text)
    for located in element.getiterator('located'):
        country= located.attrib['country']
    riverList.append((country+"-"+riverName,riverLength))
sortedRiver = sorted(riverList, key=lambda tup: tup[1],reverse=True)[0]
print(sortedRiver)

('BR-Amazonas', 6448.0)


In [118]:
#largest lake
lakeName=""
lakeArea=0.0;
lakeList=[]
for element in document.iterfind('lake'):
    lakeName= element.find('name').text
    if element.find('area') != None:
        lakeArea= float(element.find('area').text)
    for located in element.getiterator('located'):
        country= located.attrib['country']
    lakeList.append((country+"-"+lakeName,lakeArea))
sortedLake = sorted(lakeList, key=lambda tup: tup[1],reverse=True)[0]
print(sortedLake)

('TM-Caspian Sea', 386400.0)


In [119]:
#airport highest elevation
airportName=""
airportElevation=0.0;
airportList=[]
for element in document.iterfind('airport'):
    airportName = element.attrib['country']+"-"+element.attrib['iatacode']
    if element.find('elevation').text != None:
       airportElevation = float(element.find('elevation').text)
       airportList.append((airportName,airportElevation))
sortedAirport = sorted(airportList, key=lambda tup: tup[1],reverse=True)[0]
print(sortedAirport)

('BOL-LPB', 4063.0)
