# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [31]:
from xml.etree import ElementTree as ET
import pandas as pd

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [14]:
document_tree = ET.parse( 'mondial_database_less.xml' )
document_tree

<xml.etree.ElementTree.ElementTree at 0x191c5c5c4a8>

In [18]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

SyntaxError: invalid syntax (<ipython-input-18-71a7702f86c3>, line 3)

In [11]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

SyntaxError: invalid syntax (<ipython-input-11-6645883cfe43>, line 3)

****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [27]:

document = ET.parse( 'mondial_database.xml' )
for element in document.iterfind('country'):
        country_list=element.find('name')
        infant_mortality=element.find('infant_mortality')
        if infant_mortality!=None:
            print(country_list.text, infant_mortality.text)

Albania 13.19
Greece 4.78
Macedonia 7.9
Serbia 6.16
Andorra 3.69
France 3.31
Spain 3.33
Austria 4.16
Czech Republic 2.63
Germany 3.46
Hungary 5.09
Italy 3.31
Liechtenstein 4.33
Slovakia 5.35
Slovenia 4.04
Switzerland 3.73
Belarus 3.64
Latvia 7.91
Lithuania 6
Poland 6.19
Ukraine 8.1
Russia 7.08
Belgium 4.18
Luxembourg 4.28
Netherlands 3.66
Bosnia and Herzegovina 5.84
Croatia 5.87
Bulgaria 15.08
Romania 10.16
Turkey 21.43
Denmark 4.1
Estonia 6.7
Faroe Islands 5.71
Finland 3.36
Norway 2.48
Sweden 2.6
Monaco 1.81
Gibraltar 6.29
Guernsey 3.47
Iceland 3.15
Ireland 3.74
San Marino 4.52
Jersey 3.86
Malta 3.59
Isle of Man 4.17
Moldova 12.93
Portugal 4.48
United Kingdom 4.44
Afghanistan 117.23
China 14.79
Iran 39
Pakistan 57.48
Tajikistan 35.03
Turkmenistan 38.13
Uzbekistan 19.84
Armenia 13.97
Georgia 16.68
Azerbaijan 26.67
Bahrain 9.68
Bangladesh 45.67
Myanmar 44.91
India 43.19
Bhutan 37.89
Brunei 10.48
Malaysia 13.69
Laos 54.53
Thailand 9.86
Cambodia 51.36
Vietnam 18.99
Kazakhstan 21.61
North 

In [49]:
document = ET.parse( 'mondial_database.xml' )
root = document.getroot()

citylist = []
poplist = []
year = []

for element in root.iter('country'):
    for city in element.iter('city'):
        cities = city.find('name').text
        for pop in city.iter():
            if pop.tag == 'population':
                poplist.append(int(pop.text)) 
                citylist.append(cities) 
                year.append(int(pop.attrib.get('year')))         
                

PopulationFrame=pd.DataFrame({'City':citylist, 'Year':year, 'Population':poplist})
PopulationFrame.loc[PopulationFrame.Year == 2011].drop_duplicates(['City']).sort_values(['Year','Population'],ascending=False).head(10)


Unnamed: 0,City,Population,Year
4303,Mumbai,12442373,2011
4399,Delhi,11034555,2011
4280,Bangalore,8443675,2011
3235,London,8250205,2011
3944,Tehran,8154051,2011
4201,Dhaka,7423137,2011
4414,Hyderabad,6731790,2011
4263,Ahmadabad,5577940,2011
8782,Luanda,5000000,2011
4353,Chennai,4646732,2011


In [None]:
document = ET.parse( 'mondial_database.xml' )
d = {}
for country in document.iterfind('country'):

    if country.find('./ethnicgroup[1][@percentage]') is None:
        pass
    else:
        lastpop = int(country.find('./population[last()]').text)
        ethnic = country.find('./ethnicgroup[1]')
        ethnicname = ethnic.text
        ethicperc = float(ethnic.get('percentage'))/100
        print country.find('name').text, ethnicname
        print lastpop * ethicperc
        d[country.find('name').text, ethnicname] = lastpop * ethicperc  
sorted_d = sorted(d.items(), key=operator.itemgetter(1), reverse=True)[:10]
sorted_d

In [None]:
d = {}
for river in document.iterfind('river'):
    name = river.get('id')
    print name
    country = river.get('country')
    print country
    length = river.find('./length')
    if length is None:
        pass
    else:
        print float(length.text)
        d[name, country]=float(length.text)
    
sorted_d = sorted(d.items(), key=operator.itemgetter(1), reverse=True)[:1]
sorted_d
# [(('river-Amazonas', 'CO BR PE'), 6448.0)]

# name and country of b) largest lake
d = {}
for lake in document.iterfind('lake'):
    name = lake.get('id')
    print name
    country = lake.get('country')
    print country
    area = lake.find('./area')
    if area is None:
        pass
    else:
        print float(area.text)
        d[name, country]=float(area.text)

sorted_d = sorted(d.items(), key=operator.itemgetter(1), reverse=True)[:1]
sorted_d

# Caspian Sea
#[(('lake-KaspischesMeer', 'R AZ KAZ IR TM'), 386400.0)]

# name and country of c) airport at highest elevation
d = {}
for airport in document.iterfind('airport'):
    name = airport.get('iatacode')
    print name
    country = airport.get('country')
    print country
    elevation = airport.findtext('./elevation')
    print type(elevation)
    if elevation is None:
        pass
    elif elevation=='':
        pass
    else:
        print float(elevation)
        d[name, country]=float(elevation)

sorted_d = sorted(d.items(), key=operator.itemgetter(1), reverse=True)[:1]
sorted_d      
# El Alto Intl, Bolivia
# [(('LPB', 'BOL'), 4063.0)]