# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [1]:
from xml.etree import ElementTree as ET

In [2]:
document = ET.parse( './data/mondial_database.xml' )

In [3]:
infantMortalityDict = {}
for element in document.iterfind('country'):
    for child in element:
        if(child.tag == 'infant_mortality'):
            infantMortalityDict[element.find('name').text] = float(child.text)

In [4]:
infantMortalityTuples = []
for a in infantMortalityDict:
    infantMortalityTuples.append((a,infantMortalityDict[a]))
infantMortalityTuples.sort(key=lambda x: x[1])
list(map(lambda x: x[0], infantMortalityTuples[0:10]))

['Monaco',
 'Japan',
 'Norway',
 'Bermuda',
 'Singapore',
 'Sweden',
 'Czech Republic',
 'Hong Kong',
 'Macao',
 'Iceland']

In [5]:
cityPopulationDict = {}
for country in document.iterfind('country'):
    for city in country.iterfind('city'):
        maxYear = 0
        population = None
        
        for population in city.iterfind('population'):      
            year = int(population.attrib['year'])
            if (year > maxYear):
                population = int(population.text)
        if (population):
            cityPopulationDict[city.find('name').text] = population
    for province in country.iterfind('province'):
        for city in province.iterfind('city'):
            maxYear = 0
            population = None

            for population in city.iterfind('population'):      
                year = int(population.attrib['year'])
                if (year > maxYear):
                    population = int(population.text)
            if (population):
                cityPopulationDict[city.find('name').text] = population
        
        

In [6]:
cityPopulationTuples = []
for a in cityPopulationDict:
    cityPopulationTuples.append((a,cityPopulationDict[a]))
cityPopulationTuples.sort(key=lambda x: -x[1])
list(map(lambda x: x[0], cityPopulationTuples[0:10]))

['Shanghai',
 'Istanbul',
 'Mumbai',
 'Moskva',
 'Beijing',
 'São Paulo',
 'Tianjin',
 'Guangzhou',
 'Delhi',
 'Shenzhen']

In [7]:
ethnicPopulationDict = {}
for country in document.iterfind('country'):
    populationValue = None
    for population in country.iterfind('population'):      
        year = int(population.attrib['year'])
        if (year > maxYear):
            populationValue = int(population.text)
    if (populationValue):
        for ethnicgroup in country.iterfind('ethnicgroup'):
            ethnicPopulation = populationValue * float(ethnicgroup.attrib['percentage'])
            if (ethnicgroup.text not in ethnicPopulationDict):
                ethnicPopulationDict[ethnicgroup.text] = 0;
            ethnicPopulationDict[ethnicgroup.text] = ethnicPopulationDict[ethnicgroup.text] + ethnicPopulation
                      

In [8]:
ethnicPopulationTuples = []
for a in ethnicPopulationDict:
    ethnicPopulationTuples.append((a,ethnicPopulationDict[a]))
ethnicPopulationTuples.sort(key=lambda x: -x[1])
list(map(lambda x: x[0], ethnicPopulationTuples[0:10]))

['Han Chinese',
 'Indo-Aryan',
 'European',
 'African',
 'Dravidian',
 'Mestizo',
 'Bengali',
 'Russian',
 'Japanese',
 'Malay']

In [9]:
riverTuples = []
for element in document.iterfind('river'):
    if (element.find('length') != None):
        riverTuples.append((element.find('name').text, element.attrib["country"], float(element.find('length').text)))
riverTuples.sort(key=lambda x: -x[2])
list(map(lambda x: (x[0], x[1]), riverTuples[0:10]))

[('Amazonas', 'CO BR PE'),
 ('Jangtse', 'CN'),
 ('Hwangho', 'CN'),
 ('Lena', 'R'),
 ('Zaire', 'RCB ZRE'),
 ('Mekong', 'CN LAO THA K VN'),
 ('Irtysch', 'R KAZ CN'),
 ('Niger', 'RMM RN WAN RG'),
 ('Missouri', 'USA'),
 ('Jenissej', 'R')]

In [10]:
lakeTuples = []
for element in document.iterfind('lake'):
    if (element.find('area') != None):
        lakeTuples.append((element.find('name').text, element.attrib["country"], float(element.find('area').text)))
lakeTuples.sort(key=lambda x: -x[2])
list(map(lambda x: (x[0], x[1]), lakeTuples[0:10]))

[('Caspian Sea', 'R AZ KAZ IR TM'),
 ('Lake Superior', 'CDN USA'),
 ('Lake Victoria', 'EAT EAK EAU'),
 ('Lake Huron', 'CDN USA'),
 ('Lake Michigan', 'USA'),
 ('Dead Sea', 'IL JOR WEST'),
 ('Lake Tanganjika', 'ZRE Z BI EAT'),
 ('Great Bear Lake', 'CDN'),
 ('Ozero Baikal', 'R'),
 ('Lake Malawi', 'MW MOC EAT')]

In [11]:
airportTuples = []
for element in document.iterfind('airport'):
    if (element.find('elevation') != None and element.find('elevation').text != None):
        airportTuples.append((element.find('name').text, element.attrib["country"], float(element.find('elevation').text)))
airportTuples.sort(key=lambda x: -x[2])
list(map(lambda x: (x[0], x[1]), airportTuples[0:10]))

[('El Alto Intl', 'BOL'),
 ('Lhasa-Gonggar', 'CN'),
 ('Yushu Batang', 'CN'),
 ('Juliaca', 'PE'),
 ('Teniente Alejandro Velasco Astete Intl', 'PE'),
 ('Juana Azurduy De Padilla', 'BOL'),
 ('Mariscal Sucre Intl', 'EC'),
 ('Coronel Fap Alfredo Mendivil Duarte', 'PE'),
 ('Mayor General FAP Armando Revoredo Iglesias Airport', 'PE'),
 ('Licenciado Adolfo Lopez Mateos Intl', 'MEX')]