# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

### 1-10 countries with the lowest infant mortality rates

In [5]:
document = ET.parse( './data/mondial_database.xml' )

In [6]:
infants_rates = {}
for country in document.findall('country'):
    country_name = country.find('name').text
    country_infant_mortality = country.find('infant_mortality')
    if country_infant_mortality is not None :
        infants_rates[country_name] = float(country_infant_mortality.text)
infants_rates

{'Afghanistan': 117.23,
 'Albania': 13.19,
 'Algeria': 21.76,
 'American Samoa': 8.92,
 'Andorra': 3.69,
 'Angola': 79.99,
 'Anguilla': 3.4,
 'Antigua and Barbuda': 13.29,
 'Argentina': 9.96,
 'Armenia': 13.97,
 'Aruba': 11.74,
 'Australia': 4.43,
 'Austria': 4.16,
 'Azerbaijan': 26.67,
 'Bahamas': 12.5,
 'Bahrain': 9.68,
 'Bangladesh': 45.67,
 'Barbados': 10.93,
 'Belarus': 3.64,
 'Belgium': 4.18,
 'Belize': 20.31,
 'Benin': 57.09,
 'Bermuda': 2.48,
 'Bhutan': 37.89,
 'Bolivia': 38.61,
 'Bosnia and Herzegovina': 5.84,
 'Botswana': 9.38,
 'Brazil': 19.21,
 'British Virgin Islands': 13.45,
 'Brunei': 10.48,
 'Bulgaria': 15.08,
 'Burkina Faso': 76.8,
 'Burundi': 63.44,
 'Cambodia': 51.36,
 'Cameroon': 55.1,
 'Canada': 4.71,
 'Cape Verde': 24.28,
 'Cayman Islands': 6.21,
 'Central African Republic': 92.86,
 'Chad': 90.3,
 'Chile': 7.02,
 'China': 14.79,
 'Colombia': 15.02,
 'Comoros': 65.31,
 'Congo': 59.34,
 'Cook Islands': 14.33,
 'Costa Rica': 8.7,
 'Cote dIvoire': 60.16,
 'Croatia': 5

In [7]:
sorted_infants_rates = sorted(infants_rates.items(), key=lambda x: x[1])
sorted_infants_rates[0:10]

[('Monaco', 1.81),
 ('Japan', 2.13),
 ('Bermuda', 2.48),
 ('Norway', 2.48),
 ('Singapore', 2.53),
 ('Sweden', 2.6),
 ('Czech Republic', 2.63),
 ('Hong Kong', 2.73),
 ('Macao', 3.13),
 ('Iceland', 3.15)]

### 2- 10 cities with the largest population

I will assume that the population of any city = the latest census of this city

In [8]:
city_population = {}
for country in document.findall('country'):
    for city in country.iter('city'):
        city_name = city.find('name').text
        latest_population_year = 0
        for population in city.iter('population'):
            population_year = int(population.get('year'))
            if(population_year > latest_population_year):
                latest_population_year = population_year
                city_pop = float(population.text)
        city_population[city_name] = city_pop
city_population

{'Akure': 239124.0,
 u'\xc1guas Lindas de Goi\xe1s': 159138.0,
 'Yibin': 241019.0,
 'Szczecin': 409211.0,
 'Lianyungang': 354139.0,
 'Machakos': 150041.0,
 'Toledo': 287206.0,
 'Zhaodong': 179976.0,
 'Zinder': 322935.0,
 'Fayetteville': 200574.0,
 'Pomona': 149058.0,
 'Usak': 187886.0,
 'Sacaba': 169494.0,
 'Liverpool': 552267.0,
 'High Wycombe': 120256.0,
 'Livingstone': 1269848.0,
 'Novi Sad': 335701.0,
 'Mitu': 131545.0,
 'Choluteca': 120791.0,
 'Panshan': 362773.0,
 'Mito': 1020241.0,
 u'San Pedro Garza Garc\xeda': 122627.0,
 'Caen': 108793.0,
 u'Maring\xe1': 350653.0,
 'La Chorrera': 161470.0,
 'Terrassa': 214406.0,
 'Oshogbo': 250951.0,
 'Exeter': 113507.0,
 'Jeju': 399416.0,
 'Talara': 87622.0,
 'Ar Ramadi': 192556.0,
 'Saitama': 1223954.0,
 'San Fernando del Valle de Catamarca': 140556.0,
 'Sheikhupura': 280263.0,
 'Tonsberg': 8984.0,
 'Tougourt': 143270.0,
 'Windsor': 210891.0,
 'Aijal': 1727692.0,
 'Tanjungpinang': 174758.0,
 'Strasbourg': 272222.0,
 'Indaiatuba': 199592.0,
 

In [9]:
sorted_city_population = sorted(city_population.items(), key=lambda x: x[1], reverse=True)
sorted_city_population[0:10]

[('Shanghai', 22315474.0),
 ('Istanbul', 13710512.0),
 ('Mumbai', 12442373.0),
 ('Moskva', 11979529.0),
 ('Beijing', 11716620.0),
 (u'S\xe3o Paulo', 11152344.0),
 ('Tianjin', 11090314.0),
 ('Guangzhou', 11071424.0),
 ('Delhi', 11034555.0),
 ('Shenzhen', 10358381.0)]

### 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)


I will assume that the population of any country = the latest census of this country

In [10]:
ethinc_population = {}
for country in document.findall('country'):
    latest_population_year = 0
    for country_population in country.findall('population'):
        population_year = int(country_population.get('year'))
        if(population_year > latest_population_year):
            latest_population_year = population_year
            latest_country_population = float(population.text)
    for country_ethinc_group in country.findall('ethnicgroup'):
        ethinc_group_name = country_ethinc_group.text
        ethinc_group_precentage = float(country_ethinc_group.get('percentage'))
        if ethinc_group_name in ethinc_population.keys():
            ethinc_population[ethinc_group_name] = ethinc_population[ethinc_group_name] +\
                ethinc_group_precentage * latest_country_population
        else :
            ethinc_population[ethinc_group_name] = ethinc_group_precentage * latest_country_population
ethinc_population

{'Acholi': 99880.0,
 'Afar': 916399.0,
 'African': 46657693.5,
 'African descent': 1248500.0,
 'African-white-Indian': 2247300.0,
 'Afro-Asian': 249700.0,
 'Afro-Chinese': 29964.0,
 'Afro-East Indian': 74910.0,
 'Afro-European': 377047.0,
 'Albanian': 5423484.0,
 'Alemannic': 2372150.0,
 'Americo-Liberians': 124850.0,
 'Amerindian': 4553778.9,
 'Amhara': 671693.0,
 'Andorran': 824010.0,
 'Arab': 15938351.0,
 'Arab Iranian': 574310.0,
 'Arab-Berber': 7393617.0,
 'Arabic': 34458.6,
 'Armenian': 2609365.0,
 'Asian': 2307977.1,
 'Assyrian': 124850.0,
 'Austrian': 2274767.0,
 'Aymara': 624250.0,
 'Azerbaijani': 599280.0,
 'Azeri': 2424587.0,
 'Baganda': 424490.0,
 'Bagisu': 124850.0,
 'Bahraini': 1573110.0,
 'Bakongo': 324610.0,
 'Baloch': 49940.0,
 'Banda': 674190.0,
 'Bantu': 2372150.0,
 'Baoule': 574310.0,
 'Bashkir': 29964.0,
 'Basogo': 199760.0,
 'Basques Bretons': 2497000.0,
 'Batobo': 74910.0,
 'Batswana': 2372150.0,
 'Baya': 848980.0,
 'Beja': 149820.0,
 'Belorussian': 2189869.0,
 '

In [11]:
sorted_ethinc_population = sorted(ethinc_population.items(), key=lambda x: x[1], reverse=True)
sorted_ethinc_population[0:10]

[('African', 46657693.5),
 ('European', 24241375.4),
 ('Mestizo', 21741379.0),
 ('Polynesian', 16642505.0),
 ('Arab', 15938351.0),
 ('Chinese', 11808313.0),
 ('Arab-Berber', 7393617.0),
 ('Black', 6379835.0),
 ('Malay', 6050231.0),
 ('Melanesian', 5770567.0)]

### 4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [12]:
longest_river_length = 0
for river in document.findall('river'):
    if(river.find('length') is not None):
        river_length = float(river.find('length').text)
    if(river_length > longest_river_length):
        longest_river_length = river_length
        longest_river_name = river.find('name').text
        if(river.find('source') is not None):
            longest_river_country_code = river.find('source').get('country')
        else:
            longest_river_country_code = river.get('country')
longest_river_country = document.find(".//country[@car_code='"+longest_river_country_code+"']")
longest_river_country_name = longest_river_country.find('name').text
print longest_river_name,longest_river_length,longest_river_country_name

Amazonas 6448.0 Peru


In [13]:
largest_lake_area = 0
for lake in document.findall('lake'):
    if(lake.find('area') is not None):
        lake_area = float(lake.find('area').text)
    if(lake_area > largest_lake_area):
        largest_lake_area = lake_area
        largest_lake_name = lake.find('name').text
        if(lake.find('located') is not None):
            largest_lake_country_code = lake.find('located').get('country')
        else:
            largest_lake_country_code = lake.get('country')
largest_lake_country = document.find(".//country[@car_code='"+largest_lake_country_code+"']")
largest_lake_country_name = largest_lake_country.find('name').text
print largest_lake_name,largest_lake_area,largest_lake_country_name

Caspian Sea 386400.0 Russia


In [14]:
highest_air_port_elevation = 0
for airport in document.findall('airport'):
    if(airport.find('elevation').text is not None):
        air_port_elevation = float(airport.find('elevation').text)
    if(air_port_elevation > highest_air_port_elevation):
        highest_air_port_elevation = air_port_elevation
        highest_air_port_name = airport.find('name').text
        highest_air_port_country_code = airport.get('country')
highest_air_port_country = document.find(".//country[@car_code='"+highest_air_port_country_code+"']")
highest_air_port_country_name = highest_air_port_country.find('name').text
print highest_air_port_name,highest_air_port_elevation,highest_air_port_country_name

El Alto Intl 4063.0 Bolivia
