# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
document = ET.parse( './data/mondial_database.xml' )

In [6]:
import pandas as pd

In [7]:
### 1. 10 countries with the lowest infant mortality rates

# define data frame to populate with extracted values from xml file
df1 = pd.DataFrame(columns = ['country', 'infantMortalityRate'])

# loop over countries to get name & infant mortality rate
i = 0
for country in document.findall('country'):
    countryName = country.find('name').text
    if country.find('infant_mortality') is None:
        infantMortalityRate = float('NaN')
    else:
        infantMortalityRate = float(country.find('infant_mortality').text)
    df1.loc[i] = [countryName, infantMortalityRate]
    i += 1

# get top 10 countries with highest infant mortality rates
df1.sort_values('infantMortalityRate', ascending=False)[:10]

Unnamed: 0,country,infantMortalityRate
194,Western Sahara,145.82
54,Afghanistan,117.23
189,Mali,104.34
226,Somalia,100.14
213,Central African Republic,92.86
230,Guinea-Bissau,90.92
214,Chad,90.3
192,Niger,86.27
195,Angola,79.99
201,Burkina Faso,76.8


In [8]:
# 2. 10 cities with the largest population

# define data frame to populate with extracted values from xml file
df2 = pd.DataFrame(columns = ['city', 'population'])

# loop over cities to get name & latest recorded population
i = 0
for country in document.findall('country'):
    for city in country.findall('city'):
        cityName = city.find('name').text
        year = 0
        cityPopulation = 0
        for population in city.findall('population'):
            if 'year' in population.attrib and int(population.attrib['year']) > year:
                cityPopulation = int(population.text)
        df2.loc[i] = [cityName, cityPopulation]
        i += 1

# get top 10 cities with largest populations
df2.sort_values('population', ascending=False)[:10]

Unnamed: 0,city,population
176,Seoul,9708483.0
164,Al Qahirah,8471859.0
80,Bangkok,7506700.0
128,Hong Kong,7055071.0
92,Ho Chi Minh,5968384.0
212,Singapore,5076700.0
163,Al Iskandariyah,4123869.0
216,New Taipei,3939305.0
177,Busan,3403135.0
107,Pyongyang,3255288.0


In [9]:
# 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

# define dictionary to hold aggregated population of ethnic groups
ethnicPopulations = {}

# loop over countries to get ethnic groups & their populations
i = 0
for country in document.findall('country'):
    # get country's population
    year = 0
    pop = 0
    for population in country.findall('population'):
        if 'year' in population.attrib and int(population.attrib['year']) > year:
            countryPopulation = int(population.text)

    # calculate ethnic group populations for current country & update dictionary in the process
    for ethnicgroup in country.findall('ethnicgroup'):
        eg = ethnicgroup.text
        egPercentage = float(ethnicgroup.attrib['percentage'])
        egPopulation = egPercentage * countryPopulation
        if ethnicPopulations.has_key(eg):
            ethnicPopulations[eg] = ethnicPopulations[eg] + egPopulation
        else:
            ethnicPopulations[eg] = egPopulation

# get top 10 ethnic groups with largest overall populations
df3 = pd.DataFrame.from_dict(ethnicPopulations, orient = 'index')
df3.columns = ['population']
df3.sort_values('population', ascending=False)[:10]

Unnamed: 0,population
Han Chinese,124505900000.0
Indo-Aryan,87181560000.0
European,49487220000.0
African,31832510000.0
Dravidian,30271370000.0
Mestizo,15773440000.0
Bengali,14677690000.0
Russian,13185700000.0
Japanese,12653420000.0
Malay,12199360000.0


In [10]:
# 4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

# get countries and their codes
i = 0
countryCodes = pd.DataFrame(columns = ['country', 'code'])
for country in document.findall('country'):
    countryCode = country.attrib['car_code']
    countryCodes.loc[i] = [country.find('name').text, countryCode]
    i += 1
countryCodes = countryCodes.set_index('code')


# LONGEST RIVER
longestRiverLength = 0
longestRiverName = ""
longestRiverCountryCode = ""

for river in document.findall('river'):
    if river.find('length') is not None:
        riverLength = float(river.find('length').text)
        if longestRiverLength < riverLength:
            longestRiverLength = riverLength
            longestRiverName = river.find('name').text
            longestRiverCountryCode = river.attrib['country']

riverCountries = countryCodes.loc[str.split(longestRiverCountryCode)]
print "Longest River:", longestRiverName
print "  Length:", longestRiverLength
print "  Country:", list(riverCountries['country'])
print ""


# LARGEST LAKE
largestLakeArea = 0
largestLakeName = ""
largestLakeCountryCode = ""

for lake in document.findall('lake'):
    if lake.find('area') is not None:
        lakeArea = float(lake.find('area').text)
        if largestLakeArea < lakeArea:
            largestLakeArea = lakeArea
            largestLakeName = lake.find('name').text
            largestLakeCountryCode = lake.attrib['country']

lakeCountries = countryCodes.loc[str.split(largestLakeCountryCode)]
print "Largest Lake:", largestLakeName
print "  Area:", largestLakeArea
print "  Country:", list(lakeCountries['country'])
print ""


# HIGHEST-ELEVATED AIRPORT
highestAirportElevation = 0
highestAirportName = ""
highestAirportCountryCode = ""

for airport in document.findall('airport'):
    if airport.find('elevation') is not None and airport.find('elevation').text is not None:
        airportElevation = float(airport.find('elevation').text)
        if highestAirportElevation < airportElevation:
            highestAirportElevation = airportElevation
            highestAirportName = airport.find('name').text
            highestAirportCountryCode = airport.attrib['country']

airportCountries = countryCodes.loc[str.split(highestAirportCountryCode)]
print "Highest Airport:", highestAirportName
print "  Elevation:", highestAirportElevation
print "  Country:", list(airportCountries['country'])
print ""

Longest River: Amazonas
  Length: 6448.0
  Country: ['Colombia', 'Brazil', 'Peru']

Largest Lake: Caspian Sea
  Area: 386400.0
  Country: ['Russia', 'Azerbaijan', 'Kazakhstan', 'Iran', 'Turkmenistan']

Highest Airport: El Alto Intl
  Elevation: 4063.0
  Country: ['Bolivia']

