# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':',)
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

('* Albania:',)
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
('* Greece:',)
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
('* Macedonia:',)
Skopje, Kumanovo
('* Serbia:',)
Beograd, Novi Sad, Niš
('* Montenegro:',)
Podgorica
('* Kosovo:',)
Prishtine
('* Andorra:',)
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

###### Question 1 - Lowest Mortality Rates

In [5]:
import pandas as pd

In [6]:
document = ET.parse( './data/mondial_database.xml' )

In [7]:
root = document.getroot()
tempDict1 = {}

for element in root.iterfind('country'):
    country = element.find('name')
    tempElement = element.find('infant_mortality')
    if tempElement is not None:
        tempDict1[country.text] = float(tempElement.text)

In [8]:
seriesInfMort = pd.Series(tempDict1)
seriesInfMort.sort_values().head(10)

Monaco            1.81
Japan             2.13
Norway            2.48
Bermuda           2.48
Singapore         2.53
Sweden            2.60
Czech Republic    2.63
Hong Kong         2.73
Macao             3.13
Iceland           3.15
dtype: float64

###### Question 2 - Largest Cities

In [9]:
root = document.getroot()
tempDict2 = {}

for element in root.iter('city'):
    city = element.find('name')
    maxYear = 0
    maxYearPop = 0
    for subelement in element.iterfind('population'):
        #take the population figure from the latest year available
        if maxYear < int(subelement.attrib['year']):
            maxYear = int(subelement.attrib['year'])
            maxYearPop = int(subelement.text)
    tempDict2[city.text] = maxYearPop

In [10]:
seriesCityPops = pd.Series(tempDict2)
seriesCityPops.sort_values(ascending=False).head(10)

Shanghai     22315474
Istanbul     13710512
Mumbai       12442373
Moskva       11979529
Beijing      11716620
São Paulo    11152344
Tianjin      11090314
Guangzhou    11071424
Delhi        11034555
Shenzhen     10358381
dtype: int64

###### Question 3 - Most Populus Ethnicities

In [11]:
root = document.getroot()
tempDict3 = {}
tempDict4 = {}

for element in root.iter('country'):
    country = element.find('name')
    maxYear = 0
    maxYearPop = 0
    for subelement in element.iterfind('population'):
        #take the population figure from the latest year available
        if maxYear < int(subelement.attrib['year']):
            maxYear = int(subelement.attrib['year'])
            maxYearPop = int(subelement.text)
    tempDict3[country.text] = maxYearPop
    for subelement in element.iterfind('ethnicgroup'):
        ethnicity = subelement.text
        if tempDict4.has_key(ethnicity):
            tempDict4[ethnicity] = tempDict4[ethnicity] + (maxYearPop * float(subelement.attrib['percentage']) / 100)
        else:
            tempDict4[ethnicity] = (maxYearPop * float(subelement.attrib['percentage']) / 100)

In [12]:
seriesEthnicPops = pd.Series(tempDict4)
seriesEthnicPops.sort_values(ascending=False).head(10)

Han Chinese    1.245059e+09
Indo-Aryan     8.718156e+08
European       4.948722e+08
African        3.183251e+08
Dravidian      3.027137e+08
Mestizo        1.577344e+08
Bengali        1.467769e+08
Russian        1.318570e+08
Japanese       1.265342e+08
Malay          1.219936e+08
dtype: float64

In [13]:
root = document.getroot()
tempDict3 = {}
tempDict4 = {}

for element in root.iter('country'):
    country = element.find('name')
    maxYear = 0
    maxYearPop = 0
    for subelement in element.iterfind('population'):
        #take the population figure from the latest year available
        if maxYear < int(subelement.attrib['year']):
            maxYear = int(subelement.attrib['year'])
            maxYearPop = int(subelement.text)
    tempDict3[country.text] = maxYearPop
    for subelement in element.iterfind('ethnicgroup'):
        ethnicity = subelement.text
        if tempDict4.has_key(ethnicity):
            tempDict4[ethnicity] = tempDict4[ethnicity] + (maxYearPop * float(subelement.attrib['percentage']) / 100)
        else:
            tempDict4[ethnicity] = (maxYearPop * float(subelement.attrib['percentage']) / 100)

In [14]:
seriesEthnicPops = pd.Series(tempDict4)
seriesEthnicPops.sort_values(ascending=False).head(10)

Han Chinese    1.245059e+09
Indo-Aryan     8.718156e+08
European       4.948722e+08
African        3.183251e+08
Dravidian      3.027137e+08
Mestizo        1.577344e+08
Bengali        1.467769e+08
Russian        1.318570e+08
Japanese       1.265342e+08
Malay          1.219936e+08
dtype: float64

###### Question 4 - Highest Airport, Longest River, and Largest Lake

In [15]:
#first, let's get all of the country abbreviations
countryDict = {}
for element in root.iter('country'):
    tempElement = element.find('name')
    countryDict[element.attrib['car_code']] = tempElement.text

In [16]:
# Highest Airport
root = document.getroot()
tempDict5 = {}

maxAirportElevation = 0
maxAirportElevationName = ""
maxAirportElevationCountry = ""
for element in root.iter('airport'):
    tempElement = element.find('elevation')
    if tempElement is not None and tempElement.text is not None:
        airportElevation = float(tempElement.text)
        if airportElevation > maxAirportElevation:
            maxAirportElevation = airportElevation
            maxAirportElevationName = element.find('name').text
            maxAirportElevationCountry = element.attrib['country']
print "The highest airport is " + str(maxAirportElevation) + " meters high."
print "The highest airport is named " + maxAirportElevationName + "."
countries = maxAirportElevationCountry.split()
print "The highest airport is located in",
for i in countries:
    print countryDict[i],

The highest airport is 4063.0 meters high.
The highest airport is named El Alto Intl.
The highest airport is located in Bolivia


In [17]:
# Longest River
root = document.getroot()
tempDict6 = {}

maxRiverLength = 0
maxRiverLengthName = ""
maxRiverLengthCountry = ""
for element in root.iter('river'):
    tempElement = element.find('length')
    if tempElement is not None:
        riverLength = float(tempElement.text)
        if riverLength > maxRiverLength:
            maxRiverLength = riverLength
            maxRiverLengthName = element.find('name').text
            maxRiverLengthCountry = element.attrib['country']
print "The longest river is " + str(maxRiverLength) + " kilometers long."
print "The name of the longest river is named " + maxRiverLengthName + "."
countries = maxRiverLengthCountry.split()
print "The longest river is located in the following countries:",
for i in countries:
    print countryDict[i],

The longest river is 6448.0 kilometers long.
The name of the longest river is named Amazonas.
The longest river is located in the following countries: Colombia Brazil Peru


In [18]:
# Largest Lake
root = document.getroot()
tempDict7 = {}

maxLakeSize = 0
maxLakeSizeName = ""
maxLakeSizeCountry = ""
for element in root.iter('lake'):
    tempElement = element.find('area')
    if tempElement is not None:
        lakeSize = float(tempElement.text)
        if lakeSize > maxLakeSize:
            maxLakeSize = lakeSize
            maxLakeSizeName = element.find('name').text
            maxLakeSizeCountry = element.attrib['country']
print "The largest lake size is " + str(maxLakeSize) + " square kilometers."
print "The largest lake name is " + maxLakeSizeName + "."
countries = maxLakeSizeCountry.split()
print "The largest lake is located in the following countries:",
for i in countries:
    print countryDict[i],

The largest lake size is 386400.0 square kilometers.
The largest lake name is Caspian Sea.
The largest lake is located in the following countries: Russia Azerbaijan Kazakhstan Iran Turkmenistan
