# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
import pandas as pd

In [6]:
document = ET.parse( './data/mondial_database.xml' )

In [7]:
# Build disctionary containing country name, infant mortality
# Also creaite dict to translate country codes to names
country_mortality={}
country_code={}
for child in document.getroot().iterfind('country[infant_mortality]'):
    c_code = child.attrib["car_code"]
    c_name = child.find('name').text
    country_code.update({c_code:c_name})
    country_mortality.update({c_name: float(child.find('infant_mortality').text)})


In [8]:
# Convert dict to DataFrame, sort by mortality rate and output 10 countries with lowest mortality rate
country_mortality_f = pd.DataFrame.from_dict(country_mortality, orient='index').sort_values(by=[0], ascending=True).head(10)
country_mortality_f

Unnamed: 0,0
Monaco,1.81
Japan,2.13
Bermuda,2.48
Norway,2.48
Singapore,2.53
Sweden,2.6
Czech Republic,2.63
Hong Kong,2.73
Macao,3.13
Iceland,3.15


In [9]:
# Build dictionary containing city name, population estimate and date of population estimate
city_size={}
for child in document.getroot().iterfind('country'):
    for subelement in child.findall('.//city'):
        city_size.update({subelement.find('name').text:{'population':0,'year':0}})
        for subelement2 in subelement.findall('population'):
            # Different cities have different census / estimate dates
            # In all cases most recent estimates are last, so replace entries until final estimate is reached
            # In third excercise started using commands to go directly to last population size element
            city_size[subelement.find('name').text]['year']= subelement2.attrib["year"]
            city_size[subelement.find('name').text]['population'] = int(subelement2.text)       

In [10]:
# Convert dict to DataFrame, sort by population and output top 10 cities by population
city_size_f = pd.DataFrame.from_dict(city_size, orient='index').sort_values(by=["population"], ascending=False).head(10)
city_size_f

Unnamed: 0,year,population
Shanghai,2010,22315474
Istanbul,2012,13710512
Mumbai,2011,12442373
Moskva,2013,11979529
Beijing,2010,11716620
São Paulo,2010,11152344
Tianjin,2010,11090314
Guangzhou,2010,11071424
Delhi,2011,11034555
Shenzhen,2010,10358381


In [22]:
# Build dictionary containing country name, population estimate and ethnic groups
c_ethnic={}
for child in document.getroot().iterfind('country'):
    c_name = child.find('name').text
    popul = float(child.findall("population[last()]")[0].text)
    c_ethnic.update({c_name:{"Total_Population":popul}})
    for subelement in child.findall(".ethnicgroup"):
        ethnic_name = subelement.text
        ethnic_popul = float(subelement.attrib["percentage"])*popul/100.0
        c_ethnic[child.find('name').text].update({ethnic_name:ethnic_popul})
c_ethnic

{'Afghanistan': {'Hazara': 4944389.0,
  'Pashtun': 9888778.0,
  'Tajik': 6505775.0,
  'Total_Population': 26023100.0,
  'Uzbek': 1561386.0},
 'Albania': {'Albanian': 2660131.1,
  'Greek': 84004.14,
  'Total_Population': 2800138.0},
 'Algeria': {'Arab-Berber': 36692191.8,
  'European': 370628.2,
  'Total_Population': 37062820.0},
 'American Samoa': {'Caucasian': 1110.38,
  'Samoan': 49411.91,
  'Tongan': 2220.76,
  'Total_Population': 55519.0},
 'Andorra': {'African': 3905.75,
  'Andorran': 25777.95,
  'French': 1562.3,
  'Portuguese': 8592.65,
  'Spanish': 33589.45,
  'Total_Population': 78115.0},
 'Angola': {'Bakongo': 3169829.13,
  'European': 243833.01,
  'Kimbundu': 6095825.25,
  'Ovimbundu': 9021821.37,
  'Total_Population': 24383301.0},
 'Anguilla': {'Black': 11746.337,
  'Mulatto': 599.702,
  'Total_Population': 13037.0,
  'White': 482.369},
 'Antigua and Barbuda': {'Total_Population': 81799.0},
 'Argentina': {'European': 41389415.0, 'Total_Population': 42669500.0},
 'Armenia': 

In [12]:
# Convert dict to DataFrame, calculate sums of populations by ethnicity, extract row of sums, sort, show top 10
c_ethnic_f = pd.DataFrame.from_dict(c_ethnic, orient='index').fillna(0)
c_ethnic_f.loc['Total']= c_ethnic_f.sum()
top_c_ethnic = c_ethnic_f.drop('Total_Population',1).loc['Total'].sort_values(ascending=False)
top_c_ethnic[0:10]

Han Chinese    1.245059e+09
Indo-Aryan     8.718156e+08
European       4.948722e+08
African        3.183251e+08
Dravidian      3.027137e+08
Mestizo        1.577344e+08
Bengali        1.467769e+08
Russian        1.318570e+08
Japanese       1.265342e+08
Malay          1.219936e+08
Name: Total, dtype: float64

In [13]:
# Search data set for longest river
# This search bases country of source
river_size=0.0
river_name=[]
river_country=[]
for child in document.getroot().iterfind('river[length]'):
    if float(child.find('.length').text) > river_size:
        river_size = float(child.find('.length').text)
        river_name = child.find('.name').text
        river_country= country_code[child.find('./source').attrib["country"]]
print "Longest river: " + river_name
print "Length: " + str(river_size)
print "Country of source: " + river_country

Longest river: Amazonas
Length: 6448.0
Country of source: Peru


In [14]:
# Search data set for largest lake
# This search bases country on list of all countries passed by river
lake_size=0.0
lake_name=[]
lake_country=[]
for child in document.getroot().iterfind('lake[area]'):
    if float(child.find('.area').text) > lake_size:
        lake_size = float(child.find('.area').text)
        lake_name = child.find('.name').text
        lake_country = list(map(lambda x: country_code[x], child.attrib["country"].split()))
print "Largest laker: " + lake_name
print "Size: " + str(lake_size)
print "Countries passed: " + str(lake_country)

Largest laker: Caspian Sea
Size: 386400.0
Countries passed: ['Russia', 'Azerbaijan', 'Kazakhstan', 'Iran', 'Turkmenistan']


In [15]:
# Search data set for highest elevalation airport
airport_height=0.0
airport_name=[]
airport_country=[]
for child in document.getroot().iterfind('airport[elevation]'):
    if (child.find('.elevation').text is not None) and (float(child.find('.elevation').text) > airport_height):
        airport_height = float(child.find('.elevation').text)
        airport_name = child.find('.name').text
        airport_country = list(map(lambda x: country_code[x], child.attrib["country"].split()))
        
print "Hightest elevation airport: " + airport_name
print "Elevation: " + str(airport_height)
print "Country: " + str(airport_country)

Hightest elevation airport: El Alto Intl
Elevation: 4063.0
Country: ['Bolivia']


In [16]:
# Created repeatable code to perfom searches based on solution for airport
def search_xml(s_item, s_feat):
    airport_height=0.0
    airport_name=[]
    airport_country=[]
    for child in document.getroot().iterfind((s_item)+'['+s_feat+']'):
        if (child.find('.'+s_feat).text is not None) and (float(child.find('.'+s_feat).text) > airport_height):
            airport_height = float(child.find('.'+s_feat).text)
            airport_name = child.find('.name').text
            airport_country = list(map(lambda x: country_code[x], child.attrib["country"].split()))        
    print "Seach item: " + (s_item).upper()
    print "Seach criteria: " + (s_feat).upper()
    print "Item found: " + airport_name
    print "Criteria: " + str(airport_height)
    print "Country: " + str(airport_country) 

In [17]:
search_xml("airport","elevation")

Seach item: AIRPORT
Seach criteria: ELEVATION
Item found: El Alto Intl
Criteria: 4063.0
Country: ['Bolivia']


In [18]:
search_xml("lake","area")

Seach item: LAKE
Seach criteria: AREA
Item found: Caspian Sea
Criteria: 386400.0
Country: ['Russia', 'Azerbaijan', 'Kazakhstan', 'Iran', 'Turkmenistan']


In [19]:
search_xml("river","length")

Seach item: RIVER
Seach criteria: LENGTH
Item found: Amazonas
Criteria: 6448.0
Country: ['Colombia', 'Brazil', 'Peru']
