# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )
root = document_tree.getroot()

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


In [5]:
root = document_tree.getroot()

****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

****
## Answer-1
1. 10 countries with the lowest infant mortality rates

In [6]:
import pandas as pd

tree = ET.parse( './data/mondial_database.xml' )

# A generator: iterates over tree and returns panda data frame row
def iter_tree(root, child, element):
    count = 0
    for doc in root.iterfind(child):
        _name = doc.find('name')
        if _name is not None:
            
            _element = doc.find(element)
            _element_val = None
            if _element is not None:
                _element_val = float(_element.text)
            
            # Found required elements - create a row
            row = dict(zip([child, element], [_name.text, _element_val]))
            row_s = pd.Series(row)
            row_s.name = count
            count += 1
            
            yield row_s
    
# create an empty data frame
child = 'country'
element = 'infant_mortality'
df = pd.DataFrame(columns=(child, element))

# iterate over the tree and append row into data frame
for row in iter_tree(tree.getroot(), child, element):
    df = df.append(row)
    
# query data frame
df[df.infant_mortality.notnull()].sort_values(by=element).head(10)

Unnamed: 0,country,infant_mortality
38,Monaco,1.81
98,Japan,2.13
117,Bermuda,2.48
36,Norway,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


## Answer-2
2. 10 cities with the largest population

In [7]:
import pandas as pd
from xml.etree import ElementTree as ET

tree = ET.parse( './data/mondial_database.xml' )

# create an empty data frame
child = 'country'
element = 'city'
sub_elem = 'population'
# A generator: iterates over tree and returns panda data frame row
def iter_city_population(root):
    count = 0
    for doc in root.iterfind(child):
        country_name = doc.find('name')
        if country_name is not None:
            
            for doc1 in doc.getiterator(element):
                city_name = doc1.find('name').text
                
                for doc2 in doc1.iterfind(sub_elem):
                    # print (doc2.attrib)
                    
                    _subelement_val = None
                    if doc2.attrib['year'] == '2011':
                        
                        _subelement_val = int(doc2.text)
            
                        # Found required elements - create a row
                        row = dict(zip([child, element, sub_elem], [country_name.text, city_name, _subelement_val]))
                        row_s = pd.Series(row)
                        row_s.name = count
                        count += 1

                        yield row_s
    
df = pd.DataFrame(columns=(child, element, sub_elem))

# iterate over the tree and append row into data frame
for row in iter_city_population(tree.getroot()):
    df = df.append(row)
df.sort_values(by='population', ascending=False).head(10)

Unnamed: 0,country,city,population
529,India,Mumbai,12442373.0
554,India,Delhi,11034555.0
523,India,Bangalore,8443675.0
418,United Kingdom,London,8250205.0
487,Iran,Tehran,8154051.0
505,Bangladesh,Dhaka,7423137.0
558,India,Hyderabad,6731790.0
518,India,Ahmadabad,5577940.0
627,Angola,Luanda,5000000.0
542,India,Chennai,4646732.0


****
## Answer - 3
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [8]:
import pandas as pd
from xml.etree import ElementTree as ET

tree = ET.parse( './data/mondial_database.xml' )
root = tree.getroot()

# total population per country
population = {}
# ethnicgroup population
eg_population = {}

# collect population per country
for elem in root.findall('country'):
    # assumption is that the latest year population comes at the end
    for item in elem.iterfind('population' ):
        population[elem.find('name').text] = float(item.text)

# go through ethenicgroups per county
for elem in root.findall('country'):
    country = elem.find('name').text
    for item in elem.iterfind('ethnicgroup' ):
        g_name = item.text
        percent = float(item.attrib['percentage']) / 100
        # workout ethenicgropu population from percentage and total population
        g_population = population[country] * percent
        
        # add into enthenicgroup population
        if g_name in eg_population:
            eg_population[g_name] += g_population
        else:
            eg_population[g_name] = g_population

count = 0
print ("10 ethnicgroup with the largest population\n")
print ("Ethnicgroup: Population")
print ("=======================")
for item in sorted(eg_population.items(), key=lambda x:x[1], reverse=True):
    if count == 10:
        break
    print(item[0] + ": ", int(item[1]))
    count += 1

10 ethnicgroup with the largest population

Ethnicgroup: Population
Han Chinese:  1245058800
Indo-Aryan:  871815583
European:  494872219
African:  318325120
Dravidian:  302713744
Mestizo:  157734354
Bengali:  146776916
Russian:  131856996
Japanese:  126534212
Malay:  121993550


****
## Answer - 4
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [9]:
import pandas as pd
from xml.etree import ElementTree as ET

tree = ET.parse( './data/mondial_database.xml' )
root = tree.getroot()

river = 'river'
length = 'length'
country_code = 'country_code'
country = 'country'
lake = 'lake'
area = 'area'
airport = 'airport'
elevation = 'elevation'

def get_child_text(element, child):
    _val = None
    _child_obj = element.find(child)
    if _child_obj is not None:
        _val = _child_obj.text
    
    return _val

def get_df_raw(idx, columns, values):
    row = dict(zip(columns,values))
    row_s = pd.Series(row)
    row_s.name = idx
    
    return row_s

def to_float(_str):
    if _str is not None:
        return float(_str)
    return 0.0

def iter_river(root):
    count = 0
    for elem in root.findall(river):
        
        # Found required elements - create a row
        _columns = [river, length, country_code]
        _values = [get_child_text(elem, 'name'), to_float(get_child_text(elem, length)), elem.attrib[country]]
        row_s = get_df_raw(count, _columns, _values)
        count += 1

        yield row_s

def iter_lake(root):
    count = 0
    for elem in root.findall(lake):
        
        # Found required elements - create a row
        _columns = [lake, area, country_code]
        _values = [get_child_text(elem, 'name'), to_float(get_child_text(elem, area)), elem.attrib[country]]
        row_s = get_df_raw(count, _columns, _values)
        count += 1

        yield row_s

def iter_airport(root):
    count = 0
    for elem in root.findall(airport):
        
        # Found required elements - create a row
        _columns = [airport, elevation, country_code]
        _values = [get_child_text(elem, 'name'), to_float(get_child_text(elem, elevation)), elem.attrib[country]]
        row_s = get_df_raw(count, _columns, _values)
        count += 1

        yield row_s

def iter_country(root):
    count = 0
    for elem in root.findall(country):
        
        # Found required elements - create a row
        row_s = get_df_raw(count, [country_code, country], [elem.attrib['car_code'], get_child_text(elem, 'name')])
        count += 1

        yield row_s
        
def get_countries(country_code):
    country_names = []
    for _cc in country_code.split():
        c_name = df_country[df_country.country_code == _cc].iloc[0]['country']
        country_names.append(c_name)
    return country_names

df_river = pd.DataFrame(columns=(river, length, country_code))
for row in iter_river(root):
    df_river = df_river.append(row)

df_country = pd.DataFrame(columns=(country_code, country))
for row in iter_country(root):
    df_country = df_country.append(row)

longest_river = df_river.sort_values(by=length, ascending=False).head(1)
lr_country_names = get_countries(longest_river.iloc[0]['country_code'])
print("\nLongest River: %s" % longest_river.iloc[0]['river'])
print("River Length: %f" % longest_river.iloc[0]['length']) 
print("Country it passes through:", *lr_country_names, sep=" ")
      
df_lake = pd.DataFrame(columns=(lake, area, country_code))
for row in iter_lake(root):
    df_lake = df_lake.append(row)

largest_lake = df_lake.sort_values(by=area, ascending=False).head(1)
ll_country_names = get_countries(largest_lake.iloc[0]['country_code'])
print("\nLargest Lake: %s" % largest_lake.iloc[0][lake])
print("Lake Area: %f" % largest_lake.iloc[0][area]) 
print("Country it belongs to:", *ll_country_names, sep=" ")

df_airport = pd.DataFrame(columns=(airport, elevation, country_code))
for row in iter_airport(root):
    df_airport = df_airport.append(row)

airport_highest = df_airport.sort_values(by=elevation, ascending=False).head(1)
ah_country_names = get_countries(airport_highest.iloc[0]['country_code'])
print("\nAirport at Highest Elevation: %s" % airport_highest.iloc[0][airport])
print("Evelvation: %f" % airport_highest.iloc[0][elevation]) 
print("Country it belongs to:", *ah_country_names, sep=" ")




Longest River: Amazonas
River Length: 6448.000000
Country it passes through: Colombia Brazil Peru

Largest Lake: Caspian Sea
Lake Area: 386400.000000
Country it belongs to: Russia Azerbaijan Kazakhstan Iran Turkmenistan

Airport at Highest Elevation: El Alto Intl
Evelvation: 4063.000000
Country it belongs to: Bolivia
