# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [79]:
from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [6]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':')
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [7]:
document = ET.parse( './data/mondial_database.xml' )

In [129]:
inf_mor_dict = {}

for country in root.iter('country'):
    name = country.find('name').text
    
    try:
        inf_mor = float(country.find('infant_mortality').text)/100
    except:
        inf_mor = np.nan
        
    inf_mor_dict[name] = inf_mor
    
inf_mor_df = pd.DataFrame.from_dict(inf_mor_dict, orient='index')
inf_mor_df.columns = ['infant_mortality_rate']
inf_mor_df[inf_mor_df.infant_mortality_rate.notnull()].sort_values(by='infant_mortality_rate').head(10)

Unnamed: 0,infant_mortality_rate
Monaco,0.0181
Japan,0.0213
Bermuda,0.0248
Norway,0.0248
Singapore,0.0253
Sweden,0.026
Czech Republic,0.0263
Hong Kong,0.0273
Macao,0.0313
Iceland,0.0315


In [130]:
pop_dict = {}

for city in root.iter('city'):
    name = city.find('name').text
    
    for pop in city.findall('population'):
        year = 0
        population = 0
        if int(pop.get('year')) > year:
            year = int(pop.get('year'))
            population = int(pop.text)
            
    pop_dict[name] = (year, population)
    
pop_df = pd.DataFrame.from_dict(pop_dict, orient='index')
pop_df.columns = ['year', 'population']
pop_df.sort_values(by='population',ascending=False).head(10)

Unnamed: 0,year,population
Shanghai,2010,22315474
Istanbul,2012,13710512
Mumbai,2011,12442373
Moskva,2013,11979529
Beijing,2010,11716620
São Paulo,2010,11152344
Tianjin,2010,11090314
Guangzhou,2010,11071424
Delhi,2011,11034555
Shenzhen,2010,10358381


In [139]:
ethnic_dict = {}

for country in root.iter('country'):
    
    # get latest population for that country
    for pop in country.findall('population'):
        year = 0
        population = 0
        if int(pop.get('year')) > year:
            year = int(pop.get('year'))
            population = int(pop.text)
            
    # get ethnic percentages, calculate population, and add to dict
    for ethnic in country.findall('ethnicgroup'):
        ethnicgroup = ethnic.text
        ethnicpercent = float(ethnic.get('percentage'))/100
        
        # calculating population and adding to dictionary
        if ethnicgroup in ethnic_dict:
            ethnic_dict[ethnicgroup] += int(ethnicpercent * population)
        else:
            ethnic_dict[ethnicgroup] = int(ethnicpercent * population)

# creating data fame and output
ethnic_df = pd.DataFrame.from_dict(ethnic_dict, orient='index')
ethnic_df.columns = ['population']
ethnic_df.sort_values('population',ascending=False).head(10)

Unnamed: 0,population
Han Chinese,1245058800
Indo-Aryan,871815583
European,494872201
African,318325104
Dravidian,302713744
Mestizo,157734349
Bengali,146776916
Russian,131856989
Japanese,126534212
Malay,121993548


In [161]:
country_dict = {}
river_dict = {}
lake_dict = {}
airport_dict = {}

for country in root.iter('country'):
    countrycode = country.get('car_code')
    countryname = country.find('name').text
    country_dict[countrycode] = countryname
    
for river in root.iter('river'):
    rivername = river.find('name').text
    try:
        length = int(river.find('length').text)
    except:
        length = np.nan
    country = river.get('country')
    country = country.split()
    river_dict[rivername] = (length, country)

for lake in root.iter('lake'):
    name = lake.find('name').text
    try:
        area = float(lake.find('area').text)
    except:
        area = np.nan
    country = lake.get('country')
    country = country.split()
    lake_dict[name] = (area, country)
    
for airport in root.iter('airport'):
    name = airport.find('name').text
    try:
        elevation = int(airport.find('elevation').text)
    except:
        elevation = np.nan
    country = airport.get('country')
    airport_dict[name] = (elevation, country)

In [163]:
river_df = pd.DataFrame.from_dict(river_dict, orient='index')
river_df.columns = ['length', 'country']
river_df.sort_values('length', ascending=False).head(1)

Unnamed: 0,length,country
Amazonas,6448.0,"[CO, BR, PE]"


In [164]:
lake_df = pd.DataFrame.from_dict(lake_dict, orient='index')
lake_df.columns = ['area', 'country']
lake_df.sort_values('area', ascending=False).head(1)

Unnamed: 0,area,country
Caspian Sea,386400.0,"[R, AZ, KAZ, IR, TM]"


In [165]:
airport_df = pd.DataFrame.from_dict(airport_dict, orient='index')
airport_df.columns = ['elevation', 'country']
airport_df.sort_values('elevation', ascending=False).head(1)

Unnamed: 0,elevation,country
El Alto Intl,4063.0,BOL
