# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [10]:
from xml.etree import ElementTree as ET
from pandas.io.json import json_normalize
import numpy as np
import pandas as pd

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [11]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [12]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [13]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [14]:
document_tree = ET.parse( './data/mondial_database.xml' )

In [17]:
# Create dataframe of all countries and their infant mortality rate, from xml file
listt = []
missing_IM = 999 #placeholder for missing population
for element in document_tree.iterfind('country'):
    result = {'country':'', 'infant_mortality':missing_IM }
    result['country'] = element.find('name').text
    for subelement in element.getiterator('infant_mortality'):
        result['infant_mortality'] = float(subelement.text)
    listt.append(result)
df = pd.DataFrame(listt)

# Remove those with missing infant mortality rate
countries_infant_mortality = df[df.infant_mortality<>missing_IM]

# Question 1: 10 countries with the lowest infant mortality rates
countries_infant_mortality.sort_values(by="infant_mortality").head(10)

Unnamed: 0,country,infant_mortality
38,Monaco,1.81
98,Japan,2.13
117,Bermuda,2.48
36,Norway,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


In [23]:
# Create dataframe of all countries and their populations for each year, from xml file
listt = []
missing_population = 0 #placeholder for missing population
for element in document_tree.iterfind('country'):
    result = {'country':'', 'city':'', 'population':missing_population, 'year':missing_population }
    result['country'] = element.find('name').text
    for subelement in element.getiterator('city'):
        result['city'] = subelement.find('name').text
        for subsubelement in subelement.getiterator('population'):
            result['population'] = int(subsubelement.text)
            result['year'] = int(subsubelement.get('year'))
            listt.append(result.copy())

df = pd.DataFrame(listt)

# Capture the city & population (with corresponding country) for only the LATEST year
df1 = df.drop(['population'], axis=1).groupby(by=['country', 'city']).max()
df1 = df1.reset_index()
cities_population = df.merge(df1)

# Question 2: 10 cities with the largest population
cities_population.sort_values('population',ascending=0).head(10)

Unnamed: 0,city,country,population,year
1250,Shanghai,China,22315474,2010
707,Istanbul,Turkey,13710512,2012
1420,Mumbai,India,12442373,2011
443,Moskva,Russia,11979529,2013
1249,Beijing,China,11716620,2010
2592,São Paulo,Brazil,11152344,2010
1251,Tianjin,China,11090314,2010
974,Guangzhou,China,11071424,2010
1466,Delhi,India,11034555,2011
977,Shenzhen,China,10358381,2010


In [21]:
# Create dataframes for: a) all countries and populations for each year, 
# b) all countries, ethnicgroups, and percentage,   from xml file
country_list = []
ethnic_list = []
missing_population = 0 #placeholder for missing population
for element in document_tree.iterfind('country'):
    country_data = {'country':'', 'population':missing_population, 'year':missing_population,  }
    country_data['country'] = element.find('name').text
    ethnic_data = {'country':'', 'ethnicgroup':'', 'percentage':0}
    ethnic_data['country'] = element.find('name').text
    for subelement in element.findall('population'):
        country_data['population'] = int(subelement.text)
        country_data['year'] = int(subelement.get('year'))
        country_list.append(country_data.copy())
    for subelement in element.getiterator('ethnicgroup'):
        ethnic_data['ethnicgroup'] = subelement.text
        ethnic_data['percentage'] = float(subelement.get('percentage'))
        ethnic_list.append(ethnic_data.copy())

# Capture the country & population for the LATEST year
df1 = pd.DataFrame(country_list)
df2 = df1.drop(['population'], axis=1).groupby(by='country').max()
df2 = df2.reset_index()
countries_population = df1.merge(df2)
countries_population

# Ethnic groups by percentage of country
ethnic_percentage = pd.DataFrame(ethnic_list)
ethnic_percentage

# Merge data into one (big) table and create (calculate) population for ethnic groups
df = countries_population.merge(ethnic_percentage)
df['ethnic_population'] = pd.to_numeric((df.population*df.percentage)/100)

# Aggregate ethnic group populations by country
df3 = df[['ethnicgroup','ethnic_population']]

# Question 3: 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
df3.groupby('ethnicgroup').sum().sort_values('ethnic_population',ascending=0).head(10)

Unnamed: 0_level_0,ethnic_population
ethnicgroup,Unnamed: 1_level_1
Han Chinese,1245059000.0
Indo-Aryan,871815600.0
European,494872200.0
African,318325100.0
Dravidian,302713700.0
Mestizo,157734400.0
Bengali,146776900.0
Russian,131857000.0
Japanese,126534200.0
Malay,121993600.0


In [22]:
# Create dataframes for: a) all countries and car_codes, 
# b) all rivers, c) all lakes, d) all airports   from xml file
country_list = []
river_list = []
lake_list = []
airport_list = []
missing_number = 0 #placeholder to denote missing numbers (length, area, elevation)
for element in document_tree.iterfind('country'):
    country_data = {'country':'', 'car_code':''}
    country_data['country'] = element.find('name').text
    country_data['car_code'] = element.get('car_code')
    country_list.append(country_data.copy())
    
for element in document_tree.iterfind('river'):
    river_data = {'name':'', 'car_code':'', 'length':0}
    river_data['name'] = element.find('name').text
    river_data['car_code'] = element.get('country')
    length = element.find('length')
    if length<>None: # Only add rivers that actually have a length
        if length.text<>None:
            river_data['length'] = float(length.text)
            river_list.append(river_data.copy())
    
for element in document_tree.iterfind('lake'):
    lake_data = {'name':'', 'car_code':'', 'area':0}
    lake_data['name'] = element.find('name').text
    lake_data['car_code'] = element.get('country')
    area = element.find('area')
    if area<>None: # Only add lakes that actually have an area
        if area.text<>None:
            lake_data['area'] = float(area.text)
            lake_list.append(lake_data.copy())
    
for element in document_tree.iterfind('airport'):
    airport_data = {'name':'', 'car_code':'', 'elevation':0}
    airport_data['name'] = element.find('name').text
    airport_data['car_code'] = element.get('country')
    elevation = element.find('elevation')
    if elevation.text<>None: # Only add airports that actually have an elevation
        if elevation.text<>None:
            airport_data['elevation'] = float(elevation.text)
            airport_list.append(airport_data.copy())

# countries and car_codes
df = pd.DataFrame(country_list)
            
# largest river
df1 = pd.DataFrame(river_list)
df1 = df1.sort_values(by='length').tail(1)
df1 = df1.reset_index('index',drop=1)
df1b = pd.DataFrame(df1.loc[0].car_code.split(" "), columns=["car_code"])
df1b = df1b.merge(df)

# biggest lake
df2 = pd.DataFrame(lake_list)
df2 = df2.sort_values(by='area').tail(1)
df2 = df2.reset_index('index',drop=1)
df2b = pd.DataFrame(df2.loc[0].car_code.split(" "), columns=["car_code"])
df2b = df2b.merge(df)

# airport at highest elevation
df3 = pd.DataFrame(airport_list)
df3 = df3.sort_values(by='elevation').tail(1)
df3 = df3.reset_index('index',drop=1)
df3b = pd.DataFrame(df3.loc[0].car_code.split(" "), columns=["car_code"])
df3b = df3b.merge(df)

# Question 4: name and country of a) longest river, b) largest lake and c) airport at highest elevation
print "The longest river is: "+str(df1.loc[0]['name'])+", located in countries: "+', '.join(df1b['country'])
print "The largest lake is: "+str(df2.loc[0]['name'])+", located in countries: "+', '.join(df2b['country'])
print "The airport at highest elevation is: "+str(df3.loc[0]['name'])+", located in countries: "+', '.join(df3b['country'])

The longest river is: Amazonas, located in countries: Colombia, Brazil, Peru
The largest lake is: Caspian Sea, located in countries: Russia, Azerbaijan, Kazakhstan, Iran, Turkmenistan
The airport at highest elevation is: El Alto Intl, located in countries: Bolivia
