# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET
import pandas as pd

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [6]:
document = ET.parse( './data/mondial_database.xml' )

In [9]:
#Exercise 1: 10 countries with the lowest infant mortality rates

countries=[]
mortality=[]
dfs= pd.DataFrame()
for element in document.iterfind('country'):
        country=element.find('name').text
        countries.append(country)
        try:
            infant_mortality=element.find('infant_mortality').text
            mortality.append(float(infant_mortality))
            
        except:
            infant_mortality="NAN"
            mortality.append(float(infant_mortality))
            continue
df=pd.DataFrame({"Countries":countries, "Infant mortality":mortality})
df.sort_values(by=["Infant mortality"],ascending=True).head(10)
    

Unnamed: 0,Countries,Infant mortality
38,Monaco,1.81
98,Japan,2.13
117,Bermuda,2.48
36,Norway,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


In [45]:
# Exercise2: 10 cities with the largest population

tree = ET.parse('./data/mondial_database.xml')
root = tree.getroot()

city_list=[]
year_list=[]
popu_list=[]

for element in root.iter('country'):
    for city in element.iter('city'):
        cities = city.find('name').text
        for popu in city.iter():
            if popu.tag == 'population':
                city_list.append(cities)
                year_list.append(int(popu.attrib.get('year')))
                popu_list.append(int(popu.text))
            
df = pd.DataFrame({'City':city_list,'Year':year_list,'Population':popu_list})     
NoDuplicates=df.sort_values(by=['Year','Population'],ascending=True).drop_duplicates(['City'],keep='last')
NoDuplicates.sort_values(by=['Population'],ascending=False).head(10).set_index(['City'])


Unnamed: 0_level_0,Population,Year
City,Unnamed: 1_level_1,Unnamed: 2_level_1
Shanghai,22315474,2010
Istanbul,13710512,2012
Mumbai,12442373,2011
Moskva,11979529,2013
Beijing,11716620,2010
São Paulo,11152344,2010
Tianjin,11090314,2010
Guangzhou,11071424,2010
Delhi,11034555,2011
Shenzhen,10358381,2010


In [None]:
# Exercise3: 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)








In [146]:
document = ET.parse( './data/mondial_database.xml' )
root = document.getroot()
root.tag
root.attrib
pop

for child in root.findall('./country'):
  #  print(child)
    name = child.find('name').text
    if child.find('infant_mortality') is not None:
        infant = child.find('infant_mortality').text
        str1 = [name]
        str2 = [infant]
        str3 = {name:infant}
        #str3 = str1 + str3
        #str2 = [infant]
        #str4 = str2 
        #newstr = {'country':str1,'Infant':str2}
        #  pop = pd.DataFrame(newstr)
        print(str3)
       
       
                  

{'Albania': '13.19'}
{'Greece': '4.78'}
{'Macedonia': '7.9'}
{'Serbia': '6.16'}
{'Andorra': '3.69'}
{'France': '3.31'}
{'Spain': '3.33'}
{'Austria': '4.16'}
{'Czech Republic': '2.63'}
{'Germany': '3.46'}
{'Hungary': '5.09'}
{'Italy': '3.31'}
{'Liechtenstein': '4.33'}
{'Slovakia': '5.35'}
{'Slovenia': '4.04'}
{'Switzerland': '3.73'}
{'Belarus': '3.64'}
{'Latvia': '7.91'}
{'Lithuania': '6'}
{'Poland': '6.19'}
{'Ukraine': '8.1'}
{'Russia': '7.08'}
{'Belgium': '4.18'}
{'Luxembourg': '4.28'}
{'Netherlands': '3.66'}
{'Bosnia and Herzegovina': '5.84'}
{'Croatia': '5.87'}
{'Bulgaria': '15.08'}
{'Romania': '10.16'}
{'Turkey': '21.43'}
{'Denmark': '4.1'}
{'Estonia': '6.7'}
{'Faroe Islands': '5.71'}
{'Finland': '3.36'}
{'Norway': '2.48'}
{'Sweden': '2.6'}
{'Monaco': '1.81'}
{'Gibraltar': '6.29'}
{'Guernsey': '3.47'}
{'Iceland': '3.15'}
{'Ireland': '3.74'}
{'San Marino': '4.52'}
{'Jersey': '3.86'}
{'Malta': '3.59'}
{'Isle of Man': '4.17'}
{'Moldova': '12.93'}
{'Portugal': '4.48'}
{'United Kingdom'

In [31]:
document = ET.parse( './data/mondial_database.xml' )
def remove_empty_elements(document):
  for element in document.xpath('//*[not(node())]'):
    element.getparent().remove(element)

In [62]:
for child in document.getroot():
    e = child.findtext('infant_mortality')
    f = child.findtext('name')
    if e is not None:
        print(sort(e))
      

NameError: name 'sort' is not defined