# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

#### 10 countries with the lowest infant mortality rates

In [65]:
#cElementTree process faster than ElementTree

try:
    import xml.etree.cElementTree as ET
    import pandas as pd
    import numpy as np
except ImportError:
    import xml.etree.ElementTree as ET
    import pandas as pd
    import numpy as np

In [73]:
document = ET.parse( './data/mondial_database.xml' )

In [136]:
#Create a pandas DataFrame to get top 10
country_name = ''
infant_mortality = ''
d_frame = pd.DataFrame(columns = ["country_name","infant_mortality"])
d_frame

Unnamed: 0,country_name,infant_mortality


In [137]:
#loop over each country to get name and infant_mortality rate
#and append to dataframe
for country in document.findall('country'):
    country_name = country.find('name').text
    country_infant_mortality = country.find('infant_mortality')
    if country_infant_mortality != None: 
        country_infant_mortality = float(country_infant_mortality.text)
        d_frame.loc[len(d_frame)] = [country_name, country_infant_mortality] 

#sort by asc and show top 10
d_frame.sort_values(by = 'infant_mortality').drop_duplicates().head(10)

Unnamed: 0,country_name,infant_mortality
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
72,Hong Kong,2.73
73,Macao,3.13
39,Iceland,3.15


#### 10 cities with the largest population

In [227]:
city_name = ''
population = ''
df_city = pd.DataFrame(columns = ['city_name','Population'])
df_city['Population'] = df_city['Population'] .astype(float)
df_city

Unnamed: 0,city_name,Population


In [232]:
for country in document.findall('country'):
    for city in countries.iter('city'):
        cityname = city.find('name').text
        year = int(0)
        for p in city.iterfind('population'):
            year = p.attrib['year']
            if p.attrib['year'] >= year:
                population = int(p.text)
        df_city.loc[len(df_city)] = [cityname, population] 

df_city.sort_values(by = 'Population', ascending = False).drop_duplicates().head(10)

Unnamed: 0,city_name,Population
0,Tirana,418495.0
2324,Durrës,113249.0
6327,Vlorë,79513.0
4264,Elbasan,78703.0
6919,Shkodër,77075.0
1235,Korçë,51152.0


#### 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [235]:
country_name = ''
population = 0
ethnic_population = 0
document = ET.parse( './data/mondial_database.xml' )

df_ethnic = pd.DataFrame(columns = ['Country','Ethnic group','Population'])
df_ethnic

Unnamed: 0,Country,Ethnic group,Population


In [236]:
for country in document.iterfind('country'):
    country_name = country.find('name').text
    year = int(0)
    population = int(0)
    ethnic_name = ''
    ethnicpopulation = ''
    
    for element in country.iterfind('population'):
        year = element.attrib['year']
        if element.attrib['year'] >= year:
            population = int(element.text)
    
    for ethnic in country.iter('ethnicgroup'):
        ethnic_name = ethnic.text
        ethnicpopulation = round(float(ethnic.attrib['percentage']) * 0.01 * int(population))

        
        if ethnic_name == None:
            ethnic_name = countryname
            ethnicpopulation = countrypopulation
        df_ethnic.loc[len(df_ethnic)] = [country_name,ethnic_name,ethnicpopulation]
df_ethnic.groupby('Ethnic group').sum().sort_values(by = 'Population', ascending = False).head(10)

    

Unnamed: 0_level_0,Population
Ethnic group,Unnamed: 1_level_1
Han Chinese,1245059000.0
Indo-Aryan,871815600.0
European,494872200.0
African,318325100.0
Dravidian,302713700.0
Mestizo,157734400.0
Bengali,146776900.0
Russian,131857000.0
Japanese,126534200.0
Malay,121993600.0


#### name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [272]:
import re
# Longest River
longest_river = ''
river_length = 0
document = ET.parse( './data/mondial_database.xml' )

df_river = pd.DataFrame(columns = ['River','Country','Length'])
df_river

Unnamed: 0,River,Country,Length


In [284]:
for river in document.iterfind('river'):
    river_name = ''
    length = 0
    river_country_code = ''
    river_country_name = ''

    #check if name has - or not. Remove the word 'river'
    if '-' in river.attrib['id']:
        river_name = river.attrib['id'].split('-')[1]
    else:
        river_name = river.attrib['id']
    
    if river.find('source') is not None:
        #get country code and map with country name 
        river_country_code = river.find('source').get('country').split(' ')[0]
        if river.find('located') is not None:
            river_country_name = river.find('located').get('province')
            match = re.match('(-\w+ -)(\w+ -) | (-\w+ -)', river_country_name)
            print(match)
            #river_country_name = river_country_name.split(' ')[0].split('-')[1]
        print(river_name, river_country_name)
        
        #if country code is 3 characters, try to find the province name
        #and extract the middle word
        #if len(river_country_code) == 3:
        #    if river.find('located') is not None:
        #        river_country_name = river.find('located').attrib['province']
        #        river_country_name = river_country_name.split(' ')[0].split('-')[1]
        #    print(river_name, '3 charac long',river_country_name)
        #else:
        #    river_country_name = document.find(".//country[@car_code='"+river_country_code+"']")
        #    print(river_name, river_country_code)
        #    if river_country_name is not None:
        #        river_country_name.find('name').text
        
    if river.find('length') is not None:
        length = float(river.find('length').text)
    
    df_river.loc[len(df_river)] = [river_name,river_country_name,length]
df_river

Thjorsa 
Joekulsa_a_Fjoellum 
None
Glomma lteil-AK-N lteil-OES-N lteil-HE-N lteil-ST-N
None
Lagen lteil-OP-N lteil-HE-N lteil-AK-N
None
Goetaaelv prov-Sweden-5
None
Klaraelv prov-Sweden-22
None
Umeaelv prov-Sweden-23
None
Dalaelv prov-Sweden-4 prov-Sweden-11 prov-Sweden-21 prov-Sweden-25
None
Vaesterdalaelv prov-Sweden-11
None
Oesterdalaelv prov-Sweden-11
None
Paatsjoki lteil-LAP-SF
None
Ounasjoki lteil-LAP-SF
None
Kemijoki lteil-LAP-SF
None
Oulujoki lteil-OUL-SF
None
Kymijoki lteil-MIK-SF lteil-KYM-SF lteil-HAE-SF
None
Kokemaeenjoki lteil-TUP-SF lteil-HAE-SF
None
Vuoksi lteil-KYM-SF lteil-KUO-SF
None
Themse prov-gb-6 prov-gb-7 prov-gb-8 prov-gb-9
None
Maas prov-Netherlands-7 prov-Netherlands-10 prov-Netherlands-12 prov-Netherlands-13
None
Loire prov-France-11 prov-France-25 prov-France-30 prov-France-89 prov-France-111
None
Garonne prov-France-5 prov-France-62 prov-France-77
None
Rhone prov-France-62 prov-France-111 prov-France-104
None
Saone prov-France-72 prov-France-25 prov-France-

Unnamed: 0,River,Country,Length
0,Thjorsa,,230.0
1,Joekulsa_a_Fjoellum,,206.0
2,Glomma,lteil-AK-N lteil-OES-N lteil-HE-N lteil-ST-N,604.0
3,Lagen,lteil-OP-N lteil-HE-N lteil-AK-N,322.0
4,Goetaaelv,prov-Sweden-5,93.0
5,Klaraelv,prov-Sweden-22,460.0
6,Umeaelv,prov-Sweden-23,470.0
7,Dalaelv,prov-Sweden-4 prov-Sweden-11 prov-Sweden-21 pr...,520.0
8,Vaesterdalaelv,prov-Sweden-11,320.0
9,Oesterdalaelv,prov-Sweden-11,241.0
