# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [21]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [22]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [23]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [24]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [25]:
def formatPrint(lst):
    '''this function helps print with proper format'''
    for tup in lst:
        if len(tup[0]) > 7:
            print tup[0] + str('\t') + str(tup[1])
        else:
            print tup[0] + str('\t') * 2 + str(tup[1])

In [26]:
#parse XML file into document
document = ET.parse( './data/mondial_database.xml' )

In [27]:
# 1) 10 countries with the lowest infant mortality rates

infant_mortality_lst = [] #list of tuples that will hold countries and its respective infant mortality (country, infant_mortality)

#this loop iterates through and extract each country and its infant mortality from document XML
for element in document.iterfind('country'):
    for subelement in element.iter('infant_mortality'):
        country = element.find('name').text
        mortality_rate = float(subelement.text)
        infant_mortality_lst.append( (country, mortality
                                      ++_rate) )

infant_mortality_lst.sort(key = lambda tup: tup[1]) #sorts the list by infant mortality rates (the 2nd entry of each tuple)
formatPrint(infant_mortality_lst[:10])

Monaco		1.81
Japan		2.13
Norway		2.48
Bermuda		2.48
Singapore	2.53
Sweden		2.6
Czech Republic	2.63
Hong Kong	2.73
Macao		3.13
Iceland		3.15


In [28]:
# 2) 10 cities with the largest population

pop_lst = [] #list of tuples holding city name and population (city, population)

#this loop iterates through and extracts each city and its latest population from document XML
for element in document.iter('city'):
    city = element.find('name').text
    if element.findall('population') == []:
        #No population found
        continue
    else:
        pop = int(element.findall('population')[-1].text)
        pop_lst.append( (city, pop))
        
pop_lst.sort(key = lambda tup: tup[1], reverse = True)
formatPrint(pop_lst[:10])

Shanghai	22315474
Istanbul	13710512
Mumbai		12442373
Moskva		11979529
Beijing		11716620
São Paulo	11152344
Tianjin		11090314
Guangzhou	11071424
Delhi		11034555
Shenzhen	10358381


In [29]:
#3) 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

ethnic_dict = {} #key is an ethnic group, value is this group's cumulative population over all countries
ethnic_lst = []  #list of tuples of (ethnic_group, cumulative world population)

for element in document.iterfind('country'):
    #country = element.find('name').text
    pop = int(element.findall('population')[-1].text)
    
    #need a check for whether ethnic group exists for each country
    if element.findall('ethnicgroup') == []:
        continue
    else:
        grp = element.findall('ethnicgroup')[0].text #the first ethnic group has the highest percentage of the population
        pct = float(element.findall('ethnicgroup')[0].get('percentage')) / 100.0
        if grp in ethnic_dict.keys():
            ethnic_dict[grp] += int(pop * pct)
        else:
            ethnic_dict[grp] = int(pop * pct)

for key, value in ethnic_dict.items():
    ethnic_lst.append( (key, value) )
    
#sort ethnic_lst so top 10 ethnic groups can be displayed
ethnic_lst.sort(key = lambda tup: tup[1], reverse = True)
formatPrint(ethnic_lst[:10])
        #print country, pop, grp, pct
    

Han Chinese	1245058800
European	441003283
Dravidian	302713744
African		198605031
Bengali		146776916
Mestizo		141972914
Japanese	126534212
Russian		114646210
Javanese	113456006
German		79192719


In [43]:
# 4) name and country of a) longest river, b) largest lake and c) airport at highest elevation
def getCarCodes(countryElement):
    car_codes = []
    for code in countryElement:
        car_codes.append(code.get('country'))
    return car_codes

def findCountryName(car_codes, doc):
    #input car_code(s) and an XML object, doc, returns the name of the country or countries
    car_codes_copy = car_codes[:]
    countries = []
    for code in car_codes:
        for element in doc.iterfind('country'):
            curr_code = element.get('car_code')
            if curr_code == None:
                continue
            else:
                if curr_code in car_codes_copy:
                    countries.append(element.find('name').text)
                    car_codes_copy.remove(curr_code)
                    if len(car_codes_copy) == 0:
                        return countries

longestRiver = None
longest = None

#finds the longest river
for element in document.iterfind('river'):
    currRiver = element.find('name').text
    if element.find('length') == None or element.find('located') == None:
        continue
    else:
        currLength = float(element.find('length').text) #cast string to a float type
    if currLength > longest:
        longest = currLength
        longestRiver = currRiver
        countryElement = element.findall('located')

print longestRiver, longest, findCountryName(getCarCodes(countryElement), document)
    

Amazonas 6448.0 ['Colombia', 'Brazil', 'Peru']


In [45]:
#finds largest lake
largestLake = None; largest = None
for element in document.iterfind('lake'):
    currLake = element.find('name').text
    if element.find('area') == None or element.find('located') == None:
        continue
    else:
        currArea = float(element.find('area').text) #cast string to a float type
    if currArea > largest:
        largest = currArea
        largestLake = currLake
        countryElement = element.findall('located')
        
print largestLake, largest, findCountryName(getCarCodes(countryElement), document)

Caspian Sea 386400.0 ['Russia', 'Iran', 'Turkmenistan', 'Kazakhstan']


In [50]:
#finds airport at highest elevation
highestAirport = None; highest = None
for element in document.iterfind('airport'):
    currAirport = element.find('name').text
    if element.find('elevation') == None or element.get('country') == None or element.find('elevation').text == None:
        continue
    else:
        currElevation = float(element.find('elevation').text) #cast string to a float type
    if currElevation > highest:
        highest = currElevation
        highestAirport = currAirport
        country = element.get('country')
        
print highestAirport, highest, country

El Alto Intl 4063.0 BOL
