# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
# question 1

document = ET.parse( './data/mondial_database.xml' ) #load in data
data_dict = {} #create blank dictionary

for country in document.iterfind('country'): #for every country...  
    for pop in country.iterfind('population'): #find the population
        try: #if the population is there try...
            #append infant mortality as values to dictionary:
            data_dict[country.find('name').text] = [float(country.find('infant_mortality').text)] 
        except: #do nothing if it doesn't work
            pass
data_dict #test

{'Afghanistan': [117.23],
 'Albania': [13.19],
 'Algeria': [21.76],
 'American Samoa': [8.92],
 'Andorra': [3.69],
 'Angola': [79.99],
 'Anguilla': [3.4],
 'Antigua and Barbuda': [13.29],
 'Argentina': [9.96],
 'Armenia': [13.97],
 'Aruba': [11.74],
 'Australia': [4.43],
 'Austria': [4.16],
 'Azerbaijan': [26.67],
 'Bahamas': [12.5],
 'Bahrain': [9.68],
 'Bangladesh': [45.67],
 'Barbados': [10.93],
 'Belarus': [3.64],
 'Belgium': [4.18],
 'Belize': [20.31],
 'Benin': [57.09],
 'Bermuda': [2.48],
 'Bhutan': [37.89],
 'Bolivia': [38.61],
 'Bosnia and Herzegovina': [5.84],
 'Botswana': [9.38],
 'Brazil': [19.21],
 'British Virgin Islands': [13.45],
 'Brunei': [10.48],
 'Bulgaria': [15.08],
 'Burkina Faso': [76.8],
 'Burundi': [63.44],
 'Cambodia': [51.36],
 'Cameroon': [55.1],
 'Canada': [4.71],
 'Cape Verde': [24.28],
 'Cayman Islands': [6.21],
 'Central African Republic': [92.86],
 'Chad': [90.3],
 'Chile': [7.02],
 'China': [14.79],
 'Colombia': [15.02],
 'Comoros': [65.31],
 'Congo': 

In [6]:
import pandas as pd #import package

#convert dictionary to data frame and sort by lowest infant mortality:
df_answer1 = pd.DataFrame.from_dict(data_dict, orient = 'index').sort_values(by=0) 
df_answer1.head(10) # answer 1, lowest infant mortality

Unnamed: 0,0
Monaco,1.81
Japan,2.13
Bermuda,2.48
Norway,2.48
Singapore,2.53
Sweden,2.6
Czech Republic,2.63
Hong Kong,2.73
Macao,3.13
Iceland,3.15


In [7]:
# question 2
pop_dict = {}

for country in document.iterfind('country'):#for every country... 
    for city in country.iterfind('city'): #find every city...
        for pop in city.iterfind('population'): #and the population
            try: #if the city is there try...
            #append population as values to dictionary:
                pop_dict[city.find('name').text] = [int(pop.text)]
            except:
                pass
            
df_answer2 = pd.DataFrame.from_dict(pop_dict, orient = 'index').sort_values(by=0, ascending = False)
df_answer2.head(10)

Unnamed: 0,0
Seoul,9708483
Al Qahirah,8471859
Bangkok,7506700
Hong Kong,7055071
Ho Chi Minh,5968384
Singapore,5076700
Al Iskandariyah,4123869
New Taipei,3939305
Busan,3403135
Pyongyang,3255288


In [8]:
# question 3
data_dict2 = {} #create 2 new data dics, 1 for population, 1 for ethnic groups, country key for both
data_dict3 = {}

#populate dict 2
for country in document.iterfind('country'): #for every country...  
    for pop in country.iterfind('population'): #find the population...
        #if found try appending country (key) population (value) to dictionary
        data_dict2[country.find('name').text] = [int(pop.text)] 
           
data_dict2 #test

{'Afghanistan': [26023100],
 'Albania': [2800138],
 'Algeria': [37062820],
 'American Samoa': [55519],
 'Andorra': [78115],
 'Angola': [24383301],
 'Anguilla': [13037],
 'Antigua and Barbuda': [81799],
 'Argentina': [42669500],
 'Armenia': [3026879],
 'Aruba': [101484],
 'Australia': [23135281],
 'Austria': [8499759],
 'Azerbaijan': [9356500],
 'Bahamas': [353658],
 'Bahrain': [1234596],
 'Bangladesh': [149772364],
 'Barbados': [277821],
 'Belarus': [9460692],
 'Belgium': [11099554],
 'Belize': [312971],
 'Benin': [9983884],
 'Bermuda': [64237],
 'Bhutan': [733004],
 'Bolivia': [10027262],
 'Bosnia and Herzegovina': [3791622],
 'Botswana': [2038228],
 'Brazil': [202768562],
 'British Virgin Islands': [23161],
 'Brunei': [393372],
 'Bulgaria': [7284552],
 'Burkina Faso': [17322796],
 'Burundi': [8444784],
 'Cambodia': [14364931],
 'Cameroon': [19406100],
 'Canada': [35158304],
 'Cape Verde': [491875],
 'Cayman Islands': [55691],
 'Central African Republic': [4349921],
 'Ceuta': [82376],

In [9]:
#populate dict 3
for country in document.iterfind('country'):
    group_dict = {} #temporary dict for ethnic groups
    for group in country.iterfind('ethnicgroup'): #find every ethnic group in every country...
        #populate temporary dict with ethnic group (key) name and percent (value)
        group_dict[group.text] = float(group.attrib['percentage'])
        #popeulate dictionary 3 with all of the country's (key) and all of the temporary dict (value)
        data_dict3[country.find('name').text] = group_dict  
        
data_dict3 #test

{'Afghanistan': {'Hazara': 19.0, 'Pashtun': 38.0, 'Tajik': 25.0, 'Uzbek': 6.0},
 'Albania': {'Albanian': 95.0, 'Greek': 3.0},
 'Algeria': {'Arab-Berber': 99.0, 'European': 1.0},
 'American Samoa': {'Caucasian': 2.0, 'Samoan': 89.0, 'Tongan': 4.0},
 'Andorra': {'African': 5.0,
  'Andorran': 33.0,
  'French': 2.0,
  'Portuguese': 11.0,
  'Spanish': 43.0},
 'Angola': {'Bakongo': 13.0,
  'European': 1.0,
  'Kimbundu': 25.0,
  'Ovimbundu': 37.0},
 'Anguilla': {'Black': 90.1, 'Mulatto': 4.6, 'White': 3.7},
 'Argentina': {'European': 97.0},
 'Armenia': {'Armenian': 97.7, 'Russian': 0.5, 'Yezidi': 1.3},
 'Aruba': {'European/Caribbean Amerindian': 80.0},
 'Australia': {'Asian': 7.0, 'European': 92.0},
 'Austria': {'Austrian': 91.1,
  'Croat': 2.0,
  'German': 0.9,
  'Serbs': 2.0,
  'Slovene': 1.0,
  'Turkish': 1.6},
 'Azerbaijan': {'Armenian': 1.5,
  'Azeri': 90.6,
  'Dagestani': 2.2,
  'Russian': 1.8},
 'Bahrain': {'Arab': 10.0, 'Asian': 13.0, 'Bahraini': 63.0, 'Iranian': 8.0},
 'Bangladesh': 

In [10]:
df_1 = pd.DataFrame.from_dict(data_dict2, orient='index') #turn the dicts to dataframes
df_2 = pd.DataFrame.from_dict(data_dict3, orient='index')

df_2.fillna(0, inplace = True) # deal with missing values
df_2.head() #test

Unnamed: 0,Albanian,Greek,Macedonian,Turkish,Gypsy,Serb,Montenegrin,Hungarian,Roma,Bosniak,...,Lugbara,Bunyoro,Batobo,Sotho,Euro-African,Indo-Mauritian,Sino-Mauritian,Franco-Mauritian,African descent,Seychellois
Afghanistan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Albania,95.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Algeria,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
American Samoa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Andorra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df_2['population'] = df_1[0] #combine data frames

pd.to_numeric(df_2['population']) #make sure population is numeric

for column in df_2.columns[:-1]: #for all ethnic group columns in dataframe convert number to percentage  
    df_2[column] = df_2[column]/100
    
for column in df_2.columns[:]: #for all ethnic group columns in dataframe convert percentage to actua amount of population
    df_2[column] = df_2[column] * df_2['population']
df_2 #test

Unnamed: 0,Albanian,Greek,Macedonian,Turkish,Gypsy,Serb,Montenegrin,Hungarian,Roma,Bosniak,...,Bunyoro,Batobo,Sotho,Euro-African,Indo-Mauritian,Sino-Mauritian,Franco-Mauritian,African descent,Seychellois,population
Afghanistan,0.0,0.00,0.0,0.000000e+00,0.0,0.0,0.0,0.000,0.000,0.000,...,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,677201733610000
Albania,2660131.1,84004.14,0.0,0.000000e+00,0.0,0.0,0.0,0.000,0.000,0.000,...,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7840772819044
Algeria,0.0,0.00,0.0,0.000000e+00,0.0,0.0,0.0,0.000,0.000,0.000,...,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1373652626352400
American Samoa,0.0,0.00,0.0,0.000000e+00,0.0,0.0,0.0,0.000,0.000,0.000,...,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3082359361
Andorra,0.0,0.00,0.0,0.000000e+00,0.0,0.0,0.0,0.000,0.000,0.000,...,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6101953225
Angola,0.0,0.00,0.0,0.000000e+00,0.0,0.0,0.0,0.000,0.000,0.000,...,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,594545367656601
Anguilla,0.0,0.00,0.0,0.000000e+00,0.0,0.0,0.0,0.000,0.000,0.000,...,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,169963369
Argentina,0.0,0.00,0.0,0.000000e+00,0.0,0.0,0.0,0.000,0.000,0.000,...,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1820686230250000
Armenia,0.0,0.00,0.0,0.000000e+00,0.0,0.0,0.0,0.000,0.000,0.000,...,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9161996480641
Aruba,0.0,0.00,0.0,0.000000e+00,0.0,0.0,0.0,0.000,0.000,0.000,...,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10299002256


In [12]:
ethnic_total_dict = {} #blank dictionary for ethnic group totals

for column in df_2.columns[:]: #populate dictionary with ethnic group name (key) and total poulation (value)
     ethnic_total_dict[column] = [df_2[column].sum()]  
        
ethic_pop_df = pd.DataFrame.from_dict(ethnic_total_dict, orient='index') #convert dictionary to dataframe

ethic_pop_df.sort_values(by=0, ascending=False).head(10) # answer 3, largest ethnic groups 

Unnamed: 0,0
Han Chinese,1245059000.0
Indo-Aryan,871815600.0
European,494872200.0
African,318325100.0
Dravidian,302713700.0
Mestizo,157734400.0
Bengali,146776900.0
Russian,131857000.0
Japanese,126534200.0
Malay,121993600.0


In [13]:
#question 4
river_dict = {} #blank dictionaries for each of the three categories
lake_dict = {}
airport_dict = {}

#populate river dictionary
for river in document.iterfind('river'): #for every river
        try: #find the name (key) and length (value) and add it to the dictioanry
            river_dict[river.find('name').text] = int(river.find('length').text)
        except:
            pass

#populate lake dictionary-repeat above
for lake in document.iterfind('lake'):
        try:
            lake_dict[lake.find('name').text] = int(lake.find('area').text)
        except:
            pass

#populate airport dictionary-repeat above
for airport in document.iterfind('airport'): 
        try:
            airport_dict[airport.find('name').text] = int(airport.find('elevation').text)
        except:
            pass

#convert dictionaries to date frames
df_river = pd.DataFrame.from_dict(river_dict, orient = 'index') 
df_lake = pd.DataFrame.from_dict(lake_dict, orient = 'index')
df_airport = pd.DataFrame.from_dict(airport_dict, orient = 'index')

#pick the longest river, largest lake, and highest airport
top_river = df_river.sort_values(by=0, ascending=False).head(1).index.tolist()
top_lake = df_lake.sort_values(by=0, ascending=False).head(1).index.tolist()
top_airport = df_airport.sort_values(by=0, ascending=False).head(1).index.tolist()

top_river #test

['Amazonas']

In [14]:
top_lake #test

['Caspian Sea']

In [15]:
top_airport #test

['El Alto Intl']

In [27]:
river_final = [] #blank list to fill with country names

#collect all of the country codes for countries that have the longest river
for river in document.iterfind('river'): #for every river...
        if river.find('name').text==top_river[0]: #check to see if it's the longest river
            river_answer = river.attrib['country'].split()  #if it is add the country code to the list

#translate the country codes into the country names
for river in document.iterfind('country'): #for every country...
    for country in river_answer: 
        if river.attrib['car_code'] == country: #check to see if the country code is on the list
            river_final.append(river.find('name').text) #if it is add the country name to the answer
            
river_final #answer 4a

['Colombia', 'Brazil', 'Peru']

In [28]:
lake_final = [] #repeat above for lakes

for lake in document.iterfind('lake'):
        if lake.find('name').text==top_lake[0]: 
            lake_answer = lake.attrib['country'].split() 
        
for lake in document.iterfind('country'):
    for country in lake_answer:
        if lake.attrib['car_code'] == country: 
            lake_final.append(lake.find('name').text)

lake_final

['Russia', 'Iran', 'Turkmenistan', 'Azerbaijan', 'Kazakhstan']

In [29]:
airport_final = [] #repeat above for lakes

for airport in document.iterfind('airport'):
        if airport.find('name').text==top_airport[0]: 
            airport_answer = airport.attrib['country'].split() 
        
for airport in document.iterfind('country'):
    for country in airport_answer:
        if airport.attrib['car_code'] == country: 
            airport_final.append(airport.find('name').text)

airport_final

['Bolivia']