 import libaries, load documents and get roots

In [1]:
from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np

In [2]:
#get document, and the root of the string documents
document = ET.parse( './data/mondial_database.xml' )
root = document.getroot()

# Question1: 10 countries with the lowest infant mortality rates

In [3]:
# parse name of the country, infant_mortality rate,
# then update the mortality dictionary

mort = {}
for child in root.iter('country'):
    name = child.find('name').text
    for mortsub in child.iter('infant_mortality'):
        mortality = float(mortsub.text)
        mort.update({name:mortality})

In [4]:
# read dictinary mort as data frame
# sort df by mortality rate, and show lowest 10
mortality_df = pd.DataFrame.from_dict(mort, orient = "index")
mortality_df.columns = ["mortality"]
mortality_df.sort_values("mortality").head(10)

Unnamed: 0,mortality
Monaco,1.81
Japan,2.13
Bermuda,2.48
Norway,2.48
Singapore,2.53
Sweden,2.6
Czech Republic,2.63
Hong Kong,2.73
Macao,3.13
Iceland,3.15


# Question2 : 10 cities with the largest population

In [5]:
#parse, city name population
pop_dict = {}
for child in root.iter('city'):
    year0 = 2000
    for pop in child.getiterator('population'):
        year = int(pop.get('year'))
        city = child.findtext('name')
        if year >= year0:
            year0 = year
            population = int(pop.text)
            pop_dict.update({city: population})

In [6]:
# read dictionary as data frame
# sort by population, descending order
population_df = pd.DataFrame.from_dict(pop_dict, orient = "index")
population_df.columns = ["population"]
population_df.sort_values("population",ascending= False).head(10)

Unnamed: 0,population
Shanghai,22315474
Istanbul,13710512
Mumbai,12442373
Moskva,11979529
Beijing,11716620
São Paulo,11152344
Tianjin,11090314
Guangzhou,11071424
Delhi,11034555
Shenzhen,10358381


# Question 3: 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)


In [7]:
ethn_name = list()
ethn_population = list()

# extract lastest population
# get the population of each ethnicity in every country
# write the ethnicity group name and population into seperate list

for child in root.iter('country'):
    country = child.findtext('name')
    year0 = 1950
    for population in child.findall('population'):
        year = int(population.get('year'))
        if year > year0:
            year0 = year
            pop = int (population.text)
    for ethinicty in root.iter('ethnicgroup'):
        percent = float (ethinicty.get('percentage'))
        ethname = ethinicty.text
        ethpop = percent * pop / 100 
        ethn_name.append(ethname)
        ethn_population.append(ethpop)

In [8]:
#dataframe 
ethn_df = pd.DataFrame({'ethnicity':ethn_name ,'population':ethn_population})

#groupby ethnicity group and sum

ethn_df.groupby('ethnicity').sum().sort_values('population', ascending = False).head(10)

Unnamed: 0_level_0,population
ethnicity,Unnamed: 1_level_1
African,130273300000.0
European,67684540000.0
Mestizo,60704280000.0
Polynesian,46467680000.0
Arab,44501600000.0
Chinese,32970090000.0
Arab-Berber,20643780000.0
Black,17813190000.0
Malay,16892900000.0
Melanesian,16112050000.0


# Question4: name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [9]:
#find elements of "river"
print(root.find("river")[:])

[<Element 'name' at 0x119259e08>, <Element 'to' at 0x119259e58>, <Element 'area' at 0x119259ea8>, <Element 'length' at 0x119259ef8>, <Element 'source' at 0x119259f48>, <Element 'estuary' at 0x119260098>]


In [10]:
# 10 longest rivers
river_dict = {}
for child in root.iter('river'):
    name = child.findtext('name')
    for sub in child.findall('length'):
        length = float(sub.text)
        river_dict.update({name:length})
river_df = pd.DataFrame.from_dict(river_dict, orient = "index")
river_df.sort_values(0,ascending = False).head(10)

Unnamed: 0,0
Amazonas,6448.0
Jangtse,6380.0
Hwangho,4845.0
Lena,4400.0
Zaire,4374.0
Mekong,4350.0
Irtysch,4248.0
Niger,4184.0
Missouri,4130.0
Jenissej,4092.0


a) Find name and length of longest river

In [11]:
max_length = 0
for child in root.iter('river'):
    for sub in child.findall('length'):
        length = float(sub.text)
        if length > max_length:
            max_length = length
            name = child.findtext('name')
            country = child.get("country")
print(name, max_length, country)

Amazonas 6448.0 CO BR PE


In [12]:
# check the answer from element "city"
for child in root.iter('country'):
    country = child.findtext('name')
    for subchild in child.iter('city'):
        cityname = subchild.findtext('name')
        for city in subchild.findall('located_at'):
            river = city.get('river')
            if river == "river-Amazonas":
                print (country, cityname, river)
        

Colombia Leticia river-Amazonas
Brazil Macapá river-Amazonas
Brazil Manaus river-Amazonas
Peru Cusco river-Amazonas
Peru Iquitos river-Amazonas
Peru Pucallpa river-Amazonas


2) Name & area of Largest Lake

In [13]:
print(root.find("lake")[:])

[<Element 'name' at 0x119414368>, <Element 'located' at 0x1194143b8>, <Element 'to' at 0x119414408>, <Element 'area' at 0x119414458>, <Element 'latitude' at 0x1194144a8>, <Element 'longitude' at 0x1194144f8>, <Element 'elevation' at 0x119414548>, <Element 'depth' at 0x119414598>]


In [14]:
lake_dict={}
max_area = 0 
for child in root.iter('lake'):
    for sub in child.findall('area'):
        area = float(sub.text)
        if area> max_area:
            max_area = area
            name = child.findtext('name')
            country = child.get("country")
print(name, max_area,country)

Caspian Sea 386400.0 R AZ KAZ IR TM


R-Russia, KAZ-Kazakhsta, AZ-Azerbaijan, IR-Iran, TM-Turkmenistan,

c) airport at highest elevation
        

In [15]:
print(root.find("airport")[:])

[<Element 'name' at 0x11962f368>, <Element 'latitude' at 0x11962f3b8>, <Element 'longitude' at 0x11962f408>, <Element 'elevation' at 0x11962f458>, <Element 'gmtOffset' at 0x11962f4a8>]


In [33]:
airport_ls = []
elevation_ls = []
for child in root.iter('airport'):
    #for subchild in child.find("elevation"):
        #elevation = float(subchild.text)
        #if elevation > highest:
            #highest = elevation
    airport = child.findtext('name')
    elevation = child.findtext('elevation')
    airport_ls.append(airport)
    elevation_ls.append(elevation)

aiport_elevation = pd.DataFrame({'airport':airport_ls ,'elevation':elevation_ls})
aiport_elevation.sort_values("elevation", ascending = False).head(5)      

Unnamed: 0,airport,elevation
536,Mashhad,995
1009,A Coruna,99
915,Yakutsk,99
387,Guipavas,99
361,Kuopio,98


In [35]:
elevation_ls = list(map(float, elevation_ls))

ValueError: could not convert string to float: 