In [1]:
from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np
import operator
import os

In [2]:
#Reading the file
document = ET.parse( 'data_wrangling_xml\data_wrangling_xml\data\mondial_database.xml' )
root=document.getroot()

In [3]:
#Excercise 1: countries with the lowest infant mortality rates

#defining a dictionary 
country_mortality_dict  = {}
country_mortality_dict['country']=[]
country_mortality_dict['infant_mortality']=[]

#Iterating and finding in country
for country in document.iterfind('country'):
    mortality = country.find('infant_mortality')
    
    if mortality is not None:
        country_mortality_dict['country'].append(country.find('name').text)
        country_mortality_dict['infant_mortality'].append(float(mortality.text))
        
#creating dataframe of the above dict
df = pd.DataFrame(country_mortality_dict, columns=['country', 'infant_mortality'])

#Sorting mortality rates
df.sort_values('infant_mortality').head(10)

Unnamed: 0,country,infant_mortality
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
72,Hong Kong,2.73
73,Macao,3.13
39,Iceland,3.15


In [4]:
#Excercise 2: 10 cities with the largest population

#defining a list
CityList = []

for city in document.findall('.//city'):
    name = city.find('name')
    name = name.text
    population = city.find('population[@year="2011"]') 
    
    if population is None:
        population = np.nan
    else:
        population = int(population.text)
        
#creating the list     
    CityList.append([name, population])
 
 #creating dataframe
df = pd.DataFrame(CityList, columns=['City_Name','Population_2011'])

#sorting
df.sort_values('Population_2011', ascending=False).head(10)

Unnamed: 0,City_Name,Population_2011
1527,Mumbai,12442373.0
1582,Delhi,11034555.0
1515,Bangalore,8443675.0
1000,London,8250205.0
1382,Tehran,8154051.0
1470,Dhaka,7423137.0
1591,Hyderabad,6731790.0
1505,Ahmadabad,5577940.0
3056,Luanda,5000000.0
1556,Chennai,4646732.0


In [5]:
#Excercise 3: 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

#defining a list
data=[]
for country in document.findall('country'):
    countryname=country.find('name').text
    totalpop=country.find('population[@year="2011"]')
    if totalpop is None:
        totalpop = np.nan
    else:
        totalpop = int(totalpop.text)
       
        
        for ethnicgrp in country.findall('ethnicgroup'):
            ethnicgrp1=ethnicgrp.text
            percent = float(ethnicgrp.attrib['percentage'])
            epop = int(totalpop * percent / 100)            
          
            
#creating the list            
            data.append([countryname,totalpop,ethnicgrp1,epop])
    
#creating dataframe
df=pd.DataFrame(data,columns=['countryname','totalpop','ethnicgrp1','ethnicpop'])

#sorting and assigning the value to a var
df11=df.groupby('ethnicgrp1').ethnicpop.sum().sort_values(ascending=False).reset_index().head(10)

#reading the var
df11

Unnamed: 0,ethnicgrp1,ethnicpop
0,Indo-Aryan,871815583
1,Dravidian,302713744
2,African,166391980
3,Bengali,146776916
4,German,74278483
5,English,52820300
6,Mediterranean Nordic,46815916
7,Persian,38326331
8,Polish,38018418
9,Mongol,36325649


In [6]:
#Excercise 4: name and country of a) longest river

countrycode_dict={}
for country in document.iterfind('country'):
    countrycode_dict[country.get('car_code')] = country.find('name').text
  
    
riverlist=[]
for rivername in document.iterfind('river'):
    for country in rivername.get('country').split():
        rivername1=rivername.find('name').text
        riverlength=rivername.find('length')
        if riverlength==None:
            riverlength=np.nan
        else:        
            riverlength=float(riverlength.text)
    
#creating the list
        riverlist.append([rivername1,riverlength,countrycode_dict[country]])

df = pd.DataFrame(riverlist, columns=['rivername','riverlength','country'])

#sorting and assigning the value to a var
df1=df.sort_values('riverlength',ascending=False).head(1)

#reading the var
df1

Unnamed: 0,rivername,riverlength,country
300,Amazonas,6448.0,Peru


In [7]:
#Excercise 4: name and country of b) largest lake
countrycode_dict={}
for country in document.iterfind('country'):
    countrycode_dict[country.get('car_code')] = country.find('name').text
  
    
lakelist=[]
for lakename in document.iterfind('lake'):
    for country in lakename.get('country').split():
        lakename1=lakename.find('name').text
        lakearea=lakename.find('area')
        if lakearea==None:
            lakearea=np.nan
        else:        
            lakearea=float(lakearea.text)
    
    
#creating the list
        lakelist.append([lakename1,lakearea,countrycode_dict[country]])
    
#creating DF
df = pd.DataFrame(lakelist, columns=['lakename','lakearea','country'])

#sorting and assigning the value to a var
df1=df.sort_values('lakearea',ascending=False).head(1)

#reading the var
df1

Unnamed: 0,lakename,lakearea,country
68,Caspian Sea,386400.0,Russia


In [8]:
#Excercise 4: name and country of c) airport at highest elevation

#defining a list & DF
airport_list=[]
airport_df = pd.DataFrame()

for ap in document.iterfind('airport'):
    apname=ap.find('name').text
    elv=ap.find('elevation').text  
    
     
    try:
        airport_list.append({'Name':apname, 'elevation':int(elv)})
                
    except:
         next
            
#creating DF
airport_df = pd.DataFrame(airport_list)

#sorting
airport_df.sort('elevation', ascending=False).head(1)



Unnamed: 0,Name,elevation
80,El Alto Intl,4063
