<p>Ben Trey<br>
Project: JSON Based Data Exercise<br>
Data Science Track<br>
2019/6/7<br>
</p>

In [58]:
from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np

****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [62]:
document_tree = ET.parse( './data/mondial_database.xml' )

<b>1. 10 countries with the lowest infant mortality rates</b>

In [159]:
#lists to construct the database
country_name = []
infant_mortality_rate = []

#populating the lists for the database
for element in document_tree.iterfind('country'):
    country_name.append(element.find('name').text)
    rate = []
    for subelement in element.getiterator('infant_mortality'):
        rate.append(float(subelement.text))
    if len(rate)<1:
        infant_mortality_rate.append(np.nan)
    else:
        infant_mortality_rate.append(rate[0])
        
#creating series from lists
country_name = pd.Series(country_name)
infant_mortality_rate = pd.Series(infant_mortality_rate)

#creating dataframe from series
df = pd.DataFrame()
df['country'] = country_name
df['rate'] = infant_mortality_rate
df.dropna()
df.sort_values(by=['rate'])[:10]

Unnamed: 0,country,rate
38,Monaco,1.81
98,Japan,2.13
117,Bermuda,2.48
36,Norway,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


<b>2. 10 cities with the largest population</b>

In [107]:
#lists to construct the database
cities = []
populations = []

#populating the lists for the database
for element in document_tree.iterfind('country'):
    for subelement in element.getiterator('city'):
        city = subelement.find('name').text
        population = 0
        for subsubelement in subelement.getiterator('population'):
            population = subsubelement.text
        if population != '':
            cities.append(city)
            populations.append(float(population))

#creating series from lists
cities = pd.Series(cities)
populations=pd.Series(populations)

#creating dataframe from series
city_people=pd.DataFrame()
city_people['city']=cities
city_people['population']=populations
city_people.dropna()
city_people.sort_values(by=['population'],ascending=False)[:10]
    

Unnamed: 0,city,population
1251,Shanghai,22315474.0
707,Istanbul,13710512.0
1421,Mumbai,12442373.0
443,Moskva,11979529.0
1250,Beijing,11716620.0
2594,São Paulo,11152344.0
1252,Tianjin,11090314.0
974,Guangzhou,11071424.0
1467,Delhi,11034555.0
977,Shenzhen,10358381.0


<b>3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)</b>

In [158]:
#dictionary to construct database
populations = {}

#populationg the dictionary
for element in document_tree.iterfind('country'):
    max_population = 0
    groups = []
    #the maximum population of the country or contained city in the last decade is used
    for subelement in element.getiterator('population'):
        population = float(subelement.text)
        year = float(subelement.attrib['year'])
        if population > max_population and year > 2009:
            max_population = population
    for subelement in element.getiterator('ethnicgroup'):
        groups.append([subelement.text,subelement.attrib['percentage']])
    for group in groups:
        if group[0] in populations.keys():
            populations[group[0]] += float(group[1])*float(max_population)/100
        else:
            populations[group[0]] = float(group[1])*float(max_population)/100

#creating series from lists
groups=pd.Series(list(populations.keys()))
populations=pd.Series(list(populations.values()))

#creating dataframe from series
df=pd.DataFrame()
df['groups']=groups
df['populations']=populations
df.sort_values(by=['populations'],ascending=False)[:10]

1245058800.0


Unnamed: 0,groups,populations
80,Han Chinese,1245059000.0
106,Indo-Aryan,871815600.0
128,European,494848300.0
16,African,318325100.0
105,Dravidian,302713700.0
150,Mestizo,157854000.0
98,Bengali,146776900.0
33,Russian,131897000.0
139,Japanese,127289000.0
110,Malay,121993600.0


<b>4. name and country of a) longest river, b) largest lake and c) airport at highest elevation</b>

In [163]:
#dictionary to hold values
longest_river=[0,'','']
largest_lake=[0,'','']
highest_airport=[0,'','']

#finding the longest river
for element in document_tree.iterfind('river'):
    if element.find('length') != None:
        length=float(element.find('length').text)
        if length>longest_river[0]:
            longest_river[0]=length
            longest_river[1]=element.find('name').text
            longest_river[2]=element.attrib['country'].split(' ')
            
#finding the largest lake
for element in document_tree.iterfind('lake'):
    if element.find('area') != None:
        area=float(element.find('area').text)
        if area>largest_lake[0]:
            largest_lake[0]=area
            largest_lake[1]=element.find('name').text
            largest_lake[2]=element.attrib['country'].split(' ')
            
#finding the highest airport
for element in document_tree.iterfind('airport'):
    if element.find('elevation').text != None:
        elevation=float(element.find('elevation').text)
        if elevation>highest_airport[0]:
            highest_airport[0]=elevation
            highest_airport[1]=element.find('name').text
            highest_airport[2]=element.attrib['country'].split(' ')
            
#finding countries from country codes and printing results
action = ['Longest River','Largest Lake','Highest Airport']
places_of_interest = [longest_river, largest_lake, highest_airport]
for i, place in enumerate(places_of_interest):
    print(action[i])
    print(place)
    countries=[]
    #finding country
    for element in document_tree.iterfind('country'):
        if element.attrib['car_code'] != None:
            if element.attrib['car_code'] in place[2]:
                countries.append(element.find('name').text)
    print(countries)
    print('\n')


Longest River
[6448.0, 'Amazonas', ['CO', 'BR', 'PE']]
['Colombia', 'Brazil', 'Peru']


Largest Lake
[386400.0, 'Caspian Sea', ['R', 'AZ', 'KAZ', 'IR', 'TM']]
['Russia', 'Iran', 'Turkmenistan', 'Azerbaijan', 'Kazakhstan']


Highest Airport
[4063.0, 'El Alto Intl', ['BOL']]
['Bolivia']


