In [2]:
from xml.etree import ElementTree as ET

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print( '* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print( capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


In [17]:
import pandas as pd
import numpy as np
import matplotlib as plt


In [193]:
# Trying to find 10 countries with the lowest infant mortality rates
country = ""           # Naming countries as country and starting empty
infant_mortality = ""   # Naming infant_mortality and starting empty
document = ET.parse( './data/mondial_database.xml' )   # Add a line with xml document link
df = pd.DataFrame(columns=['country','infant_mortality'])  # Naming pandas dataframe as df
df['infant_mortality'] = df['infant_mortality'].astype(float) # infant_mortality is float data type

for country in document.iterfind( 'country' ):    #After many tries, this command worked.
    for node in country.iter() :                  # I used node for both country and infant_mortality to print names.
        if node.tag == 'name':                    # Found 'name' from first example
             country = (node.text)                # country is text type
        
        if node.tag == 'infant_mortality':        # Used same command as above
            infant_mortality = float(node.text)   #infant_mortality is float type
    df.loc[len(df)] = [country,infant_mortality]  # Finally back to something normal
    country = ""                                  
        
df.sort_values(by = 'infant_mortality', ascending=True).head(10) #Working by adding ascending to command

Unnamed: 0,country,infant_mortality
38,Monaco,1.81
98,Naha,2.13
36,Vadsoe,2.48
117,Hamilton,2.48
106,Singapore,2.53
37,Västerås,2.6
10,Zlín,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Akureyri,3.15


In [204]:
# Attempt to get 10 most populated cities
cityname = ""
citypopulation = ""
document = ET.parse( './data/mondial_database.xml' )
df = pd.DataFrame(columns=['CityName','Population']) #create data frame to hold country name and its popuplation
df['Population'] = df['Population'].astype(float)

#loop through country element to find city name and its population
for country in document.iterfind( 'country' ):
    for city in country.iter('city'): #find all cities within each country element
        cityname = city.find('name').text
        year = int(0)
        for node in city.iterfind('population'): #find all population elements with each city
            year = node.attrib['year'] #there are multiple population elements with different 'year' attribute
            if node.attrib['year'] >= year: #store the population number of the latest year
                citypopulation = int(node.text)
        df.loc[len(df)] = [cityname,citypopulation] #add city name and its population to data frame
        cityname = ""
        
df.sort_values(by = 'Population', ascending=False).head(10) #Sort population in descending order

Unnamed: 0,CityName,Population
1341,Shanghai,22315474.0
771,Istanbul,13710512.0
1527,Mumbai,12442373.0
479,Moskva,11979529.0
1340,Beijing,11716620.0
2810,São Paulo,11152344.0
1342,Tianjin,11090314.0
1064,Guangzhou,11071424.0
1582,Delhi,11034555.0
1067,Shenzhen,10358381.0


In [84]:
# Attempt to find 10 largest ethnic groups across all country populations.
countryname = ""         # Start variables as empty or 0.
countrypopulation = 0
ethnipopulation = 0
document = ET.parse( './data/mondial_database.xml' )
df = pd.DataFrame(columns=['country','ethnicgroup','population'])  # Set up pandas df columns.
df['population'] = df['population'].astype(float)  # Main calculation variable

for country in document.iterfind( 'country' ):     # Start with finding countries
    countryname = country.find('name').text
    
    year = int(0)                                  # Find population. Note multiple years of data.
    countrypopulation = int(0)                     # Had to add a step to sort through years.
    for node in country.iterfind('population'): 
        year = node.attrib['year']
        if node.attrib['year'] >= year: 
            countrypopulation = int(node.text) 
    
    ethnicname = ""                                # Find ethnic groups
    for ethnic in country.iterfind('ethnicgroup'): 
        ethnicname = ethnic.text
        
        ethnicpopulation = int(0)                  # Last get ethnic group population numbers
        ethnicpopulation = (float(ethnic.attrib['percentage']) * 0.01 * int(countrypopulation))
        if ethnicname == "":
            ethnicname = countryname
            ethnicpopulation = countrypopulation
        
        df.loc[len(df)] = [countryname,ethnicname,ethnicpopulation]   # Add data to df columns
    countryname = ""

df.groupby('ethnicgroup').sum().sort_values(by = 'population', ascending=False).head(10)  # List head with first 10.

Unnamed: 0_level_0,population
ethnicgroup,Unnamed: 1_level_1
Han Chinese,1245059000.0
Indo-Aryan,871815600.0
European,494872200.0
African,318325100.0
Dravidian,302713700.0
Mestizo,157734400.0
Bengali,146776900.0
Russian,131857000.0
Japanese,126534200.0
Malay,121993600.0


In [167]:
longest_length = 0    # Find longest river name, length and country of origin.
river_length = 0
river = ""
length = 0
longest_river = ""
river_code = ""
longest_country = ""
document = ET.parse( './data/mondial_database.xml' )    # Copy from previous exercise and change variables.
df = pd.DataFrame(columns=['longest_river','longest_length','longest_country'])
df['longest_length'] = df['longest_length'].astype(float)

longest_length = 0
for river in document.iterfind('river'):
    if(river.find('length') is not None):
        river_length = float(river.find('length').text)
    if(river_length > longest_length):
        longest_length = river_length
        longest_river = river.find('name').text
        if(river.find('source') is not None):
            river_code = river.find('source').get('country')
        else:
            river_code = river.get('country')
longest_country = document.find(".//country[@car_code='"+river_code+"']")  # Had to investigate how to convert country codes to names.
longest_country = longest_country.find('name').text

df.loc[len(df)] = [longest_river, longest_length, longest_country]      # Add data to columns in df
longest_river = ""

df.sort_values(by = 'longest_length', ascending=False).head(1)


Unnamed: 0,longest_river,longest_length,longest_country
0,Amazonas,6448.0,Peru


In [166]:
largest_area = 0     # Copied from last exercise and changed variables.
lake = ""
area = 0
lake_area = 0
largest_lake = ""
largest_lake_country_code = ""
largest_lake_country = ""
country =""
document = ET.parse( './data/mondial_database.xml' )
df = pd.DataFrame(columns=['largest_lake','largest_area','country'])
df['largest_area'] = df['largest_area'].astype(float)

largest_area = 0
for lake in document.iterfind('lake'):
    if(lake.find('area') is not None):
        lake_area = float(lake.find('area').text)
    if(lake_area > largest_area):
        largest_area = lake_area
        largest_lake = lake.find('name').text
        if(lake.find('located') is not None):
            largest_lake_country_code = lake.find('located').get('country')
        else:
            largest_lake_country_code = lake.get('country')
largest_lake_country = document.find(".//country[@car_code='"+largest_lake_country_code+"']")
largest_lake_country = largest_lake_country.find('name').text


df.loc[len(df)] = [largest_lake,largest_area, largest_lake_country]
largest_area = 0

df.sort_values(by = 'largest_area', ascending=False).head(1)


Unnamed: 0,largest_lake,largest_area,country
0,Caspian Sea,386400.0,Russia


In [165]:
highest_airport = ""    # Copied from last exercise and change variables
highest_elevation = 0
highest_country = ""
airport = ""
elevation = 0
highest_code = ""
code = ""
country = ""
name = ""

document = ET.parse( './data/mondial_database.xml' )
df = pd.DataFrame(columns=['highest_airport','highest_elevation', 'highest_country'])
df['highest_elevation'] = df['highest_elevation'].astype(float)

highest_elevation = 0
for airport in document.iterfind('airport'):
    if(airport.find('elevation').text is not None):
        elevation = float(airport.find('elevation').text)
    
    if(elevation > highest_elevation):
        highest_elevation = elevation
        highest_airport = airport.find('name').text
        highest_code = airport.get('country')

highest_country = document.find(".//country[@car_code='"+highest_code+"']")
highest_country = highest_country.find('name').text

df.loc[len(df)] = [highest_airport, highest_elevation, highest_country]
highest_elevation = 0

df.sort_values(by = 'highest_elevation', ascending=False).head(1)


Unnamed: 0,highest_airport,highest_elevation,highest_country
0,El Alto Intl,4063.0,Bolivia
