In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplot

In this notebook we are building a list of countries within each WHO defined region, using country lists provided by the WHO webside (by region). For the Africa region only, the data was provided in a table. In this case, we used the read_html pandas function. For all other regions, the tables were not presented in a Table format. In these cases, we saved the html source behind the page the tables were presented (URLs provided) using the "view source" command within Chrome. From here, we imported the html file as text, sliced for the rows that contained the Country data, and the split those rows in order have only the Country names.

# Africa

In [2]:
africa_raw = pd.read_html('http://www.afro.who.int/en/countries.html')[0][1:]

In [3]:
africa_raw

Unnamed: 0,0,1,2,3
1,Algeria (French),Cote d'Ivoire (French),Liberia,Senegal (French)
2,Angola (Portuguese),Democratic Republic of Congo (French),Madagascar (French),Seychelles
3,Benin (French),Equatorial Guinea (French),Malawi,Sierra Leone
4,Botswana,Ethiopia,Mali (French),South Africa
5,Burkina Faso (French),Eritrea,Mauritania (French),South Sudan
6,Burundi (French),Gabon (French),Mauritius,Swaziland
7,Cameroon (French),Gambia,Mozambique (Portuguese) (English),Togo (French)
8,Cabo Verde (Portuguese),Ghana,Namibia,Uganda
9,Central African Republic (French),Guinea (French),Niger (French),United Republic of Tanzania
10,Chad (French),Guinea-Bissau (Portuguese),Nigeria,Zambia


In [4]:
africa = pd.melt(africa_raw)
africa['Region'] = "Africa"
africa['Country'] = africa['value']
del africa['variable']
del africa['value']
africa

Unnamed: 0,Region,Country
0,Africa,Algeria (French)
1,Africa,Angola (Portuguese)
2,Africa,Benin (French)
3,Africa,Botswana
4,Africa,Burkina Faso (French)
5,Africa,Burundi (French)
6,Africa,Cameroon (French)
7,Africa,Cabo Verde (Portuguese)
8,Africa,Central African Republic (French)
9,Africa,Chad (French)


In [5]:
#remove NaN value and then drop the language references
africa = africa.drop([47])
africa['Country'] = africa['Country'].apply(lambda x: x.split('(')[0])
africa.head()

Unnamed: 0,Region,Country
0,Africa,Algeria
1,Africa,Angola
2,Africa,Benin
3,Africa,Botswana
4,Africa,Burkina Faso


# Europe

In [6]:
# captures this file by doing a view source of html page, and saving as an html file
#open the file
#original source page: http://www.euro.who.int/en/countries
euro_raw_open = open('Region-Country-Sources/Euro.html', 'r', encoding='ISO-8859-2')

In [7]:
#use readlines to return a list of lines
euro_raw = euro_raw_open.readlines()
len(euro_raw)
#determined specific lines of HTML by looking at viewsource
euro_raw = euro_raw[243:304]

In [8]:
# now we need to extract the Country from each list
# 1. only look at lines with a country in them - ie have a tag
# 2. split the line twice: first looks for anything after a '">' the sencond looks for anything before '</a>'
eu = [x.split('">')[1].split("</a>")[0] for x in euro_raw if '">' in x]
eu

['Albania',
 'Andorra',
 'Armenia',
 'Austria',
 'Azerbaijan',
 'Belarus',
 'Belgium',
 'Bosnia and Herzegovina',
 'Bulgaria',
 'Croatia',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Estonia',
 'Finland',
 'France',
 'Georgia',
 'Germany',
 'Greece',
 'Hungary',
 'Iceland',
 'Ireland',
 'Israel',
 'Italy',
 'Kazakhstan',
 'Kyrgyzstan',
 'Latvia',
 'Lithuania',
 'Luxembourg',
 'Malta',
 'Monaco',
 'Montenegro',
 'Netherlands',
 'Norway',
 'Poland',
 'Portugal',
 'Republic of Moldova',
 'Romania',
 'Russian Federation',
 'San Marino',
 'Serbia',
 'Slovakia',
 'Slovenia',
 'Spain',
 'Sweden',
 'Switzerland',
 'Tajikistan',
 'The former Yugoslav Republic of Macedonia',
 'Turkey',
 'Turkmenistan',
 'Ukraine',
 'United Kingdom of Great Britain and Northern Ireland',
 'Uzbekistan']

In [9]:
europe = pd.DataFrame({'Region': 'Europe', 'Country': eu})
europe.head()

Unnamed: 0,Country,Region
0,Albania,Europe
1,Andorra,Europe
2,Armenia,Europe
3,Austria,Europe
4,Azerbaijan,Europe


# Region of Americas

In [10]:
# Same process as Europe - use viewsource
# captures this file by doing a view source of html page, and saving as an html file
#open the file
#original source page: http://www.paho.org/hq/index.php?option=com_wrapper&view=wrapper&Itemid=2005&lang=en
americas_raw_open = open('Region-Country-Sources/Americas.html', 'r', encoding='ISO-8859-2')

In [11]:
#use readlines to return a list of lines
americas_raw = americas_raw_open.readlines()
#determined specific lines of HTML by looking at viewsource
americas_raw = americas_raw[29:166]
len(americas_raw)

137

In [12]:
am = [x.split('alt="')[1].split('" style="')[0] for x in americas_raw if 'alt="' in x]
am

['Antigua and Barbuda',
 'Argentina',
 'Aruba',
 'Bahamas',
 'Barbados',
 'Belize',
 'Bermuda',
 'Bolivarian Republic of Venezuela',
 'Bolivia',
 'Brazil',
 'British Virgin Islands (UK)',
 'Canada',
 'Cayman Islands',
 'Chile',
 'Colombia',
 'Costa Rica',
 'Cuba',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'El Salvador',
 'French Guiana',
 'Grenada',
 'Guadeloupe',
 'Guatemala',
 'Guyana',
 'Haiti',
 'Honduras',
 'Jamaica',
 'Martinique',
 'Mexico',
 'Montserrat',
 'Netherland Antilles',
 'Nicaragua',
 'Panama',
 'Paraguay',
 'Peru',
 'Puerto Rico',
 'Saint Kitts and Nevis',
 'Saint Lucia',
 'Saint Vincent and the Grenadines',
 'Suriname',
 'Trinidad and Tobago',
 'Turks and Caicos',
 'United States of America',
 'Uruguay']

In [13]:
americas = pd.DataFrame({'Region': 'Americas', 'Country': am})
americas.head()

Unnamed: 0,Country,Region
0,Antigua and Barbuda,Americas
1,Argentina,Americas
2,Aruba,Americas
3,Bahamas,Americas
4,Barbados,Americas


# Southeast Asia

In [14]:
# again, information is not organized into a table on web page, so using viewsource as text file to read
# original source page: http://www.searo.who.int/countries/en/
seasia_raw_open = open('Region-Country-Sources/SEAsia.html', 'r', encoding='ISO-8859-2')

In [15]:
# use readlines to return a list of lines
seasia_raw = seasia_raw_open.readlines()
# determined specific lines of HTML by looking at viewsource
seasia_raw = seasia_raw[287:346]
seasia_raw

['<a href="/bangladesh" class="" >Bangladesh</a> \n',
 '\t\t\t\t</li>\n',
 '\t\t\t\t<li>\n',
 '<a href="/bhutan" class="" >Bhutan</a> \n',
 '\t\t\t\t</li>\n',
 '\t\t\t\t<li>\n',
 '<a href="/dprkorea" class="" >Democratic Peopleâ\x80\x99s Republic of Korea</a> \n',
 '\t\t\t\t</li>\n',
 '\t\t\t\t<li>\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\n',
 '\t<a  href="/entity/india/en/index.html">India</a>\n',
 '\t\t\t\t</li>\n',
 '\t\t\t\t<li>\n',
 '<a href="/indonesia" class="" >Indonesia</a> \n',
 '\t\t\t\t</li>\n',
 '\t\t\t\t<li>\n',
 '<a href="/maldives" class="" >Maldives</a> \n',
 '\t\t\t\t</li>\n',
 '\t</ul>\n',
 '\n',
 '</div>\n',
 '<!-- begin: box -->\n',
 '\n',
 '</div>\n',
 '<!-- end subcol_1-1_1 -->\n',
 '\n',
 '<!-- start subcol_1-1_2 -->\n',
 '<div class="subcol_1-1_2">\n',
 '\n',
 '<!-- begin: box -->\n',
 '<div class="box">\n',
 '      <ul class="list">\n',
 '\t\t\t\t<li>\n',
 '<a href="/myanmar" class="" >Myanmar</a> \n',
 

In [16]:
sea = []
for x in seasia_raw:
    if '</a>' in x:
        if 'class=""' in x:
            sea.append(x.split('class="" >')[1].split('</a>')[0])
        else:
            sea.append(x.split('html">')[1].split('</a>')[0])
sea

['Bangladesh',
 'Bhutan',
 'Democratic Peopleâ\x80\x99s Republic of Korea',
 'India',
 'Indonesia',
 'Maldives',
 'Myanmar',
 'Nepal',
 'Sri Lanka ',
 'Thailand',
 'Timor-Leste']

In [17]:
seasia = pd.DataFrame({'Region': 'Southeast Asia', 'Country': sea})
seasia

Unnamed: 0,Country,Region
0,Bangladesh,Southeast Asia
1,Bhutan,Southeast Asia
2,Democratic Peopleâs Republic of Korea,Southeast Asia
3,India,Southeast Asia
4,Indonesia,Southeast Asia
5,Maldives,Southeast Asia
6,Myanmar,Southeast Asia
7,Nepal,Southeast Asia
8,Sri Lanka,Southeast Asia
9,Thailand,Southeast Asia


# Eastern Mediteranean

In [18]:
# original source: http://www.emro.who.int/countries.html
eastern_med_raw_open = open('Region-Country-Sources/EasternMed.html', 'r', encoding='ISO-8859-2')

In [19]:
# use readlines to return a list of lines
emed_raw = eastern_med_raw_open.readlines()
# determined specific lines of HTML by looking at viewsource
emed_raw = emed_raw[183:223]
emed_raw

['<p><a href="/countries/afg/index.html">Afghanistan</a></p>\n',
 '<h3><a name="B"></a>B</h3>\n',
 '<p><a href="/countries/bah/index.html">Bahrain</a></p>\n',
 '<h3><a name="D"></a>D</h3>\n',
 '<p><a href="/countries/dji/index.html">Djibouti</a></p>\n',
 '<h3><a name="E"></a>E</h3>\n',
 '<p><a href="/countries/egy/index.html">Egypt</a></p>\n',
 '<h3><a name="I"></a>I</h3>\n',
 '<p><a href="/countries/irn/index.html">Iran, Islamic Republic of</a></p>\n',
 '<p><a href="/countries/irq/index.html">Iraq</a></p>\n',
 '<h3><a name="J"></a>J</h3>\n',
 '<p><a href="/countries/jor/index.html">Jordan</a></p>\n',
 '<h3><a name="K"></a>K</h3>\n',
 '<p><a href="/countries/kwt/index.html">Kuwait</a></p>\n',
 '<h3><a name="L"></a>L</h3>\n',
 '<p><a href="/countries/lbn/index.html">Lebanon</a></p>\n',
 '<p><a href="/countries/lby/index.html">Libya</a></p>\n',
 '<h3><a name="M"></a>M</h3>\n',
 '<p><a href="/fr/countries/mor/index.html">Morocco</a></p>\n',
 '</div>\n',
 '<div class="topicsright">\n',
 '<

In [20]:
em = []
for x in emed_raw:
    if '</a>' in x:
        if 'html">' in x:
            em.append(x.split('html">')[1].split('</a>')[0])
        elif '#ffffff;">' in x:
            em.append(x.split('fff;">')[1].split('</a>')[0])
em


['Afghanistan',
 'Bahrain',
 'Djibouti',
 'Egypt',
 'Iran, Islamic Republic of',
 'Iraq',
 'Jordan',
 'Kuwait',
 'Lebanon',
 'Libya',
 'Morocco',
 'Occupied Palestinian territory',
 'Oman',
 'Pakistan',
 'Palestine, see Occupied Palestinian territory',
 'Qatar',
 'Saudi Arabia',
 'Somalia',
 'Sudan',
 'Syrian Arab Republic',
 'Tunisia',
 'United Arab Emirates',
 'Yemen']

In [21]:
eastern_med = pd.DataFrame({'Region': 'Eastern Mediterranean', 'Country': em})
eastern_med

Unnamed: 0,Country,Region
0,Afghanistan,Eastern Mediterranean
1,Bahrain,Eastern Mediterranean
2,Djibouti,Eastern Mediterranean
3,Egypt,Eastern Mediterranean
4,"Iran, Islamic Republic of",Eastern Mediterranean
5,Iraq,Eastern Mediterranean
6,Jordan,Eastern Mediterranean
7,Kuwait,Eastern Mediterranean
8,Lebanon,Eastern Mediterranean
9,Libya,Eastern Mediterranean


# Western Pacific

In [22]:
# original source: http://www.wpro.who.int/countries/en/
pacific_raw_open = open('Region-Country-Sources/Pacific.html', 'r', encoding='ISO-8859-2')

In [23]:
# use readlines to return a list of lines
pacific_raw = pacific_raw_open.readlines()
# determined specific lines of HTML by looking at viewsource
pacific_raw= pacific_raw[327:1059]
pacific_raw

['\t<a  href="/countries/asm/en/index.html">American Samoa</a>\n',
 '\t\t\t\t</li>\n',
 '\t</ul>\n',
 '\n',
 '\t\n',
 '\t<div class="clear"> </div>\n',
 '</div>\n',
 '<!-- end: teaser -->\n',
 '\n',
 ' \n',
 '<!-- start: teaser -->\n',
 '<div class="teaser">\n',
 '\t\n',
 '   \n',
 '<div  class="image_left"  style="width:24px">\n',
 '<img src="/countries/flag_aus_sm.gif" alt="" width="24" height="16" />\n',
 '</div>\n',
 ' \n',
 '      <ul class="list">\n',
 '\t\t\t\t<li>\n',
 '\t<a  href="/countries/aus/en/index.html">Australia</a>\n',
 '\t\t\t\t</li>\n',
 '\t</ul>\n',
 '\n',
 '\t\n',
 '\t<div class="clear"> </div>\n',
 '</div>\n',
 '<!-- end: teaser -->\n',
 '\n',
 ' \n',
 '<!-- start: teaser -->\n',
 '<div class="teaser">\n',
 '\t\n',
 '   \n',
 '<div  class="image_left"  style="width:24px">\n',
 '<img src="/countries/flag_bru_sm.gif" alt="" width="24" height="16" />\n',
 '</div>\n',
 ' \n',
 '      <ul class="list">\n',
 '\t\t\t\t<li>\n',
 '\t<a  href="/countries/brn/en/index.html"

In [24]:
wp = [x.split('html">')[1].split('</a>')[0] for x in pacific_raw if '</a>' in x]
wp

['American Samoa',
 'Australia',
 'Brunei Darussalam',
 'Cambodia',
 'China',
 'Cook Islands',
 'Fiji',
 'French Polynesia (France)',
 'Guam (USA)',
 'Hong Kong (China)',
 'Japan',
 'Kiribati',
 "Lao People's Democratic Republic",
 'Macao (China)',
 'Malaysia',
 'Marshall Islands',
 'Micronesia, Federated States of',
 'Mongolia',
 'Nauru',
 'New Caledonia (France)',
 'New Zealand',
 'Niue',
 'Northern Mariana Islands, Commonwealth of the',
 'Palau',
 'Papua New Guinea',
 'Philippines',
 'Pitcairn Islands (UK)',
 'Republic of Korea',
 'Samoa',
 'Singapore',
 'Solomon Islands',
 'Tokelau* (New Zealand)',
 'Tonga',
 'Tuvalu',
 'Vanuatu',
 'Viet Nam',
 'Wallis and Futuna']

In [25]:
pacific = pd.DataFrame({'Region': 'Western Pacific', 'Country': wp})
pacific

Unnamed: 0,Country,Region
0,American Samoa,Western Pacific
1,Australia,Western Pacific
2,Brunei Darussalam,Western Pacific
3,Cambodia,Western Pacific
4,China,Western Pacific
5,Cook Islands,Western Pacific
6,Fiji,Western Pacific
7,French Polynesia (France),Western Pacific
8,Guam (USA),Western Pacific
9,Hong Kong (China),Western Pacific


In [27]:
# Merge all dataframes together

who_regions = [africa, europe, americas, seasia, eastern_med, pacific]

rc_map = pd.concat(who_regions)

rc_map



Unnamed: 0,Country,Region
0,Algeria,Africa
1,Angola,Africa
2,Benin,Africa
3,Botswana,Africa
4,Burkina Faso,Africa
5,Burundi,Africa
6,Cameroon,Africa
7,Cabo Verde,Africa
8,Central African Republic,Africa
9,Chad,Africa


In [28]:
rc_map['Region'].value_counts()


Europe                   53
Africa                   47
Americas                 46
Western Pacific          37
Eastern Mediterranean    23
Southeast Asia           11
Name: Region, dtype: int64

# Export file for use in main lab notebook

In [29]:
rc_map.to_csv('RegionCountryMap.csv')