# Intstall Modules

# Import Modules

In [1]:
import pandas as pd
import requests

# Parameters

In [11]:
# Paths: Local Setup
PATH_EXT = "../data/external/"
PATH_RAW = "../data/raw/"
PATH_INT = "../data/interim/"
PATH_PRO = "../data/processed/"
PATH_REP = "../reports/"
PATH_FIGS = "../reports/figures/"

In [2]:
# TODO: Update section with paths

# Parsing

In [3]:
# List of countries to be parsed: 235-1
# Excluded Micronesia due to parsing issues
country_list = [
    "Afghanistan",
    "Albania",
    "Algeria",
    "American-Samoa",
    "Andorra",
    "Angola",
    "Anguilla",
    "Antigua-and-Barbuda",
    "Argentina",
    "Armenia",
    "Aruba",
    "Australia",
    "Austria",
    "Azerbaijan",
    "Bahamas",
    "Bahrain",
    "Bangladesh",
    "Barbados",
    "Belarus",
    "Belgium",
    "Belize",
    "Benin",
    "Bermuda",
    "Bhutan",
    "Bolivia",
    "Bosnia-and-Herzegovina",
    "Botswana",
    "Brazil",
    "British-Virgin-Islands",
    "Brunei-darussalam",
    "Bulgaria",
    "Burkina-Faso",
    "Burundi",
    "Cabo-Verde",
    "Cambodia",
    "Cameroon",
    "Canada",
    "Caribbean-Netherlands",
    "Cayman-Islands",
    "Central-African-Republic",
    "Chad",
    "Channel-Islands",
    "Chile",
    "China",
    "Colombia",
    "Comoros",
    "Congo",
    "Cook-Islands",
    "Costa-Rica",
    "Croatia",
    "Cuba",
    "Curacao",
    "Cyprus",
    "Czechia",
    "Cote-d-Ivoire",
    "Denmark",
    "Djibouti",
    "Dominica",
    "Dominican-Republic",
    "Democratic-Republic-of-the-Congo",
    "Ecuador",
    "Egypt",
    "El-Salvador",
    "Equatorial-Guinea",
    "Eritrea",
    "Estonia",
    "Ethiopia",
    "Faeroe-Islands",
    "Falkland-Islands-Malvinas",
    "Fiji",
    "Finland",
    "France",
    "French-Guiana",
    "French-Polynesia",
    "Gabon",
    "Gambia",
    "Georgia",
    "Germany",
    "Ghana",
    "Gibraltar",
    "Greece",
    "Greenland",
    "Grenada",
    "Guadeloupe",
    "Guam",
    "Guatemala",
    "Guinea",
    "Guinea-Bissau",
    "Guyana",
    "Haiti",
    "Holy-See",
    "Honduras",
    "China-Hong-Kong-SAR",
    "Hungary",
    "Iceland",
    "India",
    "Indonesia",
    "Iran",
    "Iraq",
    "Ireland",
    "Isle-of-Man",
    "Israel",
    "Italy",
    "Jamaica",
    "Japan",
    "Jordan",
    "Kazakhstan",
    "Kenya",
    "Kiribati",
    "Kuwait",
    "Kyrgyzstan",
    "Laos",
    "Latvia",
    "Lebanon",
    "Lesotho",
    "Liberia",
    "Libya",
    "Liechtenstein",
    "Lithuania",
    "Luxembourg",
    "China-Macao-SAR",
    "Madagascar",
    "Malawi",
    "Malaysia",
    "Maldives",
    "Mali",
    "Malta",
    "Marshall-Islands",
    "Martinique",
    "Mauritania",
    "Mauritius",
    "Mayotte",
    "Mexico",
    # "Micronesia",
    "Moldova",
    "Monaco",
    "Mongolia",
    "Montenegro",
    "Montserrat",
    "Morocco",
    "Mozambique",
    "Myanmar",
    "Namibia",
    "Nauru",
    "Nepal",
    "Netherlands",
    "New-Caledonia",
    "New-Zealand",
    "Nicaragua",
    "Niger",
    "Nigeria",
    "Niue",
    "North-Korea",
    "Northern-Mariana-Islands",
    "Norway",
    "Oman",
    "Pakistan",
    "Palau",
    "Panama",
    "Papua-New-Guinea",
    "Paraguay",
    "Peru",
    "Philippines",
    "Poland",
    "Portugal",
    "Puerto-Rico",
    "Qatar",
    "Romania",
    "Russia",
    "Rwanda",
    "Reunion",
    "Saint-Barthelemy",
    "Saint-Helena",
    "Saint-Kitts-and-Nevis",
    "Saint-Lucia",
    "Saint-Martin",
    "Saint-Pierre-and-Miquelon",
    "Samoa",
    "San-Marino",
    "Sao-Tome-and-Principe",
    "Saudi-Arabia",
    "Senegal",
    "Serbia",
    "Seychelles",
    "Sierra-Leone",
    "Singapore",
    "Sint-Maarten",
    "Slovakia",
    "Slovenia",
    "Solomon-Islands",
    "Somalia",
    "South-Africa",
    "South-Korea",
    "South-Sudan",
    "Spain",
    "Sri-Lanka",
    "Saint-Vincent-and-the-Grenadines",
    "State-of-Palestine",
    "Sudan",
    "Suriname",
    "Swaziland",
    "Sweden",
    "Switzerland",
    "Syria",
    "Taiwan",
    "Tajikistan",
    "Tanzania",
    "North-Macedonia",
    "Thailand",
    "Timor-Leste",
    "Togo",
    "Tokelau",
    "Tonga",
    "Trinidad-and-Tobago",
    "Tunisia",
    "Turkey",
    "Turkmenistan",
    "Turks-and-Caicos-Islands",
    "Tuvalu",
    "UK",
    "Uganda",
    "Ukraine",
    "United-Arab-Emirates",
    "US",
    "United-States-Virgin-Islands",
    "Uruguay",
    "Uzbekistan",
    "Vanuatu",
    "Venezuela",
    "Vietnam",
    "wallis-and-Futuna-Islands",
    "Western-Sahara",
    "Yemen",
    "Zambia",
    "Zimbabwe",
]


In [4]:
def get_html(country):
    # Basic URL + country to be parsed
    URL = f'https://www.worldometers.info/world-population/{country}-population/'
    
    # Define header to avoid 403 forbidden error
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    
    # Get html (source code)
    html = requests.get(URL, headers=headers).text

    return html

In [5]:
def create_table(country, html, important_features):
    matcher = "Population"

    # Find tables with the matcher
    table = pd.read_html(html, match=matcher, header=0)
    # print(len(table))

    # Desired table is at index position 1 (country population per year)
    df = table[1]
    #print(df.head(1))

    # Keeping only required features
    df = df[important_features]

    # Add country at the very beginning of the dataframe
    df.insert(0, "Country", country.title())   

    return df


In [6]:
# Defining required features
important_features = [
    "Year",
    "Population",
    "Yearly % Change",
    "Urban Population",
]

df_all = pd.DataFrame(columns=important_features) # create empty dataframe
df_all.insert(0, "Country", 0)
df_all


Unnamed: 0,Country,Year,Population,Yearly % Change,Urban Population


In [7]:
for country in country_list:
    print(country)
    country = country.lower()
    html = get_html(country) # get html source (code)
    df = create_table(country, html, important_features) # extracting table information
    df_all = df_all.append(df) # append overall dataframe


Afghanistan
Albania
Algeria
American-Samoa
Andorra
Angola
Anguilla
Antigua-and-Barbuda
Argentina
Armenia
Aruba
Australia
Austria
Azerbaijan
Bahamas
Bahrain
Bangladesh
Barbados
Belarus
Belgium
Belize
Benin
Bermuda
Bhutan
Bolivia
Bosnia-and-Herzegovina
Botswana
Brazil
British-Virgin-Islands
Brunei-darussalam
Bulgaria
Burkina-Faso
Burundi
Cabo-Verde
Cambodia
Cameroon
Canada
Caribbean-Netherlands
Cayman-Islands
Central-African-Republic
Chad
Channel-Islands
Chile
China
Colombia
Comoros
Congo
Cook-Islands
Costa-Rica
Croatia
Cuba
Curacao
Cyprus
Czechia
Cote-d-Ivoire
Denmark
Djibouti
Dominica
Dominican-Republic
Democratic-Republic-of-the-Congo
Ecuador
Egypt
El-Salvador
Equatorial-Guinea
Eritrea
Estonia
Ethiopia
Faeroe-Islands
Falkland-Islands-Malvinas
Fiji
Finland
France
French-Guiana
French-Polynesia
Gabon
Gambia
Georgia
Germany
Ghana
Gibraltar
Greece
Greenland
Grenada
Guadeloupe
Guam
Guatemala
Guinea
Guinea-Bissau
Guyana
Haiti
Holy-See
Honduras
China-Hong-Kong-SAR
Hungary
Iceland
India
Indon

In [8]:
df_all

Unnamed: 0,Country,Year,Population,Yearly % Change,Urban Population
0,Afghanistan,2020,38928346,2.33 %,9904337
1,Afghanistan,2019,38041754,2.34 %,9582625
2,Afghanistan,2018,37171921,2.41 %,9273302
3,Afghanistan,2017,36296113,2.58 %,8971472
4,Afghanistan,2016,35383032,2.82 %,8670939
...,...,...,...,...,...
13,Zimbabwe,1975,6293875,3.54 %,1215331
14,Zimbabwe,1970,5289303,3.42 %,898584
15,Zimbabwe,1965,4471177,3.43 %,644767
16,Zimbabwe,1960,3776681,3.28 %,472478


# Export dataframe

In [14]:
# Using pickle format to preserve formats
FILENAME = f'population_by_country_p2.pkl'
data_path = os.path.join(PATH_EXT, FILENAME)
pd.to_pickle(df_all, data_path, protocol=2)

In [15]:
FILENAME = f'population_by_country.csv'
data_path = os.path.join(PATH_EXT, FILENAME)
df_all.to_csv(data_path, index=False)