# Data Preprocessing

In [35]:
# Import packages
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np

Github: https://github.com/anetey-abbey/infoviz.git <br>
Notebook: https://anetey-abbey.github.io/infoviz/docs/notebook.html

## Load data

Import the country dataset to get the codes of the countries in the WDI dataset in order to differentiate from groups of countries, e.g. Europe.

In [36]:
# this dataset contains the meta data of the WDI dataset
df_countrydata = pd.read_csv('WDICountry.csv', sep=',') 

Import the WDI dataset and selected relevant indicators.

In [37]:
df = pd.read_csv('WDIData.csv', sep=',') 
df = df.drop('Unnamed: 66', axis=1)

# filter on period 1990 - 2020
years = list(map(str, range(1990, 2021)))
df = df[['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'] + years]

# filter to keep only relevant variables in dataset
selected_indicators = ['GDP growth (annual %)',
                       'Individuals using the Internet (% of population)',
                       'Foreign direct investment, net inflows (BoP, current US$)',
                       'GNI per capita, PPP (current international $)',
                       'School enrollment, secondary (% net)',
                       'Individuals using the Internet (% of population)',
                        "Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)",
                       'Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative)',
                       "Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)",
                       'Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)',
                       'Government expenditure on education, total (% of GDP)',
                       'Secondary education, duration (years)',
                       'Secondary education, pupils',
                       'Adjusted savings: education expenditure (% of GNI)',
                       'Adjusted savings: education expenditure (current US$)',
                       'Compulsory education, duration (years)',
                       'Secure Internet servers',
                       'Fixed telephone subscriptions',
                       'Access to electricity (% of population)',
                       'ICT goods exports (% of total goods exports)',
                       'Mobile cellular subscriptions',
                       'Mobile cellular subscriptions (per 100 people)',
                       'Educational attainment, at least completed upper secondary, population 25+, total (%) (cumulative)',
                       'Educational attainment, at least completed lower secondary, population 25+, total (%) (cumulative)',
                       'School enrollment, tertiary (% gross)',
                       'School enrollment, secondary (% gross)',
                       'School enrollment, primary (% gross)',
                       'Literacy rate, youth total (% of people ages 15-24)',
                       'Literacy rate, adult total (% of people ages 15 and above)', 
                       'ICT goods imports (% total goods imports)']
df = df[df['Indicator Name'].isin(selected_indicators)]

# Print the head
df.head(2)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1990,1991,1992,1993,1994,1995,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
3,Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,29.089827,31.844384,31.79416,32.001027,33.87191,38.880173,40.261358,43.061877,44.27086,45.803485
36,Africa Eastern and Southern,AFE,Adjusted savings: education expenditure (% of ...,NY.ADJ.AEDU.GN.ZS,4.426717,4.882468,6.162041,6.071125,6.04343,6.990009,...,4.755062,4.878013,4.563364,4.510963,4.433471,4.3559,4.442274,4.692845,4.791304,4.71065


For effective data analysis we need the data in the wide format but with the variable names in the columns.

In [38]:
df_long = df.melt(id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'], 
                  var_name='Year', value_name='Value')

df_wide = df_long.pivot_table(index=['Year', 'Country Name', 'Country Code'], 
                              columns='Indicator Name', values='Value', aggfunc='first')

df_wide = df_wide.reset_index()
df_wide['Year'] = df_wide['Year'].astype(int)

df_wide

Indicator Name,Year,Country Name,Country Code,Access to electricity (% of population),Adjusted savings: education expenditure (% of GNI),Adjusted savings: education expenditure (current US$),"Compulsory education, duration (years)","Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)","Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)","Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)",...,"Literacy rate, youth total (% of people ages 15-24)",Mobile cellular subscriptions,Mobile cellular subscriptions (per 100 people),"School enrollment, primary (% gross)","School enrollment, secondary (% gross)","School enrollment, secondary (% net)","School enrollment, tertiary (% gross)","Secondary education, duration (years)","Secondary education, pupils",Secure Internet servers
0,1990,Afghanistan,AFG,,1.953448,,,,,,...,,0.000000e+00,0.000000,29.041420,10.849960,,2.211410,6.0,182340.0,
1,1990,Africa Eastern and Southern,AFE,,4.426717,,,,,,...,,7.880000e+03,0.002587,,,,,6.0,,
2,1990,Africa Western and Central,AFW,,2.615379,,,,,,...,,0.000000e+00,0.000000,,,,,7.0,,
3,1990,Albania,ALB,100.000000,2.800000,5.674630e+07,,,,,...,,0.000000e+00,0.000000,99.451752,90.081741,,8.366360,8.0,475074.0,
4,1990,Algeria,DZA,,4.946261,2.957000e+09,,,,,...,,4.700000e+02,0.001825,92.593437,59.414188,,10.293140,6.0,2162469.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8206,2020,West Bank and Gaza,PSE,100.000000,,,10.0,0.26999,21.708691,1.94667,...,99.232353,4.274119e+06,83.783026,96.427193,91.027290,,43.097672,8.0,806276.0,2573.0
8207,2020,World,WLD,90.521569,3.872891,,10.0,,,,...,91.883980,8.265682e+09,106.152775,102.407066,76.750740,,40.244061,6.0,613156032.0,89189073.0
8208,2020,"Yemen, Rep.",YEM,73.757927,3.600000,,9.0,,,,...,,1.517800e+07,50.888548,,,,,6.0,,169.0
8209,2020,Zambia,ZMB,44.524475,3.563864,6.273000e+08,7.0,,,,...,,1.910421e+07,103.917835,,,,,5.0,,745.0


In [39]:
# join the continent data to the dataset
df_wide = pd.merge(df_wide, df_countrydata[['Country Code', 'Region']], on='Country Code', how='left')
df_wide

Unnamed: 0,Year,Country Name,Country Code,Access to electricity (% of population),Adjusted savings: education expenditure (% of GNI),Adjusted savings: education expenditure (current US$),"Compulsory education, duration (years)","Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)","Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)","Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)",...,Mobile cellular subscriptions,Mobile cellular subscriptions (per 100 people),"School enrollment, primary (% gross)","School enrollment, secondary (% gross)","School enrollment, secondary (% net)","School enrollment, tertiary (% gross)","Secondary education, duration (years)","Secondary education, pupils",Secure Internet servers,Region
0,1990,Afghanistan,AFG,,1.953448,,,,,,...,0.000000e+00,0.000000,29.041420,10.849960,,2.211410,6.0,182340.0,,South Asia
1,1990,Africa Eastern and Southern,AFE,,4.426717,,,,,,...,7.880000e+03,0.002587,,,,,6.0,,,
2,1990,Africa Western and Central,AFW,,2.615379,,,,,,...,0.000000e+00,0.000000,,,,,7.0,,,
3,1990,Albania,ALB,100.000000,2.800000,5.674630e+07,,,,,...,0.000000e+00,0.000000,99.451752,90.081741,,8.366360,8.0,475074.0,,Europe & Central Asia
4,1990,Algeria,DZA,,4.946261,2.957000e+09,,,,,...,4.700000e+02,0.001825,92.593437,59.414188,,10.293140,6.0,2162469.0,,Middle East & North Africa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8206,2020,West Bank and Gaza,PSE,100.000000,,,10.0,0.26999,21.708691,1.94667,...,4.274119e+06,83.783026,96.427193,91.027290,,43.097672,8.0,806276.0,2573.0,Middle East & North Africa
8207,2020,World,WLD,90.521569,3.872891,,10.0,,,,...,8.265682e+09,106.152775,102.407066,76.750740,,40.244061,6.0,613156032.0,89189073.0,
8208,2020,"Yemen, Rep.",YEM,73.757927,3.600000,,9.0,,,,...,1.517800e+07,50.888548,,,,,6.0,,169.0,Middle East & North Africa
8209,2020,Zambia,ZMB,44.524475,3.563864,6.273000e+08,7.0,,,,...,1.910421e+07,103.917835,,,,,5.0,,745.0,Sub-Saharan Africa


In [40]:
# most of the time we need the data of all countries only thus we filter out all non-country data
df_countrydata = df_countrydata[df_countrydata['Income Group'].notna()]
country_codes = df_countrydata['Country Code'].unique()
df_all_countries = df_wide[df_wide['Country Code'].isin(country_codes)]
df_all_countries

Unnamed: 0,Year,Country Name,Country Code,Access to electricity (% of population),Adjusted savings: education expenditure (% of GNI),Adjusted savings: education expenditure (current US$),"Compulsory education, duration (years)","Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)","Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)","Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)",...,Mobile cellular subscriptions,Mobile cellular subscriptions (per 100 people),"School enrollment, primary (% gross)","School enrollment, secondary (% gross)","School enrollment, secondary (% net)","School enrollment, tertiary (% gross)","Secondary education, duration (years)","Secondary education, pupils",Secure Internet servers,Region
0,1990,Afghanistan,AFG,,1.953448,,,,,,...,0.0,0.000000,29.041420,10.849960,,2.211410,6.0,182340.0,,South Asia
3,1990,Albania,ALB,100.000000,2.800000,5.674630e+07,,,,,...,0.0,0.000000,99.451752,90.081741,,8.366360,8.0,475074.0,,Europe & Central Asia
4,1990,Algeria,DZA,,4.946261,2.957000e+09,,,,,...,470.0,0.001825,92.593437,59.414188,,10.293140,6.0,2162469.0,,Middle East & North Africa
5,1990,American Samoa,ASM,,11.760709,,,,,,...,0.0,0.000000,98.892731,90.757858,,,4.0,3437.0,,East Asia & Pacific
6,1990,Andorra,AND,100.000000,3.300000,,,,,,...,0.0,0.000000,,,,,7.0,,,Europe & Central Asia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8205,2020,Virgin Islands (U.S.),VIR,100.000000,9.356177,,,,,,...,80000.0,76.610007,,,,,6.0,,69.0,Latin America & Caribbean
8206,2020,West Bank and Gaza,PSE,100.000000,,,10.0,0.26999,21.708691,1.94667,...,4274119.0,83.783026,96.427193,91.027290,,43.097672,8.0,806276.0,2573.0,Middle East & North Africa
8208,2020,"Yemen, Rep.",YEM,73.757927,3.600000,,9.0,,,,...,15178000.0,50.888548,,,,,6.0,,169.0,Middle East & North Africa
8209,2020,Zambia,ZMB,44.524475,3.563864,6.273000e+08,7.0,,,,...,19104208.0,103.917835,,,,,5.0,,745.0,Sub-Saharan Africa


In [41]:
# Replace 'nan' with np.nan
df_all_countries.replace('nan', np.nan, inplace=True)

# Convert columns to numeric data type
df_all_countries['Individuals using the Internet (% of population)'] = pd.to_numeric(df_all_countries['Individuals using the Internet (% of population)'], errors='coerce')

# Replace missing values with linearly interpolated values
df_all_countries = df_all_countries.interpolate(method='linear')
df_all_countries

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all_countries.replace('nan', np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all_countries['Individuals using the Internet (% of population)'] = pd.to_numeric(df_all_countries['Individuals using the Internet (% of population)'], errors='coerce')


Unnamed: 0,Year,Country Name,Country Code,Access to electricity (% of population),Adjusted savings: education expenditure (% of GNI),Adjusted savings: education expenditure (current US$),"Compulsory education, duration (years)","Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)","Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)","Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)",...,Mobile cellular subscriptions,Mobile cellular subscriptions (per 100 people),"School enrollment, primary (% gross)","School enrollment, secondary (% gross)","School enrollment, secondary (% net)","School enrollment, tertiary (% gross)","Secondary education, duration (years)","Secondary education, pupils",Secure Internet servers,Region
0,1990,Afghanistan,AFG,,1.953448,,,,,,...,0.0,0.000000,29.041420,10.849960,,2.211410,6.0,1.823400e+05,,South Asia
3,1990,Albania,ALB,100.000000,2.800000,5.674630e+07,,,,,...,0.0,0.000000,99.451752,90.081741,,8.366360,8.0,4.750740e+05,,Europe & Central Asia
4,1990,Algeria,DZA,100.000000,4.946261,2.957000e+09,,,,,...,470.0,0.001825,92.593437,59.414188,,10.293140,6.0,2.162469e+06,,Middle East & North Africa
5,1990,American Samoa,ASM,100.000000,11.760709,2.118500e+09,,,,,...,0.0,0.000000,98.892731,90.757858,,7.046450,4.0,3.437000e+03,,East Asia & Pacific
6,1990,Andorra,AND,100.000000,3.300000,1.280000e+09,,,,,...,0.0,0.000000,85.966160,49.580654,,3.799760,7.0,8.729650e+04,,Europe & Central Asia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8205,2020,Virgin Islands (U.S.),VIR,100.000000,9.356177,8.991825e+09,10.0,0.573083,24.344624,3.978658,...,80000.0,76.610007,106.792183,77.899291,61.87253,36.303374,6.0,5.454473e+05,69.0,Latin America & Caribbean
8206,2020,West Bank and Gaza,PSE,100.000000,6.478089,6.203650e+09,10.0,0.269990,21.708691,1.946670,...,4274119.0,83.783026,96.427193,91.027290,61.87253,43.097672,8.0,8.062760e+05,2573.0,Middle East & North Africa
8208,2020,"Yemen, Rep.",YEM,73.757927,3.600000,3.415475e+09,9.0,0.269990,21.708691,1.946670,...,15178000.0,50.888548,96.719765,91.027290,61.87253,43.097672,6.0,8.062760e+05,169.0,Middle East & North Africa
8209,2020,Zambia,ZMB,44.524475,3.563864,6.273000e+08,7.0,0.269990,21.708691,1.946670,...,19104208.0,103.917835,97.012337,91.027290,61.87253,43.097672,5.0,8.062760e+05,745.0,Sub-Saharan Africa


In [42]:
# Replace 'nan' with np.nan
df_all_countries.replace('nan', np.nan, inplace=True)

# Convert columns to numeric data type
df_all_countries['School enrollment, tertiary (% gross)'] = pd.to_numeric(df_all_countries['School enrollment, tertiary (% gross)'], errors='coerce')
df_all_countries['School enrollment, secondary (% gross)'] = pd.to_numeric(df_all_countries['School enrollment, secondary (% gross)'], errors='coerce')

# Replace missing values with linearly interpolated values
df_all_countries_clean = df_all_countries.interpolate(method='linear')

In [43]:
# Access the "Region" column and get unique values
unique_regions = df_all_countries['Region'].unique()

# Print the unique values
for region in unique_regions:
    print(region)

South Asia
Europe & Central Asia
Middle East & North Africa
East Asia & Pacific
Sub-Saharan Africa
Latin America & Caribbean
North America


In [44]:
country_continent_dict = {'Afghanistan': 'Asia', 'Albania': 'Europe', 'Algeria': 'Middle East', 'American Samoa': 'Oceania', 'Andorra': 'Europe', 'Angola': 'Africa', 'Antigua and Barbuda': 'North America', 'Argentina': 'South America', 'Armenia': 'Asia', 'Aruba': 'North America', 'Australia': 'Oceania', 'Austria': 'Europe', 'Azerbaijan': 'Asia', 'Bahamas, The': 'North America', 'Bahrain': 'Middle East', 'Bangladesh': 'Asia', 'Barbados': 'North America', 'Belarus': 'Europe', 'Belgium': 'Europe', 'Belize': 'North America', 'Benin': 'Africa', 'Bermuda': 'North America', 'Bhutan': 'Asia', 'Bolivia': 'South America', 'Bosnia and Herzegovina': 'Europe', 'Botswana': 'Africa', 'Brazil': 'South America', 'British Virgin Islands': 'North America', 'Brunei Darussalam': 'Asia', 'Bulgaria': 'Europe', 'Burkina Faso': 'Africa', 'Burundi': 'Africa', 'Cabo Verde': 'Africa', 'Cambodia': 'Asia', 'Cameroon': 'Africa', 'Canada': 'North America', 'Cayman Islands': 'North America', 'Central African Republic': 'Africa', 'Chad': 'Africa', 'Channel Islands': 'Europe', 'Chile': 'South America', 'China': 'Asia', 'Colombia': 'South America', 'Comoros': 'Africa', 'Congo, Dem. Rep.': 'Africa', 'Congo, Rep.': 'Africa', 'Costa Rica': 'North America', "Cote d'Ivoire": 'Africa', 'Croatia': 'Europe', 'Cuba': 'North America', 'Curacao': 'North America', 'Cyprus': 'Asia', 'Czech Republic': 'Europe', 'Denmark': 'Europe', 'Djibouti': 'Africa', 'Dominica': 'North America', 'Dominican Republic': 'North America', 'Ecuador': 'South America', 'Egypt, Arab Rep.': 'Middle East', 'El Salvador': 'North America', 'Equatorial Guinea': 'Africa', 'Eritrea': 'Africa', 'Estonia': 'Europe', 'Eswatini': 'Africa', 'Ethiopia': 'Africa', 'Faroe Islands': 'Europe', 'Fiji': 'Oceania', 'Finland': 'Europe', 'France': 'Europe', 'French Polynesia': 'Oceania', 'Gabon': 'Africa', 'Gambia, The': 'Africa', 'Georgia': 'Asia', 'Germany': 'Europe', 'Ghana': 'Africa', 'Gibraltar': 'Europe', 'Greece': 'Europe', 'Greenland': 'North America', 'Grenada': 'North America', 'Guam': 'Oceania', 'Guatemala': 'North America', 'Guinea': 'Africa', 'Guinea-Bissau': 'Africa', 'Guyana': 'South America', 'Haiti': 'North America', 'Honduras': 'North America', 'Hong Kong SAR, China': 'Asia', 'Hungary': 'Europe', 'Iceland': 'Europe', 'India': 'Asia', 'Indonesia': 'Asia', 'Iran, Islamic Rep.': 'Middle East', 'Iraq': 'Middle East', 'Ireland': 'Europe', 'Isle of Man': 'Europe', 'Israel': 'Middle East', 'Italy': 'Europe', 'Jamaica': 'North America', 'Japan': 'Asia', 'Jordan': 'Middle East', 'Kazakhstan': 'Asia', 'Kenya': 'Africa', 'Kiribati': 'Oceania', "Korea, Dem. People's Rep.": 'Asia', 'Korea, Rep.': 'Asia', 'Kosovo': 'Europe', 'Kuwait': 'Middle East', 'Kyrgyz Republic': 'Asia', 'Lao PDR': 'Asia', 'Latvia': 'Europe', 'Lebanon': 'Middle East', 'Lesotho': 'Africa', 'Liberia': 'Africa', 'Libya': 'Middle East', 'Liechtenstein': 'Europe', 'Lithuania': 'Europe', 'Luxembourg': 'Europe', 'Macao SAR, China': 'Asia', 'Madagascar': 'Africa', 'Malawi': 'Africa', 'Malaysia': 'Asia', 'Maldives': 'Asia', 'Mali': 'Africa', 'Malta': 'Europe', 'Marshall Islands': 'Oceania', 'Mauritania': 'Africa', 'Mauritius': 'Africa', 'Mexico': 'North America', 'Micronesia, Fed. Sts.': 'Oceania', 'Moldova': 'Europe', 'Monaco': 'Europe', 'Mongolia': 'Asia', 'Montenegro': 'Europe', 'Morocco': 'Africa', 'Mozambique': 'Africa', 'Myanmar': 'Asia', 'Namibia': 'Africa', 'Nauru': 'Oceania', 'Nepal': 'Asia', 'Netherlands': 'Europe', 'New Caledonia': 'Oceania', 'New Zealand': 'Oceania', 'Nicaragua': 'North America', 'Niger': 'Africa', 'Nigeria': 'Africa', 'North Macedonia': 'Europe', 'Northern Mariana Islands': 'Oceania', 'Norway': 'Europe', 'Oman': 'Middle East', 'Pakistan': 'Asia', 'Palau': 'Oceania', 'Panama': 'North America', 'Papua New Guinea': 'Oceania', 'Paraguay': 'South America', 'Peru': 'South America', 'Philippines': 'Asia', 'Poland': 'Europe', 'Portugal': 'Europe', 'Puerto Rico': 'North America', 'Qatar': 'Middle East', 'Romania': 'Europe', 'Russian Federation': 'Europe', 'Rwanda': 'Africa', 'Samoa': 'Oceania', 'San Marino': 'Europe', 'Sao Tome and Principe': 'Africa', 'Saudi Arabia': 'Middle East', 'Senegal': 'Africa', 'Serbia': 'Europe', 'Seychelles': 'Africa', 'Sierra Leone': 'Africa', 'Singapore': 'Asia', 'Sint Maarten (Dutch part)': 'North America', 'Slovak Republic': 'Europe', 'Slovenia': 'Europe', 'Solomon Islands': 'Oceania', 'Somalia': 'Africa', 'South Africa': 'Africa', 'South Sudan': 'Africa', 'Spain': 'Europe', 'Sri Lanka': 'Asia', 'St. Kitts and Nevis': 'North America', 'St. Lucia': 'North America', 'St. Martin (French part)': 'North America', 'St. Vincent and the Grenadines': 'North America', 'Sudan': 'Africa', 'Suriname': 'South America', 'Sweden': 'Europe', 'Switzerland': 'Europe', 'Syrian Arab Republic': 'Middle East', 'Tajikistan': 'Asia', 'Tanzania': 'Africa', 'Thailand': 'Asia', 'Timor-Leste': 'Asia', 'Togo': 'Africa', 'Tonga': 'Oceania', 'Trinidad and Tobago': 'North America', 'Tunisia': 'Africa', 'Turkiye': 'Asia', 'Turkmenistan': 'Asia', 'Turks and Caicos Islands': 'North America', 'Tuvalu': 'Oceania', 'Uganda': 'Africa', 'Ukraine': 'Europe', 'United Arab Emirates': 'Middle East', 'United Kingdom': 'Europe', 'United States': 'North America', 'Uruguay': 'South America', 'Uzbekistan': 'Asia', 'Vanuatu': 'Oceania', 'Vietnam': 'Asia', 'Virgin Islands (U.S.)': 'North America', 'West Bank and Gaza': 'Asia', 'Yemen, Rep.': 'Middle East', 'Zambia': 'Africa', 'Zimbabwe': 'Africa'}

unique_values = set(country_continent_dict.values())
for value in unique_values:
    print(value)

North America
Middle East
Asia
Oceania
South America
Europe
Africa


In [45]:
def group_continents(country_continent_dict):
    new_dict = {}
    for country, continent in country_continent_dict.items():
        if continent in ['Africa', 'South America', 'Middle East']:
            new_continent = 'Africa & South America & Middle East'
        elif continent in ['Europe', 'North America']:
            new_continent = 'Europe & North America'
        elif continent in ['Asia', 'Oceania']:
            new_continent = 'Asia & Oceania'
        else:
            new_continent = continent
        new_dict[country] = new_continent
    return new_dict

digital_divide_regions = group_continents(country_continent_dict)
print(digital_divide_regions)

unique_values = set(digital_divide_regions.values())
for value in unique_values:
    print(value)

{'Afghanistan': 'Asia & Oceania', 'Albania': 'Europe & North America', 'Algeria': 'Africa & South America & Middle East', 'American Samoa': 'Asia & Oceania', 'Andorra': 'Europe & North America', 'Angola': 'Africa & South America & Middle East', 'Antigua and Barbuda': 'Europe & North America', 'Argentina': 'Africa & South America & Middle East', 'Armenia': 'Asia & Oceania', 'Aruba': 'Europe & North America', 'Australia': 'Asia & Oceania', 'Austria': 'Europe & North America', 'Azerbaijan': 'Asia & Oceania', 'Bahamas, The': 'Europe & North America', 'Bahrain': 'Africa & South America & Middle East', 'Bangladesh': 'Asia & Oceania', 'Barbados': 'Europe & North America', 'Belarus': 'Europe & North America', 'Belgium': 'Europe & North America', 'Belize': 'Europe & North America', 'Benin': 'Africa & South America & Middle East', 'Bermuda': 'Europe & North America', 'Bhutan': 'Asia & Oceania', 'Bolivia': 'Africa & South America & Middle East', 'Bosnia and Herzegovina': 'Europe & North America',

In [46]:
df_all_countries['Digital Divide Region'] = df_all_countries['Country Name'].map(digital_divide_regions)
df_all_countries

Unnamed: 0,Year,Country Name,Country Code,Access to electricity (% of population),Adjusted savings: education expenditure (% of GNI),Adjusted savings: education expenditure (current US$),"Compulsory education, duration (years)","Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)","Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)","Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)",...,Mobile cellular subscriptions (per 100 people),"School enrollment, primary (% gross)","School enrollment, secondary (% gross)","School enrollment, secondary (% net)","School enrollment, tertiary (% gross)","Secondary education, duration (years)","Secondary education, pupils",Secure Internet servers,Region,Digital Divide Region
0,1990,Afghanistan,AFG,,1.953448,,,,,,...,0.000000,29.041420,10.849960,,2.211410,6.0,1.823400e+05,,South Asia,Asia & Oceania
3,1990,Albania,ALB,100.000000,2.800000,5.674630e+07,,,,,...,0.000000,99.451752,90.081741,,8.366360,8.0,4.750740e+05,,Europe & Central Asia,Europe & North America
4,1990,Algeria,DZA,100.000000,4.946261,2.957000e+09,,,,,...,0.001825,92.593437,59.414188,,10.293140,6.0,2.162469e+06,,Middle East & North Africa,Africa & South America & Middle East
5,1990,American Samoa,ASM,100.000000,11.760709,2.118500e+09,,,,,...,0.000000,98.892731,90.757858,,7.046450,4.0,3.437000e+03,,East Asia & Pacific,Asia & Oceania
6,1990,Andorra,AND,100.000000,3.300000,1.280000e+09,,,,,...,0.000000,85.966160,49.580654,,3.799760,7.0,8.729650e+04,,Europe & Central Asia,Europe & North America
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8205,2020,Virgin Islands (U.S.),VIR,100.000000,9.356177,8.991825e+09,10.0,0.573083,24.344624,3.978658,...,76.610007,106.792183,77.899291,61.87253,36.303374,6.0,5.454473e+05,69.0,Latin America & Caribbean,Europe & North America
8206,2020,West Bank and Gaza,PSE,100.000000,6.478089,6.203650e+09,10.0,0.269990,21.708691,1.946670,...,83.783026,96.427193,91.027290,61.87253,43.097672,8.0,8.062760e+05,2573.0,Middle East & North Africa,Asia & Oceania
8208,2020,"Yemen, Rep.",YEM,73.757927,3.600000,3.415475e+09,9.0,0.269990,21.708691,1.946670,...,50.888548,96.719765,91.027290,61.87253,43.097672,6.0,8.062760e+05,169.0,Middle East & North Africa,Africa & South America & Middle East
8209,2020,Zambia,ZMB,44.524475,3.563864,6.273000e+08,7.0,0.269990,21.708691,1.946670,...,103.917835,97.012337,91.027290,61.87253,43.097672,5.0,8.062760e+05,745.0,Sub-Saharan Africa,Africa & South America & Middle East


In [47]:
df_all_countries['Digital Divide Region'] = df_all_countries['Country Name'].map(digital_divide_regions)
df_all_countries

Unnamed: 0,Year,Country Name,Country Code,Access to electricity (% of population),Adjusted savings: education expenditure (% of GNI),Adjusted savings: education expenditure (current US$),"Compulsory education, duration (years)","Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)","Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)","Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)",...,Mobile cellular subscriptions (per 100 people),"School enrollment, primary (% gross)","School enrollment, secondary (% gross)","School enrollment, secondary (% net)","School enrollment, tertiary (% gross)","Secondary education, duration (years)","Secondary education, pupils",Secure Internet servers,Region,Digital Divide Region
0,1990,Afghanistan,AFG,,1.953448,,,,,,...,0.000000,29.041420,10.849960,,2.211410,6.0,1.823400e+05,,South Asia,Asia & Oceania
3,1990,Albania,ALB,100.000000,2.800000,5.674630e+07,,,,,...,0.000000,99.451752,90.081741,,8.366360,8.0,4.750740e+05,,Europe & Central Asia,Europe & North America
4,1990,Algeria,DZA,100.000000,4.946261,2.957000e+09,,,,,...,0.001825,92.593437,59.414188,,10.293140,6.0,2.162469e+06,,Middle East & North Africa,Africa & South America & Middle East
5,1990,American Samoa,ASM,100.000000,11.760709,2.118500e+09,,,,,...,0.000000,98.892731,90.757858,,7.046450,4.0,3.437000e+03,,East Asia & Pacific,Asia & Oceania
6,1990,Andorra,AND,100.000000,3.300000,1.280000e+09,,,,,...,0.000000,85.966160,49.580654,,3.799760,7.0,8.729650e+04,,Europe & Central Asia,Europe & North America
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8205,2020,Virgin Islands (U.S.),VIR,100.000000,9.356177,8.991825e+09,10.0,0.573083,24.344624,3.978658,...,76.610007,106.792183,77.899291,61.87253,36.303374,6.0,5.454473e+05,69.0,Latin America & Caribbean,Europe & North America
8206,2020,West Bank and Gaza,PSE,100.000000,6.478089,6.203650e+09,10.0,0.269990,21.708691,1.946670,...,83.783026,96.427193,91.027290,61.87253,43.097672,8.0,8.062760e+05,2573.0,Middle East & North Africa,Asia & Oceania
8208,2020,"Yemen, Rep.",YEM,73.757927,3.600000,3.415475e+09,9.0,0.269990,21.708691,1.946670,...,50.888548,96.719765,91.027290,61.87253,43.097672,6.0,8.062760e+05,169.0,Middle East & North Africa,Africa & South America & Middle East
8209,2020,Zambia,ZMB,44.524475,3.563864,6.273000e+08,7.0,0.269990,21.708691,1.946670,...,103.917835,97.012337,91.027290,61.87253,43.097672,5.0,8.062760e+05,745.0,Sub-Saharan Africa,Africa & South America & Middle East


In [48]:
df_wide['Digital Divide Region'] = df_wide['Country Name'].map(digital_divide_regions)
df_wide

Unnamed: 0,Year,Country Name,Country Code,Access to electricity (% of population),Adjusted savings: education expenditure (% of GNI),Adjusted savings: education expenditure (current US$),"Compulsory education, duration (years)","Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)","Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)","Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)",...,Mobile cellular subscriptions (per 100 people),"School enrollment, primary (% gross)","School enrollment, secondary (% gross)","School enrollment, secondary (% net)","School enrollment, tertiary (% gross)","Secondary education, duration (years)","Secondary education, pupils",Secure Internet servers,Region,Digital Divide Region
0,1990,Afghanistan,AFG,,1.953448,,,,,,...,0.000000,29.041420,10.849960,,2.211410,6.0,182340.0,,South Asia,Asia & Oceania
1,1990,Africa Eastern and Southern,AFE,,4.426717,,,,,,...,0.002587,,,,,6.0,,,,
2,1990,Africa Western and Central,AFW,,2.615379,,,,,,...,0.000000,,,,,7.0,,,,
3,1990,Albania,ALB,100.000000,2.800000,5.674630e+07,,,,,...,0.000000,99.451752,90.081741,,8.366360,8.0,475074.0,,Europe & Central Asia,Europe & North America
4,1990,Algeria,DZA,,4.946261,2.957000e+09,,,,,...,0.001825,92.593437,59.414188,,10.293140,6.0,2162469.0,,Middle East & North Africa,Africa & South America & Middle East
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8206,2020,West Bank and Gaza,PSE,100.000000,,,10.0,0.26999,21.708691,1.94667,...,83.783026,96.427193,91.027290,,43.097672,8.0,806276.0,2573.0,Middle East & North Africa,Asia & Oceania
8207,2020,World,WLD,90.521569,3.872891,,10.0,,,,...,106.152775,102.407066,76.750740,,40.244061,6.0,613156032.0,89189073.0,,
8208,2020,"Yemen, Rep.",YEM,73.757927,3.600000,,9.0,,,,...,50.888548,,,,,6.0,,169.0,Middle East & North Africa,Africa & South America & Middle East
8209,2020,Zambia,ZMB,44.524475,3.563864,6.273000e+08,7.0,,,,...,103.917835,,,,,5.0,,745.0,Sub-Saharan Africa,Africa & South America & Middle East


In [49]:
# Save as pickle file. This datatype preserves all relevant information of the dataframe.
df_wide.to_pickle("df_wide.pkl")
df_all_countries.to_pickle("df_all_countries.pkl")
df_all_countries_clean.to_pickle("df_all_countries_clean.pkl")