In [29]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

In [30]:
# Read the data
GDP_data = pd.read_excel('dataset\API_NY.GDP.MKTP.CD_DS2_en_excel_v2_4770502.xls', usecols=np.r_[0, 24:65], skiprows=3)
literacy_rate_data = pd.read_excel('dataset\API_SE.ADT.LITR.ZS_DS2_en_excel_v2_4773710.xls', usecols=np.r_[0, 24:65], skiprows=3)
mortality_rate_data = pd.read_excel('dataset\API_SP.DYN.IMRT.IN_DS2_en_excel_v2_4770604.xls', usecols=np.r_[0, 24:65], skiprows=3)
population_data = pd.read_excel('dataset\API_SP.POP.TOTL_DS2_en_excel_v2_4770385.xls', usecols=np.r_[0, 24:65], skiprows=3)
EVI_data = pd.read_excel('dataset\API_TX.VAL.MRCH.XD.WD_DS2_en_excel_v2_4774581.xls', usecols=np.r_[0, 24:65], skiprows=3)

In [31]:
GDP_data.head()

Unnamed: 0,Country Name,1980,1981,1982,1983,1984,1985,1986,1987,1988,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Aruba,,,,,,,405586600.0,487709500.0,596648000.0,...,2637989000.0,2615084000.0,2727933000.0,2791061000.0,2963128000.0,2983799000.0,3092179000.0,3202235000.0,3368970000.0,2610039000.0
1,Africa Eastern and Southern,170656100000.0,174388900000.0,167268100000.0,174919900000.0,160135700000.0,136298800000.0,152519900000.0,186146600000.0,204142000000.0,...,964180700000.0,972573400000.0,983472900000.0,1003768000000.0,924522800000.0,882721300000.0,1021119000000.0,1007240000000.0,1001017000000.0,927484500000.0
2,Afghanistan,3641723000.0,3478788000.0,,,,,,,,...,18190410000.0,20203570000.0,20564490000.0,20550580000.0,19998160000.0,18019560000.0,18896350000.0,18418850000.0,18904490000.0,20143440000.0
3,Africa Western and Central,112031300000.0,211003500000.0,187163700000.0,138115200000.0,114262700000.0,116507300000.0,107497500000.0,110321800000.0,108943500000.0,...,680456000000.0,736039900000.0,832216900000.0,892497900000.0,766958000000.0,690545400000.0,683748000000.0,766359700000.0,794719100000.0,784799700000.0
4,Angola,5930503000.0,5550483000.0,5550483000.0,5784342000.0,6131475000.0,7553560000.0,7072063000.0,8083872000.0,8769251000.0,...,109436600000.0,124998200000.0,133401600000.0,137244400000.0,87219300000.0,49840490000.0,68972770000.0,77792940000.0,69309110000.0,53619070000.0


In [32]:
literacy_rate_data.head()

Unnamed: 0,Country Name,1980,1981,1982,1983,1984,1985,1986,1987,1988,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Aruba,,,,,,,,,,...,,,,,,,,97.807419,,97.989998
1,Africa Eastern and Southern,,,,,,,,55.770672,56.088421,...,65.86454,67.391823,68.445107,68.827972,69.282494,70.059601,69.999451,70.42025,71.574051,71.889908
2,Afghanistan,,,,,,,,,,...,31.448851,,,,,,,,,
3,Africa Western and Central,,,,,,40.603149,40.878811,41.673779,42.446861,...,51.952728,52.603539,53.144989,54.186619,54.979179,55.564968,56.604252,59.61927,60.178661,60.234989
4,Angola,,,,,,,,,,...,,,,66.030113,,,,,,


In [33]:
# Get the list of country names
countries = GDP_data['Country Name'].unique().tolist()

# Create a dataframe to store the data for all countries
df = pd.DataFrame(columns=['Country Name', 'Year', 'GDP', 'Literacy Rate', 'Mortality Rate', 'Population', 'Export Value Index'])
for country in countries:
    # Extract the data for the country from each data set
    GDP = GDP_data.loc[GDP_data['Country Name'] == country].iloc[0, 1:].tolist()
    literacy_rate = literacy_rate_data.loc[literacy_rate_data['Country Name'] == country].iloc[0, 1:].tolist()
    mortality_rate = mortality_rate_data.loc[mortality_rate_data['Country Name'] == country].iloc[0, 1:].tolist()
    population = population_data.loc[population_data['Country Name'] == country].iloc[0, 1:].tolist()
    EVI = EVI_data.loc[EVI_data['Country Name'] == country].iloc[0, 1:].tolist()
    
    # Combine the data into a list of tuples
    data = list(zip([country]*len(GDP), GDP_data.columns[1:], GDP, literacy_rate, mortality_rate, population, EVI))
    
    # Append the data to the dataframe
    # df = df.append(pd.DataFrame(data, columns=df.columns))
    df = pd.concat([df, pd.DataFrame(data, columns=df.columns)])

# Print the data for a specific country (e.g., Canada)
# print(df)
print(df.loc[df['Country Name'] == 'United States'].head())

    Country Name  Year           GDP  Literacy Rate  Mortality Rate  \
0  United States  1980  2.857307e+12            NaN            12.6   
1  United States  1981  3.207041e+12            NaN            12.1   
2  United States  1982  3.343789e+12            NaN            11.7   
3  United States  1983  3.634038e+12            NaN            11.3   
4  United States  1984  4.037613e+12            NaN            10.9   

    Population  Export Value Index  
0  227225000.0           31.375503  
1  229466000.0           30.850243  
2  231664000.0           29.251602  
3  233792000.0           28.971391  
4  235825000.0           30.830603  


In [34]:
df['Year'] = df['Year'].astype(int)
model_GDP = df[['Year', 'GDP']]
row_GDP = (model_GDP['Year'] >= 1990) & (model_GDP['Year'] <= 2020)
model_data_GDP = model_GDP.loc[row_GDP, ['GDP']].reset_index(drop=True)
model_data_GDP = model_data_GDP.rename(columns={'GDP': 'GDP in Ten Years'})
model_data_GDP

Unnamed: 0,GDP in Ten Years
0,7.648045e+08
1,8.720670e+08
2,9.586592e+08
3,1.083240e+09
4,1.245810e+09
...,...
8241,2.054868e+10
8242,1.758489e+10
8243,3.415607e+10
8244,2.183223e+10


In [35]:

model_data = df.drop(columns=['GDP'])
row_index = (model_data['Year'] >= 1980) & (model_data['Year'] <= 2010)
columns = ['Country Name', 'Year', 'Literacy Rate', 'Population', 'Export Value Index', 'Mortality Rate']
model_data_range = model_data.loc[row_index, columns].reset_index(drop=True)
model_data_range

train_data = pd.concat([model_data_range, model_data_GDP], axis=1)
train_data

Unnamed: 0,Country Name,Year,Literacy Rate,Population,Export Value Index,Mortality Rate,GDP in Ten Years
0,Aruba,1980,,62267.0,,,7.648045e+08
1,Aruba,1981,,62614.0,,,8.720670e+08
2,Aruba,1982,,63116.0,,,9.586592e+08
3,Aruba,1983,,63683.0,,,1.083240e+09
4,Aruba,1984,,64174.0,,,1.245810e+09
...,...,...,...,...,...,...,...
8241,Zimbabwe,2006,,12330490.0,103.896104,53.4,2.054868e+10
8242,Zimbabwe,2007,,12450568.0,124.675325,54.6,1.758489e+10
8243,Zimbabwe,2008,,12550347.0,114.285714,54.5,3.415607e+10
8244,Zimbabwe,2009,,12679810.0,117.864831,54.0,2.183223e+10


In [36]:
row_index = (model_data['Year'] >= 2001) & (model_data['Year'] <= 2020)
columns = ['Country Name', 'Year', 'Literacy Rate', 'Population', 'Export Value Index', 'Mortality Rate']
test_data = model_data.loc[row_index, columns].reset_index(drop=True)
test_data

Unnamed: 0,Country Name,Year,Literacy Rate,Population,Export Value Index,Mortality Rate
0,Aruba,2001,,90691.0,96.021747,
1,Aruba,2002,,91781.0,58.958162,
2,Aruba,2003,,92701.0,81.308807,
3,Aruba,2004,,93540.0,136.105004,
4,Aruba,2005,,94483.0,175.001625,
...,...,...,...,...,...,...
5315,Zimbabwe,2016,,14452704.0,173.230130,40.8
5316,Zimbabwe,2017,,14751101.0,180.807792,39.9
5317,Zimbabwe,2018,,15052184.0,210.771948,38.8
5318,Zimbabwe,2019,,15354608.0,221.766753,38.1


In [37]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8246 entries, 0 to 8245
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Country Name        8246 non-null   object 
 1   Year                8246 non-null   int32  
 2   Literacy Rate       1726 non-null   float64
 3   Population          8205 non-null   float64
 4   Export Value Index  5270 non-null   float64
 5   Mortality Rate      6923 non-null   float64
 6   GDP in Ten Years    7708 non-null   float64
dtypes: float64(5), int32(1), object(1)
memory usage: 418.9+ KB


In [38]:
train_data.isna().sum()

Country Name             0
Year                     0
Literacy Rate         6520
Population              41
Export Value Index    2976
Mortality Rate        1323
GDP in Ten Years       538
dtype: int64

In [39]:
train_data['Literacy Rate'].fillna((train_data['Literacy Rate'].mean()), inplace=True)
train_data['Export Value Index'].fillna((train_data['Export Value Index'].mean()), inplace=True)
train_data['GDP in Ten Years'].fillna((train_data['GDP in Ten Years'].mean()), inplace=True)
train_data['Mortality Rate'].fillna((train_data['Mortality Rate'].mean()), inplace=True)
train_data['Population'].fillna((train_data['Population'].mean()), inplace=True)
train_data.head()

Unnamed: 0,Country Name,Year,Literacy Rate,Population,Export Value Index,Mortality Rate,GDP in Ten Years
0,Aruba,1980,72.533757,62267.0,130.832542,44.664662,764804500.0
1,Aruba,1981,72.533757,62614.0,130.832542,44.664662,872067000.0
2,Aruba,1982,72.533757,63116.0,130.832542,44.664662,958659200.0
3,Aruba,1983,72.533757,63683.0,130.832542,44.664662,1083240000.0
4,Aruba,1984,72.533757,64174.0,130.832542,44.664662,1245810000.0


In [40]:
# Encoding Country Name
le = LabelEncoder()
train_data['Country Name'] = le.fit_transform(train_data['Country Name'])
test_data['Country Name'] = le.fit_transform(test_data['Country Name'])

In [41]:
list(le.inverse_transform(range(266)))

['Afghanistan',
 'Africa Eastern and Southern',
 'Africa Western and Central',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Arab World',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas, The',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'British Virgin Islands',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Caribbean small states',
 'Cayman Islands',
 'Central African Republic',
 'Central Europe and the Baltics',
 'Chad',
 'Channel Islands',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo, Dem. Rep.',
 'Congo, Rep.',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Curacao',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Early-demographic dividend',
 'East

In [42]:
# Drop the target feature from the train data
X = train_data.drop('GDP in Ten Years', axis=1)
y = train_data['GDP in Ten Years']

# Shape and dimension
print("Dimension of X  = {}\nType of X  = {}\n\nDimension of y  = {}\nType of y  = {}".format(X.shape, type(X), y.shape, type(y)))

Dimension of X  = (8246, 6)
Type of X  = <class 'pandas.core.frame.DataFrame'>

Dimension of y  = (8246,)
Type of y  = <class 'pandas.core.series.Series'>


In [43]:
# dataMap = sns.heatmap(X.corr(), vmin= -1, vmax= 1, annot= True, cmap='BrBG')

In [44]:
from sklearn.model_selection import train_test_split


# Split the data using stratified sampling based on the country column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=298, stratify=X['Country Name'])
X_train.shape, X_test.shape


((6596, 6), (1650, 6))

In [45]:
# # Create a training and a validation set
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=298)
# X_train.shape, X_test.shape

In [46]:
init_models = { 'Linear Regression': LinearRegression(),
                'Random forest': RandomForestRegressor(random_state=64),
                'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=79),
               }
R2 = []
models_names = []
for i, (key,model) in enumerate(init_models.items()):
    model.fit(X_train, y_train)
    models_names.append(key)
    R2.append(np.mean(cross_val_score(model, X_train, y_train, cv=5)))
models_scores = pd.DataFrame({'model name': models_names, 'R2 score': R2})
models_scores.head(7)

Unnamed: 0,model name,R2 score
0,Linear Regression,0.455494
1,Random forest,0.975512
2,Gradient Boosting Regressor,0.95716


In [None]:
# init_models = {
#     'Linear Regression': LinearRegression(),
#     'Random Forest': RandomForestRegressor(random_state=64),
#     'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=79)
# }

# # Fit each model and calculate R2 score on training and validation sets
# R2_train = []
# R2_test = []
# models_names = []
# for key, model in init_models.items():
#     model.fit(X_train, y_train)
#     models_names.append(key)
#     y_pred_train = model.predict(X_train)
#     y_pred_test = model.predict(X_test)
#     R2_train.append(r2_score(y_train, y_pred_train))
#     R2_test.append(r2_score(y_test, y_pred_test))

# # Create a dataframe with the R2 scores for each model
# models_scores = pd.DataFrame({'Model Name': models_names, 'R2 Train': R2_train, 'R2 Test': R2_test})
# models_scores.head()