In [97]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [98]:
# Read the data
GDP_data = pd.read_excel('dataset\API_NY.GDP.MKTP.CD_DS2_en_excel_v2_4770502.xls', usecols=np.r_[0, 24:65], skiprows=3)
literacy_rate_data = pd.read_excel('dataset\API_SE.ADT.LITR.ZS_DS2_en_excel_v2_4773710.xls', usecols=np.r_[0, 24:65], skiprows=3)
mortality_rate_data = pd.read_excel('dataset\API_SP.DYN.IMRT.IN_DS2_en_excel_v2_4770604.xls', usecols=np.r_[0, 24:65], skiprows=3)
population_data = pd.read_excel('dataset\API_SP.POP.TOTL_DS2_en_excel_v2_4770385.xls', usecols=np.r_[0, 24:65], skiprows=3)
EVI_data = pd.read_excel('dataset\API_TX.VAL.MRCH.XD.WD_DS2_en_excel_v2_4774581.xls', usecols=np.r_[0, 24:65], skiprows=3)

In [99]:
GDP_data.head()

Unnamed: 0,Country Name,1980,1981,1982,1983,1984,1985,1986,1987,1988,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Aruba,,,,,,,405586600.0,487709500.0,596648000.0,...,2637989000.0,2615084000.0,2727933000.0,2791061000.0,2963128000.0,2983799000.0,3092179000.0,3202235000.0,3368970000.0,2610039000.0
1,Africa Eastern and Southern,170656100000.0,174388900000.0,167268100000.0,174919900000.0,160135700000.0,136298800000.0,152519900000.0,186146600000.0,204142000000.0,...,964180700000.0,972573400000.0,983472900000.0,1003768000000.0,924522800000.0,882721300000.0,1021119000000.0,1007240000000.0,1001017000000.0,927484500000.0
2,Afghanistan,3641723000.0,3478788000.0,,,,,,,,...,18190410000.0,20203570000.0,20564490000.0,20550580000.0,19998160000.0,18019560000.0,18896350000.0,18418850000.0,18904490000.0,20143440000.0
3,Africa Western and Central,112031300000.0,211003500000.0,187163700000.0,138115200000.0,114262700000.0,116507300000.0,107497500000.0,110321800000.0,108943500000.0,...,680456000000.0,736039900000.0,832216900000.0,892497900000.0,766958000000.0,690545400000.0,683748000000.0,766359700000.0,794719100000.0,784799700000.0
4,Angola,5930503000.0,5550483000.0,5550483000.0,5784342000.0,6131475000.0,7553560000.0,7072063000.0,8083872000.0,8769251000.0,...,109436600000.0,124998200000.0,133401600000.0,137244400000.0,87219300000.0,49840490000.0,68972770000.0,77792940000.0,69309110000.0,53619070000.0


In [100]:
literacy_rate_data.head()

Unnamed: 0,Country Name,1980,1981,1982,1983,1984,1985,1986,1987,1988,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Aruba,,,,,,,,,,...,,,,,,,,97.807419,,97.989998
1,Africa Eastern and Southern,,,,,,,,55.770672,56.088421,...,65.86454,67.391823,68.445107,68.827972,69.282494,70.059601,69.999451,70.42025,71.574051,71.889908
2,Afghanistan,,,,,,,,,,...,31.448851,,,,,,,,,
3,Africa Western and Central,,,,,,40.603149,40.878811,41.673779,42.446861,...,51.952728,52.603539,53.144989,54.186619,54.979179,55.564968,56.604252,59.61927,60.178661,60.234989
4,Angola,,,,,,,,,,...,,,,66.030113,,,,,,


In [101]:
# Provided country
country_name = 'United States'
# Get the list of country names
countries = GDP_data['Country Name'].unique().tolist()

# Create a dataframe to store the data for all countries
df = pd.DataFrame(columns=['Country Name', 'Year', 'GDP', 'Literacy Rate', 'Mortality Rate', 'Population', 'Export Value Index'])
for country in countries:
    # Extract the data for the country from each data set
    GDP = GDP_data.loc[GDP_data['Country Name'] == country].iloc[0, 1:].tolist()
    literacy_rate = literacy_rate_data.loc[literacy_rate_data['Country Name'] == country].iloc[0, 1:].tolist()
    mortality_rate = mortality_rate_data.loc[mortality_rate_data['Country Name'] == country].iloc[0, 1:].tolist()
    population = population_data.loc[population_data['Country Name'] == country].iloc[0, 1:].tolist()
    EVI = EVI_data.loc[EVI_data['Country Name'] == country].iloc[0, 1:].tolist()
    
    # Combine the data into a list of tuples
    data = list(zip([country]*len(GDP), GDP_data.columns[1:], GDP, literacy_rate, mortality_rate, population, EVI))
    
    # Append the data to the dataframe
    # df = df.append(pd.DataFrame(data, columns=df.columns))
    df = pd.concat([df, pd.DataFrame(data, columns=df.columns)])

# Print the data for a specific country (e.g., Canada)
# print(df)
print(df.loc[df['Country Name'] == country_name].head())

    Country Name  Year           GDP  Literacy Rate  Mortality Rate  \
0  United States  1980  2.857307e+12            NaN            12.6   
1  United States  1981  3.207041e+12            NaN            12.1   
2  United States  1982  3.343789e+12            NaN            11.7   
3  United States  1983  3.634038e+12            NaN            11.3   
4  United States  1984  4.037613e+12            NaN            10.9   

    Population  Export Value Index  
0  227225000.0           31.375503  
1  229466000.0           30.850243  
2  231664000.0           29.251602  
3  233792000.0           28.971391  
4  235825000.0           30.830603  


In [102]:
model_data = df.loc[df['Country Name'] == country_name]
model_data.head()

Unnamed: 0,Country Name,Year,GDP,Literacy Rate,Mortality Rate,Population,Export Value Index
0,United States,1980,2857307000000.0,,12.6,227225000.0,31.375503
1,United States,1981,3207041000000.0,,12.1,229466000.0,30.850243
2,United States,1982,3343789000000.0,,11.7,231664000.0,29.251602
3,United States,1983,3634038000000.0,,11.3,233792000.0,28.971391
4,United States,1984,4037613000000.0,,10.9,235825000.0,30.830603


In [103]:
model_data['Year'] = model_data['Year'].astype(int)
model_GDP = model_data[['Year', 'GDP']]
row_GDP = (model_GDP['Year'] >= 1990) & (model_GDP['Year'] <= 2020)
model_data_GDP = model_GDP.loc[row_GDP, ['GDP']].reset_index(drop=True)
model_data_GDP = model_data_GDP.rename(columns={'GDP': 'GDP in Ten Years'})
model_data_GDP

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data['Year'] = model_data['Year'].astype(int)


Unnamed: 0,GDP in Ten Years
0,5963144000000.0
1,6158129000000.0
2,6520327000000.0
3,6858559000000.0
4,7287236000000.0
5,7639749000000.0
6,8073122000000.0
7,8577554000000.0
8,9062818000000.0
9,9631174000000.0


In [104]:

model_data = model_data.drop(columns=['GDP'])
row_index = (model_data['Year'] >= 1980) & (model_data['Year'] <= 2010)
columns = ['Year', 'Literacy Rate', 'Population', 'Export Value Index', 'Mortality Rate']
model_data_range = model_data.loc[row_index, columns]
model_data_range

train_data = pd.concat([model_data_range, model_data_GDP], axis=1)
train_data.head()

Unnamed: 0,Year,Literacy Rate,Population,Export Value Index,Mortality Rate,GDP in Ten Years
0,1980,,227225000.0,31.375503,12.6,5963144000000.0
1,1981,,229466000.0,30.850243,12.1,6158129000000.0
2,1982,,231664000.0,29.251602,11.7,6520327000000.0
3,1983,,233792000.0,28.971391,11.3,6858559000000.0
4,1984,,235825000.0,30.830603,10.9,7287236000000.0


In [105]:
row_index = (model_data['Year'] >= 2011) & (model_data['Year'] <= 2020)
columns = ['Year', 'Literacy Rate', 'Population', 'Export Value Index', 'Mortality Rate']
test_data = model_data.loc[row_index, columns].reset_index(drop=True)
test_data

Unnamed: 0,Year,Literacy Rate,Population,Export Value Index,Mortality Rate
0,2011,,311583481.0,189.598909,6.1
1,2012,,313877662.0,197.680959,6.0
2,2013,,316059947.0,202.01516,6.0
3,2014,,318386329.0,207.250875,5.9
4,2015,,320738994.0,192.164933,5.8
5,2016,,323071755.0,185.570699,5.7
6,2017,,325122128.0,197.753857,5.7
7,2018,,326838199.0,212.807775,5.6
8,2019,,328329953.0,210.144913,5.5
9,2020,,331501080.0,183.089582,5.4


In [106]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31 entries, 0 to 30
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Year                31 non-null     int32  
 1   Literacy Rate       0 non-null      float64
 2   Population          31 non-null     float64
 3   Export Value Index  31 non-null     float64
 4   Mortality Rate      31 non-null     float64
 5   GDP in Ten Years    31 non-null     float64
dtypes: float64(5), int32(1)
memory usage: 1.6 KB


In [107]:
train_data.isna().sum()

Year                   0
Literacy Rate         31
Population             0
Export Value Index     0
Mortality Rate         0
GDP in Ten Years       0
dtype: int64

In [108]:
# Fill the missing values in the training set
train_data['Literacy Rate'].fillna((train_data['Literacy Rate'].mean()), inplace=True)
train_data['Export Value Index'].fillna((train_data['Export Value Index'].mean()), inplace=True)
train_data['GDP in Ten Years'].fillna((train_data['GDP in Ten Years'].mean()), inplace=True)
train_data['Mortality Rate'].fillna((train_data['Mortality Rate'].mean()), inplace=True)
train_data['Population'].fillna((train_data['Population'].mean()), inplace=True)
train_data.dropna(axis=1, inplace=True)
train_data.head()

Unnamed: 0,Year,Population,Export Value Index,Mortality Rate,GDP in Ten Years
0,1980,227225000.0,31.375503,12.6,5963144000000.0
1,1981,229466000.0,30.850243,12.1,6158129000000.0
2,1982,231664000.0,29.251602,11.7,6520327000000.0
3,1983,233792000.0,28.971391,11.3,6858559000000.0
4,1984,235825000.0,30.830603,10.9,7287236000000.0


In [109]:
# Fill the missing values in the test set
test_data['Literacy Rate'].fillna((test_data['Literacy Rate'].mean()), inplace=True)
test_data['Export Value Index'].fillna((test_data['Export Value Index'].mean()), inplace=True)
test_data['Mortality Rate'].fillna((test_data['Mortality Rate'].mean()), inplace=True)
test_data['Population'].fillna((test_data['Population'].mean()), inplace=True)
test_data.dropna(axis=1, inplace=True)
test_data.head()

Unnamed: 0,Year,Population,Export Value Index,Mortality Rate
0,2011,311583481.0,189.598909,6.1
1,2012,313877662.0,197.680959,6.0
2,2013,316059947.0,202.01516,6.0
3,2014,318386329.0,207.250875,5.9
4,2015,320738994.0,192.164933,5.8


In [110]:
# Drop the target feature from the train data
X = train_data.drop('GDP in Ten Years', axis=1)
y = train_data['GDP in Ten Years']

# Shape and dimension
print("Dimension of X  = {}\nType of X  = {}\n\nDimension of y  = {}\nType of y  = {}".format(X.shape, type(X), y.shape, type(y)))

Dimension of X  = (31, 4)
Type of X  = <class 'pandas.core.frame.DataFrame'>

Dimension of y  = (31,)
Type of y  = <class 'pandas.core.series.Series'>


In [111]:
# dataMap = sns.heatmap(X.corr(), vmin= -1, vmax= 1, annot= True, cmap='BrBG')

In [112]:
# Create a training and a validation set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=298)
X_train.shape, X_test.shape

((24, 4), (7, 4))

In [113]:
# Train the machine learning model
init_models = { 'Linear Regression': LinearRegression(),
                'Random forest': RandomForestRegressor(random_state=64),
                'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=79),
               }
R2 = []
models_names = []
for i, (key,model) in enumerate(init_models.items()):
    model.fit(X_train, y_train)
    models_names.append(key)
    R2.append(np.mean(cross_val_score(model, X_train, y_train, cv=5)))
models_scores = pd.DataFrame({'Model Name': models_names, 'R2 Score': R2})
models_scores.head()

Unnamed: 0,Model Name,R2 Score
0,Linear Regression,0.993302
1,Random forest,0.983854
2,Gradient Boosting Regressor,0.972722


In [114]:
# Choose the best model with the highest R2 score
models_scores.sort_values('R2 Score', ascending=False, inplace=True)
best_model = models_scores.iloc[0]

print('Best Model:')
print(best_model)

Best Model:
Model Name    Linear Regression
R2 Score               0.993302
Name: 0, dtype: object


In [115]:
best_model_name = best_model['Model Name']
best_model = init_models[best_model_name]
best_model.fit(X_train, y_train)

In [116]:
R2_train = []
R2_test = []
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)
R2_train.append(r2_score(y_train, y_pred_train))
R2_test.append(r2_score(y_test, y_pred_test))

# Create a dataframe with the R2 scores for each model
models_scores = pd.DataFrame({'Model Name': best_model,'R2 Train': R2_train, 'R2 Test': R2_test})
models_scores

Unnamed: 0,Model Name,R2 Train,R2 Test
0,LinearRegression(),0.996173,0.995332


In [117]:
y_pred_test

array([1.27460601e+13, 1.01300464e+13, 6.21395132e+12, 1.56352773e+13,
       1.97631146e+13, 9.10954273e+12, 1.06186012e+13])

In [118]:
prediction = best_model.predict(test_data)
prediction

array([2.22987229e+13, 2.28697616e+13, 2.33914144e+13, 2.39165057e+13,
       2.40967054e+13, 2.44202852e+13, 2.50650738e+13, 2.57073535e+13,
       2.60284962e+13, 2.60719893e+13])

In [119]:
# Create a conclusion dataframe and append 'Year' columns
conclusion = pd.DataFrame()
conclusion['Year'] = test_data['Year'] + 10
conclusion['GDP'] = prediction
conclusion

Unnamed: 0,Year,GDP
0,2021,22298720000000.0
1,2022,22869760000000.0
2,2023,23391410000000.0
3,2024,23916510000000.0
4,2025,24096710000000.0
5,2026,24420290000000.0
6,2027,25065070000000.0
7,2028,25707350000000.0
8,2029,26028500000000.0
9,2030,26071990000000.0


In [120]:
# Save model prediction as a csv file
conclusion.to_csv('prediction.csv', index=False)