In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from time import sleep
from tqdm import tqdm
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, scale
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LassoCV

In [2]:
# Read in the csv file containing the cleaned data
df = pd.read_csv('Cleaned_Training_Set.csv', header=0)
df = df.drop('Unnamed: 0', axis=1)
df

Unnamed: 0,country,year,World bank region,"World bank, 4 income groups 2017",Adjusted savings: mineral depletion (current US$),Adjusted savings: energy depletion (current US$),Population (Total),Population growth (annual %),Population density (people per sq. km of land area),Rural population (% of total population),Urban population (% of total),Rural population,Urban population,Urban population growth (annual %),Urban population 5 years in the future
0,Afghanistan,1972,South Asia,Low income,0.000000e+00,2.233065e+07,11644377.0,2.794796,17.853176,88.0702,11.9298,10255226.0,1389151.0,6.634620,1855690
1,Afghanistan,1973,South Asia,Low income,0.000000e+00,2.654146e+07,11966352.0,2.727531,18.346829,87.6208,12.3792,10485013.0,1481339.0,6.425364,1942479
2,Afghanistan,1974,South Asia,Low income,0.000000e+00,6.606462e+07,12273589.0,2.535101,18.817885,87.1714,12.8286,10699059.0,1574530.0,6.101041,2014500
3,Afghanistan,1975,South Asia,Low income,0.000000e+00,6.882184e+07,12551790.0,2.241357,19.244423,86.7220,13.2780,10885163.0,1666627.0,5.684501,2066033
4,Afghanistan,1976,South Asia,Low income,0.000000e+00,6.463635e+07,12806810.0,2.011378,19.635420,86.2426,13.7574,11044926.0,1761884.0,5.558187,2064276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6466,Zimbabwe,1998,Sub-Saharan Africa,Low income,3.574371e+06,2.922644e+06,12229500.0,1.513394,31.613028,67.0524,32.9476,8200173.0,4029327.0,2.750848,4438019
6467,Zimbabwe,1999,Sub-Saharan Africa,Low income,1.121581e+07,0.000000e+00,12384727.0,1.261295,32.014287,66.6472,33.3528,8254074.0,4130653.0,2.483614,4498365
6468,Zimbabwe,2000,Sub-Saharan Africa,Low income,2.259094e+07,1.635434e+06,12503652.0,0.955674,32.321706,66.2420,33.7580,8282669.0,4220983.0,2.163253,4558017
6469,Zimbabwe,2001,Sub-Saharan Africa,Low income,1.491629e+07,2.523939e+07,12586763.0,0.662494,32.536546,65.8216,34.1784,8284809.0,4301954.0,1.900130,4620705


In [3]:
# Since all the countries in the preview are "Low income", we are looking for one in a different category that we can
# test later to ensure that our ordinal income feature is correctly mapped.
df.iloc[50,3]

'Upper middle income'

In order to use categorical features in our model, we will need to assign them a numerical value, which is done in the two cells below.

In [4]:
mapper = {'High income': 3, 'Upper middle income': 2, 'Lower middle income': 1, 'Low income': 0}
df['World bank, 4 income groups 2017'] = df['World bank, 4 income groups 2017'].replace(mapper)

In [5]:
df

Unnamed: 0,country,year,World bank region,"World bank, 4 income groups 2017",Adjusted savings: mineral depletion (current US$),Adjusted savings: energy depletion (current US$),Population (Total),Population growth (annual %),Population density (people per sq. km of land area),Rural population (% of total population),Urban population (% of total),Rural population,Urban population,Urban population growth (annual %),Urban population 5 years in the future
0,Afghanistan,1972,South Asia,0,0.000000e+00,2.233065e+07,11644377.0,2.794796,17.853176,88.0702,11.9298,10255226.0,1389151.0,6.634620,1855690
1,Afghanistan,1973,South Asia,0,0.000000e+00,2.654146e+07,11966352.0,2.727531,18.346829,87.6208,12.3792,10485013.0,1481339.0,6.425364,1942479
2,Afghanistan,1974,South Asia,0,0.000000e+00,6.606462e+07,12273589.0,2.535101,18.817885,87.1714,12.8286,10699059.0,1574530.0,6.101041,2014500
3,Afghanistan,1975,South Asia,0,0.000000e+00,6.882184e+07,12551790.0,2.241357,19.244423,86.7220,13.2780,10885163.0,1666627.0,5.684501,2066033
4,Afghanistan,1976,South Asia,0,0.000000e+00,6.463635e+07,12806810.0,2.011378,19.635420,86.2426,13.7574,11044926.0,1761884.0,5.558187,2064276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6466,Zimbabwe,1998,Sub-Saharan Africa,0,3.574371e+06,2.922644e+06,12229500.0,1.513394,31.613028,67.0524,32.9476,8200173.0,4029327.0,2.750848,4438019
6467,Zimbabwe,1999,Sub-Saharan Africa,0,1.121581e+07,0.000000e+00,12384727.0,1.261295,32.014287,66.6472,33.3528,8254074.0,4130653.0,2.483614,4498365
6468,Zimbabwe,2000,Sub-Saharan Africa,0,2.259094e+07,1.635434e+06,12503652.0,0.955674,32.321706,66.2420,33.7580,8282669.0,4220983.0,2.163253,4558017
6469,Zimbabwe,2001,Sub-Saharan Africa,0,1.491629e+07,2.523939e+07,12586763.0,0.662494,32.536546,65.8216,34.1784,8284809.0,4301954.0,1.900130,4620705


In [6]:
# Testing to ensure that a our 'Upper middle income' sample country is correctly mapped.
df.iloc[50,3]

2

In [7]:
df_dummies = pd.get_dummies(df, columns=['country', 'World bank region'])
df_dummies

Unnamed: 0,year,"World bank, 4 income groups 2017",Adjusted savings: mineral depletion (current US$),Adjusted savings: energy depletion (current US$),Population (Total),Population growth (annual %),Population density (people per sq. km of land area),Rural population (% of total population),Urban population (% of total),Rural population,...,"country_Yemen, Rep.",country_Zambia,country_Zimbabwe,World bank region_East Asia & Pacific,World bank region_Europe & Central Asia,World bank region_Latin America & Caribbean,World bank region_Middle East & North Africa,World bank region_North America,World bank region_South Asia,World bank region_Sub-Saharan Africa
0,1972,0,0.000000e+00,2.233065e+07,11644377.0,2.794796,17.853176,88.0702,11.9298,10255226.0,...,0,0,0,0,0,0,0,0,1,0
1,1973,0,0.000000e+00,2.654146e+07,11966352.0,2.727531,18.346829,87.6208,12.3792,10485013.0,...,0,0,0,0,0,0,0,0,1,0
2,1974,0,0.000000e+00,6.606462e+07,12273589.0,2.535101,18.817885,87.1714,12.8286,10699059.0,...,0,0,0,0,0,0,0,0,1,0
3,1975,0,0.000000e+00,6.882184e+07,12551790.0,2.241357,19.244423,86.7220,13.2780,10885163.0,...,0,0,0,0,0,0,0,0,1,0
4,1976,0,0.000000e+00,6.463635e+07,12806810.0,2.011378,19.635420,86.2426,13.7574,11044926.0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6466,1998,0,3.574371e+06,2.922644e+06,12229500.0,1.513394,31.613028,67.0524,32.9476,8200173.0,...,0,0,1,0,0,0,0,0,0,1
6467,1999,0,1.121581e+07,0.000000e+00,12384727.0,1.261295,32.014287,66.6472,33.3528,8254074.0,...,0,0,1,0,0,0,0,0,0,1
6468,2000,0,2.259094e+07,1.635434e+06,12503652.0,0.955674,32.321706,66.2420,33.7580,8282669.0,...,0,0,1,0,0,0,0,0,0,1
6469,2001,0,1.491629e+07,2.523939e+07,12586763.0,0.662494,32.536546,65.8216,34.1784,8284809.0,...,0,0,1,0,0,0,0,0,0,1


In [8]:
# Assign predictors to X and target feature to y
X = df_dummies.drop('Urban population 5 years in the future', axis=1)
y = df_dummies['Urban population 5 years in the future']

### Train-Test Split

In [9]:
rand_state=18

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rand_state)

### Model 0: Basic Linear Regression--No Tuning

In [11]:
# The score attribute of scikit-learn's LinearRegression is R-squared
# R-Squared = Explained variance of the model / Total variance of the target variable
reg0 = LinearRegression()
reg0.fit(X_train, y_train)
y_pred_lin = reg0.predict(X_test)
reg0.score(X_test, y_test)

0.9993746420260124

### Scaling Data, Determining Parameters, and Setting Up Pipeline for Lasso

In [12]:
# Setting up the steps for the pipeline--ensuring data is scaled because values can range from 0 < x < 1 to 
# x > 1,000,000,000 depending on the feature
# With so many features (230), I am assuming that not all features are important and that some are more important than
# others. For that reason, I have chosen a Lasso Regression model in order to shrink "unimportant" features' 
# coefficients to 0.
steps = [('scaler', StandardScaler()),
        ('lasso', Lasso(tol=0.01))]

In [13]:
# Setting up the pipeline with the steps
pipeline = Pipeline(steps)

In [14]:
# Setting up the parameters: testing 8 possible values to find the best alpha constant 
param_grid = {'lasso__alpha': np.logspace(-7, 7, 8)}

In [15]:
# Using gridsearch and cross validation to find best parameters
gscv = GridSearchCV(pipeline, param_grid, scoring='neg_root_mean_squared_error')

In [16]:
# Fitting the model to the training data
gscv.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('lasso', Lasso(tol=0.01))]),
             param_grid={'lasso__alpha': array([1.e-07, 1.e-05, 1.e-03, 1.e-01, 1.e+01, 1.e+03, 1.e+05, 1.e+07])},
             scoring='neg_root_mean_squared_error')

In [17]:
# Using the model to predict on the unseen test data
y_pred = gscv.predict(X_test)

In [18]:
# Determining the best value for Lasso's constant "alpha"
chosen_alpha = gscv.best_params_['lasso__alpha']
print(gscv.best_params_)

{'lasso__alpha': 100000.0}


In [19]:
rmse_score = gscv.score(X_test, y_test) #looking at rmse metric
rmse_score

-1750181.855973187

In [20]:
# Mean cross-validated score of the best_estimator
gscv.best_score_

-2637290.9833209715

In [21]:
gscv.scorer_

make_scorer(mean_squared_error, greater_is_better=False, squared=False)

In [22]:
# Determining the best Lasso model
my_model = gscv.best_estimator_
my_model

Pipeline(steps=[('scaler', StandardScaler()),
                ('lasso', Lasso(alpha=100000.0, tol=0.01))])

In [23]:
# Using the best model to predict on the data that was held out of the training set in order to test how well model 
# evaluates unseen data
my_model.predict(X_test)

array([ 1.01567154e+08,  5.88829976e+07,  3.38628512e+07, ...,
       -2.80803369e+05,  6.30175085e+06,  1.12235337e+07])

In [24]:
# Looking at Lasso coefficients
lasso_coef = my_model.named_steps['lasso'].coef_
lasso_coef

array([-0.00000000e+00,  6.11134370e+04,  2.59377504e+05,  0.00000000e+00,
        3.27053033e+07,  0.00000000e+00, -0.00000000e+00, -3.84719380e+05,
        5.75779026e-10, -3.16607880e+06,  1.91551678e+07,  2.87253558e+04,
       -0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  7.12044853e+04,
        0.00000000e+00,  0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -9.15756760e+05,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00,  6.66815807e+04,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  5.84152168e+05,  0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
       -9.80947632e+06,  

In [25]:
# Looking for non-zero Lasso coefficients
lasso_coef_num = lasso_coef[lasso_coef != 0]
lasso_coef_num

array([ 6.11134370e+04,  2.59377504e+05,  3.27053033e+07, -3.84719380e+05,
        5.75779026e-10, -3.16607880e+06,  1.91551678e+07,  2.87253558e+04,
        7.12044853e+04, -9.15756760e+05,  6.66815807e+04,  5.84152168e+05,
       -9.80947632e+06, -1.23692843e+05, -1.83595269e+05, -4.70239209e+05,
       -9.07031627e+06, -9.79669302e+05,  2.83613576e+02, -2.64071885e+04,
        1.20511739e+05, -1.02362809e+05,  2.68202224e+04,  5.64663955e+04,
        1.75331332e+05, -2.96021997e+05, -3.01007067e+05, -6.36478731e+05,
       -8.19521579e+04, -1.74731283e+04, -1.72841656e+04,  5.43433325e+04,
       -1.27058252e+05, -3.55171610e+05, -7.21433213e+04, -3.01056328e+04,
        4.75083575e+05,  4.60927542e+04, -5.95153832e+05,  9.91983289e+04,
       -2.90255619e+05])

In [26]:
# Determining how many features have a non-zero Lasso coefficient
len(lasso_coef_num)

41

In [27]:
# Determining the names of the features with non-zero Lasso coefficients
lasso_coef_name = X_train.columns[lasso_coef != 0]
lasso_coef_name

Index(['World bank, 4 income groups 2017',
       'Adjusted savings: mineral depletion (current US$)',
       'Population (Total)', 'Rural population (% of total population)',
       'Urban population (% of total)', 'Rural population', 'Urban population',
       'Urban population growth (annual %)', 'country_Argentina',
       'country_Bangladesh', 'country_Bhutan', 'country_Brazil',
       'country_China', 'country_Congo, Dem. Rep.', 'country_Egypt, Arab Rep.',
       'country_Ethiopia', 'country_India', 'country_Indonesia',
       'country_Iran, Islamic Rep.', 'country_Italy', 'country_Japan',
       'country_Kenya', 'country_Korea, Rep.', 'country_Maldives',
       'country_Mexico', 'country_Myanmar', 'country_Nigeria',
       'country_Pakistan', 'country_Philippines', 'country_Poland',
       'country_Romania', 'country_Saudi Arabia', 'country_Tanzania',
       'country_Thailand', 'country_Uganda', 'country_Ukraine',
       'country_United States', 'country_Venezuela, RB', 'country

In [28]:
# Ensuring that the lengths of the arrays ate the same
assert len(lasso_coef_num) == len(lasso_coef_name)

# Mapping the column name with the associated Lasso coefficient
feature_coef_dict = {lasso_coef_name[i]: lasso_coef_num[i] for i in range(len(lasso_coef_name))}
feature_coef_dict

{'World bank, 4 income groups 2017': 61113.43702069456,
 'Adjusted savings: mineral depletion (current US$)': 259377.50361377557,
 'Population (Total)': 32705303.313617587,
 'Rural population (% of total population)': -384719.37990910385,
 'Urban population (% of total)': 5.757790260373901e-10,
 'Rural population': -3166078.7962525734,
 'Urban population': 19155167.8151746,
 'Urban population growth (annual %)': 28725.355816635383,
 'country_Argentina': 71204.48525460706,
 'country_Bangladesh': -915756.7598416574,
 'country_Bhutan': 66681.58074782489,
 'country_Brazil': 584152.1675506077,
 'country_China': -9809476.318619,
 'country_Congo, Dem. Rep.': -123692.84282727749,
 'country_Egypt, Arab Rep.': -183595.26874985336,
 'country_Ethiopia': -470239.20930828166,
 'country_India': -9070316.274736281,
 'country_Indonesia': -979669.30216603,
 'country_Iran, Islamic Rep.': 283.6135758186534,
 'country_Italy': -26407.188457454682,
 'country_Japan': 120511.73914082162,
 'country_Kenya': -102

Above, we see that 8 features that weren't one-hot encoded were chosen. The model also selected two one-hot encoded regions and 31 one-hot encoded countries.

### Predicting the Population for 2008 using a Lasso model and data from 2003

In [29]:
# Reading in 2003 dataframe
df_2003 = pd.read_csv('data_2003.csv', header=0)
df_2003 = df_2003.drop('Unnamed: 0', axis=1)
df_2003

Unnamed: 0,country,year,World bank region,"World bank, 4 income groups 2017",Adjusted savings: mineral depletion (current US$),Adjusted savings: energy depletion (current US$),Population (Total),Population growth (annual %),Population density (people per sq. km of land area),Rural population (% of total population),Urban population (% of total),Rural population,Urban population,Urban population growth (annual %),Urban population 5 years in the future
0,Afghanistan,2003,South Asia,Low income,0.000000e+00,2.049705e+05,23116142.0,4.031248,35.441703,78.6376,21.3624,18177979.0,4938163.0,5.254873,720689.0
1,Albania,2003,Europe & Central Asia,Upper middle income,0.000000e+00,4.692363e+07,3033659.0,-0.583987,110.717482,55.2212,44.7788,1675223.0,1358436.0,1.703278,6178697.0
2,Algeria,2003,Middle East & North Africa,Upper middle income,1.792483e+07,1.460891e+10,33003442.0,1.312884,13.856862,35.6718,64.3282,11772922.0,21230520.0,3.164894,20277.0
3,American Samoa,2003,East Asia & Pacific,Upper middle income,0.000000e+00,0.000000e+00,59117.0,0.658489,295.585000,9.7238,90.2762,5748.0,53369.0,1.217826,22319.0
4,Andorra,2003,Europe & Central Asia,High income,0.000000e+00,0.000000e+00,75643.0,5.438529,160.942553,8.8650,91.1350,6706.0,68937.0,4.978505,1027593.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,Virgin Islands (U.S.),2003,Latin America & Caribbean,High income,0.000000e+00,0.000000e+00,108085.0,-0.113735,308.814286,6.4258,93.5742,6945.0,101140.0,0.225685,0.0
206,West Bank and Gaza,2003,Middle East & North Africa,Lower middle income,0.000000e+00,0.000000e+00,3154969.0,2.555248,524.081229,27.3758,72.6242,863698.0,2291271.0,2.856446,875444.0
207,"Yemen, Rep.",2003,Middle East & North Africa,Lower middle income,0.000000e+00,2.585782e+09,19081306.0,2.818175,36.140891,72.1316,27.8684,13763651.0,5317655.0,4.752199,1442678.0
208,Zambia,2003,Sub-Saharan Africa,Lower middle income,3.429587e+07,5.705385e+05,10894519.0,2.501029,14.655186,64.1120,35.8880,6984694.0,3909825.0,3.514839,1023611.0


In [30]:
# Using the mapper from the first part for ordinal income group feature
df_2003['World bank, 4 income groups 2017'] = df_2003['World bank, 4 income groups 2017'].replace(mapper)

In [31]:
# Dropping the empty column that we will attempt to predict
df_2003.drop('Urban population 5 years in the future', axis=1, inplace=True)

In [32]:
# Getting dummies for "country" and "World bank region" columns so that data is the same shape
df_2003_X = pd.get_dummies(df_2003, columns=['country', 'World bank region'])
df_2003_X

Unnamed: 0,year,"World bank, 4 income groups 2017",Adjusted savings: mineral depletion (current US$),Adjusted savings: energy depletion (current US$),Population (Total),Population growth (annual %),Population density (people per sq. km of land area),Rural population (% of total population),Urban population (% of total),Rural population,...,"country_Yemen, Rep.",country_Zambia,country_Zimbabwe,World bank region_East Asia & Pacific,World bank region_Europe & Central Asia,World bank region_Latin America & Caribbean,World bank region_Middle East & North Africa,World bank region_North America,World bank region_South Asia,World bank region_Sub-Saharan Africa
0,2003,0,0.000000e+00,2.049705e+05,23116142.0,4.031248,35.441703,78.6376,21.3624,18177979.0,...,0,0,0,0,0,0,0,0,1,0
1,2003,2,0.000000e+00,4.692363e+07,3033659.0,-0.583987,110.717482,55.2212,44.7788,1675223.0,...,0,0,0,0,1,0,0,0,0,0
2,2003,2,1.792483e+07,1.460891e+10,33003442.0,1.312884,13.856862,35.6718,64.3282,11772922.0,...,0,0,0,0,0,0,1,0,0,0
3,2003,2,0.000000e+00,0.000000e+00,59117.0,0.658489,295.585000,9.7238,90.2762,5748.0,...,0,0,0,1,0,0,0,0,0,0
4,2003,3,0.000000e+00,0.000000e+00,75643.0,5.438529,160.942553,8.8650,91.1350,6706.0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,2003,3,0.000000e+00,0.000000e+00,108085.0,-0.113735,308.814286,6.4258,93.5742,6945.0,...,0,0,0,0,0,1,0,0,0,0
206,2003,1,0.000000e+00,0.000000e+00,3154969.0,2.555248,524.081229,27.3758,72.6242,863698.0,...,0,0,0,0,0,0,1,0,0,0
207,2003,1,0.000000e+00,2.585782e+09,19081306.0,2.818175,36.140891,72.1316,27.8684,13763651.0,...,1,0,0,0,0,0,1,0,0,0
208,2003,1,3.429587e+07,5.705385e+05,10894519.0,2.501029,14.655186,64.1120,35.8880,6984694.0,...,0,1,0,0,0,0,0,0,0,1


In [33]:
# Predicting urban population for 2008
pop_2008 = abs(my_model.predict(df_2003_X).astype(int))
pop_2008

array([  7342034,   1252175,  22897718,    267973,    395286,   9150224,
          358320,  34700451,   1867784,     80307,  20997582,   5666997,
         4716152,    682650,    949254,  46442945,     94187,   7198782,
         9822027,     40291,   3506274,    477315,    967089,   6250070,
         1722156,    931616, 159427507,    282063,   5666148,   4297525,
         1366530,     48963,   4308753,   9663248,  26454625,    742085,
         1318120,   3000816,    582845,  16841967, 556702361,  32089591,
          586290,  21672521,   1969144,   2844107,   9242341,   2540085,
         9064468,    684186,   7746551,   4694963,    538905,    151933,
         6557606,   9160492,  35755855,   4018829,    256075,    841152,
          932089,  19692222,    485280,     74792,   4420186,  49757416,
          165859,   1182672,    369117,   2250336,  62379798,  11403741,
         7212588,    190270,    306576,    447881,   6790973,   3815370,
           45067,    132802,   4550178,   3685068, 

In [34]:
# Dataframe of country and prediction
country_pop_prediction = pd.DataFrame({'Country': df['country'].unique(), 'Population in 2008': pop_2008})
country_pop_prediction

Unnamed: 0,Country,Population in 2008
0,Afghanistan,7342034
1,Albania,1252175
2,Algeria,22897718
3,American Samoa,267973
4,Andorra,395286
...,...,...
205,Virgin Islands (U.S.),666298
206,West Bank and Gaza,2276201
207,"Yemen, Rep.",8065133
208,Zambia,4978386


In [35]:
# Urban population data for 2007
pop_2007 = df['Urban population 5 years in the future'].loc[df['year']==2002]
pop_2007

30       5908765
61       1441361
92      24154838
123        53263
154        72597
          ...   
6364      101661
6377     2568098
6408     6367019
6439     4535925
6470     4684302
Name: Urban population 5 years in the future, Length: 210, dtype: int64

In [36]:
# Dataframe comparing urban population in 2007 and 2008 for each country
pop_2007_2008 = pd.DataFrame({'Country': df['country'].unique(), 'Population in 2007': pop_2007, 
                              'Population in 2008': pop_2008}).reset_index(drop=True)
pop_2007_2008

Unnamed: 0,Country,Population in 2007,Population in 2008
0,Afghanistan,5908765,7342034
1,Albania,1441361,1252175
2,Algeria,24154838,22897718
3,American Samoa,53263,267973
4,Andorra,72597,395286
...,...,...,...
205,Virgin Islands (U.S.),101661,666298
206,West Bank and Gaza,2568098,2276201
207,"Yemen, Rep.",6367019,8065133
208,Zambia,4535925,4978386


In [37]:
pop_2007_2008.iloc[6]

Country               Antigua and Barbuda
Population in 2007                  25600
Population in 2008                 358320
Name: 6, dtype: object

### Using the Basic Linear Regression Model (the First Model) to Predict from 2003's data

In [38]:
linreg_2008_prediction = abs(reg0.predict(df_2003_X).astype(int))
linreg_2008_prediction

array([  5642071,   1259158,  24552559,    335758,     82052,   9325741,
            8362,  37739645,   1828945,     81970,  17469109,   5438722,
         4468114,     70290,    728106,  41124955,     43604,   7172955,
        10254383,     90973,   3428639,      6945,    194008,   6355459,
         1616689,    269130, 171207000,     79509,   5289230,   2884063,
          561078,    280660,   2682937,   9465684,  26588963,    131238,
         1447757,   2113078,     36229,  14398427, 619830069,  34687511,
           26396,  17855435,   2076415,   2616437,   8836188,   2366579,
         8802770,    506009,   7487645,   4510073,    300925,    159622,
         6428542,   9399518,  32730607,   3750403,     53393,    925663,
          825160,  12459804,    111502,     96040,   4345772,  52710375,
           84051,    570037,    297322,   2128853,  61323196,  10915984,
         6883267,    120838,     87920,    155005,   6326345,   3221737,
          231042,    170289,   4206258,   3412593, 

In [39]:
pop_comparison = pd.DataFrame({'Country': df['country'].unique(), 'Population in 2007': pop_2007, 
                              'Population in 2008 (Lasso)': pop_2008, 
                              'Population in 2008 (Linear Regression)': 
                               linreg_2008_prediction}).reset_index(drop=True)
pop_comparison

Unnamed: 0,Country,Population in 2007,Population in 2008 (Lasso),Population in 2008 (Linear Regression)
0,Afghanistan,5908765,7342034,5642071
1,Albania,1441361,1252175,1259158
2,Algeria,24154838,22897718,24552559
3,American Samoa,53263,267973,335758
4,Andorra,72597,395286,82052
...,...,...,...,...
205,Virgin Islands (U.S.),101661,666298,304225
206,West Bank and Gaza,2568098,2276201,2568081
207,"Yemen, Rep.",6367019,8065133,6054677
208,Zambia,4535925,4978386,4507988


---

### Scaling Selected Features for Optimized Linear Regression Model

In [40]:
# Narrowing down features from Lasso Model results using the feature selection results from Cell #28
opt_features_df = df[['country', 'year', 'World bank region', 'World bank, 4 income groups 2017', 
                'Adjusted savings: mineral depletion (current US$)', 'Population (Total)', 
                'Rural population (% of total population)', 'Urban population (% of total)', 
                'Rural population', 'Urban population', 'Urban population growth (annual %)',
                 'Urban population 5 years in the future']]
opt_features_df

Unnamed: 0,country,year,World bank region,"World bank, 4 income groups 2017",Adjusted savings: mineral depletion (current US$),Population (Total),Rural population (% of total population),Urban population (% of total),Rural population,Urban population,Urban population growth (annual %),Urban population 5 years in the future
0,Afghanistan,1972,South Asia,0,0.000000e+00,11644377.0,88.0702,11.9298,10255226.0,1389151.0,6.634620,1855690
1,Afghanistan,1973,South Asia,0,0.000000e+00,11966352.0,87.6208,12.3792,10485013.0,1481339.0,6.425364,1942479
2,Afghanistan,1974,South Asia,0,0.000000e+00,12273589.0,87.1714,12.8286,10699059.0,1574530.0,6.101041,2014500
3,Afghanistan,1975,South Asia,0,0.000000e+00,12551790.0,86.7220,13.2780,10885163.0,1666627.0,5.684501,2066033
4,Afghanistan,1976,South Asia,0,0.000000e+00,12806810.0,86.2426,13.7574,11044926.0,1761884.0,5.558187,2064276
...,...,...,...,...,...,...,...,...,...,...,...,...
6466,Zimbabwe,1998,Sub-Saharan Africa,0,3.574371e+06,12229500.0,67.0524,32.9476,8200173.0,4029327.0,2.750848,4438019
6467,Zimbabwe,1999,Sub-Saharan Africa,0,1.121581e+07,12384727.0,66.6472,33.3528,8254074.0,4130653.0,2.483614,4498365
6468,Zimbabwe,2000,Sub-Saharan Africa,0,2.259094e+07,12503652.0,66.2420,33.7580,8282669.0,4220983.0,2.163253,4558017
6469,Zimbabwe,2001,Sub-Saharan Africa,0,1.491629e+07,12586763.0,65.8216,34.1784,8284809.0,4301954.0,1.900130,4620705


In [41]:
# Getting dummies for narrowed down dataframe
opt_feat_df_dummies = pd.get_dummies(opt_features_df, columns=['country', 'World bank region'])
opt_feat_df_dummies

Unnamed: 0,year,"World bank, 4 income groups 2017",Adjusted savings: mineral depletion (current US$),Population (Total),Rural population (% of total population),Urban population (% of total),Rural population,Urban population,Urban population growth (annual %),Urban population 5 years in the future,...,"country_Yemen, Rep.",country_Zambia,country_Zimbabwe,World bank region_East Asia & Pacific,World bank region_Europe & Central Asia,World bank region_Latin America & Caribbean,World bank region_Middle East & North Africa,World bank region_North America,World bank region_South Asia,World bank region_Sub-Saharan Africa
0,1972,0,0.000000e+00,11644377.0,88.0702,11.9298,10255226.0,1389151.0,6.634620,1855690,...,0,0,0,0,0,0,0,0,1,0
1,1973,0,0.000000e+00,11966352.0,87.6208,12.3792,10485013.0,1481339.0,6.425364,1942479,...,0,0,0,0,0,0,0,0,1,0
2,1974,0,0.000000e+00,12273589.0,87.1714,12.8286,10699059.0,1574530.0,6.101041,2014500,...,0,0,0,0,0,0,0,0,1,0
3,1975,0,0.000000e+00,12551790.0,86.7220,13.2780,10885163.0,1666627.0,5.684501,2066033,...,0,0,0,0,0,0,0,0,1,0
4,1976,0,0.000000e+00,12806810.0,86.2426,13.7574,11044926.0,1761884.0,5.558187,2064276,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6466,1998,0,3.574371e+06,12229500.0,67.0524,32.9476,8200173.0,4029327.0,2.750848,4438019,...,0,0,1,0,0,0,0,0,0,1
6467,1999,0,1.121581e+07,12384727.0,66.6472,33.3528,8254074.0,4130653.0,2.483614,4498365,...,0,0,1,0,0,0,0,0,0,1
6468,2000,0,2.259094e+07,12503652.0,66.2420,33.7580,8282669.0,4220983.0,2.163253,4558017,...,0,0,1,0,0,0,0,0,0,1
6469,2001,0,1.491629e+07,12586763.0,65.8216,34.1784,8284809.0,4301954.0,1.900130,4620705,...,0,0,1,0,0,0,0,0,0,1


In [42]:
# Assign predictors to X and target feature to y
X_lr = opt_feat_df_dummies.drop('Urban population 5 years in the future', axis=1)
y_lr = opt_feat_df_dummies['Urban population 5 years in the future']

In [43]:
# Train-test split
X_lr_train, X_lr_test, y_lr_train, y_lr_test = train_test_split(X_lr, y_lr, test_size=0.2, 
                                                                random_state=rand_state)

In [44]:
# Scaling the data and building a linear regression model using the help of a pipeline
lr_pipeline = Pipeline(steps = [('scaler', StandardScaler()),
                                ('linear', LinearRegression())])

model = lr_pipeline.fit(X_lr_train, y_lr_train)

y_pred_lin = abs(lr_pipeline.predict(X_lr_test))
lr_pipeline.score(X_lr_test, y_lr_test)

0.9993456241264517

In [45]:
# Since we are not using all features from the original dataset, we need to remove these extra features from 2003's
# dataset
df_2003_X_cols = df_2003_X.columns.intersection(X_lr.columns)
df_2003_X_lr = df_2003_X[df_2003_X_cols]
df_2003_X_lr

Unnamed: 0,year,"World bank, 4 income groups 2017",Adjusted savings: mineral depletion (current US$),Population (Total),Rural population (% of total population),Urban population (% of total),Rural population,Urban population,Urban population growth (annual %),country_Afghanistan,...,"country_Yemen, Rep.",country_Zambia,country_Zimbabwe,World bank region_East Asia & Pacific,World bank region_Europe & Central Asia,World bank region_Latin America & Caribbean,World bank region_Middle East & North Africa,World bank region_North America,World bank region_South Asia,World bank region_Sub-Saharan Africa
0,2003,0,0.000000e+00,23116142.0,78.6376,21.3624,18177979.0,4938163.0,5.254873,1,...,0,0,0,0,0,0,0,0,1,0
1,2003,2,0.000000e+00,3033659.0,55.2212,44.7788,1675223.0,1358436.0,1.703278,0,...,0,0,0,0,1,0,0,0,0,0
2,2003,2,1.792483e+07,33003442.0,35.6718,64.3282,11772922.0,21230520.0,3.164894,0,...,0,0,0,0,0,0,1,0,0,0
3,2003,2,0.000000e+00,59117.0,9.7238,90.2762,5748.0,53369.0,1.217826,0,...,0,0,0,1,0,0,0,0,0,0
4,2003,3,0.000000e+00,75643.0,8.8650,91.1350,6706.0,68937.0,4.978505,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,2003,3,0.000000e+00,108085.0,6.4258,93.5742,6945.0,101140.0,0.225685,0,...,0,0,0,0,0,1,0,0,0,0
206,2003,1,0.000000e+00,3154969.0,27.3758,72.6242,863698.0,2291271.0,2.856446,0,...,0,0,0,0,0,0,1,0,0,0
207,2003,1,0.000000e+00,19081306.0,72.1316,27.8684,13763651.0,5317655.0,4.752199,0,...,1,0,0,0,0,0,1,0,0,0
208,2003,1,3.429587e+07,10894519.0,64.1120,35.8880,6984694.0,3909825.0,3.514839,0,...,0,1,0,0,0,0,0,0,0,1


In [46]:
# Predictions using the scaled features in our Linear Regression model
scaled_linreg_2008_prediction = abs(lr_pipeline.predict(df_2003_X_lr).astype(int))
scaled_linreg_2008_prediction

array([  5397049,   1151545,  25023033,    359878,    197177,   9151033,
           30150,  38119993,   1856057,     19001,  17039929,   5337657,
         4561465,      5574,    823865,  41114169,     10809,   7197241,
        10285625,    182841,   3195449,     29241,    337350,   6375993,
         1446457,    254521, 170854969,     32198,   5257785,   2728505,
          541241,    490950,   2744889,   9626169,  27161145,     50630,
         1172025,   1950265,      7622,  13892153, 619569721,  34667065,
          130502,  17801785,   2048569,   2529849,   8782393,   2265657,
         8665657,    598585,   7567929,   4672057,    345145,    427462,
         6425145,   9394745,  32920121,   3709497,     15814,    934457,
          897593,  12321337,    153030,     33337,   4231737,  52787769,
          131641,    631353,    131641,   2079289,  61262393,  10977849,
         6836793,    130502,    165318,    163270,   6240825,   3162681,
           33337,    174649,   4250169,   3385913, 

In [47]:
# Creating a dataframe to compare the results of
pop_comparison_all = pd.DataFrame({'Country': df['country'].unique(), 'Population in 2007': pop_2007, 
                              'Population in 2008 (Lasso)': pop_2008, 
                              'Population in 2008 (Basic Linear Regression)': linreg_2008_prediction,
                              'Population in 2008 (LR--Feature Selection & Scaling)': 
                                   scaled_linreg_2008_prediction}).reset_index(drop=True)
pop_comparison_all

Unnamed: 0,Country,Population in 2007,Population in 2008 (Lasso),Population in 2008 (Basic Linear Regression),Population in 2008 (LR--Feature Selection & Scaling)
0,Afghanistan,5908765,7342034,5642071,5397049
1,Albania,1441361,1252175,1259158,1151545
2,Algeria,24154838,22897718,24552559,25023033
3,American Samoa,53263,267973,335758,359878
4,Andorra,72597,395286,82052,197177
...,...,...,...,...,...
205,Virgin Islands (U.S.),101661,666298,304225,306630
206,West Bank and Gaza,2568098,2276201,2568081,2560569
207,"Yemen, Rep.",6367019,8065133,6054677,6246969
208,Zambia,4535925,4978386,4507988,4448825


### Notes:

- Because we are using Linear Regression, some predicted values are negative. This is possible to resolve by finding the absolute value of the predicted feature.

- Since we trained and fit the models across all countries, predictions for some countries are more accurate than for others. One way to fix this would be to customize the model for each country. 

- Typically urban population increases over time. However, there are exceptions. For example, if something catastrophic happens, if there is a large number of people emigrating from the country, or if there is a societal shift in which citizens leave the city for more rural areas.

- If we want to work on optimizing the models, we could look into the percentage increase in urban populations from previous years to determine just how feasible our predictions are.

- When comparing the Lasso prediction and basic Linear Regression predictions for each country, we see that from the 10 rows we can see above, the LinReg model's prediction is typically closer to the population in 2007 than the Lasso model's.

- The first Linear Regression model was trained across all features while the second was trained on a slightly smaller set of features. The features were also scaled in the second model. It is also possible to continue looking for a better Linear Regression model by creating one with scaled features and the full dataset and creating another without scaling and with fewer features. These models would be created similarly to the second linear regression model above.

- It is also possible to use time series analysis to predict population. However, that is currently out of the scope of this project.