In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
data = pd.read_csv('data/GDP_Country.csv')
data.sample(5)

Unnamed: 0,Country,Population,Area_sqm,Pop_Density_per sqm,Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
60,Egypt,78887007,1001450,78.8,0.24,-0.22,32.59,4000.0,57.7,131.8,2.87,0.48,96.65,1.0,22.94,5.23,0.149,0.357,0.493
129,Malta,400214,316,1266.5,62.28,2.07,3.89,17700.0,92.8,505.0,28.13,3.13,68.74,,10.22,8.1,0.03,0.23,0.74
4,Andorra,71201,468,152.1,0.0,6.6,4.05,19000.0,100.0,497.2,2.22,0.0,97.78,3.0,8.71,6.25,,,
42,China,1313973713,9596960,136.9,0.15,-0.4,24.18,5000.0,90.9,266.7,15.4,1.25,83.35,1.5,13.25,6.97,0.125,0.473,0.403
87,Guinea-Bissau,1442029,36120,39.9,0.97,-1.57,107.17,800.0,42.4,7.4,10.67,8.82,80.51,2.0,37.22,16.53,0.62,0.12,0.26


### Preprocessing

In [3]:
data.isnull().sum()

Country                                0
Population                             0
Area_sqm                               0
Pop_Density_per sqm                    0
Coastline (coast/area ratio)           0
Net migration                          3
Infant mortality (per 1000 births)     3
GDP ($ per capita)                     1
Literacy (%)                          18
Phones (per 1000)                      4
Arable (%)                             2
Crops (%)                              2
Other (%)                              2
Climate                               22
Birthrate                              3
Deathrate                              4
Agriculture                           15
Industry                              16
Service                               15
dtype: int64

In [4]:
data.isnull().sum()/len(data)

Country                               0.000000
Population                            0.000000
Area_sqm                              0.000000
Pop_Density_per sqm                   0.000000
Coastline (coast/area ratio)          0.000000
Net migration                         0.013216
Infant mortality (per 1000 births)    0.013216
GDP ($ per capita)                    0.004405
Literacy (%)                          0.079295
Phones (per 1000)                     0.017621
Arable (%)                            0.008811
Crops (%)                             0.008811
Other (%)                             0.008811
Climate                               0.096916
Birthrate                             0.013216
Deathrate                             0.017621
Agriculture                           0.066079
Industry                              0.070485
Service                               0.066079
dtype: float64

### Imputation

In [5]:
# KNN Imputer doesn't work on Categorical data.
# Country is categorical element, since its only one its set to index
data1 = data.set_index('Country')
data1.sample(2)

Unnamed: 0_level_0,Population,Area_sqm,Pop_Density_per sqm,Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Nicaragua,5570129,129494,43.0,0.7,-1.22,29.11,2300.0,67.5,39.7,15.94,1.94,82.12,2.0,24.51,4.45,0.165,0.275,0.56
Samoa,176908,2944,60.1,13.69,-11.7,27.71,5600.0,99.7,75.2,21.2,24.38,54.42,2.0,16.43,6.62,0.114,0.584,0.302


In [6]:
from sklearn.impute import KNNImputer

In [7]:
# Alternatively we can use KNNImputer from missingpy
# from missingpy import KNNImputer

In [8]:
impu = KNNImputer()
data2 = impu.fit_transform(data1)
data2 = pd.DataFrame(data2, index=data1.index, columns=data1.columns)

In [9]:
data2.sample(2)

Unnamed: 0_level_0,Population,Area_sqm,Pop_Density_per sqm,Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Guernsey,65409.0,78.0,838.6,64.1,3.84,4.71,20000.0,95.6,842.4,9.32,4.91,85.77,3.0,8.81,10.01,0.03,0.1,0.87
Burkina Faso,13902972.0,274200.0,50.7,0.0,0.0,97.57,1100.0,26.6,7.0,14.43,0.19,85.38,2.0,45.62,15.6,0.322,0.196,0.482


In [10]:
data2.isnull().sum()

Population                            0
Area_sqm                              0
Pop_Density_per sqm                   0
Coastline (coast/area ratio)          0
Net migration                         0
Infant mortality (per 1000 births)    0
GDP ($ per capita)                    0
Literacy (%)                          0
Phones (per 1000)                     0
Arable (%)                            0
Crops (%)                             0
Other (%)                             0
Climate                               0
Birthrate                             0
Deathrate                             0
Agriculture                           0
Industry                              0
Service                               0
dtype: int64

### Base regression Model
Later we can do the outlier treatment to check if the model has improved

In [11]:
out = data2['GDP ($ per capita)']
inp = data2.drop('GDP ($ per capita)', 1)

In [12]:
import statsmodels.api as sm

In [13]:
inpc = sm.add_constant(inp)
ols = sm.OLS(out, inpc)
ols_mod = ols.fit()
ols_mod.summary()

0,1,2,3
Dep. Variable:,GDP ($ per capita),R-squared:,0.754
Model:,OLS,Adj. R-squared:,0.734
Method:,Least Squares,F-statistic:,37.61
Date:,"Thu, 02 Sep 2021",Prob (F-statistic):,4.3199999999999995e-54
Time:,22:47:08,Log-Likelihood:,-2254.0
No. Observations:,227,AIC:,4544.0
Df Residuals:,209,BIC:,4606.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.946e+06,4.41e+06,1.576,0.117,-1.74e+06,1.56e+07
Population,-2.173e-06,3.51e-06,-0.619,0.537,-9.09e-06,4.75e-06
Area_sqm,7.452e-05,0.000,0.323,0.747,-0.000,0.001
Pop_Density_per sqm,-0.2639,0.230,-1.149,0.252,-0.717,0.189
Coastline (coast/area ratio),-1.1338,5.545,-0.204,0.838,-12.065,9.797
Net migration,416.8470,78.894,5.284,0.000,261.317,572.377
Infant mortality (per 1000 births),-37.9143,28.409,-1.335,0.183,-93.918,18.090
Literacy (%),-3.6942,30.549,-0.121,0.904,-63.918,56.530
Phones (per 1000),30.3023,2.688,11.274,0.000,25.003,35.601

0,1,2,3
Omnibus:,78.912,Durbin-Watson:,1.828
Prob(Omnibus):,0.0,Jarque-Bera (JB):,611.087
Skew:,1.115,Prob(JB):,2.01e-133
Kurtosis:,10.722,Cond. No.,1550000000000.0


### Intepretation

Date:	Thu, 02 Sep 2021	Prob (F-statistic):	4.32e-54

Since F-Stat < 0.05 the overall model is significant. H0 is rejected in favour of Ha

Individual features some of them are greater than 0.05, we will start looking at feature with highest p-value and drop it, again re-build the model and check again p-values 

### Outliear Treatment

In [None]:
# data2[data2['Area_sqm']>data2['Area_sqm'].quantile(.75)]['Area_sqm']

In [None]:
# data2.fillna(0)
# data2.isnull().sum()

In [None]:
# for i in data2.columns:
#     q1 = data2[i].quantile(.25)
#     q3 = data2[i].quantile(.75)
#     iqr = q3 - q1
#     ub = q3 + 1.5 * iqr
#     lb = q1 - 1.5 * iqr
#     data2[i] = data2[i].replace(to_replace=data2[data2[i]>ub][i], value=data2[i].quantile(0.99))
#     data2[i] = data2[i].replace(to_replace=data2[data2[i]<lb][i], value=data2[i].quantile(0.01))

In [14]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 227 entries, Afghanistan  to Zimbabwe 
Data columns (total 18 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Population                          227 non-null    float64
 1   Area_sqm                            227 non-null    float64
 2   Pop_Density_per sqm                 227 non-null    float64
 3   Coastline (coast/area ratio)        227 non-null    float64
 4   Net migration                       227 non-null    float64
 5   Infant mortality (per 1000 births)  227 non-null    float64
 6   GDP ($ per capita)                  227 non-null    float64
 7   Literacy (%)                        227 non-null    float64
 8   Phones (per 1000)                   227 non-null    float64
 9   Arable (%)                          227 non-null    float64
 10  Crops (%)                           227 non-null    float64
 11  Other (%)                        

In [15]:
for i in data2.columns:
    q1 = data2[i].quantile(.25)
    q3 = data2[i].quantile(.75)
    iqr = q3 - q1
    ub = q3 + 1.5 * iqr
    lb = q1 - 1.5 * iqr
    data2[i] = np.where(data2[i]>ub, 
         data2[i].quantile(.99), 
        np.where(data2[i]<lb, 
                 data2[i].quantile(.01),
                data2[i]))


In [16]:
data2.sample(5)

Unnamed: 0_level_0,Population,Area_sqm,Pop_Density_per sqm,Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Bolivia,8989046.0,9622460.4,8.2,0.0,-1.32,53.11,2400.0,87.2,71.9,2.67,0.19,97.14,1.5,23.3,7.53,0.128,0.352,0.52
Russia,284666400.0,9622460.4,8.4,0.22,1.02,15.39,8900.0,99.6,280.6,7.33,0.11,92.56,1.8,9.95,14.65,0.054,0.371,0.575
Dominica,68910.0,754.0,91.4,19.63,-13.907,14.15,5400.0,94.0,304.8,6.67,45.021,73.33,2.0,15.27,6.73,0.177,0.328,0.495
Angola,12127070.0,9622460.4,9.7,0.13,0.0,140.299,1900.0,32.57,7.8,2.41,0.24,97.35,1.8,45.11,27.5374,0.096,0.66392,0.246
Saint Kitts & Nevis,39129.0,261.0,149.9,285.7248,-13.907,14.49,8800.0,97.0,638.9,19.44,2.78,77.78,2.0,18.02,8.33,0.035,0.258,0.707


In [17]:
out = data2['GDP ($ per capita)']
inp = data2.drop('GDP ($ per capita)', axis=1)

In [19]:
inpc = sm.add_constant(inp)
ols = sm.OLS(out, inpc)
ols_mod = ols.fit()
ols_mod.summary()

0,1,2,3
Dep. Variable:,GDP ($ per capita),R-squared:,0.801
Model:,OLS,Adj. R-squared:,0.785
Method:,Least Squares,F-statistic:,49.48
Date:,"Thu, 02 Sep 2021",Prob (F-statistic):,1.4e-63
Time:,22:48:48,Log-Likelihood:,-2222.8
No. Observations:,227,AIC:,4482.0
Df Residuals:,209,BIC:,4543.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,8642.6765,1.21e+04,0.715,0.475,-1.52e+04,3.25e+04
Population,1.698e-06,3.89e-06,0.437,0.663,-5.97e-06,9.36e-06
Area_sqm,-1.905e-05,0.000,-0.166,0.868,-0.000,0.000
Pop_Density_per sqm,0.0281,0.188,0.150,0.881,-0.343,0.399
Coastline (coast/area ratio),-15.1276,3.853,-3.926,0.000,-22.723,-7.532
Net migration,295.1782,45.856,6.437,0.000,204.779,385.577
Infant mortality (per 1000 births),-37.6590,26.997,-1.395,0.165,-90.881,15.563
Literacy (%),-0.4939,25.102,-0.020,0.984,-49.979,48.991
Phones (per 1000),32.2409,2.414,13.358,0.000,27.483,36.999

0,1,2,3
Omnibus:,9.555,Durbin-Watson:,1.855
Prob(Omnibus):,0.008,Jarque-Bera (JB):,15.716
Skew:,0.209,Prob(JB):,0.000387
Kurtosis:,4.22,Cond. No.,6210000000.0


### Scaling
Bring data into a defined range

In [20]:
from sklearn.preprocessing import StandardScaler

In [21]:
sc = StandardScaler()
inp_sc = sc.fit_transform(inp)
inp_sc = pd.DataFrame(inp_sc, columns=inp.columns, index=inp.index)
inp_sc.head(5)

Unnamed: 0_level_0,Population,Area_sqm,Pop_Density_per sqm,Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Afghanistan,-0.115094,-0.220266,-0.347864,-0.47018,2.143829,3.079871,-2.537552,-1.041107,-0.132135,-0.502354,0.375437,-1.69457,2.210162,3.055247,1.612835,-0.263552,-1.201003
Albania,-0.414949,-0.41877,-0.306272,-0.458127,-1.993115,-0.39397,0.181254,-0.736564,0.539398,-0.218337,-0.449624,1.260999,-0.631504,-0.738826,0.583345,-0.645495,0.014199
Algeria,-0.094652,2.659029,-0.366433,-0.469797,-0.107041,-0.116716,-0.650571,-0.705662,-0.799921,-0.500325,0.932166,-1.69457,-0.448316,-0.842529,-0.327892,2.850162,-1.70174
American Samoa,-0.453407,-0.427929,-0.216248,2.263057,-1.993115,-0.752237,0.710596,0.10675,-0.291774,2.527233,-0.417649,-0.216785,0.031762,-1.070336,-0.199902,-0.476559,0.568673
Andorra,-0.45326,-0.427843,-0.29134,-0.47018,2.143829,-0.904902,0.861837,1.171305,-0.874869,-0.517231,1.010534,1.260999,-1.209041,-0.563721,-0.670128,-0.394294,0.913082


In [22]:
# There is no change after scaling the data
inp['Population'].skew(), inp_sc["Population"].skew()

(2.2625680522702516, 2.262568052270252)

In [23]:
inpc = sm.add_constant(inp_sc)
ols = sm.OLS(out, inpc)
ols_mod = ols.fit()
ols_mod.summary()

0,1,2,3
Dep. Variable:,GDP ($ per capita),R-squared:,0.801
Model:,OLS,Adj. R-squared:,0.785
Method:,Least Squares,F-statistic:,49.48
Date:,"Fri, 03 Sep 2021",Prob (F-statistic):,1.4e-63
Time:,00:02:08,Log-Likelihood:,-2222.8
No. Observations:,227,AIC:,4482.0
Df Residuals:,209,BIC:,4543.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9620.3348,299.480,32.123,0.000,9029.946,1.02e+04
Population,155.5794,356.182,0.437,0.663,-546.590,857.749
Area_sqm,-59.3790,357.475,-0.166,0.868,-764.099,645.341
Pop_Density_per sqm,51.8321,346.271,0.150,0.881,-630.799,734.463
Coastline (coast/area ratio),-1581.3918,402.751,-3.926,0.000,-2375.366,-787.417
Net migration,2115.4650,328.636,6.437,0.000,1467.598,2763.331
Infant mortality (per 1000 births),-1287.6527,923.107,-1.395,0.165,-3107.447,532.142
Literacy (%),-9.7969,497.915,-0.020,0.984,-991.376,971.782
Phones (per 1000),7198.9293,538.933,13.358,0.000,6136.488,8261.371

0,1,2,3
Omnibus:,9.555,Durbin-Watson:,1.855
Prob(Omnibus):,0.008,Jarque-Bera (JB):,15.716
Skew:,0.209,Prob(JB):,0.000387
Kurtosis:,4.22,Cond. No.,17.5


In [24]:
print("There is not change before scaling and after scaling")

There is not change before scaling and after scaling


##### Interpretation

Before Standarization:

Kurtosis:	4.220	Cond. No.	6.21e+09

After Standarization:

Kurtosis:	4.220	Cond. No.	17.5

If the Condition Number is small or less than 100 that means that multicollinearity effect is very small. VIF gives the exact but this Coef No is an inference.

Standization reduced the Condition number i.e., reduces the multicollinearity problem

If we look at coef there is a change in coef. The inference will change since the coef has changed



In [26]:
# Instead of StandardScaler just do the mean subtraction
inp1 = inp - inp.mean()
inpc = sm.add_constant(inp1)
ols = sm.OLS(out, inpc)
ols_mod = ols.fit()
ols_mod.summary()

0,1,2,3
Dep. Variable:,GDP ($ per capita),R-squared:,0.801
Model:,OLS,Adj. R-squared:,0.785
Method:,Least Squares,F-statistic:,49.48
Date:,"Fri, 03 Sep 2021",Prob (F-statistic):,1.4e-63
Time:,00:15:07,Log-Likelihood:,-2222.8
No. Observations:,227,AIC:,4482.0
Df Residuals:,209,BIC:,4543.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9620.3348,299.480,32.123,0.000,9029.946,1.02e+04
Population,1.698e-06,3.89e-06,0.437,0.663,-5.97e-06,9.36e-06
Area_sqm,-1.905e-05,0.000,-0.166,0.868,-0.000,0.000
Pop_Density_per sqm,0.0281,0.188,0.150,0.881,-0.343,0.399
Coastline (coast/area ratio),-15.1276,3.853,-3.926,0.000,-22.723,-7.532
Net migration,295.1782,45.856,6.437,0.000,204.779,385.577
Infant mortality (per 1000 births),-37.6590,26.997,-1.395,0.165,-90.881,15.563
Literacy (%),-0.4939,25.102,-0.020,0.984,-49.979,48.991
Phones (per 1000),32.2409,2.414,13.358,0.000,27.483,36.999

0,1,2,3
Omnibus:,9.555,Durbin-Watson:,1.855
Prob(Omnibus):,0.008,Jarque-Bera (JB):,15.716
Skew:,0.209,Prob(JB):,0.000387
Kurtosis:,4.22,Cond. No.,4750000000.0


##### Interpretation
Other feature coef remain the same as Pre-scaled data only Constant has change.

Cond. No has changed (down) from the Pre-scaled data


### Linear Regression: Assumptions


Assumptions of Linear Regression:

	1. Multicollinearity:
		a. This is a check for the data
        
	2. Normality:
		a. This is a check for the model
        
	3. Linearity:
		a. This is a check for the model
        
	4. Autocorrelation:
		a. This is a check for the model
        
	5. Homoscedasticity:
		a. This is a check for the model
       



### Multicollinearity