In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
data = pd.read_csv('data/GDP_Country.csv')
data.sample(5)

Unnamed: 0,Country,Population,Area_sqm,Pop_Density_per sqm,Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
137,Moldova,4466706,33843,132.0,0.0,-0.26,40.42,1800.0,99.1,208.1,55.3,10.79,33.91,,15.7,12.64,0.213,0.233,0.555
180,Senegal,11987121,196190,61.1,0.27,0.2,55.51,1600.0,40.2,22.2,12.78,0.21,87.01,2.0,32.78,9.42,0.172,0.209,0.619
84,Guatemala,12293545,108890,112.9,0.37,-1.67,35.93,4100.0,70.6,92.1,12.54,5.03,82.43,2.0,29.88,5.2,0.227,0.188,0.585
213,United Kingdom,60609153,244820,247.6,5.08,2.19,5.16,27700.0,99.0,543.5,23.46,0.21,76.33,3.0,10.71,10.13,0.005,0.237,0.758
60,Egypt,78887007,1001450,78.8,0.24,-0.22,32.59,4000.0,57.7,131.8,2.87,0.48,96.65,1.0,22.94,5.23,0.149,0.357,0.493


### Preprocessing

In [3]:
data.isnull().sum()

Country                                0
Population                             0
Area_sqm                               0
Pop_Density_per sqm                    0
Coastline (coast/area ratio)           0
Net migration                          3
Infant mortality (per 1000 births)     3
GDP ($ per capita)                     1
Literacy (%)                          18
Phones (per 1000)                      4
Arable (%)                             2
Crops (%)                              2
Other (%)                              2
Climate                               22
Birthrate                              3
Deathrate                              4
Agriculture                           15
Industry                              16
Service                               15
dtype: int64

In [4]:
data.isnull().sum()/len(data)

Country                               0.000000
Population                            0.000000
Area_sqm                              0.000000
Pop_Density_per sqm                   0.000000
Coastline (coast/area ratio)          0.000000
Net migration                         0.013216
Infant mortality (per 1000 births)    0.013216
GDP ($ per capita)                    0.004405
Literacy (%)                          0.079295
Phones (per 1000)                     0.017621
Arable (%)                            0.008811
Crops (%)                             0.008811
Other (%)                             0.008811
Climate                               0.096916
Birthrate                             0.013216
Deathrate                             0.017621
Agriculture                           0.066079
Industry                              0.070485
Service                               0.066079
dtype: float64

### Imputation

In [5]:
# KNN Imputer doesn't work on Categorical data.
# Country is categorical element, since its only one its set to index
data1 = data.set_index('Country')
data1.sample(2)

Unnamed: 0_level_0,Population,Area_sqm,Pop_Density_per sqm,Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Saudi Arabia,27019731,1960582,13.8,0.13,-2.71,13.24,11800.0,78.8,140.6,1.67,0.09,98.24,1.0,29.34,2.58,0.033,0.613,0.354
Luxembourg,474413,2586,183.5,0.0,8.97,4.81,55100.0,100.0,515.4,23.28,0.4,76.32,,11.94,8.41,0.01,0.13,0.86


In [6]:
from sklearn.impute import KNNImputer

In [7]:
# Alternatively we can use KNNImputer from missingpy
# from missingpy import KNNImputer

In [8]:
impu = KNNImputer()
data2 = impu.fit_transform(data1)
data2 = pd.DataFrame(data2, index=data1.index, columns=data1.columns)

In [9]:
data2.sample(2)

Unnamed: 0_level_0,Population,Area_sqm,Pop_Density_per sqm,Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Guinea,9690222.0,245857.0,39.4,0.13,-3.06,90.37,2100.0,35.9,2.7,3.63,2.58,93.79,2.0,41.76,15.48,0.237,0.362,0.401
United Arab Emirates,2602713.0,82880.0,31.4,1.59,1.03,14.51,23200.0,77.9,475.3,0.6,2.25,97.15,1.0,18.96,4.4,0.04,0.585,0.375


In [10]:
data2.isnull().sum()

Population                            0
Area_sqm                              0
Pop_Density_per sqm                   0
Coastline (coast/area ratio)          0
Net migration                         0
Infant mortality (per 1000 births)    0
GDP ($ per capita)                    0
Literacy (%)                          0
Phones (per 1000)                     0
Arable (%)                            0
Crops (%)                             0
Other (%)                             0
Climate                               0
Birthrate                             0
Deathrate                             0
Agriculture                           0
Industry                              0
Service                               0
dtype: int64

### Base regression Model
Later we can do the outlier treatment to check if the model has improved

In [11]:
out = data2['GDP ($ per capita)']
inp = data2.drop('GDP ($ per capita)', 1)

In [12]:
import statsmodels.api as sm

In [13]:
inpc = sm.add_constant(inp)
ols = sm.OLS(out, inpc)
ols_mod = ols.fit()
ols_mod.summary()

0,1,2,3
Dep. Variable:,GDP ($ per capita),R-squared:,0.754
Model:,OLS,Adj. R-squared:,0.734
Method:,Least Squares,F-statistic:,37.61
Date:,"Fri, 03 Sep 2021",Prob (F-statistic):,4.3199999999999995e-54
Time:,07:50:03,Log-Likelihood:,-2254.0
No. Observations:,227,AIC:,4544.0
Df Residuals:,209,BIC:,4606.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.946e+06,4.41e+06,1.576,0.117,-1.74e+06,1.56e+07
Population,-2.173e-06,3.51e-06,-0.619,0.537,-9.09e-06,4.75e-06
Area_sqm,7.452e-05,0.000,0.323,0.747,-0.000,0.001
Pop_Density_per sqm,-0.2639,0.230,-1.149,0.252,-0.717,0.189
Coastline (coast/area ratio),-1.1338,5.545,-0.204,0.838,-12.065,9.797
Net migration,416.8470,78.894,5.284,0.000,261.317,572.377
Infant mortality (per 1000 births),-37.9143,28.409,-1.335,0.183,-93.918,18.090
Literacy (%),-3.6942,30.549,-0.121,0.904,-63.918,56.530
Phones (per 1000),30.3023,2.688,11.274,0.000,25.003,35.601

0,1,2,3
Omnibus:,78.912,Durbin-Watson:,1.828
Prob(Omnibus):,0.0,Jarque-Bera (JB):,611.087
Skew:,1.115,Prob(JB):,2.01e-133
Kurtosis:,10.722,Cond. No.,1550000000000.0


### Intepretation

Date:	Thu, 02 Sep 2021	Prob (F-statistic):	4.32e-54

Since F-Stat < 0.05 the overall model is significant. H0 is rejected in favour of Ha

Individual features some of them are greater than 0.05, we will start looking at feature with highest p-value and drop it, again re-build the model and check again p-values 

### Outliear Treatment

In [14]:
# data2[data2['Area_sqm']>data2['Area_sqm'].quantile(.75)]['Area_sqm']

In [15]:
# data2.fillna(0)
# data2.isnull().sum()

In [16]:
# for i in data2.columns:
#     q1 = data2[i].quantile(.25)
#     q3 = data2[i].quantile(.75)
#     iqr = q3 - q1
#     ub = q3 + 1.5 * iqr
#     lb = q1 - 1.5 * iqr
#     data2[i] = data2[i].replace(to_replace=data2[data2[i]>ub][i], value=data2[i].quantile(0.99))
#     data2[i] = data2[i].replace(to_replace=data2[data2[i]<lb][i], value=data2[i].quantile(0.01))

In [17]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 227 entries, Afghanistan  to Zimbabwe 
Data columns (total 18 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Population                          227 non-null    float64
 1   Area_sqm                            227 non-null    float64
 2   Pop_Density_per sqm                 227 non-null    float64
 3   Coastline (coast/area ratio)        227 non-null    float64
 4   Net migration                       227 non-null    float64
 5   Infant mortality (per 1000 births)  227 non-null    float64
 6   GDP ($ per capita)                  227 non-null    float64
 7   Literacy (%)                        227 non-null    float64
 8   Phones (per 1000)                   227 non-null    float64
 9   Arable (%)                          227 non-null    float64
 10  Crops (%)                           227 non-null    float64
 11  Other (%)                        

In [18]:
for i in data2.columns:
    q1 = data2[i].quantile(.25)
    q3 = data2[i].quantile(.75)
    iqr = q3 - q1
    ub = q3 + 1.5 * iqr
    lb = q1 - 1.5 * iqr
    data2[i] = np.where(data2[i]>ub, 
         data2[i].quantile(.99), 
        np.where(data2[i]<lb, 
                 data2[i].quantile(.01),
                data2[i]))


In [19]:
data2.sample(5)

Unnamed: 0_level_0,Population,Area_sqm,Pop_Density_per sqm,Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Jersey,91084.0,116.0,6449.31,285.7248,2.76,5.24,24800.0,90.0,811.3,0.0,0.0,100.0,3.0,9.3,9.28,0.05,0.02,0.93
Sao Tome & Principe,193413.0,1001.0,193.2,20.88,-2.72,43.11,1200.0,79.3,36.2,6.25,45.021,44.79,2.0,40.25,6.47,0.167,0.148,0.684
Libya,5900754.0,9622460.4,3.4,0.1,0.0,24.6,6400.0,82.6,127.1,1.03,0.19,98.78,2.0,26.49,3.48,0.076,0.499,0.425
Greece,10688058.0,131940.0,81.0,10.37,2.35,5.53,20000.0,97.5,589.7,21.1,8.78,70.12,3.0,9.68,10.24,0.054,0.213,0.733
Palau,20579.0,458.0,44.9,285.7248,2.85,14.84,9000.0,92.0,325.6,8.7,4.35,86.95,2.0,18.03,6.8,0.062,0.12,0.818


In [20]:
out = data2['GDP ($ per capita)']
inp = data2.drop('GDP ($ per capita)', axis=1)

In [21]:
inpc = sm.add_constant(inp)
ols = sm.OLS(out, inpc)
ols_mod = ols.fit()
ols_mod.summary()

0,1,2,3
Dep. Variable:,GDP ($ per capita),R-squared:,0.801
Model:,OLS,Adj. R-squared:,0.785
Method:,Least Squares,F-statistic:,49.48
Date:,"Fri, 03 Sep 2021",Prob (F-statistic):,1.4e-63
Time:,07:50:03,Log-Likelihood:,-2222.8
No. Observations:,227,AIC:,4482.0
Df Residuals:,209,BIC:,4543.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,8642.6765,1.21e+04,0.715,0.475,-1.52e+04,3.25e+04
Population,1.698e-06,3.89e-06,0.437,0.663,-5.97e-06,9.36e-06
Area_sqm,-1.905e-05,0.000,-0.166,0.868,-0.000,0.000
Pop_Density_per sqm,0.0281,0.188,0.150,0.881,-0.343,0.399
Coastline (coast/area ratio),-15.1276,3.853,-3.926,0.000,-22.723,-7.532
Net migration,295.1782,45.856,6.437,0.000,204.779,385.577
Infant mortality (per 1000 births),-37.6590,26.997,-1.395,0.165,-90.881,15.563
Literacy (%),-0.4939,25.102,-0.020,0.984,-49.979,48.991
Phones (per 1000),32.2409,2.414,13.358,0.000,27.483,36.999

0,1,2,3
Omnibus:,9.555,Durbin-Watson:,1.855
Prob(Omnibus):,0.008,Jarque-Bera (JB):,15.716
Skew:,0.209,Prob(JB):,0.000387
Kurtosis:,4.22,Cond. No.,6210000000.0


### Scaling
Bring data into a defined range

In [22]:
from sklearn.preprocessing import StandardScaler

In [23]:
sc = StandardScaler()
inp_sc = sc.fit_transform(inp)
inp_sc = pd.DataFrame(inp_sc, columns=inp.columns, index=inp.index)
inp_sc.head(5)

Unnamed: 0_level_0,Population,Area_sqm,Pop_Density_per sqm,Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Afghanistan,-0.115094,-0.220266,-0.347864,-0.47018,2.143829,3.079871,-2.537552,-1.041107,-0.132135,-0.502354,0.375437,-1.69457,2.210162,3.055247,1.612835,-0.263552,-1.201003
Albania,-0.414949,-0.41877,-0.306272,-0.458127,-1.993115,-0.39397,0.181254,-0.736564,0.539398,-0.218337,-0.449624,1.260999,-0.631504,-0.738826,0.583345,-0.645495,0.014199
Algeria,-0.094652,2.659029,-0.366433,-0.469797,-0.107041,-0.116716,-0.650571,-0.705662,-0.799921,-0.500325,0.932166,-1.69457,-0.448316,-0.842529,-0.327892,2.850162,-1.70174
American Samoa,-0.453407,-0.427929,-0.216248,2.263057,-1.993115,-0.752237,0.710596,0.10675,-0.291774,2.527233,-0.417649,-0.216785,0.031762,-1.070336,-0.199902,-0.476559,0.568673
Andorra,-0.45326,-0.427843,-0.29134,-0.47018,2.143829,-0.904902,0.861837,1.171305,-0.874869,-0.517231,1.010534,1.260999,-1.209041,-0.563721,-0.670128,-0.394294,0.913082


In [24]:
# There is no change after scaling the data
inp['Population'].skew(), inp_sc["Population"].skew()

(2.2625680522702516, 2.262568052270252)

In [25]:
inpc = sm.add_constant(inp_sc)
ols = sm.OLS(out, inpc)
ols_mod = ols.fit()
ols_mod.summary()

0,1,2,3
Dep. Variable:,GDP ($ per capita),R-squared:,0.801
Model:,OLS,Adj. R-squared:,0.785
Method:,Least Squares,F-statistic:,49.48
Date:,"Fri, 03 Sep 2021",Prob (F-statistic):,1.4e-63
Time:,07:50:03,Log-Likelihood:,-2222.8
No. Observations:,227,AIC:,4482.0
Df Residuals:,209,BIC:,4543.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9620.3348,299.480,32.123,0.000,9029.946,1.02e+04
Population,155.5794,356.182,0.437,0.663,-546.590,857.749
Area_sqm,-59.3790,357.475,-0.166,0.868,-764.099,645.341
Pop_Density_per sqm,51.8321,346.271,0.150,0.881,-630.799,734.463
Coastline (coast/area ratio),-1581.3918,402.751,-3.926,0.000,-2375.366,-787.417
Net migration,2115.4650,328.636,6.437,0.000,1467.598,2763.331
Infant mortality (per 1000 births),-1287.6527,923.107,-1.395,0.165,-3107.447,532.142
Literacy (%),-9.7969,497.915,-0.020,0.984,-991.376,971.782
Phones (per 1000),7198.9293,538.933,13.358,0.000,6136.488,8261.371

0,1,2,3
Omnibus:,9.555,Durbin-Watson:,1.855
Prob(Omnibus):,0.008,Jarque-Bera (JB):,15.716
Skew:,0.209,Prob(JB):,0.000387
Kurtosis:,4.22,Cond. No.,17.5


In [26]:
print("There is not change before scaling and after scaling")

There is not change before scaling and after scaling


##### Interpretation

Before Standarization:

Kurtosis:	4.220	Cond. No.	6.21e+09

After Standarization:

Kurtosis:	4.220	Cond. No.	17.5

If the Condition Number is small or less than 100 that means that multicollinearity effect is very small. VIF gives the exact but this Coef No is an inference.

Standization reduced the Condition number i.e., reduces the multicollinearity problem

If we look at coef there is a change in coef. The inference will change since the coef has changed



In [27]:
# Instead of StandardScaler just do the mean subtraction
inp1 = inp - inp.mean()
inpc = sm.add_constant(inp1)
ols = sm.OLS(out, inpc)
ols_mod = ols.fit()
ols_mod.summary()

0,1,2,3
Dep. Variable:,GDP ($ per capita),R-squared:,0.801
Model:,OLS,Adj. R-squared:,0.785
Method:,Least Squares,F-statistic:,49.48
Date:,"Fri, 03 Sep 2021",Prob (F-statistic):,1.4e-63
Time:,07:50:03,Log-Likelihood:,-2222.8
No. Observations:,227,AIC:,4482.0
Df Residuals:,209,BIC:,4543.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9620.3348,299.480,32.123,0.000,9029.946,1.02e+04
Population,1.698e-06,3.89e-06,0.437,0.663,-5.97e-06,9.36e-06
Area_sqm,-1.905e-05,0.000,-0.166,0.868,-0.000,0.000
Pop_Density_per sqm,0.0281,0.188,0.150,0.881,-0.343,0.399
Coastline (coast/area ratio),-15.1276,3.853,-3.926,0.000,-22.723,-7.532
Net migration,295.1782,45.856,6.437,0.000,204.779,385.577
Infant mortality (per 1000 births),-37.6590,26.997,-1.395,0.165,-90.881,15.563
Literacy (%),-0.4939,25.102,-0.020,0.984,-49.979,48.991
Phones (per 1000),32.2409,2.414,13.358,0.000,27.483,36.999

0,1,2,3
Omnibus:,9.555,Durbin-Watson:,1.855
Prob(Omnibus):,0.008,Jarque-Bera (JB):,15.716
Skew:,0.209,Prob(JB):,0.000387
Kurtosis:,4.22,Cond. No.,4750000000.0


##### Interpretation
Other feature coef remain the same as Pre-scaled data only Constant has change.

Cond. No has changed (down) from the Pre-scaled data


### Linear Regression: Assumptions


Assumptions of Linear Regression:

	1. Multicollinearity:
		a. This is a check for the data
        
	2. Normality:
		a. This is a check for the model
        
	3. Linearity:
		a. This is a check for the model
        
	4. Autocorrelation:
		a. This is a check for the model
        
	5. Homoscedasticity:
		a. This is a check for the model
       



### Multicollinearity

In [28]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [31]:
# Convert dataframe into np array
# VIF is computed using x1 against x2, x3, x4, x5. 
# Next x2 againts x1, x3, x4, x5 iteratively for each of the features against all other features 
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(inp1.values, i) for i in range(inp1.shape[1])]
vif['feature'] = inp1.columns
vif.sort_values('VIF', ascending=False)

Unnamed: 0,VIF,feature
16,25.532397,Service
14,20.026042,Agriculture
15,16.545912,Industry
10,14.145739,Other (%)
8,10.506242,Arable (%)
5,9.500994,Infant mortality (per 1000 births)
12,6.134302,Birthrate
9,3.830876,Crops (%)
7,3.238427,Phones (per 1000)
6,2.764233,Literacy (%)


##### Interpretation
For these features VIF > 5, we will drop one by one feature and then check for 
multicollinearity between other features

VIF	feature

16	25.532397	Service

14	20.026042	Agriculture

15	16.545912	Industry

10	14.145739	Other (%)

8	10.506242	Arable (%)

5	9.500994	Infant mortality (per 1000 births)

12	6.134302	Birthrate

Sometimes Service might be very critial for explainability of the model, so we 
can't drop Service. Lets check the correlation of all the features with respect 
GDP data. Since Industry has less correlation lets try droping the Industry and checking

Agriculture                          -0.580424

Industry                             -0.036918

Service                               0.535850


In [32]:
data2.corr()['GDP ($ per capita)']

Population                            0.017884
Area_sqm                             -0.060373
Pop_Density_per sqm                   0.238214
Coastline (coast/area ratio)          0.177843
Net migration                         0.403517
Infant mortality (per 1000 births)   -0.619008
GDP ($ per capita)                    1.000000
Literacy (%)                          0.513722
Phones (per 1000)                     0.847224
Arable (%)                            0.014793
Crops (%)                            -0.230128
Other (%)                             0.105637
Climate                               0.318543
Birthrate                            -0.661537
Deathrate                            -0.219528
Agriculture                          -0.580424
Industry                             -0.036918
Service                               0.535850
Name: GDP ($ per capita), dtype: float64

In [38]:
inpv1 = inp1.drop('Industry', axis=1)

vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(np.array(inpv1), i) for i in range(inpv1.shape[1])]
vif['feature'] = inpv1.columns
vif.sort_values(by='VIF', ascending=False)

Unnamed: 0,VIF,feature
10,14.144188,Other (%)
8,10.504069,Arable (%)
5,9.500895,Infant mortality (per 1000 births)
12,6.127679,Birthrate
9,3.830596,Crops (%)
7,3.238427,Phones (per 1000)
6,2.74216,Literacy (%)
15,2.645795,Service
14,2.570655,Agriculture
13,2.5108,Deathrate


In [34]:
np.array(inpv1)

array([[-1.05459775e+07, -6.86583005e+05, -6.40668767e+02, ...,
         1.79714952e+01,  2.31862026e-01, -1.96674802e-01],
       [-3.80213195e+07, -1.30533501e+06, -5.64068767e+02, ...,
        -4.34590485e+00,  8.38620264e-02,  2.32519824e-03],
       [-8.67288349e+06,  8.28837739e+06, -6.74868767e+02, ...,
        -4.95590485e+00, -4.71379736e-02, -2.78674802e-01],
       ...,
       [-2.01467865e+07, -8.06113005e+05, -6.48068767e+02, ...,
        -1.26590485e+00, -1.31379736e-02, -1.83674802e-01],
       [-3.01009645e+07, -5.81469005e+05, -6.73368767e+02, ...,
         1.79714952e+01,  7.18620264e-02, -8.76748018e-02],
       [-2.93661695e+07, -9.43503005e+05, -6.57368767e+02, ...,
         1.79714952e+01,  3.08620264e-02,  2.32519824e-03]])

In [35]:
inpv1.values

array([[-1.05459775e+07, -6.86583005e+05, -6.40668767e+02, ...,
         1.79714952e+01,  2.31862026e-01, -1.96674802e-01],
       [-3.80213195e+07, -1.30533501e+06, -5.64068767e+02, ...,
        -4.34590485e+00,  8.38620264e-02,  2.32519824e-03],
       [-8.67288349e+06,  8.28837739e+06, -6.74868767e+02, ...,
        -4.95590485e+00, -4.71379736e-02, -2.78674802e-01],
       ...,
       [-2.01467865e+07, -8.06113005e+05, -6.48068767e+02, ...,
        -1.26590485e+00, -1.31379736e-02, -1.83674802e-01],
       [-3.01009645e+07, -5.81469005e+05, -6.73368767e+02, ...,
         1.79714952e+01,  7.18620264e-02, -8.76748018e-02],
       [-2.93661695e+07, -9.43503005e+05, -6.57368767e+02, ...,
         1.79714952e+01,  3.08620264e-02,  2.32519824e-03]])

##### Interpretation

VIF	feature

10	14.144188	Other (%)

8	10.504069	Arable (%)

5	9.500895	Infant mortality (per 1000 births)

Since Arable has less correlation lets try droping the Arable and checking

In [39]:
inpv2 = inpv1.drop('Arable (%)', axis=1)
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(np.array(inpv2), i) for i in range(inpv2.shape[1])]
vif['feature'] = inpv2.columns
vif.sort_values('VIF', ascending=False)

Unnamed: 0,VIF,feature
5,9.482559,Infant mortality (per 1000 births)
11,6.004283,Birthrate
7,3.235938,Phones (per 1000)
6,2.705239,Literacy (%)
14,2.64576,Service
13,2.570643,Agriculture
12,2.505773,Deathrate
9,1.903902,Other (%)
8,1.885588,Crops (%)
3,1.731842,Coastline (coast/area ratio)


In [40]:
inpv3 = inpv2.drop('Infant mortality (per 1000 births)', axis=1)
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(exog=np.array(inpv3), exog_idx=i) for i in range(inpv3.shape[1])]
vif['feature'] = inpv3.columns
vif.sort_values('VIF', ascending=False)

Unnamed: 0,VIF,feature
10,4.430107,Birthrate
6,3.169817,Phones (per 1000)
13,2.562855,Service
5,2.562702,Literacy (%)
12,2.359005,Agriculture
8,1.901241,Other (%)
7,1.8652,Crops (%)
3,1.723922,Coastline (coast/area ratio)
9,1.581554,Climate
11,1.455762,Deathrate


##### Interpretation
Now all the VIF factors are less than 5.
Multicollinearity problem is addressed by droping 
'Industry', 'Arable (%)', 'Infant mortality (per 1000 births)' on by one columns

In [41]:
# Build OLS model after multi-collinearity is addressed
inpc = sm.add_constant(inpv3)
ols = sm.OLS(endog=out, exog=inpc)
ols_mod = ols.fit()
ols_mod.summary()

0,1,2,3
Dep. Variable:,GDP ($ per capita),R-squared:,0.798
Model:,OLS,Adj. R-squared:,0.785
Method:,Least Squares,F-statistic:,59.78
Date:,"Fri, 03 Sep 2021",Prob (F-statistic):,1.54e-65
Time:,08:36:27,Log-Likelihood:,-2224.6
No. Observations:,227,AIC:,4479.0
Df Residuals:,212,BIC:,4530.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9620.3348,299.643,32.106,0.000,9029.673,1.02e+04
Population,1.241e-06,3.87e-06,0.321,0.749,-6.38e-06,8.86e-06
Area_sqm,-2.107e-05,0.000,-0.185,0.853,-0.000,0.000
Pop_Density_per sqm,-0.0026,0.186,-0.014,0.989,-0.370,0.365
Coastline (coast/area ratio),-14.6493,3.764,-3.892,0.000,-22.068,-7.231
Net migration,291.5991,45.495,6.409,0.000,201.918,381.280
Literacy (%),10.2348,24.183,0.423,0.673,-37.434,57.904
Phones (per 1000),32.6685,2.389,13.673,0.000,27.959,37.378
Crops (%),12.0265,27.673,0.435,0.664,-42.524,66.577

0,1,2,3
Omnibus:,9.575,Durbin-Watson:,1.811
Prob(Omnibus):,0.008,Jarque-Bera (JB):,15.191
Skew:,0.227,Prob(JB):,0.000503
Kurtosis:,4.183,Cond. No.,1090000000.0


##### Interpretation
R**