In [15]:
# Import pandas and statsmodels
import pandas as pd
import numpy as np
import statsmodels.api as sm

# load countries csv data 
df = pd.read_csv('Prosperous_Countries_Data.csv')



# Set X and y data frames 

    # independent variables
X =  df[['Gross Savings Rate','Corporate Tax and Contribution Burden','Child Mortality Rate','Young Labor Participation Rate','Health Expenditure Rate','Total Employment Rate','Human Capital Index','Income Tax Ratio']]

    # dependent variable
y = df['GDP per capita USD']




# Use dtypes to make sure the data is loaded in the right format
df.dtypes



Row Labels                                object
Gross Savings Rate                       float64
Corporate Tax and Contribution Burden    float64
Child Mortality Rate                     float64
Young Labor Participation Rate           float64
Health Expenditure Rate                  float64
Total Employment Rate                    float64
Human Capital Index                      float64
Income Tax Ratio                         float64
GDP per capita USD                         int64
dtype: object

In [16]:
# Regression with statsmodel
# Use statsmodels to calculate important traits of the regression
# This re-creates the model using the statsmodel package, rather than sklearn

X = sm.add_constant(X) # this makes sure the regression model has an intercept
# the "sm.OLS" command that calculates the regression does not have an intercept by default
 

# fit the model using OLS
model = sm.OLS(y,X).fit()

    
# Summarize the model using .summary() and print the summary 
print_model = model.summary()
print(print_model)


                            OLS Regression Results                            
Dep. Variable:     GDP per capita USD   R-squared:                       0.592
Model:                            OLS   Adj. R-squared:                  0.570
Method:                 Least Squares   F-statistic:                     26.71
Date:                Fri, 09 Dec 2022   Prob (F-statistic):           3.42e-25
Time:                        03:41:26   Log-Likelihood:                -1703.1
No. Observations:                 156   AIC:                             3424.
Df Residuals:                     147   BIC:                             3452.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------

In [17]:
# Set a new X by removing the variable with the highest p-value

X =  df[['Corporate Tax and Contribution Burden','Child Mortality Rate','Young Labor Participation Rate','Health Expenditure Rate','Total Employment Rate','Human Capital Index','Income Tax Ratio']]

X = sm.add_constant(X) # this makes sure the regression model has an intercept
# the "sm.OLS" command that calculates the regression does not have an intercept by default


# fit the model using OLS
model = sm.OLS(y,X).fit()
 

    
# Summarize the model using .summary() and print the summary 
print_model = model.summary()
print(print_model)

## Repeat this input box as many times as necessary until all independent variables are significant to the 90% confidence level

                            OLS Regression Results                            
Dep. Variable:     GDP per capita USD   R-squared:                       0.588
Model:                            OLS   Adj. R-squared:                  0.569
Method:                 Least Squares   F-statistic:                     30.20
Date:                Fri, 09 Dec 2022   Prob (F-statistic):           1.25e-25
Time:                        03:43:30   Log-Likelihood:                -1703.9
No. Observations:                 156   AIC:                             3424.
Df Residuals:                     148   BIC:                             3448.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------

In [18]:
# Set a new X by removing the variable with the highest p-value

X =  df[['Child Mortality Rate','Young Labor Participation Rate','Health Expenditure Rate','Total Employment Rate','Human Capital Index','Income Tax Ratio']]

X = sm.add_constant(X) # this makes sure the regression model has an intercept
# the "sm.OLS" command that calculates the regression does not have an intercept by default


# fit the model using OLS
model = sm.OLS(y,X).fit()
 

    
# Summarize the model using .summary() and print the summary 
print_model = model.summary()
print(print_model)


                            OLS Regression Results                            
Dep. Variable:     GDP per capita USD   R-squared:                       0.583
Model:                            OLS   Adj. R-squared:                  0.566
Method:                 Least Squares   F-statistic:                     34.71
Date:                Fri, 09 Dec 2022   Prob (F-statistic):           5.08e-26
Time:                        03:44:16   Log-Likelihood:                -1704.9
No. Observations:                 156   AIC:                             3424.
Df Residuals:                     149   BIC:                             3445.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const       

In [19]:
# Set a new X by removing the variable with the highest p-value

X =  df[['Young Labor Participation Rate','Health Expenditure Rate','Total Employment Rate','Human Capital Index','Income Tax Ratio']]

X = sm.add_constant(X) # this makes sure the regression model has an intercept
# the "sm.OLS" command that calculates the regression does not have an intercept by default


# fit the model using OLS
model = sm.OLS(y,X).fit()
 

    
# Summarize the model using .summary() and print the summary 
print_model = model.summary()
print(print_model)


                            OLS Regression Results                            
Dep. Variable:     GDP per capita USD   R-squared:                       0.578
Model:                            OLS   Adj. R-squared:                  0.564
Method:                 Least Squares   F-statistic:                     41.17
Date:                Fri, 09 Dec 2022   Prob (F-statistic):           1.62e-26
Time:                        03:45:11   Log-Likelihood:                -1705.7
No. Observations:                 156   AIC:                             3423.
Df Residuals:                     150   BIC:                             3442.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const       

In [20]:
# Set a new X by removing the variable with the highest p-value

X =  df[['Health Expenditure Rate','Total Employment Rate','Human Capital Index','Income Tax Ratio']]

X = sm.add_constant(X) # this makes sure the regression model has an intercept
# the "sm.OLS" command that calculates the regression does not have an intercept by default


# fit the model using OLS
model = sm.OLS(y,X).fit()
 

    
# Summarize the model using .summary() and print the summary 
print_model = model.summary()
print(print_model)


                            OLS Regression Results                            
Dep. Variable:     GDP per capita USD   R-squared:                       0.570
Model:                            OLS   Adj. R-squared:                  0.558
Method:                 Least Squares   F-statistic:                     49.98
Date:                Fri, 09 Dec 2022   Prob (F-statistic):           9.88e-27
Time:                        03:46:13   Log-Likelihood:                -1707.4
No. Observations:                 156   AIC:                             3425.
Df Residuals:                     151   BIC:                             3440.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                   -7