# Importing libraries/datasets and descriptive statistics


The following code imports the required libraries and the dataset:

In [6]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

data_path = '../completed_datasets/all_indexes.csv'
df =  pd.read_csv(data_path, index_col=False)

ModuleNotFoundError: No module named 'statsmodels'

A short overview of some descriptice statistics of the datasets:

In [None]:
df.describe()

Unnamed: 0,Year,Number of Undergraduate Students,GDP per Capita,Tertiary Education Enrollment,Tertiary Expenditure,GINI,HDI,Distance,Population,Students nomalised
count,569.0,569.0,551.0,427.0,174.0,261.0,539.0,569.0,563.0,563.0
mean,2016.525483,80.008787,31832.417721,56.70008,20.961876,34.560536,0.803343,5141.360281,66299280.0,0.001951
std,1.699151,206.783188,25110.858485,26.329726,7.564203,6.629539,0.113334,3906.205924,199718900.0,0.008344
min,2014.0,5.0,1281.506041,4.01016,0.0058,24.0,0.446,313.0,33706.0,1e-06
25%,2015.0,10.0,12545.317084,37.02314,16.079942,29.7,0.7485,1733.0,4041678.0,4e-05
50%,2017.0,20.0,26621.272229,60.33445,20.827395,33.6,0.815,4422.0,10546060.0,0.000191
75%,2018.0,65.0,47126.056149,74.265934,25.976608,38.1,0.895,8686.0,49941100.0,0.000499
max,2019.0,2685.0,155201.399605,148.530884,43.51374,57.1,0.957,18817.0,1397715000.0,0.0599


# Linear Regression on the individual factors vs. normalised students

The following code performs regression between the normalised students and all of the separate factors. The output is an R-squared value and p-value.

In [4]:
String = ['GDP per Capita', 'Tertiary Education Enrollment','Tertiary Expenditure', 'GINI', 'HDI', 'Distance']
Sample = "Students normalised"

for factor in String:
  df =  pd.read_csv(data_path, index_col=False)
  df.dropna(subset = [factor], inplace=True)
  df.dropna(subset = [Sample], inplace=True)

  X = df[factor]
  y = df[Sample]
  X = sm.add_constant(X) # A constant must be added, as the statsmodels library does not automatically include a coefficient in their regression function

  model = sm.OLS(y, X).fit()
  predictions = model.predict(X)

  # Print out the statistics
  print('\033[1m' + factor + ' vs.' + '\033[0m')
  print('R squared:') 
  print(model.rsquared)
  
  print('p value:') 
  print(model.pvalues)
  print()

NameError: name 'data_path' is not defined

# Multiple Regression vs. normalised students 

Multiple regression was performed on all the variables using the below code. As discusses in the methodology, variables with a VIF factor above 2 were iteratively dropped until all were below the threshold.

In [None]:
String = ["GINI", "GDP per Capita", "Tertiary Education Enrollment", "Tertiary Expenditure", "Distance", "HDI"]
Sample = "Students normalised"

df =  pd.read_csv(data_path, index_col=False)
df.dropna(subset = String, inplace=True)
df.dropna(subset = [Sample], inplace=True)
# df = df[df['Year'] == 2015]
X = df[String]
y = df[Sample]
X = sm.add_constant(X)
# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
print(model.summary())


pd.DataFrame({'variables':X.columns[1:], 'VIF':[variance_inflation_factor(X.values, i+1) for i in range(len(X.columns[1:]))]})

Iteratively dropping the variables left the following:

In [None]:
String = ["GINI", "GDP per Capita", "Tertiary Education Enrollment", "Tertiary Expenditure"]
Sample = "Students normalised"

df =  pd.read_csv(data_path, index_col=False)
df.dropna(subset = String, inplace=True)
df.dropna(subset = [Sample], inplace=True)
# df = df[df['Year'] == 2015]
X = df[String]
y = df[Sample]
X = sm.add_constant(X)
# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
print(model.summary())


pd.DataFrame({'variables':X.columns[1:], 'VIF':[variance_inflation_factor(X.values, i+1) for i in range(len(X.columns[1:]))]})


                             OLS Regression Results                            
Dep. Variable:     Students normalised   R-squared:                       0.106
Model:                             OLS   Adj. R-squared:                  0.071
Method:                  Least Squares   F-statistic:                     3.002
Date:                 Thu, 30 Dec 2021   Prob (F-statistic):             0.0219
Time:                         12:00:44   Log-Likelihood:                 614.57
No. Observations:                  106   AIC:                            -1219.
Df Residuals:                      101   BIC:                            -1206.
Df Model:                            4                                         
Covariance Type:             nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
cons

Unnamed: 0,variables,VIF
0,GINI,1.305211
1,GDP per Capita,1.304224
2,Tertiary Education Enrollment,1.325961
3,Tertiary Expenditure,1.195562
