# Imports

In [1]:
#Pandas, Numpy importieren
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn as sl
import scipy.stats
import statsmodels.api as sm
import matplotlib.pyplot as plt

#Importeieren der Modelle aus verschiedenen Bibliotheken
from sklearn.linear_model import LinearRegression
from statsmodels.regression.linear_model import GLS
from linearmodels.panel import PooledOLS


# Splitting data into training and testing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, KFold

from sklearn.model_selection import train_test_split

# No warnings about setting value on copy of slice
pd.options.mode.chained_assignment = None

# Display up to 60 columns of a dataframe
pd.set_option('display.max_columns', 600)

# Matplotlib visualization
%matplotlib inline
# Set default font size
plt.rcParams['font.size'] = 24

# Internal ipython tool for setting figure size
from IPython.core.pylabtools import figsize
#figsize(6, 6)

# Seaborn for visualization
sns.set(font_scale = 2)

# Import der Daten

In [2]:
pd.set_option('display.max_columns', 600)

data = pd.read_csv('../data/math_imputed_2.csv')
data = data.sample(frac=1)

data.info()
#data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 230 entries, 34 to 48
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   LOCATION           230 non-null    object 
 1   SUBJECT            230 non-null    object 
 2   TIME               230 non-null    float64
 3   PISA Math          230 non-null    float64
 4   GINI               230 non-null    float64
 5   STR_SRY            230 non-null    float64
 6   CPI                230 non-null    float64
 7   ALC_PC             230 non-null    float64
 8   INTERNET_PC        230 non-null    float64
 9   HOMICIDES          230 non-null    float64
 10  log(MIGRANTS)      230 non-null    float64
 11  log(GDP)           230 non-null    float64
 12  log(EDU_SPENDING)  230 non-null    float64
 13  log(PCT_EDU_TRY)   230 non-null    float64
dtypes: float64(12), object(2)
memory usage: 27.0+ KB


# Multindizes hinzufügen für PooledOLS

In [3]:
time = pd.Categorical(data["TIME"])
location = pd.Categorical(data["LOCATION"])
data = data.set_index(['LOCATION', 'TIME'])
data['TIME'] = time
data['LOCATION'] = location
# print(data)

## Vergleich der verschiedenen Schätzmethoden für Cov(b) anhand der t-Werte

In [4]:
from linearmodels.panel import PooledOLS
import statsmodels.api as sm
exog_vars = ['GINI', 'log(PCT_EDU_TRY)', 'ALC_PC', 'HOMICIDES']
exog = sm.add_constant(data[exog_vars])
# data = data.drop(columns={"LOCATION", "TIME", "SUBJECT"})

mod = PooledOLS(data["PISA Math"], exog)

ols_errors = mod.fit()
clustered = mod.fit(cov_type='clustered', clusters=data.LOCATION)
clust_time = mod.fit(cov_type='clustered', clusters=data.TIME)
clust_entity_time = mod.fit(cov_type='clustered', cluster_entity=True, cluster_time=True)
# White Standardfehler 
robust = mod.fit(cov_type='robust')


from linearmodels.panel import compare

from collections import OrderedDict
res = OrderedDict()
res["Unadjusted"] = ols_errors
res['Robust'] = robust
res['Years clustered'] = clust_time
# res['Years & Countries clustered'] = clust_entity_time
res['Countries clustered'] = clustered
print(compare(res))
# print(pooled_res)
# print(clust_entity_time)

                                 Model Comparison                                 
                        Unadjusted      Robust Years clustered Countries clustered
----------------------------------------------------------------------------------
Dep. Variable            PISA Math   PISA Math       PISA Math           PISA Math
Estimator                PooledOLS   PooledOLS       PooledOLS           PooledOLS
No. Observations               230         230             230                 230
Cov. Est.               Unadjusted      Robust       Clustered           Clustered
R-squared                   0.7844      0.7844          0.7844              0.7844
R-Squared (Within)         -0.5583     -0.5583         -0.5583             -0.5583
R-Squared (Between)         0.8493      0.8493          0.8493              0.8493
R-Squared (Overall)         0.7844      0.7844          0.7844              0.7844
F-statistic                 204.65      204.65          204.65              204.65
P-va