# Datenanalyse und -aufbereitung

## Imports 

In [1]:
#Pandas, Numpy importieren
import pandas as pd
import numpy as np
import sklearn as sl
import matplotlib.pyplot as plt
import scipy.stats
import seaborn as sns

#SciKit importieren
from sklearn.linear_model import LinearRegression

pd.set_option("display.max_rows", 1000)


## Einlesen der Daten

### PISA Math

In [2]:
#import csv file
data_pisa_math = pd.read_csv("../data/PISA_Raw/Pisa_Math_Raw.csv")
#remove useless  
# data_pisa_math = data_pisa_math[data_pisa_math["SUBJECT"] == "TOT"]
data_pisa_math = data_pisa_math[["LOCATION", "SUBJECT", "TIME", "Value"]]
data_pisa_math.info()
data_pisa_math.head(6)
# m = data_pisa_math[data_pisa_math["TIME"] == 2018]
# m = m[m["SUBJECT"] == "TOT"]
# m

FileNotFoundError: [Errno 2] No such file or directory: '../data/PISA_Raw/Pisa_Math_Raw.csv'

### Pisa Read

In [None]:
#import csv file
data_pisa_read = pd.read_csv("../data/PISA_Raw/Pisa_Read_Raw.csv")
#remove useless  
# data_pisa_read = data_pisa_read[data_pisa_read["SUBJECT"] == "TOT"]
data_pisa_read = data_pisa_read[["LOCATION", "SUBJECT", "TIME", "Value"]]
data_pisa_read.info()
data_pisa_read.head()

### PISA Science

In [None]:
#import csv file
data_pisa_science = pd.read_csv("../data/PISA_Raw/Pisa_Science_Raw.csv")
#remove useless  
# data_pisa_science = data_pisa_science[data_pisa_science["SUBJECT"] == "TOT"]
data_pisa_science = data_pisa_science[["LOCATION", "SUBJECT", "TIME", "Value"]]
data_pisa_science.info()
data_pisa_science.head()

### Gini-Koeffizient

In [None]:
#import csv file
data_gini = pd.read_csv("../data/income_inequality.csv")
#remove useless  
# data_gini.head()
data_gini = data_gini[data_gini["SUBJECT"] == "GINI"]
data_gini = data_gini[["LOCATION", "TIME", "Value"]]
data_gini.columns = ["LOCATION", "TIME", "GINI"]

# In Zeitreihen verwandeln
data_gini_int = data_gini.pivot_table("GINI", "LOCATION", "TIME")

# transform back to normal df
data_gini_int.columns.name = None               #remove categories
data_gini_int = data_gini_int.reset_index()                #index to columns

data_gini_labels = data_gini_int
# print(data_gini_labels)

# linear interpolieren
data_gini_int = data_gini_int.drop(["LOCATION"], axis=1) 

data_gini_int = data_gini_int.interpolate(method="linear", axis=1, limit_direction="both")
data_gini_int = pd.concat([data_gini_labels["LOCATION"], data_gini_int]
                            , ignore_index = False, axis=1)

# print(data_gini)
# print(data_gini_int)
# turn every year column into own column entry
data_gini_int = data_gini_int.melt(id_vars='LOCATION', 
                                     var_name="TIME", value_name="GINI")
data_gini = data_gini_int
data_gini.info()
data_gini.head()

### Schüler-Lehrer-Betreuungsverhältnis

- https://databank.worldbank.org/reports.aspx?source=2&series=SE.SEC.ENRL.TC.ZS&country=#
- https://data.oecd.org/teachers/students-per-teaching-staff.htm
#### Ganzer Datensatz
<p> 
    Jahre: 2005 - 2018 <br>
    Subjects: EARLYCHILDEDU, PRY, SRY, TRY
</p>

In [None]:
#import csv file
#str = students per teacher ratio
data_str = pd.read_csv("../data/students_per_teacher.csv")
#remove useless  
data_str = data_str[["LOCATION", "SUBJECT", "TIME", "Value"]]

# Jeder Bildungsbereich in eigene Spalte
data_str = data_str.pivot_table("Value", ["LOCATION", "TIME"], "SUBJECT")

# transform back to normal df
# data_str.columns = data_str.columns.droplevel(0) #remove amount
data_str.columns.name = None               #remove categories
data_str = data_str.reset_index()                #index to columns

# rename
data_str.columns = ["LOCATION", "TIME", "STR_ECE", "STR_PRY", "STR_SRY", "STR_TRY"]

data_str = data_str.drop(["STR_ECE", "STR_PRY", "STR_TRY"], axis = 1)


data_str.info()
data_str.head(5)

In [None]:
#import csv file
data_str = pd.read_csv("../data/str_worldbank.csv")

data_str = data_str.replace({"..": np.nan})
data_str.iloc[:, 4:26] = data_str.iloc[:, 4:26].apply(pd.to_numeric)

emp = np.empty(10)
emp[:] = np.nan

year_dict = dict((year, emp) for year in range(1990, 2000))
# print(year_dict)

data_str_int = pd.DataFrame.from_dict(year_dict)
data_str_int = pd.concat([data_str_int, data_str.iloc[:, 5:26]]
                            , ignore_index = None, axis=1)
data_str_int[1990] = data_str["1990 [YR1990]"]

# data_str = data_str[["Country Code"]]
data_str = data_str.rename(columns={"Country Code": "LOCATION"})

# Interpolation
# print(data_str_int.loc[0, :])
data_str_int = data_str_int.interpolate(method="linear", axis=1, limit_direction="both")

data_str = pd.concat([data_str[["LOCATION"]], data_str_int]
                            , ignore_index = None, axis=1)
# print(data_str)
# turn every year column into own column entry
data_str = data_str.melt(id_vars='LOCATION', 
                                     var_name="TIME",value_name="STR_SRY")

# jahre in einheitlichen float wert umwandeln
mask = data_str["TIME"].str.len() >= 5
# print(data_str[mask]["TIME"].str[0:5])
# alle time einträge die länger als 4 zeichen sind abschneiden
data_str.loc[mask, ["TIME"]] = data_str[mask]["TIME"].str[0:5]
# typecast; otherwise no merging possible
data_str.TIME = data_str.TIME.apply(pd.to_numeric)

# print(data_str[data_str["LOCATION"] == "DEU"])
data_str = data_str[data_str["TIME"] >= 1999]
data_str.info()
data_str.head()

### Migration
- anscheinend nur alle 5 Jahre Daten verfügbar

In [None]:
#import csv file
data_migration = pd.read_csv("../data/percentage of migrants.csv")
# data_migration.info()
# Select all available data after 2000
data_migration = data_migration[["Country Code", "2000", "2005", "2010", "2015"]]
data_migration = data_migration.rename(columns={"Country Code": "LOCATION"})

# Interpolation
data_migration_int = pd.DataFrame(dtype='float64')
data_migration_int["2000"] = data_migration["2000"] 
data_migration_int["2001"] = np.nan 
data_migration_int["2002"] = np.nan 
data_migration_int["2003"] = np.nan 
data_migration_int["2004"] = np.nan 
data_migration_int["2005"] = data_migration["2005"] 
data_migration_int["2006"] = np.nan 
data_migration_int["2007"] = np.nan 
data_migration_int["2008"] = np.nan 
data_migration_int["2009"] = np.nan 
data_migration_int["2010"] = data_migration["2010"] 
data_migration_int["2011"] = np.nan 
data_migration_int["2012"] = np.nan 
data_migration_int["2013"] = np.nan 
data_migration_int["2014"] = np.nan 
data_migration_int["2015"] = data_migration["2015"]
data_migration_int["2016"] = np.nan 
data_migration_int["2017"] = np.nan 
data_migration_int["2018"] = np.nan 

data_migration_int = data_migration_int.interpolate(method="linear", axis=1)

data_migration = pd.concat([data_migration[["LOCATION"]], data_migration_int]
                            , ignore_index = None, axis=1)

# turn every year column into own column entry
data_migration = data_migration.melt(id_vars='LOCATION', 
                                     value_vars=["2000", "2003", "2006", "2009", "2012", "2015", "2018"],
                                     var_name="TIME",value_name="MIGRANTS")
# typecast; otherwise no merging possible
data_migration.TIME = data_migration.TIME.apply(pd.to_numeric)

#log the migrant percentage
# data_migration["log(MIGRANTS)"] = np.log(data_migration["MIGRANTS"])
# data_migration = data_migration.drop(["MIGRANTS"], axis = 1)

data_migration.info()
data_migration.tail()

### GDP per Capita
https://data.oecd.org/gdp/gross-domestic-product-gdp.htm

In [None]:
#GDP Daten werden eingelesen 
data_gdp = pd.read_csv('../data/gdp_pc.csv')

#Daten zuschneiden
data_gdp = data_gdp[['LOCATION', "TIME", 'Value']]
#Value nach gdp umbenennen
data_gdp = data_gdp[data_gdp["TIME"] >= 1999]
data_gdp = data_gdp.rename(columns = {'Value': 'GDP'})



# In Zeitreihen verwandeln
data_gdp_int = data_gdp.pivot_table("GDP", "LOCATION", "TIME")

# transform back to normal df
data_gdp_int.columns.name = None               #remove categories
data_gdp_int = data_gdp_int.reset_index()                #index to columns

data_gdp_labels = data_gdp_int
# print(data_gdp_int)


# linear interpolieren
data_gdp_int = data_gdp_int.drop(["LOCATION"], axis = 1)
data_gdp_int = data_gdp_int.interpolate(method="linear", axis=1, limit_direction="both")

# print(data_gdp_int)
# Location spalte wieder hinzufügen
data_gdp_int = pd.concat([data_gdp_labels["LOCATION"], data_gdp_int]
                            , ignore_index = False, axis=1)
# print(data_gdp_int)

# turn every year column into own column entry
data_gdp_int = data_gdp_int.melt(id_vars='LOCATION', 
                                     var_name="TIME", value_name="GDP")

data_gdp = data_gdp_int

data_gdp.info()
data_gdp.head()

### Bildungsausgaben

In [None]:
data_edu_spending = pd.read_csv("../data/education_spending/EducationSpendingTertiär2000_2016.csv")
data_edu_spending.head()

data_edu_spending = data_edu_spending[["LOCATION", "TIME", "Value"]]
data_edu_spending = data_edu_spending.rename(columns = {'Value': 'EDU_SPENDING'})

# In Zeitreihen verwandeln
data_pivot = data_edu_spending.pivot_table("EDU_SPENDING", "LOCATION", "TIME")

# transform back to normal df
data_pivot.columns.name = None               #remove categories
data_pivot = data_pivot.reset_index()                #index to columns

# # leeren df mit Spalten für alle Jahre von 2000-2018 erstellen
# emp = np.empty(data_pivot.shape[0])
# emp[:] = np.nan
# year_dict = dict((year, emp) for year in range(2000, 2019))
# # print(year_dict)

data_edu_spending_int = pd.DataFrame()


# print(data_edu_spending_int)
# data_pivot.info()
# print(data_pivot)
# print(data_pivot[2000])
# print(data_edu_spending_int["2000"])

data_edu_spending_int["2000"] = data_pivot[2000] 
data_edu_spending_int["2001"] = np.nan 
data_edu_spending_int["2002"] = np.nan 
data_edu_spending_int["2003"] = np.nan 
data_edu_spending_int["2004"] = np.nan
data_edu_spending_int["2005"] = data_pivot[2005] 
data_edu_spending_int["2006"] = np.nan
data_edu_spending_int["2007"] = np.nan  
data_edu_spending_int["2008"] = data_pivot[2008] 
data_edu_spending_int["2009"] = data_pivot[2009] 
data_edu_spending_int["2010"] = data_pivot[2010] 
data_edu_spending_int["2011"] = data_pivot[2011]
data_edu_spending_int["2012"] = data_pivot[2012] 
data_edu_spending_int["2013"] = data_pivot[2013] 
data_edu_spending_int["2014"] = data_pivot[2014] 
data_edu_spending_int["2015"] = data_pivot[2015]
data_edu_spending_int["2016"] = data_pivot[2016] 
data_edu_spending_int["2017"] = np.nan 
data_edu_spending_int["2018"] = np.nan 

# data_edu_spending_int.info()
# Estland 2000 manuell auf Nan setzten, da Wert=0 unrealistisch
data_edu_spending_int.iloc[13, 0] = np.nan
# print(data_edu_spending_int.iloc[13, :])

data_edu_spending_labels = data_pivot
# print(data_edu_spending_labels)

# linear interpolieren
data_edu_spending_int = data_edu_spending_int.interpolate(method="linear", axis=1, limit_direction="both")
# data_edu_spending = data_edu_spending.dropna()

# print(data_edu_spending_int)
# Location spalte wieder hinzufügen
data_edu_spending_int = pd.concat([data_edu_spending_labels["LOCATION"], data_edu_spending_int]
                            , ignore_index = False, axis=1)
# print(data_edu_spending_int)

# turn every year column into own column entry
data_edu_spending_int = data_edu_spending_int.melt(id_vars='LOCATION', 
                                     var_name="TIME", value_name="EDU_SPENDING")

data_edu_spending_int.TIME = data_edu_spending_int.TIME.apply(pd.to_numeric)

data_edu_spending = data_edu_spending_int

# data_edu_spending[data_edu_spending["EDU_SPENDING"] == 0].head(100)
data_edu_spending.info()
data_edu_spending.head()

Weltbankdaten
https://databank.worldbank.org/reports.aspx?source=2&series=SE.XPD.SECO.PC.ZS#

In [None]:
data_edu_spending = pd.read_csv("../data/gov_spending_per_student_of_GDP.csv")

data_edu_spending = data_edu_spending.replace({"..": np.nan})
data_edu_spending.iloc[:, 28:54] = data_edu_spending.iloc[:, 28:54].apply(pd.to_numeric)

data_edu_spending_int = data_edu_spending.iloc[:, 28:54]

data_edu_spending = pd.concat([data_edu_spending.iloc[:, 3], data_edu_spending.iloc[:, 28:54]]
                            , ignore_index = None, axis=1)
data_edu_spending = data_edu_spending[["Country Code"]]
data_edu_spending = data_edu_spending.rename(columns={"Country Code": "LOCATION"})

# data_edu_spending_int = 

# # Interpolation
# print(data_edu_spending_int.info())
data_edu_spending_int = data_edu_spending_int.interpolate(method="linear", axis=1, limit_direction="both")

data_edu_spending = pd.concat([data_edu_spending[["LOCATION"]], data_edu_spending_int]
                            , ignore_index = None, axis=1)

# turn every year column into own column entry
data_edu_spending = data_edu_spending.melt(id_vars='LOCATION', 
                                     var_name="TIME",value_name="EDU_SPENDING")

# jahre in einheitlichen float wert umwandeln
mask = data_edu_spending["TIME"].str.len() >= 5
# alle time einträge die länger als 4 zeichen sind abschneiden
data_edu_spending.loc[mask, ["TIME"]] = data_edu_spending[mask]["TIME"].str[0:5]
# typecast; otherwise no merging possible
data_edu_spending.TIME = data_edu_spending.TIME.apply(pd.to_numeric)

# print(data_str[data_str["LOCATION"] == "DEU"])
data_edu_spending = data_edu_spending[data_edu_spending["TIME"] >= 1999]
data_edu_spending.info()
data_edu_spending.head()

### Alkoholkonsum

In [None]:
# Import
data_alc = pd.read_csv('../data/alcohol_consumption.csv')

In [None]:
data_alc = data_alc[['LOCATION', 'TIME', 'Value']]
#data_alc = data_alc[data_alc['TIME'] == YEAR]
data_alc = data_alc.rename(columns = {'Value' : 'ALC_PC'})

data_alc.head()

### Corruption Perceptions Index

In [None]:
data_corruption = pd.read_csv('../data/cpi/cpi_2000_formatted.csv')
data_corruption["TIME"] = 2000
# print(data_corruption.iloc[:, 1])
# data_corruption["CPI"] = data_corruption.iloc[:, 1]
# data_corruption.drop(["CPI Score 2000"], axis=1)

for year in range(2003, 2019)[0::3]:
    print('../data/cpi/cpi_',year,'_formatted.csv')
    data_corruption_next = pd.read_csv('../data/cpi/cpi_'+str(year)+'_formatted.csv')
    data_corruption_next["TIME"] = year
    data_corruption = pd.concat([data_corruption, data_corruption_next], axis=0)
    
data_corruption.info()
data_corruption.tail()

### Sozialleistungsquote

In [None]:
#import csv file
data_social = pd.read_csv("../data/public_net_social_spending.csv")

# Select all available data after 2000
# data_social = data_social[data_social["TIME"] == 2015]
# select only public net spending as percentage of gdp
data_social = data_social[data_social["SUBJECT"] == "PUBNET"]
data_social = data_social[data_social["MEASURE"] == "PC_GDP"]
data_social = data_social[["LOCATION", "TIME", "Value"]]
data_social = data_social.rename(columns={"Value": "SOCIAL_EXP"})
# log Social expenses --> less heteroscedasticity and higher correlation
# data_social["SOCIAL_EXP"] = np.log(data_social["SOCIAL_EXP"])
# data_social = data_social.rename(columns={"SOCIAL_EXP": "log(SOCIAL_EXP)"})

data_social.info()
data_social.head()

### Internetzugang
https://data.oecd.org/ict/internet-access.htm

In [None]:
data_internet = pd.read_csv("../data/internet_access.csv")

data_internet = data_internet[["LOCATION", "TIME", "Value"]]
data_internet = data_internet.rename(columns={"Value": "INTERNET_PC"})

# In Zeitreihen verwandeln
data_internet_int = data_internet.pivot_table("INTERNET_PC", "LOCATION", "TIME")

# wieder von pivot tabelle in df umwandeln
data_internet_int.columns.name = None               #remove categories
data_internet_int = data_internet_int.reset_index()                #index to columns

# Spalten für fehlende Jahre hinzufügen
nan_cols = pd.DataFrame(columns=["2000", "2001", "2002", "2003", "2004"], dtype="float64")

data_internet_labels = data_internet_int
# print(data_internet_labels)

# linear interpolieren
data_internet_int = data_internet_int.drop(["LOCATION"], axis=1) 

#nan cols hinzufügen
data_internet_int = pd.concat([nan_cols, data_internet_int]
                            , ignore_index = None, axis=1)

data_internet_int = data_internet_int.interpolate(method="linear", axis=1, limit_direction="both")
data_internet_int = pd.concat([data_internet_labels["LOCATION"], data_internet_int]
                            , ignore_index = False, axis=1)
# print(data_internet_int)
# turn every year column into own column entry
data_internet_int = data_internet_int.melt(id_vars='LOCATION', 
                                     var_name="TIME", value_name="INTERNET_PC")

data_internet_int.TIME = data_internet_int.TIME.apply(pd.to_numeric)
data_internet = data_internet_int

data_internet.info()
data_internet.tail()

### Anteil der 25-64 Jährigen mit tertiärer Bildung
https://data.oecd.org/eduatt/adult-education-level.htm

In [None]:
data_edu_try = pd.read_csv("../data/tertiary_edu.csv")

data_edu_try = data_edu_try[["LOCATION", "TIME", "Value"]]
data_edu_try = data_edu_try.rename(columns={"Value": "PCT_EDU_TRY"})

# In Zeitreihen verwandeln
data_edu_try_int = data_edu_try.pivot_table("PCT_EDU_TRY", "LOCATION", "TIME")

# transform back to normal df
data_edu_try_int.columns.name = None               #remove categories
data_edu_try_int = data_edu_try_int.reset_index()                #index to columns

data_edu_try_labels = data_edu_try_int

# linear interpolieren
data_edu_try_int = data_edu_try_int.drop(["LOCATION"], axis=1) 

data_edu_try_int = data_edu_try_int.interpolate(method="linear", axis=1, limit_direction="both")
data_edu_try_int = pd.concat([data_edu_try_labels["LOCATION"], data_edu_try_int]
                            , ignore_index = False, axis=1)

# turn every year column into own column entry
data_edu_try_int = data_edu_try_int.melt(id_vars='LOCATION', 
                                     var_name="TIME", value_name="PCT_EDU_TRY")
data_edu_try = data_edu_try_int



data_edu_try.info()
data_edu_try.tail()

### Patentanmeldungen
https://data.oecd.org/rd/triadic-patent-families.htm

In [None]:
data_patents = pd.read_csv("../data/patents.csv")

data_patents = data_patents[["LOCATION", "TIME", "Value"]]
data_patents = data_patents.rename(columns={"Value": "PATENTS"})

data_patents.info()
data_patents.head()

### Housing overcrowding
https://data.oecd.org/inequality/housing-overcrowding.htm

In [None]:
data_housing = pd.read_csv("../data/housing_overcrowding.csv")

data_housing = data_housing[["LOCATION", "TIME", "Value"]]
data_housing = data_housing.rename(columns={"Value": "HOUSING"})

data_housing.info()
data_housing.head()

### Poverty rate
https://data.oecd.org/inequality/poverty-rate.htm

In [None]:
data_poverty = pd.read_csv("../data/poverty_rate.csv")

data_poverty = data_poverty[["LOCATION", "TIME", "Value"]]
data_poverty = data_poverty.rename(columns={"Value": "POVERTY"})

data_poverty.info()
data_poverty.head()

### Employment rate
https://data.oecd.org/emp/employment-rate.htm

In [None]:
data_employment = pd.read_csv("../data/employment_rate.csv")

data_employment = data_employment[["LOCATION", "TIME", "Value"]]
data_employment = data_employment.rename(columns={"Value": "EMPLOYMENT"})

data_employment.info()
data_employment.head()

### Early marriage: Nur für 2019 daten
https://data.oecd.org/inequality/discriminatory-family-code.htm#indicator-chart

In [None]:
data_early_marriage = pd.read_csv("../data/child_marriage.csv")

data_early_marriage = data_early_marriage[["LOCATION", "TIME", "Value"]]
data_early_marriage = data_early_marriage.rename(columns={"Value": "EARLY_MARRIAGE"})

data_early_marriage.info()
data_early_marriage.tail()

### teaching hours

In [None]:
data_teaching_hrs = pd.read_csv("../data/teaching_hours.csv")

data_teaching_hrs = data_teaching_hrs[data_teaching_hrs["SUBJECT"] == "LOWSRY"]
data_teaching_hrs = data_teaching_hrs[["LOCATION", "TIME", "Value"]]
data_teaching_hrs = data_teaching_hrs.rename(columns={"Value": "TEACHINGHRS"})

data_teaching_hrs.info()
data_teaching_hrs.head()

### Gesundsheitsausgaben
https://data.oecd.org/healthres/health-spending.htm

In [None]:
data_health_spending_PC = pd.read_csv("../data/health_spending_PC.csv")

data_health_spending_PC = data_health_spending_PC[data_health_spending_PC["SUBJECT"] == "TOT"]
data_health_spending_PC = data_health_spending_PC[["LOCATION", "TIME", "Value"]]
data_health_spending_PC = data_health_spending_PC.rename(columns={"Value": "HEALTH_SPENDING_PC"})

# data_health_spending_PC = data_health_spending_PC[data_health_spending_PC["LOCATION"] == "AUS"]
data_health_spending_PC.info()
data_health_spending_PC.head()

In [None]:
data_health_spending_PCT_GDP = pd.read_csv("../data/health_spending_PC_GDP.csv")

data_health_spending_PCT_GDP = data_health_spending_PCT_GDP[data_health_spending_PCT_GDP["SUBJECT"] == "TOT"]
data_health_spending_PCT_GDP = data_health_spending_PCT_GDP[["LOCATION", "TIME", "Value"]]
data_health_spending_PCT_GDP = data_health_spending_PCT_GDP.rename(columns={"Value": "HEALTH_SPENDING_PCT_GDP"})

data_health_spending_PCT_GDP.info()
data_health_spending_PCT_GDP.head()

### Human Development Index
http://hdr.undp.org/en/indicators/137506#
https://ourworldindata.org/human-development-index (Hier)

In [None]:

#import csv file
data_hdi = pd.read_csv("../data/hdi.csv")

# Select all only the specified year
data_hdi = data_hdi[["Code", "Year", "Human Development Index (UNDP)"]]
data_hdi = data_hdi.rename(columns={"Code": "LOCATION", "Year": "TIME", "Human Development Index (UNDP)": "HDI"})
data_hdi = data_hdi[data_hdi["TIME"] >= 1999]

# In Zeitreihen verwandeln
data_hdi_p = data_hdi.pivot_table("HDI", "LOCATION", "TIME")
data_hdi_p["2018"] = np.nan

# transform back to normal df
data_hdi_p.columns.name = None               #remove categories
data_hdi_p = data_hdi_p.reset_index()                #index to columns

data_hdi_labels = data_hdi_p
# print(data_hdi_p)

# linear interpolieren
data_hdi_p = data_hdi_p.drop(["LOCATION"], axis=1) 
data_hdi_p = data_hdi_p.interpolate(method="linear", axis=1, limit_direction="both")
data_hdi_p = pd.concat([data_hdi_labels["LOCATION"], data_hdi_p]
                            , ignore_index = False, axis=1)

# print(data_hdi)
# print(data_hdi_p)
# turn every year column into own column entry
data_hdi_p = data_hdi_p.melt(id_vars='LOCATION', 
                                     var_name="TIME", value_name="HDI")

# typecast; otherwise no merging possible
data_hdi_p.TIME = data_hdi_p.TIME.apply(pd.to_numeric)

# print(data_hdi_p)

data_hdi = data_hdi_p
# print(data_hdi[data_hdi["TIME"] == 2017])

data_hdi.info()
data_hdi.tail()

### Homicide rate
- Worldbank

In [None]:
#import csv file
data_homicide = pd.read_csv("../data/homicide_rate.csv")

data_homicide_int = data_homicide.iloc[:, 34:63]

data_homicide = data_homicide[["Country Code"]]
data_homicide = data_homicide.rename(columns={"Country Code": "LOCATION"})


# Interpolation
data_homicide_int = data_homicide_int.interpolate(method="linear", axis=1, limit_direction="both")

data_homicide = pd.concat([data_homicide[["LOCATION"]], data_homicide_int]
                            , ignore_index = None, axis=1)
# # turn every year column into own column entry
data_homicide = data_homicide.melt(id_vars='LOCATION', 
                                     var_name="TIME",value_name="HOMICIDES")
# typecast; otherwise no merging possible
data_homicide.TIME = data_homicide.TIME.apply(pd.to_numeric)

# data_homicide_int.info()
# data_homicide_int.head()
data_homicide.info()
data_homicide.head()
# print(data_homicide[data_homicide["HOMICIDES"] == 0])
# data_homicide[data_homicide["HOMICIDES"] == 0].count()

# Mergen aller Daten in eine große Tabelle (Outer Join)

In [None]:
data_all = data_pisa_math.merge(data_pisa_read, how="outer", left_on = ['LOCATION', 'TIME', 'SUBJECT'], 
                                right_on = ['LOCATION', 'TIME', 'SUBJECT'])
data_all = data_all.rename(columns={"Value_x": "PISA Math", "Value_y": "PISA Read"})

data_all = data_all.merge(data_pisa_science, how="outer", left_on = ['LOCATION', 'TIME', 'SUBJECT'], 
                                right_on = ['LOCATION', 'TIME', 'SUBJECT'])
data_all = data_all.rename(columns={"Value": "PISA Science"})

data_all = data_all.merge(data_gini, how="outer", left_on = ['LOCATION', 'TIME'], 
                                right_on = ['LOCATION', 'TIME'])
data_all = data_all.merge(data_migration, how="outer", left_on = ['LOCATION', 'TIME'], 
                                right_on = ['LOCATION', 'TIME'])
data_all = data_all.merge(data_str, how="outer", left_on = ['LOCATION', 'TIME'], 
                                right_on = ['LOCATION', 'TIME'])
data_all = data_all.merge(data_gdp, how="outer", left_on = ['LOCATION', 'TIME'], 
                                right_on = ['LOCATION', 'TIME'])
data_all = data_all.merge(data_edu_spending, how="outer", left_on = ['LOCATION', 'TIME'], 
                                right_on = ['LOCATION', 'TIME'])
data_all = data_all.merge(data_corruption, how="outer", left_on = ['LOCATION', 'TIME'], 
                                right_on = ['LOCATION', 'TIME'])
data_all = data_all.merge(data_alc, how="outer", left_on = ['LOCATION', 'TIME'], 
                                right_on = ['LOCATION', 'TIME'])
# data_all = data_all.merge(data_social, how="outer", left_on = ['LOCATION', 'TIME'], 
#                                 right_on = ['LOCATION', 'TIME'])
data_all = data_all.merge(data_internet, how="outer", left_on = ['LOCATION', 'TIME'], 
                                right_on = ['LOCATION', 'TIME'])
data_all = data_all.merge(data_edu_try, how="outer", left_on = ['LOCATION', 'TIME'], 
                                right_on = ['LOCATION', 'TIME'])
# data_all = data_all.merge(data_health_spending_PCT_GDP, how="outer", left_on = ['LOCATION', 'TIME'], 
#                                 right_on = ['LOCATION', 'TIME'])
# data_all = data_all.merge(data_health_spending_PC, how="outer", left_on = ['LOCATION', 'TIME'], 
#                                 right_on = ['LOCATION', 'TIME'])
# data_all = data_all.merge(data_patents, how="outer", left_on = ['LOCATION', 'TIME'], 
#                                 right_on = ['LOCATION', 'TIME'])
# data_all = data_all.merge(data_housing, how="outer", left_on = ['LOCATION', 'TIME'], 
#                                 right_on = ['LOCATION', 'TIME'])
# data_all = data_all.merge(data_poverty, how="outer", left_on = ['LOCATION', 'TIME'], 
#                                 right_on = ['LOCATION', 'TIME'])
# data_all = data_all.merge(data_teaching_hrs, how="outer", left_on = ['LOCATION', 'TIME'], 
#                                 right_on = ['LOCATION', 'TIME'])
# data_all = data_all.merge(data_employment, how="outer", left_on = ['LOCATION', 'TIME'], 
#                                 right_on = ['LOCATION', 'TIME'])
# data_all = data_all.merge(data_hdi, how="outer", left_on = ['LOCATION', 'TIME'], 
#                                 right_on = ['LOCATION', 'TIME'])
data_all = data_all.merge(data_homicide, how="outer", left_on = ['LOCATION', 'TIME'], 
                                right_on = ['LOCATION', 'TIME'])
# delete OECD Avergae 
data_all = data_all[data_all["LOCATION"] != "OAVG"]
# delete Macau and taiwan, only macro data for china
data_all = data_all[data_all["LOCATION"] != "MAC"] 
data_all = data_all[data_all["LOCATION"] != "TWN"]

# ein jahr auswählen
# data_all = data_all[data_all["TIME"] == 2015]

data_all = data_all[data_all["SUBJECT"] == "TOT"]

data_all.info()
data_all.tail()

In [None]:
data_all.info()
data_all.to_csv('../data/all_nan.csv', index=False)

# KNN Imputation

In [None]:
def knn_impute(data, year, k):
    # alle daten außer year
    data_imputed = data[data["TIME"] != year]
    # nur daten für year
    data_imputed_year = data[data["TIME"] == year]
    data_imputed_labels = data_imputed_year
    data_imputed_year = data_imputed_year.drop(["SUBJECT", "LOCATION", "TIME"], axis=1)
   
    # zu erklärende variablen auch entfernen
    data_imputed_year = data_imputed_year.drop(["PISA Math", "PISA Read", "PISA Science"], axis=1)
    
#     print(data_imputed_year)
#     print(data_imputed_labels)
    
    # Normalisierung für Imputation
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    data_imputed_year = pd.DataFrame(scaler.fit_transform(data_imputed_year),
                                columns = data_imputed_year.columns)
    
    # Imputation
    from sklearn.impute import KNNImputer
    imputer = KNNImputer(n_neighbors = k)
    data_imputed_year = pd.DataFrame(imputer.fit_transform(data_imputed_year),
                                    columns = data_imputed_year.columns)

    # transform data back to original scale
    data_imputed_year =  pd.DataFrame(scaler.inverse_transform(data_imputed_year),
                                     columns = data_imputed_year.columns)

    # vorher entfernte relevante spalten wieder hinzufügen
    data_imputed_labels = data_imputed_labels.reset_index(drop=True)
    data_imputed_year = data_imputed_year.reset_index(drop=True)

    data_imputed_year = pd.concat([data_imputed_labels[["SUBJECT", "LOCATION", "TIME", "PISA Math", "PISA Read", "PISA Science"]], data_imputed_year]
                                , ignore_index = None, axis=1)
#     print(data_imputed)

    data_imputed = data_imputed.append(data_imputed_year)
#     print(data_imputed[data_imputed["TIME"] == 2015])
#     print(data_imputed[data_imputed["TIME"] == 2012])
    
    return data_imputed

In [None]:
years = list(range(2000, 2019))
# years[0::3]
data_all_imputed = data_all
for y in years[0::3]:
    print("################################# Imputing ", y, "#################")
    data_all_imputed = knn_impute(data_all_imputed, y, 2)
    print("#################################", y, " imputed #################")

### Logarithmieren bei den passenden Regressoren
Logarithmiert wurden alle Regressoren bei denen durch diese Transformation
- die Korrelation (= der lineare Zusammenhang) deutlich erhöht werden konnte
- starke Heteroskedastie vermieden werden konnte
- stark gestauchte verteilung --> bessere Verteilung

In [None]:
# print(data_all_imputed)

data_all_imputed["log(MIGRANTS)"] = np.log(data_all_imputed["MIGRANTS"])
data_all_imputed = data_all_imputed.drop(["MIGRANTS"], axis = 1)

data_all["log(MIGRANTS)"] = np.log(data_all["MIGRANTS"])
data_all = data_all.drop(["MIGRANTS"], axis = 1)

# print(data_all_imputed)
data_all_imputed["log(GDP)"] = np.log(data_all_imputed["GDP"])
data_all_imputed = data_all_imputed.drop(["GDP"], axis = 1)

data_all["log(GDP)"] = np.log(data_all["GDP"])
data_all = data_all.drop(["GDP"], axis = 1)

data_all_imputed["log(EDU_SPENDING)"] = np.log(data_all_imputed["EDU_SPENDING"])
data_all_imputed = data_all_imputed.drop(["EDU_SPENDING"], axis = 1)

data_all["log(EDU_SPENDING)"] = np.log(data_all["EDU_SPENDING"])
data_all = data_all.drop(["EDU_SPENDING"], axis = 1)

# data_all_imputed["log(SOCIAL_EXP)"] = np.log(data_all_imputed["SOCIAL_EXP"])
# data_all_imputed = data_all_imputed.drop(["SOCIAL_EXP"], axis = 1)

# data_all["log(SOCIAL_EXP)"] = np.log(data_all["SOCIAL_EXP"])
# data_all = data_all.drop(["SOCIAL_EXP"], axis = 1)

data_all_imputed["log(PCT_EDU_TRY)"] = np.log(data_all_imputed["PCT_EDU_TRY"])
data_all_imputed = data_all_imputed.drop(["PCT_EDU_TRY"], axis = 1)

data_all["log(PCT_EDU_TRY)"] = np.log(data_all["PCT_EDU_TRY"])
data_all = data_all.drop(["PCT_EDU_TRY"], axis = 1)

# Gemeinsame Korrelationsanalyse

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
r_all = data_all_imputed[["PISA Math", "PISA Read", "PISA Science", "GINI", "log(GDP)", "CPI", "log(PCT_EDU_TRY)", "log(MIGRANTS)", "log(EDU_SPENDING)", "ALC_PC", "HOMICIDES", "INTERNET_PC", "STR_SRY"]].corr().round(decimals = 2).abs()
# print(r_all)
#sns.heatmap(r_all, cmap="Blues", vmax = 0.8, vmin = 0.2, annot = True)
sns.heatmap(r_all, cmap="Blues", robust=True, annot = True)

https://towardsdatascience.com/the-use-of-knn-for-missing-values-cf33d935c637
https://towardsdatascience.com/how-to-find-the-optimal-value-of-k-in-knn-35d936e554eb
Veschieden starke Korrelation abhängig von der Wahl von k:
- geringes k: anfällig für Ausreißer, da einzelne Nachbarn großen Einfluss haben
- hohes k: alle fehlenden Zellen werden mit sehr ähnlichen Werten gefüllt -> Gruppengrenzen verschwimmen
- für k=n entspricht kNN-Imputation der einfachen Imputation mit dem arithmetischen Mittel
<table>
    <tr>
        <th>Anzahl der Nachbarn k</th><th>Korrelation von PISA Math und GINI</th>
    </tr>
    <tr>
        <td>1</td><td>0,74</td>
    </tr>
    <tr>
        <td>2</td><td>0,76</td>
    </tr>
    <tr>
        <td>3</td><td>0,73</td>
    </tr>    
    <tr>
        <td>4</td><td>0,71</td>
    </tr>
    <tr>
        <td>5</td><td>0,70</td>
    </tr>
    <tr>
        <td>7</td><td>0,69</td>
    </tr>
    <tr>
        <td>9</td><td>0,68</td>
    </tr>
    <tr>
        <td>15</td><td>0,66</td>
    </tr>
    <tr>
        <td>30</td><td>0,62</td>
    </tr>
    <tr>
        <td>50</td><td>0.62</td>
    </tr>
    <tr>
        <td>100</td><td>0,62</td>
    </tr>
</table>

## Aufteilen in Math, Read, Science

In [None]:
data_math = data_all_imputed.drop(["PISA Read", "PISA Science"], axis=1)
data_math = data_math.dropna()
data_math.info()
data_math.head()

In [None]:
data_read = data_all_imputed.drop(["PISA Math", "PISA Science"], axis=1)
data_read = data_read.dropna()
data_read.info()
data_read.head()

In [None]:
data_science = data_all_imputed.drop(["PISA Read", "PISA Math"], axis=1)
data_science = data_science.dropna()
data_science.info()
data_science.head()

### log(GDP): Evaluation der kNN-Imputation: Vergleich mit mean-Imputation und gar keiner Imputation

In [None]:
#-------------------- Imputation ----------------------
x = data_math["log(GDP)"]
# x = np.exp(x)
y = data_math["PISA Math"]
#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('log(GDP)')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

# --------------- Imputation mit mean ---------------------
x = data_all["log(GDP)"].fillna(data_all["log(GDP)"].mean())
y = data_all["PISA Math"].fillna(data_all["PISA Math"].mean())
# y = np.log(y)

#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('log(GDP)')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

# --------------- No Imputation ---------------------
data = data_all.dropna()
x = data["log(GDP)"]
y = data["PISA Math"]
# y = np.log(y)
#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('log(GDP)')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

### STR_SRY: Evaluation der kNN-Imputation: Vergleich mit mean-Imputation und gar keiner Imputation

In [None]:
#-------------------- Imputation ----------------------
x = data_math["STR_SRY"]
# x = np.log(x)
y = data_math["PISA Math"]
# y = np.log(y)
#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('STR_SRY')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

# --------------- Imputation mit mean ---------------------
x = data_all["STR_SRY"].fillna(data_all["STR_SRY"].mean())
# x = np.log(x)
y = data_all["PISA Math"].fillna(data_all["PISA Math"].mean())
# y = np.log(y)

#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('STR_SRY')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

# --------------- No Imputation ---------------------
data = data_all.dropna()
x = data["STR_SRY"]
# x = np.log(x)
y = data["PISA Math"]
#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('STR_SRY')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

### log(MIGRANTS): Evaluation der kNN-Imputation: Vergleich mit mean-Imputation und gar keiner Imputation
- heteroskedastische Züge im Scatter-Plot zu erkennen --> log

In [None]:
#-------------------- Imputation ----------------------
x = data_math["log(MIGRANTS)"]
# x = np.exp(x)
y = data_math["PISA Math"]
#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
# fig, ax = plt.subplots(figsize=(10, 6))
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('log(MIGRANTS)')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

# --------------- Imputation mit mean ---------------------
x = data_all["log(MIGRANTS)"].fillna(data_all["log(MIGRANTS)"].mean())
# x = np.log(x)
y = data_all["PISA Math"].fillna(data_all["PISA Math"].mean())
# y = np.log(y)

#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('log(MIGRANTS)')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

# --------------- No Imputation ---------------------
data = data_all.dropna()
x = data["log(MIGRANTS)"]
# x = np.log(x)
y = data["PISA Math"]
#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('log(MIGRANTS)')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

### GINI: Evaluation der kNN-Imputation: Vergleich mit mean-Imputation und gar keiner Imputation

In [None]:
#-------------------- Imputation ----------------------
x = data_math["GINI"]
# x = np.log(x)
# x = np.sqrt(x)
y = data_math["PISA Math"]
# y = np.log(y)
# y = np.sqrt(y
# data_math.info()

#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots(figsize=(7,5))
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('GINI')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
# plt.title('Gini kNN-Imputation')
plt.show()

# --------------- Imputation mit mean ---------------------
data_mean = data_all
data_mean["GINI"] = data_all["GINI"].fillna(data_all["GINI"].mean())
data_mean = data_mean[pd.notnull(data_all["PISA Math"])]
x = data_mean["GINI"]
# x = np.log(x)
y = data_mean["PISA Math"]
# y = np.log(y)

#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots(figsize=(7,5))
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('GINI')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
# plt.title('Gini Mean-Imputation')
plt.show()

# --------------- No Imputation ---------------------
data = data_all.dropna()
# data.info()
x = data["GINI"]
# x = np.log(x)
y = data["PISA Math"]
# y = np.log(y)
#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots(figsize=(7,5))
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('GINI')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
# plt.title('Gini keine Imputation')
plt.show()

###   log(EDU_SPENDING): Evaluation der kNN-Imputation: Vergleich mit mean-Imputation und gar keiner Imputation

In [None]:
#-------------------- Imputation ----------------------
x = data_math["log(EDU_SPENDING)"]
# x = np.sqrt(x)
x = np.exp(x)
y = data_math["PISA Math"]
# y = np.log(y)
# y = np.sqrt(y)

#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('log(EDU_SPENDING)')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

# --------------- Imputation mit mean ---------------------
x = data_all["log(EDU_SPENDING)"].fillna(data_all["log(EDU_SPENDING)"].mean())
# x = np.log(x)
y = data_all["PISA Math"].fillna(data_all["PISA Math"].mean())
# y = np.log(y)

#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('log(EDU_SPENDING)')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

# --------------- No Imputation ---------------------
data = data_all.dropna()
x = data["log(EDU_SPENDING)"]
# x = np.log(x)
y = data["PISA Math"]
# y = np.log(y)
#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('log(EDU_SPENDING)')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

### Alcohol consumption per capita: Evaluation der kNN-Imputation: Vergleich mit mean-Imputation und gar keiner Imputation

In [None]:
#-------------------- Imputation ----------------------
x = data_math["ALC_PC"]
# x = np.log(x)
# x = np.sqrt(x)
y = data_math["PISA Math"]
# y = np.log(y)
# y = np.sqrt(y)

#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('ALC_PC')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

# --------------- Imputation mit mean ---------------------
x = data_all["ALC_PC"].fillna(data_all["ALC_PC"].mean())
# x = np.log(x)
y = data_all["PISA Math"].fillna(data_all["PISA Math"].mean())
# y = np.log(y)

#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('ALC_PC')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

# --------------- No Imputation ---------------------
data = data_all.dropna()
x = data["ALC_PC"]
# x = np.log(x)
y = data["PISA Math"]
# y = np.log(y)
#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('ALC_PC')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

### Social_Expenses_PCT: Evaluation der kNN-Imputation: Vergleich mit mean-Imputation und gar keiner Imputation

In [None]:
# #-------------------- Imputation ----------------------
# x = data_math["log(SOCIAL_EXP)"]
# # x = np.log(x)
# # x = np.sqrt(x)
# y = data_math["PISA Math"]
# # y = np.log(y)
# # y = np.sqrt(y)

# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('log(SOCIAL_EXP)')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

# # --------------- Imputation mit mean ---------------------
# x = data_all["log(SOCIAL_EXP)"].fillna(data_all["log(SOCIAL_EXP)"].mean())
# # x = np.log(x)
# y = data_all["PISA Math"].fillna(data_all["PISA Math"].mean())
# # y = np.log(y)

# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('SOCIAL_EXP')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

# # --------------- No Imputation ---------------------
# data = data_all.dropna()
# x = data["log(SOCIAL_EXP)"]
# # x = np.log(x)
# y = data["PISA Math"]
# # y = np.log(y)
# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('log(SOCIAL_EXP)')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

### INTERNET_PC: Evaluation der kNN-Imputation: Vergleich mit mean-Imputation und gar keiner Imputation

In [None]:
#-------------------- Imputation ----------------------
x = data_math["INTERNET_PC"]
# x = np.log(x)
# x = np.sqrt(x)
y = data_math["PISA Math"]
# y = np.log(y)
# y = np.sqrt(y)

#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('INTERNET_PC')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

# --------------- Imputation mit mean ---------------------
x = data_all["INTERNET_PC"].fillna(data_all["INTERNET_PC"].mean())
# x = np.log(x)
y = data_all["PISA Math"].fillna(data_all["PISA Math"].mean())
# y = np.log(y)

#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('INTERNET_PC')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

# --------------- No Imputation ---------------------
data = data_all.dropna()
x = data["INTERNET_PC"]
# x = np.log(x)
y = data["PISA Math"]
# y = np.log(y)
#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('INTERNET_PC')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

### PATENTS: Evaluation der kNN-Imputation: Vergleich mit mean-Imputation und gar keiner Imputation

In [None]:
# #-------------------- Imputation ----------------------
# x = data_math["PATENTS"]
# x = np.log(x)
# # x = np.sqrt(x)
# y = data_math["PISA Math"]
# # y = np.log(y)
# # y = np.sqrt(y)

# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('PATENTS')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

# # --------------- Imputation mit mean ---------------------
# x = data_all["PATENTS"].fillna(data_all["PATENTS"].mean())
# # x = np.log(x)
# y = data_all["PISA Math"].fillna(data_all["PISA Math"].mean())
# # y = np.log(y)

# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('PATENTS')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

# # --------------- No Imputation ---------------------
# data = data_all.dropna()
# x = data["PATENTS"]
# # x = np.log(x)
# y = data["PISA Math"]
# # y = np.log(y)
# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('PATENTS')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

### HOUSING: Evaluation der kNN-Imputation: Vergleich mit mean-Imputation und gar keiner Imputation

In [None]:
# #-------------------- Imputation ----------------------
# x = data_math["HOUSING"]
# # x = np.log(x)
# # x = np.sqrt(x)
# y = data_math["PISA Math"]
# # y = np.log(y)
# # y = np.sqrt(y)

# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('HOUSING')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

# # --------------- Imputation mit mean ---------------------
# x = data_all["HOUSING"].fillna(data_all["HOUSING"].mean())
# # x = np.log(x)
# y = data_all["PISA Math"].fillna(data_all["PISA Math"].mean())
# # y = np.log(y)

# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('HOUSING')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

# # --------------- No Imputation ---------------------
# data = data_all.dropna()
# x = data["HOUSING"]
# # x = np.log(x)
# y = data["PISA Math"]
# # y = np.log(y)
# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('HOUSING')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

### POVERTY: Evaluation der kNN-Imputation: Vergleich mit mean-Imputation und gar keiner Imputation

In [None]:
# #-------------------- Imputation ----------------------
# x = data_math["POVERTY"]
# # x = np.log(x)
# # x = 1/x
# # x = np.log(x)
# # x = np.sqrt(x)
# y = data_math["PISA Math"]
# # y = np.log(y)
# # y = np.sqrt(y)

# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('POVERTY')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

# # --------------- Imputation mit mean ---------------------
# x = data_all["POVERTY"].fillna(data_all["POVERTY"].mean())
# # x = np.log(x)
# y = data_all["PISA Math"].fillna(data_all["PISA Math"].mean())
# # y = np.log(y)

# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('POVERTY')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

# # --------------- No Imputation ---------------------
# data = data_all.dropna()
# x = data["POVERTY"]
# # x = np.log(x)
# y = data["PISA Math"]
# # y = np.log(y)
# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('POVERTY')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

### Employment rate: Evaluation der kNN-Imputation: Vergleich mit mean-Imputation und gar keiner Imputation

In [None]:
# #-------------------- Imputation ----------------------
# x = data_math["EMPLOYMENT"]
# # x = np.log(x)
# # x = 1/x
# # x = np.log(x)
# # x = np.sqrt(x)
# y = data_math["PISA Math"]
# # y = np.log(y)
# # y = np.sqrt(y)

# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('EMPLOYMENT')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

# # --------------- Imputation mit mean ---------------------
# x = data_all["EMPLOYMENT"].fillna(data_all["EMPLOYMENT"].mean())
# # x = np.log(x)
# y = data_all["PISA Math"].fillna(data_all["PISA Math"].mean())
# # y = np.log(y)

# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('EMPLOYMENT')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

# # --------------- No Imputation ---------------------
# data = data_all.dropna()
# x = data["EMPLOYMENT"]
# # x = np.log(x)
# y = data["PISA Math"]
# # y = np.log(y)
# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('EMPLOYMENT')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

### Teaching hours LOWSRY: Evaluation der kNN-Imputation: Vergleich mit mean-Imputation und gar keiner Imputation

In [None]:
# #-------------------- Imputation ----------------------
# x = data_math["TEACHINGHRS"]
# # x=x**2
# # x = np.log(x)
# # x = 1/x
# # x = np.log(x)
# # x = np.sqrt(x)
# # x = np.exp(x)
# y = data_math["PISA Math"]
# # y = np.log(y)
# # y = np.sqrt(y)

# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('TEACHINGHRS')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

# # --------------- Imputation mit mean ---------------------
# x = data_all["TEACHINGHRS"].fillna(data_all["TEACHINGHRS"].mean())
# # x = np.log(x)
# y = data_all["PISA Math"].fillna(data_all["PISA Math"].mean())
# # y = np.log(y)

# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('TEACHINGHRS')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

# # --------------- No Imputation ---------------------
# data = data_all.dropna()
# x = data["TEACHINGHRS"]
# # x = np.log(x)
# y = data["PISA Math"]
# # y = np.log(y)
# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('TEACHINGHRS')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

### HDI: Evaluation der kNN-Imputation: Vergleich mit mean-Imputation und gar keiner Imputation

In [None]:
# #-------------------- Imputation ----------------------
# x = data_math["HDI"]
# # x = np.log(x)
# # x = np.sqrt(x)
# y = data_math["PISA Math"]
# # y = np.log(y)
# # y = np.sqrt(y)

# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('HDI')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

# # --------------- Imputation mit mean ---------------------
# x = data_all["HDI"].fillna(data_all["HDI"].mean())
# # x = np.log(x)
# y = data_all["PISA Math"].fillna(data_all["PISA Math"].mean())
# # y = np.log(y)

# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('HDI')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

# # --------------- No Imputation ---------------------
# data = data_all.dropna()
# x = data["HDI"]
# # x = np.log(x)
# y = data["PISA Math"]
# # y = np.log(y)
# #plt.plot(x, y, "o", color="blue")
# r_gini = np.corrcoef(x, y)
# print("r = " + str(r_gini[0, 1]))

# slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
# line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
# #line
# fig, ax = plt.subplots()
# ax.plot(x, y, linewidth=0, marker='x', label='Data points')
# ax.plot(x, intercept + slope * x, label=line)
# ax.set_xlabel('HDI')
# ax.set_ylabel('PISA_MATH')
# ax.legend(facecolor='white')
# plt.show()

### HOMICIDES: Evaluation der kNN-Imputation: Vergleich mit mean-Imputation und gar keiner Imputation

In [None]:
#-------------------- Imputation ----------------------
x = data_math["HOMICIDES"]
print(data_math[data_math["HOMICIDES"] == 0])
# x = x**2
# x = np.sqrt(x)
y = data_math["PISA Math"]
# y = np.log(y)
# y = np.sqrt(y)

#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
# fig, ax = plt.subplots(figsize=(7*1.25,5*1.25))
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('HOMICIDES')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

# --------------- Imputation mit mean ---------------------
x = data_all["HOMICIDES"].fillna(data_all["HOMICIDES"].mean())
x = x**2
# x = np.log(x)
y = data_all["PISA Math"].fillna(data_all["PISA Math"].mean())
# y = np.log(y)

#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
# fig, ax = plt.subplots(figsize=(7,5))
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('HOMICIDES')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

# --------------- No Imputation ---------------------
data = data_all.dropna()
x = data["HOMICIDES"]
x = x**2
# x = np.log(x)
y = data["PISA Math"]
# y = np.log(y)
#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
# fig, ax = plt.subplots(figsize=(7,5))
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('HOMICIDES')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

### log(PCT_EDU_TRY): Evaluation der kNN-Imputation: Vergleich mit mean-Imputation und gar keiner Imputation

In [None]:
#-------------------- Imputation ----------------------
x = data_math["log(PCT_EDU_TRY)"]
# x = np.exp(x)
# x = np.log(x)
# x = np.sqrt(x)
y = data_math["PISA Math"]
# y = np.log(y)
# y = np.sqrt(y)

#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
# fig, ax = plt.subplots(figsize=(7, 4))
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('log(PCT_EDU_TRY)')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

# --------------- Imputation mit mean ---------------------
x = data_all["log(PCT_EDU_TRY)"].fillna(data_all["log(PCT_EDU_TRY)"].mean())
# x = np.log(x)
y = data_all["PISA Math"].fillna(data_all["PISA Math"].mean())
# y = np.log(y)

#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('log(PCT_EDU_TRY)')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

# --------------- No Imputation ---------------------
data = data_all.dropna()
x = data["log(PCT_EDU_TRY)"]
# x = np.log(x)
y = data["PISA Math"]
# y = np.log(y)
#plt.plot(x, y, "o", color="blue")
r_gini = np.corrcoef(x, y)
print("r = " + str(r_gini[0, 1]))

slope, intercept, r, p, stderr = scipy.stats.linregress(x, y)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
#line
fig, ax = plt.subplots()
ax.plot(x, y, linewidth=0, marker='x', label='Data points')
ax.plot(x, intercept + slope * x, label=line)
ax.set_xlabel('log(PCT_EDU_TRY)')
ax.set_ylabel('PISA_MATH')
ax.legend(facecolor='white')
plt.show()

## Export

In [None]:
# data_all_imputed_exp = data_all_imputed[["PISA Math","PISA Read","PISA Science","GINI","log(MIGRANTS)","STR_SRY","log(GDP)","log(EDU_SPENDING_SRY)","CPI", "ALC_PC", "log(SOCIAL_EXP)",PATENTS", "HOUSING", "HDI"]]
# data_all_imputed_exp.to_csv('../data/all_imputed.csv', index=False)

In [None]:
# data_math_exp = data_math
# data_math_exp.to_csv('../data/math_imputed_2.csv', index=False)

In [None]:
# data_read_exp = data_read
# data_read_exp.to_csv('../data/read_imputed_2.csv', index=False)

In [None]:
# data_science_exp = data_science
# data_science_exp.to_csv('../data/science_imputed_2.csv', index=False)

In [None]:
# data_all_imputed_exp = data_all_imputed[["PISA Math","PISA Read","PISA Science","GINI","log(MIGRANTS)","STR_SRY","log(GDP)","log(EDU_SPENDING_SRY)","CPI", "ALC_PC", "log(SOCIAL_EXP)",PATENTS", "HOUSING", "HDI"]]
# data_all_imputed_exp.to_csv('../data/all_imputed_2.csv', index=False)