##### Simulationen für MCAR mit den Top5 erklärenden Variablen (standardisiert)
In dem Notebook 'ols_prep' wird eine lineare Regression des Laptop-Preises auf alle Exogenen durchgeführt.
Anschließend werden alle Betas standardisiert und absteigend nach ihrer (absoluten) Größe sortiert. Daraus resultiert der Erklärungsgehalt der verschiedenen Exogenen. Wir nehmen die fünf Exogenen, die den größten Erklärungsgehalt haben und standardisieren sie. Sie dienen als Grundlage für die Simulationen in diesem Notebook. 
Es werden MCAR-Simulationen mit k=1000 durchgeführt, wobei jeweils nur die Top5 standardisierten Exogenen verwendet werden.
Es wird kein Jitter hinzugefügt.

In [26]:
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm
import statsmodels.formula.api as smf

rng = np.random.RandomState(0)

In [34]:
# Initialisierung von DataFrames und Data Cleaning
df = pd.read_csv('Laptop-Preise.csv', sep=';', decimal=',')
df = df[df.extern_Schnittstellen != 2300] # Ausreißer löschen
df = df.reset_index(drop=True) # Reset Index

# Nur die 5 Spalten auswählen, die auch den größten Erklärungsgehalt haben
selected_columns = ['Preis', 'Akku_Kapazitaet', 'Arbeitsspeicher', 'Kerne', 'Mobilfunk_vorhanden', 'SSD']

# Create a new DataFrame with only the selected columns
df = df[selected_columns]

df_noPrice = df.drop('Preis', axis=1)
imputed_simul_knn = pd.DataFrame(columns=['Mean KNN_1', 'MSE KNN_1', 'SE KNN_1', 'Alpha KNN_1', 'Mean KNN_3', 'MSE KNN_3', 'SE KNN_3', 'Alpha KNN_3', 'Mean KNN_5', 'MSE KNN_5', 'SE KNN_5', 'Alpha KNN_5'])
imputed_stats_knn = pd.DataFrame(columns=['Mean KNN_1', 'MSE KNN_1', 'SE KNN_1', 'Alpha KNN_1', 'Mean KNN_3', 'MSE KNN_3', 'SE KNN_3', 'Alpha KNN_3', 'Mean KNN_5', 'MSE KNN_5', 'SE KNN_5', 'Alpha KNN_5'], index = np.arange(0.1, 1, 0.1))

imputed_simul_ols = pd.DataFrame(columns=['Mean', 'MSE OLS', 'SE OLS', 'Alpha OLS'])
imputed_stats_ols = pd.DataFrame(columns=['Mean', 'MSE OLS', 'SE OLS', 'Alpha OLS'], index = np.arange(0.1, 1, 0.1))

no_imputation = pd.DataFrame(columns = ['SE', 'Alpha'])

# Skalierung (Standardisierung) von df_noPrice
col_names = df_noPrice.columns
scaler = StandardScaler().fit(df_noPrice.values)
df_noPrice = scaler.transform(df_noPrice.values)
df_noPrice = pd.DataFrame(df_noPrice, columns=col_names)

# Skalierung (Standardisierung) von df mit Preis (Preis ist unverändert)
df_std = df_noPrice.copy()
df_std.insert(0, 'Preis', df['Preis'])

# Wahrer Mittelwert:
true_mean = df['Preis'].mean()

In [35]:
def contains_mean(values):
    return 1 if (true_mean > (np.mean(values) - 1.95996 * stats.sem(values))) and (true_mean < (np.mean(values) + 1.95996 * stats.sem(values))) else 0

In [36]:
# OLS Model 
# fitting the model 
# df_noPrice ist bereits standardisiert
model = sm.OLS(df['Preis'], sm.add_constant(df_noPrice)).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Preis   R-squared:                       0.789
Model:                            OLS   Adj. R-squared:                  0.788
Method:                 Least Squares   F-statistic:                     771.4
Date:                Sun, 14 Jan 2024   Prob (F-statistic):               0.00
Time:                        20:51:22   Log-Likelihood:                -7534.6
No. Observations:                1038   AIC:                         1.508e+04
Df Residuals:                    1032   BIC:                         1.511e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                1333.9633    

In [37]:
def del_ran(df_exog, labels, chance):
    rand_array = np.random.rand(df_exog.shape[0])
    delete_entries = rand_array < chance
    keep_entries = rand_array >= chance
    
    return [df_exog[delete_entries], labels[delete_entries], df_exog[keep_entries], labels[keep_entries]]

In [38]:
def impute_ols(test_values, test_labels, train_values, train_labels):
    
    # OLS Model
    # fitting the model 
    model = sm.OLS(train_labels, sm.add_constant(train_values)).fit() 

    imputed_values = model.predict(exog = sm.add_constant(test_values, has_constant='add')).tolist()
    return [np.mean(list(train_labels) + imputed_values), np.mean((imputed_values-test_labels)**2), stats.sem(list(train_labels) + imputed_values), contains_mean(list(train_labels) + imputed_values)]

In [39]:
def impute_knn(test_values, test_labels, train_values, train_labels):
    
    tree = KDTree(train_values.values, leaf_size=5)

    imputed_values_knn_1 = []
    imputed_values_knn_3 = []
    imputed_values_knn_5 = []

    for index, entry in enumerate(test_values.values):
 
        dist, ind = tree.query([entry], k=5)
        ind = ind[0]

        current_impute_knn_1 = np.mean(train_labels.values[ind][0])
        current_impute_knn_3 = np.mean(train_labels.values[ind][:3])
        current_impute_knn_5 = np.mean(train_labels.values[ind])

        imputed_values_knn_1.append(current_impute_knn_1)
        imputed_values_knn_3.append(current_impute_knn_3)
        imputed_values_knn_5.append(current_impute_knn_5)

    mean_knn_1 = np.mean(list(train_labels.values)+imputed_values_knn_1)
    mean_knn_3 = np.mean(list(train_labels.values)+imputed_values_knn_3)
    mean_knn_5 = np.mean(list(train_labels.values)+imputed_values_knn_5)

    mse_knn_1 = np.mean((test_labels.values - imputed_values_knn_1)**2)
    mse_knn_3 = np.mean((test_labels.values - imputed_values_knn_3)**2)
    mse_knn_5 = np.mean((test_labels.values - imputed_values_knn_5)**2)

    sem_knn_1 = stats.sem(list(train_labels.values)+imputed_values_knn_1)
    sem_knn_3 = stats.sem(list(train_labels.values)+imputed_values_knn_3)
    sem_knn_5 = stats.sem(list(train_labels.values)+imputed_values_knn_5)

    alpha_knn_1 = contains_mean(list(train_labels.values)+imputed_values_knn_1)
    alpha_knn_3 = contains_mean(list(train_labels.values)+imputed_values_knn_3)
    alpha_knn_5 = contains_mean(list(train_labels.values)+imputed_values_knn_5)


    return [mean_knn_1, mse_knn_1, sem_knn_1, alpha_knn_1, mean_knn_3, mse_knn_3, sem_knn_3, alpha_knn_3, mean_knn_5, mse_knn_5, sem_knn_5, alpha_knn_5]

In [40]:
def simul_knn():
    for c in np.arange(0.1, 1, 0.1):

        imputed_simul_knn = pd.DataFrame(columns=['Mean KNN_1', 'MSE KNN_1', 'SE KNN_1', 'Alpha KNN_1', 'Mean KNN_3', 'MSE KNN_3', 'SE KNN_3', 'Alpha KNN_3', 'Mean KNN_5', 'MSE KNN_5', 'SE KNN_5', 'Alpha KNN_5'])

        for i in tqdm(range(1000)):
            temp = del_ran(df_exog = df_noPrice, labels = df['Preis'], chance = c)
            imputed_simul_knn.at[i] = impute_knn(temp[0], temp[1], temp[2], temp[3])

        return_values = []
        
        for column in imputed_simul_knn.columns:
            column_mean = np.mean(imputed_simul_knn[column].to_list())
            return_values.append(column_mean)
        # print(return_values)

        imputed_stats_knn.loc[c] =  return_values
    imputed_stats_knn

simul_knn()
imputed_stats_knn

100%|██████████| 1000/1000 [00:16<00:00, 59.56it/s]
100%|██████████| 1000/1000 [00:29<00:00, 34.13it/s]
100%|██████████| 1000/1000 [00:44<00:00, 22.47it/s]
100%|██████████| 1000/1000 [00:54<00:00, 18.35it/s]
100%|██████████| 1000/1000 [01:06<00:00, 15.07it/s]
100%|██████████| 1000/1000 [01:15<00:00, 13.19it/s]
100%|██████████| 1000/1000 [01:39<00:00, 10.07it/s]
100%|██████████| 1000/1000 [02:01<00:00,  8.22it/s]
100%|██████████| 1000/1000 [02:04<00:00,  8.05it/s]


Unnamed: 0,Mean KNN_1,MSE KNN_1,SE KNN_1,Alpha KNN_1,Mean KNN_3,MSE KNN_3,SE KNN_3,Alpha KNN_3,Mean KNN_5,MSE KNN_5,SE KNN_5,Alpha KNN_5
0.1,1332.509491,117090.803906,23.32257,1.0,1332.183591,96082.901894,23.211186,1.0,1332.215043,95859.25769,23.192264,1.0
0.2,1330.588837,123137.199603,23.420955,1.0,1330.702743,97743.091991,23.163692,1.0,1330.625671,96665.628993,23.105777,1.0
0.3,1330.024886,128274.725436,23.524624,1.0,1330.786115,100775.431165,23.109721,1.0,1330.07776,98684.262631,23.001687,1.0
0.4,1328.307222,131763.667839,23.562603,1.0,1329.950261,102921.45693,22.978258,1.0,1328.8192,100669.8397,22.8124,1.0
0.5,1327.699698,135561.110067,23.591493,1.0,1330.301814,106020.567763,22.850429,1.0,1328.860077,103905.384791,22.624325,1.0
0.6,1327.393311,141813.206755,23.560842,0.993,1330.104413,110628.175483,22.616591,1.0,1329.234849,109117.674663,22.320035,0.997
0.7,1328.480148,150598.468371,23.490295,0.963,1330.589732,117265.863522,22.321304,0.979,1330.587239,116167.019251,21.973262,0.98
0.8,1327.563384,165126.960787,23.325625,0.89,1329.123637,129252.988126,21.929909,0.92,1326.242419,127243.852493,21.440953,0.904
0.9,1327.688775,192975.245816,23.040176,0.77,1320.755366,152017.091383,21.081045,0.713,1310.170112,152189.856292,20.027902,0.626


In [41]:
def simul_ols():
    
    for c in np.arange(0.1, 1, 0.1):
        
        imputed_simul_ols = pd.DataFrame(columns=['Mean', 'MSE OLS', 'SE OLS', 'Alpha OLS'])

        for i in tqdm(range(1000)):
            temp = del_ran(df_exog = df_noPrice, labels = df['Preis'], chance = c)
            imputed_simul_ols.at[i] = impute_ols(temp[0], temp[1], temp[2], temp[3])

        return_values = []

        for column in imputed_simul_ols.columns:
            column_mean = np.mean(imputed_simul_ols[column].to_list())
            return_values.append(column_mean)
        # print(return_values)

        imputed_stats_ols.loc[c] =  return_values
    imputed_stats_ols

simul_ols()

100%|██████████| 1000/1000 [00:09<00:00, 100.76it/s]
100%|██████████| 1000/1000 [00:09<00:00, 106.23it/s]
100%|██████████| 1000/1000 [00:09<00:00, 102.75it/s]
100%|██████████| 1000/1000 [00:09<00:00, 105.84it/s]
100%|██████████| 1000/1000 [00:09<00:00, 103.21it/s]
100%|██████████| 1000/1000 [00:09<00:00, 104.33it/s]
100%|██████████| 1000/1000 [00:09<00:00, 107.52it/s]
100%|██████████| 1000/1000 [00:09<00:00, 109.66it/s]
100%|██████████| 1000/1000 [00:09<00:00, 108.78it/s]


In [43]:
pd.concat([imputed_stats_ols, imputed_stats_knn], axis=1).to_csv('MCAR_Simulation_Top5')
pd.concat([imputed_stats_ols, imputed_stats_knn], axis=1)

Unnamed: 0,Mean,MSE OLS,SE OLS,Alpha OLS,Mean KNN_1,MSE KNN_1,SE KNN_1,Alpha KNN_1,Mean KNN_3,MSE KNN_3,SE KNN_3,Alpha KNN_3,Mean KNN_5,MSE KNN_5,SE KNN_5,Alpha KNN_5
0.1,1334.053835,119812.722218,22.987324,1.0,1332.509491,117090.803906,23.32257,1.0,1332.183591,96082.901894,23.211186,1.0,1332.215043,95859.25769,23.192264,1.0
0.2,1334.2411,120001.195628,22.73498,1.0,1330.588837,123137.199603,23.420955,1.0,1330.702743,97743.091991,23.163692,1.0,1330.625671,96665.628993,23.105777,1.0
0.3,1333.878254,120297.224349,22.48415,1.0,1330.024886,128274.725436,23.524624,1.0,1330.786115,100775.431165,23.109721,1.0,1330.07776,98684.262631,23.001687,1.0
0.4,1334.009971,120525.523117,22.221043,1.0,1328.307222,131763.667839,23.562603,1.0,1329.950261,102921.45693,22.978258,1.0,1328.8192,100669.8397,22.8124,1.0
0.5,1333.56204,120670.987309,21.956839,1.0,1327.699698,135561.110067,23.591493,1.0,1330.301814,106020.567763,22.850429,1.0,1328.860077,103905.384791,22.624325,1.0
0.6,1333.515916,121107.768563,21.713567,1.0,1327.393311,141813.206755,23.560842,0.993,1330.104413,110628.175483,22.616591,1.0,1329.234849,109117.674663,22.320035,0.997
0.7,1333.189266,122156.074219,21.446303,0.99,1328.480148,150598.468371,23.490295,0.963,1330.589732,117265.863522,22.321304,0.979,1330.587239,116167.019251,21.973262,0.98
0.8,1334.085345,123599.928668,21.222434,0.948,1327.563384,165126.960787,23.325625,0.89,1329.123637,129252.988126,21.929909,0.92,1326.242419,127243.852493,21.440953,0.904
0.9,1333.574503,128800.547929,21.020732,0.78,1327.688775,192975.245816,23.040176,0.77,1320.755366,152017.091383,21.081045,0.713,1310.170112,152189.856292,20.027902,0.626
