In [403]:
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm
import statsmodels.formula.api as smf

rng = np.random.RandomState(0)

In [404]:
# Initialisierung von DataFrames und Data Cleaning
df = pd.read_csv('Laptop-Preise.csv', sep=';', decimal=',')
df = df[df.extern_Schnittstellen != 2300] # Ausreißer löschen
df = df.reset_index(drop=True) # Reset Index

# Nur die 5 Spalten auswählen, die auch den größten Erklärungsgehalt haben
selected_columns = ['Preis', 'Akku_Kapazitaet', 'Arbeitsspeicher', 'Kerne', 'Mobilfunk_vorhanden', 'SSD']

# Create a new DataFrame with only the selected columns
df = df[selected_columns]

df_noPrice = df.drop('Preis', axis=1)
imputed_simul_knn = pd.DataFrame(columns=['Mean KNN_1', 'MSE KNN_1', 'SE KNN_1', 'Alpha KNN_1', 'Mean KNN_3', 'MSE KNN_3', 'SE KNN_3', 'Alpha KNN_3', 'Mean KNN_5', 'MSE KNN_5', 'SE KNN_5', 'Alpha KNN_5'])
imputed_stats_knn = pd.DataFrame(columns=['Mean KNN_1', 'MSE KNN_1', 'SE KNN_1', 'Alpha KNN_1', 'Mean KNN_3', 'MSE KNN_3', 'SE KNN_3', 'Alpha KNN_3', 'Mean KNN_5', 'MSE KNN_5', 'SE KNN_5', 'Alpha KNN_5'], index = np.arange(0.1, 1, 0.1))

imputed_simul_ols = pd.DataFrame(columns=['Mean', 'MSE OLS', 'SE OLS', 'Alpha OLS'])
imputed_stats_ols = pd.DataFrame(columns=['Mean', 'MSE OLS', 'SE OLS', 'Alpha OLS'], index = np.arange(0.1, 1, 0.1))

no_imputation = pd.DataFrame(columns = ['SE', 'Alpha'])

# Skalierung (Standardisierung) von df_noPrice
col_names = df_noPrice.columns
scaler = StandardScaler().fit(df_noPrice.values)
df_noPrice = scaler.transform(df_noPrice.values)
df_noPrice = pd.DataFrame(df_noPrice, columns=col_names)

# Skalierung (Standardisierung) von df mit Preis (Preis ist unverändert)
df_std = df_noPrice.copy()
df_std.insert(0, 'Preis', df['Preis'])

# Wahrer Mittelwert:
true_mean = df['Preis'].mean()

In [405]:
def contains_mean(values):
    return 1 if (true_mean > (np.mean(values) - 1.95996 * stats.sem(values))) and (true_mean < (np.mean(values) + 1.95996 * stats.sem(values))) else 0

In [406]:
# OLS Model 
# fitting the model 
# df_noPrice ist bereits standardisiert
model = sm.OLS(df['Preis'], sm.add_constant(df_noPrice)).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Preis   R-squared:                       0.789
Model:                            OLS   Adj. R-squared:                  0.788
Method:                 Least Squares   F-statistic:                     771.4
Date:                Sun, 14 Jan 2024   Prob (F-statistic):               0.00
Time:                        21:35:25   Log-Likelihood:                -7534.6
No. Observations:                1038   AIC:                         1.508e+04
Df Residuals:                    1032   BIC:                         1.511e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                1333.9633    

In [407]:
def generate_mnar_datasets(df_exog, labels, chance_if_lower_than_median, chance_if_higher_than_median):
    # Combine df_exog and labels into one DataFrame
    combined_df = pd.concat([df_exog, labels], axis=1)

    # Create a random variable between 0 and 1 for each entry
    random_vars = np.random.rand(len(combined_df))

    # Compute the median of the labels
    median_label = labels.median()

    # Create boolean masks for entries to delete and keep
    mask_delete_lower = (labels <= median_label) & (random_vars < chance_if_lower_than_median)
    mask_delete_higher = (labels > median_label) & (random_vars < chance_if_higher_than_median)

    # Create the final masks for delete and keep entries
    mask_delete = mask_delete_lower | mask_delete_higher
    mask_keep = ~mask_delete

    # Separate the entries into delete and keep datasets
    delete_entries = combined_df[mask_delete]
    keep_entries = combined_df[mask_keep]

    # Extract df_exog and labels from delete and keep datasets
    df_exog_delete = delete_entries.iloc[:, :-1]
    labels_delete = delete_entries.iloc[:, -1]
    df_exog_keep = keep_entries.iloc[:, :-1]
    labels_keep = keep_entries.iloc[:, -1]

    return [df_exog_delete, labels_delete, df_exog_keep, labels_keep]


In [408]:
def impute_ols(test_values, test_labels, train_values, train_labels):
    
    # OLS Model
    # fitting the model 
    model = sm.OLS(train_labels, sm.add_constant(train_values)).fit() 

    imputed_values = model.predict(exog = sm.add_constant(test_values, has_constant='add')).tolist()
    return [np.mean(list(train_labels) + imputed_values), np.mean((imputed_values-test_labels)**2), stats.sem(list(train_labels) + imputed_values), contains_mean(list(train_labels) + imputed_values)]

In [409]:
def impute_knn(test_values, test_labels, train_values, train_labels):
    
    tree = KDTree(train_values.values, leaf_size=5)

    imputed_values_knn_1 = []
    imputed_values_knn_3 = []
    imputed_values_knn_5 = []

    for index, entry in enumerate(test_values.values):
 
        dist, ind = tree.query([entry], k=5)
        ind = ind[0]

        current_impute_knn_1 = np.mean(train_labels.values[ind][0])
        current_impute_knn_3 = np.mean(train_labels.values[ind][:3])
        current_impute_knn_5 = np.mean(train_labels.values[ind])

        imputed_values_knn_1.append(current_impute_knn_1)
        imputed_values_knn_3.append(current_impute_knn_3)
        imputed_values_knn_5.append(current_impute_knn_5)

    mean_knn_1 = np.mean(list(train_labels.values)+imputed_values_knn_1)
    mean_knn_3 = np.mean(list(train_labels.values)+imputed_values_knn_3)
    mean_knn_5 = np.mean(list(train_labels.values)+imputed_values_knn_5)

    mse_knn_1 = np.mean((test_labels.values - imputed_values_knn_1)**2)
    mse_knn_3 = np.mean((test_labels.values - imputed_values_knn_3)**2)
    mse_knn_5 = np.mean((test_labels.values - imputed_values_knn_5)**2)

    sem_knn_1 = stats.sem(list(train_labels.values)+imputed_values_knn_1)
    sem_knn_3 = stats.sem(list(train_labels.values)+imputed_values_knn_3)
    sem_knn_5 = stats.sem(list(train_labels.values)+imputed_values_knn_5)

    alpha_knn_1 = contains_mean(list(train_labels.values)+imputed_values_knn_1)
    alpha_knn_3 = contains_mean(list(train_labels.values)+imputed_values_knn_3)
    alpha_knn_5 = contains_mean(list(train_labels.values)+imputed_values_knn_5)


    return [mean_knn_1, mse_knn_1, sem_knn_1, alpha_knn_1, mean_knn_3, mse_knn_3, sem_knn_3, alpha_knn_3, mean_knn_5, mse_knn_5, sem_knn_5, alpha_knn_5]

In [410]:
def simul_knn():
    for c in np.arange(0.1, 1, 0.1):

        imputed_simul_knn = pd.DataFrame(columns=['Mean KNN_1', 'MSE KNN_1', 'SE KNN_1', 'Alpha KNN_1', 'Mean KNN_3', 'MSE KNN_3', 'SE KNN_3', 'Alpha KNN_3', 'Mean KNN_5', 'MSE KNN_5', 'SE KNN_5', 'Alpha KNN_5'])

        for i in tqdm(range(1000)):
            temp = generate_mnar_datasets(df_exog = df_noPrice, labels = df['Preis'], chance_if_lower_than_median= 0.2, chance_if_higher_than_median=c)
            imputed_simul_knn.at[i] = impute_knn(temp[0], temp[1], temp[2], temp[3])

        return_values = []
        
        for column in imputed_simul_knn.columns:
            column_mean = np.mean(imputed_simul_knn[column].to_list())
            return_values.append(column_mean)
        # print(return_values)

        imputed_stats_knn.loc[c] =  return_values
    imputed_stats_knn

simul_knn()
imputed_stats_knn

100%|██████████| 1000/1000 [00:23<00:00, 42.69it/s]
100%|██████████| 1000/1000 [00:29<00:00, 33.66it/s]
100%|██████████| 1000/1000 [00:35<00:00, 28.49it/s]
100%|██████████| 1000/1000 [00:41<00:00, 24.07it/s]
100%|██████████| 1000/1000 [00:45<00:00, 22.17it/s]
100%|██████████| 1000/1000 [00:50<00:00, 19.87it/s]
100%|██████████| 1000/1000 [00:55<00:00, 18.04it/s]
100%|██████████| 1000/1000 [01:01<00:00, 16.18it/s]
100%|██████████| 1000/1000 [01:07<00:00, 14.91it/s]


Unnamed: 0,Mean KNN_1,MSE KNN_1,SE KNN_1,Alpha KNN_1,Mean KNN_3,MSE KNN_3,SE KNN_3,Alpha KNN_3,Mean KNN_5,MSE KNN_5,SE KNN_5,Alpha KNN_5
0.1,1334.235439,103085.394227,23.369947,1.0,1335.381156,83649.329199,23.206373,1.0,1335.596359,82385.803795,23.18152,1.0
0.2,1330.918431,124466.037707,23.431529,1.0,1330.884927,98942.97529,23.174285,1.0,1330.784536,97713.784468,23.115231,1.0
0.3,1326.795982,140280.470238,23.482752,1.0,1325.757685,109129.703488,23.134098,1.0,1324.734267,107887.764153,23.02002,1.0
0.4,1320.872956,154286.648531,23.530517,1.0,1318.568955,120130.260131,23.061211,1.0,1316.849052,118665.476391,22.887738,1.0
0.5,1312.720597,167197.062867,23.545837,0.986,1309.310368,129501.995965,22.936775,0.976,1306.848467,128825.771179,22.709082,0.962
0.6,1304.229326,182884.973438,23.562111,0.877,1298.373057,142453.069328,22.785367,0.784,1295.14194,142991.228704,22.488526,0.661
0.7,1289.116455,204079.180248,23.499543,0.543,1281.311984,160637.610193,22.512626,0.28,1277.528861,162043.607896,22.170607,0.193
0.8,1265.853703,233944.427045,23.289985,0.157,1256.164018,189297.026027,22.072907,0.046,1249.61428,189974.117871,21.619263,0.018
0.9,1223.495584,295466.640841,22.821077,0.021,1205.748059,248306.805716,21.016107,0.001,1186.784952,255544.013892,19.986165,0.0


In [411]:
def simul_ols():
    
    for c in np.arange(0.1, 1, 0.1):
        
        imputed_simul_ols = pd.DataFrame(columns=['Mean', 'MSE OLS', 'SE OLS', 'Alpha OLS'])

        for i in tqdm(range(1000)):
            temp = generate_mnar_datasets(df_exog = df_noPrice, labels = df['Preis'], chance_if_lower_than_median= 0.2, chance_if_higher_than_median=c)
            imputed_simul_ols.at[i] = impute_ols(temp[0], temp[1], temp[2], temp[3])

        return_values = []

        for column in imputed_simul_ols.columns:
            column_mean = np.mean(imputed_simul_ols[column].to_list())
            return_values.append(column_mean)
        # print(return_values)

        imputed_stats_ols.loc[c] =  return_values
    imputed_stats_ols

simul_ols()

100%|██████████| 1000/1000 [00:10<00:00, 98.51it/s]
100%|██████████| 1000/1000 [00:09<00:00, 106.86it/s]
100%|██████████| 1000/1000 [00:08<00:00, 118.31it/s]
100%|██████████| 1000/1000 [00:09<00:00, 109.88it/s]
100%|██████████| 1000/1000 [00:08<00:00, 111.69it/s]
100%|██████████| 1000/1000 [00:09<00:00, 109.13it/s]
100%|██████████| 1000/1000 [00:09<00:00, 108.24it/s]
100%|██████████| 1000/1000 [00:08<00:00, 115.32it/s]
100%|██████████| 1000/1000 [00:09<00:00, 109.72it/s]


In [412]:
pd.concat([imputed_stats_ols, imputed_stats_knn], axis=1).to_csv('MNAR_Simulation_Top5_(0.1 und 0.1-0.9)')
# df_01 = pd.concat([imputed_stats_ols, imputed_stats_knn], axis=1)