In [434]:
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm
import statsmodels.formula.api as smf

rng = np.random.RandomState(0)

In [435]:
# Initialisierung von DataFrames und Data Cleaning
df = pd.read_csv('Laptop-Preise.csv', sep=';', decimal=',')
df = df[df.extern_Schnittstellen != 2300] # Ausreißer löschen
df = df.reset_index(drop=True) # Reset Index

# Nur die 5 Spalten auswählen, die auch den größten Erklärungsgehalt haben
selected_columns = ['Preis', 'Akku_Kapazitaet', 'Arbeitsspeicher', 'Kerne', 'Mobilfunk_vorhanden', 'SSD']

# Create a new DataFrame with only the selected columns
df = df[selected_columns]

df_noPrice = df.drop('Preis', axis=1)
imputed_simul_knn = pd.DataFrame(columns=['Mean KNN_1', 'MSE KNN_1', 'SE KNN_1', 'Alpha KNN_1', 'Mean KNN_3', 'MSE KNN_3', 'SE KNN_3', 'Alpha KNN_3', 'Mean KNN_5', 'MSE KNN_5', 'SE KNN_5', 'Alpha KNN_5'])
imputed_stats_knn = pd.DataFrame(columns=['Mean KNN_1', 'MSE KNN_1', 'SE KNN_1', 'Alpha KNN_1', 'Mean KNN_3', 'MSE KNN_3', 'SE KNN_3', 'Alpha KNN_3', 'Mean KNN_5', 'MSE KNN_5', 'SE KNN_5', 'Alpha KNN_5'], index = np.arange(0.1, 1, 0.1))

imputed_simul_ols = pd.DataFrame(columns=['Mean', 'MSE OLS', 'SE OLS', 'Alpha OLS'])
imputed_stats_ols = pd.DataFrame(columns=['Mean', 'MSE OLS', 'SE OLS', 'Alpha OLS'], index = np.arange(0.1, 1, 0.1))

no_imputation = pd.DataFrame(columns = ['SE', 'Alpha'])

# Skalierung (Standardisierung) von df_noPrice
col_names = df_noPrice.columns
scaler = StandardScaler().fit(df_noPrice.values)
df_noPrice = scaler.transform(df_noPrice.values)
df_noPrice = pd.DataFrame(df_noPrice, columns=col_names)

# Skalierung (Standardisierung) von df mit Preis (Preis ist unverändert)
df_std = df_noPrice.copy()
df_std.insert(0, 'Preis', df['Preis'])

# Wahrer Mittelwert:
true_mean = df['Preis'].mean()

In [436]:
def contains_mean(values):
    return 1 if (true_mean > (np.mean(values) - 1.95996 * stats.sem(values))) and (true_mean < (np.mean(values) + 1.95996 * stats.sem(values))) else 0

In [437]:
# OLS Model 
# fitting the model 
# df_noPrice ist bereits standardisiert
model = sm.OLS(df['Preis'], sm.add_constant(df_noPrice)).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Preis   R-squared:                       0.789
Model:                            OLS   Adj. R-squared:                  0.788
Method:                 Least Squares   F-statistic:                     771.4
Date:                Sun, 14 Jan 2024   Prob (F-statistic):               0.00
Time:                        23:46:13   Log-Likelihood:                -7534.6
No. Observations:                1038   AIC:                         1.508e+04
Df Residuals:                    1032   BIC:                         1.511e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                1333.9633    

In [438]:
def generate_mnar_datasets(df_exog, labels, chance_if_lower_than_median, chance_if_higher_than_median):
    # Combine df_exog and labels into one DataFrame
    combined_df = pd.concat([df_exog, labels], axis=1)

    # Create a random variable between 0 and 1 for each entry
    random_vars = np.random.rand(len(combined_df))

    # Compute the median of the labels
    median_label = labels.median()

    # Create boolean masks for entries to delete and keep
    mask_delete_lower = (labels <= median_label) & (random_vars < chance_if_lower_than_median)
    mask_delete_higher = (labels > median_label) & (random_vars < chance_if_higher_than_median)

    # Create the final masks for delete and keep entries
    mask_delete = mask_delete_lower | mask_delete_higher
    mask_keep = ~mask_delete

    # Separate the entries into delete and keep datasets
    delete_entries = combined_df[mask_delete]
    keep_entries = combined_df[mask_keep]

    # Extract df_exog and labels from delete and keep datasets
    df_exog_delete = delete_entries.iloc[:, :-1]
    labels_delete = delete_entries.iloc[:, -1]
    df_exog_keep = keep_entries.iloc[:, :-1]
    labels_keep = keep_entries.iloc[:, -1]

    return [df_exog_delete, labels_delete, df_exog_keep, labels_keep]


In [439]:
def impute_ols(test_values, test_labels, train_values, train_labels):
    
    # OLS Model
    # fitting the model 
    model = sm.OLS(train_labels, sm.add_constant(train_values)).fit() 

    imputed_values = model.predict(exog = sm.add_constant(test_values, has_constant='add')).tolist()
    return [np.mean(list(train_labels) + imputed_values), np.mean((imputed_values-test_labels)**2), stats.sem(list(train_labels) + imputed_values), contains_mean(list(train_labels) + imputed_values)]

In [440]:
def impute_knn(test_values, test_labels, train_values, train_labels):
    
    tree = KDTree(train_values.values, leaf_size=5)

    imputed_values_knn_1 = []
    imputed_values_knn_3 = []
    imputed_values_knn_5 = []

    for index, entry in enumerate(test_values.values):
 
        dist, ind = tree.query([entry], k=5)
        ind = ind[0]

        current_impute_knn_1 = np.mean(train_labels.values[ind][0])
        current_impute_knn_3 = np.mean(train_labels.values[ind][:3])
        current_impute_knn_5 = np.mean(train_labels.values[ind])

        imputed_values_knn_1.append(current_impute_knn_1)
        imputed_values_knn_3.append(current_impute_knn_3)
        imputed_values_knn_5.append(current_impute_knn_5)

    mean_knn_1 = np.mean(list(train_labels.values)+imputed_values_knn_1)
    mean_knn_3 = np.mean(list(train_labels.values)+imputed_values_knn_3)
    mean_knn_5 = np.mean(list(train_labels.values)+imputed_values_knn_5)

    mse_knn_1 = np.mean((test_labels.values - imputed_values_knn_1)**2)
    mse_knn_3 = np.mean((test_labels.values - imputed_values_knn_3)**2)
    mse_knn_5 = np.mean((test_labels.values - imputed_values_knn_5)**2)

    sem_knn_1 = stats.sem(list(train_labels.values)+imputed_values_knn_1)
    sem_knn_3 = stats.sem(list(train_labels.values)+imputed_values_knn_3)
    sem_knn_5 = stats.sem(list(train_labels.values)+imputed_values_knn_5)

    alpha_knn_1 = contains_mean(list(train_labels.values)+imputed_values_knn_1)
    alpha_knn_3 = contains_mean(list(train_labels.values)+imputed_values_knn_3)
    alpha_knn_5 = contains_mean(list(train_labels.values)+imputed_values_knn_5)


    return [mean_knn_1, mse_knn_1, sem_knn_1, alpha_knn_1, mean_knn_3, mse_knn_3, sem_knn_3, alpha_knn_3, mean_knn_5, mse_knn_5, sem_knn_5, alpha_knn_5]

In [441]:
def simul_knn():
    for c in np.arange(0.1, 1, 0.1):

        imputed_simul_knn = pd.DataFrame(columns=['Mean KNN_1', 'MSE KNN_1', 'SE KNN_1', 'Alpha KNN_1', 'Mean KNN_3', 'MSE KNN_3', 'SE KNN_3', 'Alpha KNN_3', 'Mean KNN_5', 'MSE KNN_5', 'SE KNN_5', 'Alpha KNN_5'])

        for i in tqdm(range(1000)):
            temp = generate_mnar_datasets(df_exog = df_noPrice, labels = df['Preis'], chance_if_lower_than_median= 0.2, chance_if_higher_than_median=c)
            imputed_simul_knn.at[i] = impute_knn(temp[0], temp[1], temp[2], temp[3])

        return_values = []
        
        for column in imputed_simul_knn.columns:
            column_mean = np.mean(imputed_simul_knn[column].to_list())
            return_values.append(column_mean)
        # print(return_values)

        imputed_stats_knn.loc[c] =  return_values
    imputed_stats_knn

simul_knn()
imputed_stats_knn

100%|██████████| 1000/1000 [00:25<00:00, 39.16it/s]
100%|██████████| 1000/1000 [00:29<00:00, 33.63it/s]
100%|██████████| 1000/1000 [00:35<00:00, 28.24it/s]
100%|██████████| 1000/1000 [00:40<00:00, 24.60it/s]
100%|██████████| 1000/1000 [00:43<00:00, 22.93it/s]
100%|██████████| 1000/1000 [00:48<00:00, 20.77it/s]
100%|██████████| 1000/1000 [00:58<00:00, 17.17it/s]
100%|██████████| 1000/1000 [01:02<00:00, 15.91it/s]
100%|██████████| 1000/1000 [01:06<00:00, 15.08it/s]


Unnamed: 0,Mean KNN_1,MSE KNN_1,SE KNN_1,Alpha KNN_1,Mean KNN_3,MSE KNN_3,SE KNN_3,Alpha KNN_3,Mean KNN_5,MSE KNN_5,SE KNN_5,Alpha KNN_5
0.1,1333.932975,102577.57298,23.363597,1.0,1335.424878,83520.759167,23.199781,1.0,1335.638177,82645.356423,23.174207,1.0
0.2,1330.642759,122492.945284,23.41526,1.0,1330.959408,97167.439105,23.169986,1.0,1330.96404,96174.378083,23.115108,1.0
0.3,1326.909737,139825.875689,23.491399,1.0,1325.612958,108590.089588,23.131223,1.0,1324.58995,107319.666727,23.016125,1.0
0.4,1320.915793,154608.351171,23.52994,1.0,1318.684679,120080.526038,23.055166,1.0,1317.015319,117963.494579,22.883272,1.0
0.5,1313.225213,168177.507409,23.555377,0.989,1309.465017,130146.339364,22.941664,0.986,1306.864485,129017.716064,22.700496,0.961
0.6,1303.106922,183359.193794,23.546663,0.873,1298.206346,142914.973161,22.807158,0.776,1294.850118,143022.385095,22.501728,0.65
0.7,1288.56498,202774.064724,23.451426,0.517,1281.291612,159762.863451,22.491904,0.277,1277.57471,161433.397679,22.159878,0.192
0.8,1267.084573,234417.771584,23.332863,0.167,1257.362136,187927.840927,22.107052,0.042,1251.262516,188617.732721,21.681421,0.016
0.9,1224.271718,296803.604318,22.84362,0.025,1205.265647,248562.82074,20.985749,0.001,1186.148977,255091.779448,19.946189,0.0


In [442]:
def simul_ols():
    
    for c in np.arange(0.1, 1, 0.1):
        
        imputed_simul_ols = pd.DataFrame(columns=['Mean', 'MSE OLS', 'SE OLS', 'Alpha OLS'])

        for i in tqdm(range(1000)):
            temp = generate_mnar_datasets(df_exog = df_noPrice, labels = df['Preis'], chance_if_lower_than_median= 0.2, chance_if_higher_than_median=c)
            imputed_simul_ols.at[i] = impute_ols(temp[0], temp[1], temp[2], temp[3])

        return_values = []

        for column in imputed_simul_ols.columns:
            column_mean = np.mean(imputed_simul_ols[column].to_list())
            return_values.append(column_mean)
        # print(return_values)

        imputed_stats_ols.loc[c] =  return_values
    imputed_stats_ols

simul_ols()

100%|██████████| 1000/1000 [00:10<00:00, 96.24it/s]
100%|██████████| 1000/1000 [00:10<00:00, 92.85it/s]
100%|██████████| 1000/1000 [00:09<00:00, 102.90it/s]
100%|██████████| 1000/1000 [00:09<00:00, 104.81it/s]
100%|██████████| 1000/1000 [00:09<00:00, 106.67it/s]
100%|██████████| 1000/1000 [00:09<00:00, 106.60it/s]
100%|██████████| 1000/1000 [00:09<00:00, 106.02it/s]
100%|██████████| 1000/1000 [00:09<00:00, 108.12it/s]
100%|██████████| 1000/1000 [00:09<00:00, 108.89it/s]


In [444]:
pd.concat([imputed_stats_ols, imputed_stats_knn], axis=1).to_csv('MNAR_Simulation_Top5_(0.2 und 0.1-0.9)')
df_02 = pd.concat([imputed_stats_ols, imputed_stats_knn], axis=1)