In [21]:
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm
import statsmodels.formula.api as smf

rng = np.random.RandomState(0)

In [22]:
# Initialisierung von DataFrames und Data Cleaning
df = pd.read_csv('../Laptop-Preise.csv', sep=';', decimal=',')
df = df[df.extern_Schnittstellen != 2300] # Ausreißer löschen
df = df.reset_index(drop=True) # Reset Index

# Nur die 5 Spalten auswählen, die auch den größten Erklärungsgehalt haben
selected_columns = selected_columns = ['Preis', 'Akku_Kapazitaet', 'Arbeitsspeicher', 'Kerne', 'Marke_Apple', 'Betriebssystem_Windows']

# Create a new DataFrame with only the selected columns
df = df[selected_columns]

df_noPrice = df.drop('Preis', axis=1)
imputed_simul_knn = pd.DataFrame(columns=['Mean KNN_1', 'MSE KNN_1', 'SE KNN_1', 'Alpha KNN_1', 'Mean KNN_3', 'MSE KNN_3', 'SE KNN_3', 'Alpha KNN_3', 'Mean KNN_5', 'MSE KNN_5', 'SE KNN_5', 'Alpha KNN_5'])
imputed_stats_knn = pd.DataFrame(columns=['Mean KNN_1', 'MSE KNN_1', 'SE KNN_1', 'Alpha KNN_1', 'Mean KNN_3', 'MSE KNN_3', 'SE KNN_3', 'Alpha KNN_3', 'Mean KNN_5', 'MSE KNN_5', 'SE KNN_5', 'Alpha KNN_5'], index = np.arange(0.1, 1, 0.1))

imputed_simul_ols = pd.DataFrame(columns=['Mean', 'MSE OLS', 'SE OLS', 'Alpha OLS'])
imputed_stats_ols = pd.DataFrame(columns=['Mean', 'MSE OLS', 'SE OLS', 'Alpha OLS'], index = np.arange(0.1, 1, 0.1))

no_imputation = pd.DataFrame(columns = ['SE', 'Alpha'])

# Skalierung (Standardisierung) von df_noPrice
col_names = df_noPrice.columns
scaler = StandardScaler().fit(df_noPrice.values)
df_noPrice = scaler.transform(df_noPrice.values)
df_noPrice = pd.DataFrame(df_noPrice, columns=col_names)

# Skalierung (Standardisierung) von df mit Preis (Preis ist unverändert)
df_std = df_noPrice.copy()
df_std.insert(0, 'Preis', df['Preis'])

# Wahrer Mittelwert:
true_mean = df['Preis'].mean()

In [23]:
def contains_mean(values):
    return 1 if (true_mean > (np.mean(values) - 1.95996 * stats.sem(values))) and (true_mean < (np.mean(values) + 1.95996 * stats.sem(values))) else 0

In [24]:
# OLS Model 
# fitting the model 
# df_noPrice ist bereits standardisiert
model = sm.OLS(df['Preis'], sm.add_constant(df_noPrice)).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Preis   R-squared:                       0.765
Model:                            OLS   Adj. R-squared:                  0.764
Method:                 Least Squares   F-statistic:                     671.7
Date:                Sat, 23 Mar 2024   Prob (F-statistic):          1.96e-321
Time:                        22:46:20   Log-Likelihood:                -7590.4
No. Observations:                1038   AIC:                         1.519e+04
Df Residuals:                    1032   BIC:                         1.522e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                   1333

In [25]:
def generate_mnar_datasets(df_exog, labels, chance_if_lower_than_median, chance_if_higher_than_median):
    # Combine df_exog and labels into one DataFrame
    combined_df = pd.concat([df_exog, labels], axis=1)

    # Create a random variable between 0 and 1 for each entry
    random_vars = np.random.rand(len(combined_df))

    # Compute the median of the labels
    median_label = labels.median()

    # Create boolean masks for entries to delete and keep
    mask_delete_lower = (labels <= median_label) & (random_vars < chance_if_lower_than_median)
    mask_delete_higher = (labels > median_label) & (random_vars < chance_if_higher_than_median)

    # Create the final masks for delete and keep entries
    mask_delete = mask_delete_lower | mask_delete_higher
    mask_keep = ~mask_delete

    # Separate the entries into delete and keep datasets
    delete_entries = combined_df[mask_delete]
    keep_entries = combined_df[mask_keep]

    # Extract df_exog and labels from delete and keep datasets
    df_exog_delete = delete_entries.iloc[:, :-1]
    labels_delete = delete_entries.iloc[:, -1]
    df_exog_keep = keep_entries.iloc[:, :-1]
    labels_keep = keep_entries.iloc[:, -1]

    return [df_exog_delete, labels_delete, df_exog_keep, labels_keep]


In [26]:
def impute_ols(test_values, test_labels, train_values, train_labels):
    
    # OLS Model
    # fitting the model 
    model = sm.OLS(train_labels, sm.add_constant(train_values)).fit() 

    imputed_values = model.predict(exog = sm.add_constant(test_values, has_constant='add')).tolist()
    return [np.mean(list(train_labels) + imputed_values), np.mean((imputed_values-test_labels)**2), stats.sem(list(train_labels) + imputed_values), contains_mean(list(train_labels) + imputed_values)]

In [27]:
def impute_knn(test_values, test_labels, train_values, train_labels):
    
    tree = KDTree(train_values.values, leaf_size=5)

    imputed_values_knn_1 = []
    imputed_values_knn_3 = []
    imputed_values_knn_5 = []

    for index, entry in enumerate(test_values.values):
 
        dist, ind = tree.query([entry], k=5)
        ind = ind[0]

        current_impute_knn_1 = np.mean(train_labels.values[ind][0])
        current_impute_knn_3 = np.mean(train_labels.values[ind][:3])
        current_impute_knn_5 = np.mean(train_labels.values[ind])

        imputed_values_knn_1.append(current_impute_knn_1)
        imputed_values_knn_3.append(current_impute_knn_3)
        imputed_values_knn_5.append(current_impute_knn_5)

    mean_knn_1 = np.mean(list(train_labels.values)+imputed_values_knn_1)
    mean_knn_3 = np.mean(list(train_labels.values)+imputed_values_knn_3)
    mean_knn_5 = np.mean(list(train_labels.values)+imputed_values_knn_5)

    mse_knn_1 = np.mean((test_labels.values - imputed_values_knn_1)**2)
    mse_knn_3 = np.mean((test_labels.values - imputed_values_knn_3)**2)
    mse_knn_5 = np.mean((test_labels.values - imputed_values_knn_5)**2)

    sem_knn_1 = stats.sem(list(train_labels.values)+imputed_values_knn_1)
    sem_knn_3 = stats.sem(list(train_labels.values)+imputed_values_knn_3)
    sem_knn_5 = stats.sem(list(train_labels.values)+imputed_values_knn_5)

    alpha_knn_1 = contains_mean(list(train_labels.values)+imputed_values_knn_1)
    alpha_knn_3 = contains_mean(list(train_labels.values)+imputed_values_knn_3)
    alpha_knn_5 = contains_mean(list(train_labels.values)+imputed_values_knn_5)


    return [mean_knn_1, mse_knn_1, sem_knn_1, alpha_knn_1, mean_knn_3, mse_knn_3, sem_knn_3, alpha_knn_3, mean_knn_5, mse_knn_5, sem_knn_5, alpha_knn_5]

In [28]:
def simul_knn():
    for c in np.arange(0.1, 1, 0.1):

        imputed_simul_knn = pd.DataFrame(columns=['Mean KNN_1', 'MSE KNN_1', 'SE KNN_1', 'Alpha KNN_1', 'Mean KNN_3', 'MSE KNN_3', 'SE KNN_3', 'Alpha KNN_3', 'Mean KNN_5', 'MSE KNN_5', 'SE KNN_5', 'Alpha KNN_5'])

        for i in tqdm(range(1000)):
            temp = generate_mnar_datasets(df_exog = df_noPrice, labels = df['Preis'], chance_if_lower_than_median= 0.3, chance_if_higher_than_median=c)
            imputed_simul_knn.at[i] = impute_knn(temp[0], temp[1], temp[2], temp[3])

        return_values = []
        
        for column in imputed_simul_knn.columns:
            column_mean = np.mean(imputed_simul_knn[column].to_list())
            return_values.append(column_mean)
        # print(return_values)

        imputed_stats_knn.loc[c] =  return_values
    imputed_stats_knn

simul_knn()
imputed_stats_knn

100%|██████████| 1000/1000 [00:31<00:00, 32.21it/s]
100%|██████████| 1000/1000 [00:37<00:00, 26.78it/s]
100%|██████████| 1000/1000 [00:43<00:00, 23.09it/s]
100%|██████████| 1000/1000 [00:47<00:00, 21.03it/s]
100%|██████████| 1000/1000 [00:50<00:00, 19.76it/s]
100%|██████████| 1000/1000 [00:57<00:00, 17.53it/s]
100%|██████████| 1000/1000 [01:01<00:00, 16.25it/s]
100%|██████████| 1000/1000 [01:04<00:00, 15.51it/s]
100%|██████████| 1000/1000 [01:07<00:00, 14.85it/s]


Unnamed: 0,Mean KNN_1,MSE KNN_1,SE KNN_1,Alpha KNN_1,Mean KNN_3,MSE KNN_3,SE KNN_3,Alpha KNN_3,Mean KNN_5,MSE KNN_5,SE KNN_5,Alpha KNN_5
0.1,1332.596926,104615.450057,23.32174,1.0,1338.060645,77734.8442,23.09687,1.0,1336.499931,74831.404604,23.062209,1.0
0.2,1327.154823,130718.525414,23.351794,1.0,1332.472453,92449.218558,23.021181,1.0,1330.048476,90415.132414,22.962838,1.0
0.3,1320.541776,149416.128964,23.360195,0.999,1325.470116,102954.05322,22.920072,1.0,1322.231384,101768.154074,22.833679,1.0
0.4,1312.594671,164962.047449,23.374734,0.979,1317.406797,113106.040028,22.818256,1.0,1313.177318,113127.441748,22.697104,0.995
0.5,1304.878061,176986.239192,23.353876,0.876,1307.976557,123390.03712,22.690037,0.956,1302.732021,124163.124676,22.517765,0.885
0.6,1294.898358,188640.751219,23.295718,0.663,1295.785383,136005.984502,22.537208,0.67,1291.14605,137333.782327,22.348645,0.53
0.7,1281.674084,203417.498175,23.207349,0.368,1280.040397,153623.254357,22.338585,0.24,1276.593648,155977.353671,22.120745,0.178
0.8,1261.05696,225491.515152,23.052027,0.142,1258.648646,179247.59986,22.042409,0.06,1256.054111,181985.96465,21.662007,0.051
0.9,1220.970304,279736.17731,22.5556,0.024,1212.258857,239839.37159,20.785766,0.006,1194.396549,255332.810819,19.537717,0.001


In [1]:
def simul_ols():
    
    for c in np.arange(0.1, 1, 0.1):
        
        imputed_simul_ols = pd.DataFrame(columns=['Mean', 'MSE OLS', 'SE OLS', 'Alpha OLS'])

        for i in tqdm(range(1000)):
            temp = generate_mnar_datasets(df_exog = df_noPrice, labels = df['Preis'], chance_if_lower_than_median= 0.2, chance_if_higher_than_median=c)
            imputed_simul_ols.at[i] = impute_ols(temp[0], temp[1], temp[2], temp[3])

        return_values = []

        for column in imputed_simul_ols.columns:
            column_mean = np.mean(imputed_simul_ols[column].to_list())
            return_values.append(column_mean)
        # print(return_values)

        imputed_stats_ols.loc[c] =  return_values
    imputed_stats_ols

simul_ols()

NameError: name 'np' is not defined

In [30]:
pd.concat([imputed_stats_ols, imputed_stats_knn], axis=1).to_csv('MNAR_Simulation_Top5_(0.3 und 0.1-0.9) (neu)')
df_02 = pd.concat([imputed_stats_ols, imputed_stats_knn], axis=1)