In [101]:
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree
from sklearn.preprocessing import StandardScaler

rng = np.random.RandomState(0)

In [102]:
# Initialisierung von DataFrames und Data Cleaning
df = pd.read_csv('Laptop-Preise.csv', sep=';', decimal=',')
df = df[df.extern_Schnittstellen != 2300] # Ausreißer löschen
df = df.reset_index(drop=True) # Reset Index
df.drop(['Betriebssystem_OHNE', 'Betriebssystem_Mac', 'Marke_Dell'], axis=1) # Mac und Marke_Apple sind identische Merkmale
df_noPrice = df.drop('Preis', axis=1)
imputed_simul = pd.DataFrame(columns=['MSE KNN_1', 'MSE KNN_3', 'MSE KNN_5', 'SE KNN_1', 'SE KNN_3', 'SE KNN_5'])
imputed_stats = pd.DataFrame(columns=['MSE KNN_1', 'MSE KNN_3', 'MSE KNN_5', 'SE KNN_1', 'SE KNN_3', 'SE KNN_5'], index = np.arange(0.1, 1, 0.1))


# Skalierung (Standardisierung) von df_noPrice
col_names = df_noPrice.columns
scaler = StandardScaler().fit(df_noPrice.values)
df_noPrice = scaler.transform(df_noPrice.values)
df_noPrice = pd.DataFrame(df_noPrice, columns=col_names)

In [104]:
train_values, test_values, train_labels, test_labels = train_test_split(df_noPrice, df['Preis'], test_size=0.01)

def del_ran(df_exog, labels, chance):
    rand_array = np.random.rand(df_exog.shape[0])
    delete_entries = rand_array < chance
    keep_entries = rand_array >= chance
    
    return [df_exog[delete_entries], labels[delete_entries], df_exog[keep_entries], labels[keep_entries]]

temp = del_ran(df_exog = df_noPrice, labels = df['Preis'], chance = 0.01)

In [110]:
def impute(test_values, test_labels, train_values, train_labels):
    
    tree = KDTree(train_values.values, leaf_size=5)

    imputed_values_knn_1 = []
    imputed_values_knn_3 = []
    imputed_values_knn_5 = []

    for index, entry in enumerate(test_values.values):
 
        dist, ind = tree.query([entry], k=5)
        ind = ind[0]

        current_impute_knn_1 = np.mean(train_labels.values[ind][0])
        current_impute_knn_3 = np.mean(train_labels.values[ind][:3])
        current_impute_knn_5 = np.mean(train_labels.values[ind])

        imputed_values_knn_1.append(current_impute_knn_1)
        imputed_values_knn_3.append(current_impute_knn_3)
        imputed_values_knn_5.append(current_impute_knn_5)
       
        # print(train_labels.values[ind])
        # print(current_impute_knn_1//1, current_impute_knn_3//1, current_impute_knn_5//1)
        # print(test_labels.values[index])
        # print(train_values.values[ind])

    mse_knn_1 = np.mean((test_labels.values - imputed_values_knn_1)**2)
    mse_knn_3 = np.mean((test_labels.values - imputed_values_knn_3)**2)
    mse_knn_5 = np.mean((test_labels.values - imputed_values_knn_5)**2)

    sem_knn_1 = stats.sem(list(train_labels.values)+imputed_values_knn_1)
    sem_knn_3 = stats.sem(list(train_labels.values)+imputed_values_knn_3)
    sem_knn_5 = stats.sem(list(train_labels.values)+imputed_values_knn_5)


    return [mse_knn_1, mse_knn_3, mse_knn_5, sem_knn_1, sem_knn_3, sem_knn_5]

In [111]:
for c in np.arange(0.1, 1, 0.1):

    for i in tqdm(range(1000)):
        temp = del_ran(df_exog = df_noPrice, labels = df['Preis'], chance = c)
        imputed_simul.at[i] = impute(temp[0], temp[1], temp[2], temp[3])

    imputed_stats.loc[c] = imputed_simul.mean()
imputed_stats

    

 73%|███████▎  | 734/1000 [00:21<00:07, 34.43it/s]


KeyboardInterrupt: 

In [112]:
imputed_stats

Unnamed: 0,MSE KNN_1,MSE KNN_3,MSE KNN_5,SE KNN_1,SE KNN_3,SE KNN_5
0.1,92467.018956,76067.758291,82465.238661,23.179212,23.042157,22.972533
0.2,92912.915175,76557.164682,82957.033165,23.173237,23.020625,22.943791
0.3,93358.245088,77021.262064,83370.518097,23.166582,22.995876,22.912072
0.4,94065.074613,77683.747536,83937.716969,23.156088,22.965568,22.875515
0.5,94836.571889,78364.464835,84504.448797,23.145583,22.934242,22.837945
0.6,96050.88067,79384.664281,85344.74555,23.127963,22.893562,22.790514
0.7,97553.727845,80430.145101,86280.851401,23.109497,22.850868,22.73717
0.8,99798.167423,82097.638226,88104.874085,23.079573,22.786309,22.646509
0.9,104507.748069,86000.617661,92631.534363,23.051956,22.6632,22.44155
