In [167]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree
from sklearn.preprocessing import StandardScaler

rng = np.random.RandomState(0)

In [152]:
# Initialisierung von DataFrames und Data Cleaning
df = pd.read_csv('Laptop-Preise.csv', sep=';', decimal=',')
df = df[df.extern_Schnittstellen != 2300] # Ausreißer löschen
df = df.reset_index(drop=True) # Reset Index
df.drop(['Betriebssystem_OHNE', 'Betriebssystem_Mac', 'Marke_Dell'], axis=1) # Mac und Marke_Apple sind identische Merkmale
df_noPrice = df.drop('Preis', axis=1)
error_mse = pd.DataFrame(columns=['chance', 'LM', 'KNN_1', 'KNN_3', 'KNN_5'])
error_stdErr = pd.DataFrame(columns=['chance', 'LM', 'KNN_1', 'KNN_3', 'KNN_5'])

In [153]:
# Skalierung (Standardisierung) von df_noPrice
col_names = df_noPrice.columns
scaler = StandardScaler().fit(df_noPrice.values)
df_noPrice = scaler.transform(df_noPrice.values)
df_noPrice = pd.DataFrame(df_noPrice, columns=col_names)

In [154]:
# Inputs sind einfache DataFrames
# test_values: Exogene von Delete
# test_labels: Preis von Delete
# train_values: Exogene von Keep
# train_labels: Preis von Keep

train_values, test_values, train_labels, test_labels = train_test_split(df_noPrice, df['Preis'], test_size=0.01)

def del_ran(df_exog, labels, chance):
    rand_array = np.random.rand(df_exog.shape[0])
    delete_entries = rand_array < chance
    keep_entries = rand_array >= chance
    
    return [df_exog[delete_entries], labels[delete_entries], df_exog[keep_entries], labels[keep_entries]]

    # return {'keep' : df.iloc[keep_entries], 'delete' : df.iloc[delete_entries]}


# Umschreiben, dass es auch mit der aktuellen impute-Methode funktioniert
temp = del_ran(df_exog = df_noPrice, labels = df['Preis'], chance = 0.01)

In [168]:
def impute(test_values, test_labels, train_values, train_labels):
    
    tree = KDTree(train_values.values, leaf_size=5)

    imputed_values_knn_1 = []
    imputed_values_knn_3 = []
    imputed_values_knn_5 = []

    for index, entry in enumerate(test_values.values):
 
        dist, ind = tree.query([entry], k=5)
        ind = ind[0]

        current_impute_knn_1 = np.mean(train_labels.values[ind][0])
        current_impute_knn_3 = np.mean(train_labels.values[ind][:3])
        current_impute_knn_5 = np.mean(train_labels.values[ind])

        imputed_values_knn_1.append(current_impute_knn_1)
        imputed_values_knn_3.append(current_impute_knn_3)
        imputed_values_knn_5.append(current_impute_knn_5)
       
        # print(train_labels.values[ind])
        # print(current_impute_knn_1//1, current_impute_knn_3//1, current_impute_knn_5//1)
        # print(test_labels.values[index])
        # print(train_values.values[ind])

    mse_knn_1 = np.mean((test_labels.values - imputed_values_knn_1)**2)
    mse_knn_3 = np.mean((test_labels.values - imputed_values_knn_3)**2)
    mse_knn_5 = np.mean((test_labels.values - imputed_values_knn_5)**2)

    sem_knn_1 = stats.sem(list(train_labels.values)+imputed_values_knn_1)
    sem_knn_3 = stats.sem(list(train_labels.values)+imputed_values_knn_3)
    sem_knn_5 = stats.sem(list(train_labels.values)+imputed_values_knn_5)


    return mse_knn_1, mse_knn_3, mse_knn_5, sem_knn_1, sem_knn_3, sem_knn_5

In [169]:
impute(temp[0], temp[1], temp[2], temp[3])

(114200.0,
 124985.96206349207,
 157931.2892,
 23.227741243284537,
 23.22853609608134,
 23.220854878985932)

In [None]:
k = temp['keep']
d = temp['delete']
t = d.iloc[0].values.reshape(1, -1)

print(len(t))
print(k.shape)

tree = KDTree(temp['keep'], leaf_size=5)              
dist, ind = tree.query(t, k=3)


print(dist)
print(ind[0])
# [641 501 201]

pd.DataFrame(t)
pd.DataFrame(k.at(ind[0]))

# for t in d.iterrows():
#     dist, ind = tree.query(t, k=3)
#     print(ind, dist)