In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import root_mean_squared_error
#from .kb import get_kb_accuracy

kb_accuracy = pd.read_csv("scripts/KBA.csv")

techniques = kb_accuracy.technique_accuracy.unique()
stds_outliers = kb_accuracy.std_outliers.unique()
perc_outliers = kb_accuracy.percentage_outliers.unique()
objects = kb_accuracy.column_name.unique()
datasets = kb_accuracy.name.unique()

columns = ['name', 'column_name', 'technique_accuracy']

columns_X = ['n_tuples', 'uniqueness', 'min', 'max',
       'mean', 'median', 'std', 'skewness', 'kurtosis', 'mad', 'iqr', 'p_min',
       'p_max', 'k_min', 'k_max', 's_min', 's_max', 'entropy', 'density',
       'percentage_outliers', 'std_outliers']

columns_y = 'f1_technique'

def training_testing_accuracy(distance, k):
    with open("../results/prediction_accuracy.csv", "w") as f1:
        f1.write("dataset,technique,rmse\n")

        with open("../results/techniques_accuracy_evaluation.csv", "w") as f2:
            f2.write("dataset,technique,real,pred\n")

            for dataset in datasets:
                    for technique in techniques:

                        data = kb_accuracy.copy()

                        df = data[(data["technique_accuracy"] == technique)].copy()

                        train = df[df["name"] != dataset]
                        test = df[df["name"] == dataset]

                        X_train = train[columns_X]
                        y_train = train[columns_y]
                        X_test = test[columns_X]
                        y_test = test[columns_y]

                        X_train = StandardScaler().fit_transform(X_train)
                        X_train = np.nan_to_num(X_train)

                        X_test = StandardScaler().fit_transform(X_test)
                        X_test = np.nan_to_num(X_test)

                        knn = KNeighborsRegressor(n_neighbors=k, metric=distance)
                        knn.fit(X_train, y_train)

                        y_pred = knn.predict(X_test)
                        error = root_mean_squared_error(y_test, y_pred)
                        #print(dataset+": "+str(error))
                        f1.write(dataset + "," + technique + "," + str(error) + "\n")

                        y_test = y_test.reset_index(drop=True)
                        for i in range(0, len(y_test)):
                            f2.write(dataset + "_" + str(i) + "," + technique + "," + str(
                                y_test[i]) + "," + str(y_pred[i]) + "\n")

    data = pd.read_csv("../results/prediction_accuracy.csv")
    print("Done! Final RMSE for "+distance+" and "+str(k)+" neighbours: "+str(data.rmse.mean()))

In [4]:
data = kb_accuracy.copy()

df = data[(data["technique_accuracy"] == 'LOF')].copy()

train = df[df["name"] != 'acustic']
test = df[df["name"] == 'acustic']

X_train = train[columns_X]
y_train = train[columns_y]
X_test = test[columns_X]
y_test = test[columns_y]

In [12]:
X_test = X_test.reset_index(drop=True)

In [14]:
str(X_test.percentage_outliers[0])

'10.0'

In [3]:
for distance in ['cosine','euclidean','manhattan']:
    training_testing_accuracy(distance, 14)

Done! Final RMSE for cosine and 14 neighbours: 0.14714717587782894
Done! Final RMSE for euclidean and 14 neighbours: 0.13245590003879257
Done! Final RMSE for manhattan and 14 neighbours: 0.12108558310348627


In [11]:
for k in [6,8,10,12,14,16]:
    training_testing_accuracy('manhattan', k)

Done! Final RMSE for manhattan and 6 neighbours: 0.12416254930817079
Done! Final RMSE for manhattan and 8 neighbours: 0.12266648381592435
Done! Final RMSE for manhattan and 10 neighbours: 0.12195537084473615
Done! Final RMSE for manhattan and 12 neighbours: 0.12158076268150923
Done! Final RMSE for manhattan and 14 neighbours: 0.12108558310348627
Done! Final RMSE for manhattan and 16 neighbours: 0.12073042393857412


In [12]:
for k in [18,20]:
    training_testing_accuracy('manhattan', k)


Done! Final RMSE for manhattan and 18 neighbours: 0.1203893375344208
Done! Final RMSE for manhattan and 20 neighbours: 0.12015695290610198


In [13]:
for k in [28,30]:
    training_testing_accuracy('manhattan', k)


Done! Final RMSE for manhattan and 28 neighbours: 0.12001722180190022
Done! Final RMSE for manhattan and 30 neighbours: 0.11994525184874918


In [15]:
for k in [14,15,16,17,18,19,20,21,22,23,24,25,26,27]:
    training_testing_accuracy('manhattan', k)

Done! Final RMSE for manhattan and 14 neighbours: 0.12108558310348627
Done! Final RMSE for manhattan and 15 neighbours: 0.1210056840555883
Done! Final RMSE for manhattan and 16 neighbours: 0.12073042393857412
Done! Final RMSE for manhattan and 17 neighbours: 0.12058362748239063
Done! Final RMSE for manhattan and 18 neighbours: 0.1203893375344208
Done! Final RMSE for manhattan and 19 neighbours: 0.120196667658289
Done! Final RMSE for manhattan and 20 neighbours: 0.12015695290610198
Done! Final RMSE for manhattan and 21 neighbours: 0.11994598843640546
Done! Final RMSE for manhattan and 22 neighbours: 0.11989104473891417
Done! Final RMSE for manhattan and 23 neighbours: 0.1199140216555874
Done! Final RMSE for manhattan and 24 neighbours: 0.11999230477068422
Done! Final RMSE for manhattan and 25 neighbours: 0.11992795886910834
Done! Final RMSE for manhattan and 26 neighbours: 0.11994362480793262
Done! Final RMSE for manhattan and 27 neighbours: 0.11998643899678643


In [16]:
### best k = 22

In [17]:
for distance in ['cosine','euclidean','manhattan']:
    training_testing_accuracy(distance, 22)


Done! Final RMSE for cosine and 22 neighbours: 0.14575263577352585
Done! Final RMSE for euclidean and 22 neighbours: 0.13161654734822606
Done! Final RMSE for manhattan and 22 neighbours: 0.11989104473891417


In [None]:
### best distance is manhattan
