In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import root_mean_squared_error
#from .kb import get_kb_accuracy

kb_accuracy = pd.read_csv("scripts/KBA.csv")

techniques = kb_accuracy.technique_accuracy.unique()
stds_outliers = kb_accuracy.std_outliers.unique()
perc_outliers = kb_accuracy.percentage_outliers.unique()
objects = kb_accuracy.column_name.unique()
datasets = kb_accuracy.name.unique()

columns = ['name', 'column_name', 'technique_accuracy']

columns_X = ['n_tuples', 'uniqueness', 'min', 'max',
       'mean', 'median', 'std', 'skewness', 'kurtosis', 'mad', 'iqr', 'p_min',
       'p_max', 'k_min', 'k_max', 's_min', 's_max', 'entropy', 'density',
       'percentage_outliers']

columns_y = 'f1_technique'

def training_testing_accuracy(distance, k):
    with open("../results/prediction_accuracy.csv", "w") as f1:
        f1.write("dataset,technique,rmse\n")

        with open("../results/techniques_accuracy_evaluation.csv", "w") as f2:
            f2.write("dataset,technique,real,pred\n")

            for dataset in datasets:
                    for technique in techniques:

                        data = kb_accuracy.copy()

                        df = data[(data["technique_accuracy"] == technique)].copy()

                        train = df[df["name"] != dataset]
                        test = df[df["name"] == dataset]

                        X_train = train[columns_X]
                        y_train = train[columns_y]
                        X_test = test[columns_X]
                        y_test = test[columns_y]

                        X_train = StandardScaler().fit_transform(X_train)
                        X_train = np.nan_to_num(X_train)

                        X_test = StandardScaler().fit_transform(X_test)
                        X_test = np.nan_to_num(X_test)

                        knn = KNeighborsRegressor(n_neighbors=k, metric=distance)
                        knn.fit(X_train, y_train)

                        y_pred = knn.predict(X_test)
                        error = root_mean_squared_error(y_test, y_pred)
                        #print(dataset+": "+str(error))
                        f1.write(dataset + "," + technique + "," + str(error) + "\n")

                        y_test = y_test.reset_index(drop=True)
                        for i in range(0, len(y_test)):
                            f2.write(dataset + "_" + str(i) + "," + technique + "," + str(
                                y_test[i]) + "," + str(y_pred[i]) + "\n")

    data = pd.read_csv("../results/prediction_accuracy.csv")
    print("Done! Final RMSE for "+distance+" and "+str(k)+" neighbours: "+str(data.rmse.mean()))

In [3]:
data = kb_accuracy.copy()

df = data[(data["technique_accuracy"] == 'LOF')].copy()

train = df[df["name"] != 'acustic']
test = df[df["name"] == 'acustic']

X_train = train[columns_X]
y_train = train[columns_y]
X_test = test[columns_X]
y_test = test[columns_y]

In [6]:
for distance in ['cosine','euclidean','manhattan']:
    training_testing_accuracy(distance, 14)

Done! Final RMSE for cosine and 14 neighbours: 0.14920350905603516
Done! Final RMSE for euclidean and 14 neighbours: 0.1351689567731969
Done! Final RMSE for manhattan and 14 neighbours: 0.12465085047210077


In [7]:
for k in [6,8,10,12,14,16]:
    training_testing_accuracy('manhattan', k)

Done! Final RMSE for manhattan and 6 neighbours: 0.1275813954274061
Done! Final RMSE for manhattan and 8 neighbours: 0.12621089039962535
Done! Final RMSE for manhattan and 10 neighbours: 0.12568888589939403
Done! Final RMSE for manhattan and 12 neighbours: 0.1249615425674713
Done! Final RMSE for manhattan and 14 neighbours: 0.12465085047210077
Done! Final RMSE for manhattan and 16 neighbours: 0.12429040885390151


In [8]:
for k in [18,20]:
    training_testing_accuracy('manhattan', k)


Done! Final RMSE for manhattan and 18 neighbours: 0.1238987283386525
Done! Final RMSE for manhattan and 20 neighbours: 0.12372580203510632


In [9]:
for k in [28,30]:
    training_testing_accuracy('manhattan', k)


Done! Final RMSE for manhattan and 28 neighbours: 0.12320749205031138
Done! Final RMSE for manhattan and 30 neighbours: 0.12324871937424696


In [13]:
for k in [20,21,22,23,24,25,26,27,28,29,30]:
    training_testing_accuracy('manhattan', k)

Done! Final RMSE for manhattan and 20 neighbours: 0.12372580203510632
Done! Final RMSE for manhattan and 21 neighbours: 0.12351699801714457
Done! Final RMSE for manhattan and 22 neighbours: 0.12346645009007322
Done! Final RMSE for manhattan and 23 neighbours: 0.12337938143605477
Done! Final RMSE for manhattan and 24 neighbours: 0.1233607715464592
Done! Final RMSE for manhattan and 25 neighbours: 0.12329054801182958
Done! Final RMSE for manhattan and 26 neighbours: 0.12317420288933799
Done! Final RMSE for manhattan and 27 neighbours: 0.12318565796549229
Done! Final RMSE for manhattan and 28 neighbours: 0.12320749205031138
Done! Final RMSE for manhattan and 29 neighbours: 0.12325131105144181
Done! Final RMSE for manhattan and 30 neighbours: 0.12324871937424696


In [11]:
### best k = 27

In [14]:
for distance in ['cosine','euclidean','manhattan']:
    training_testing_accuracy(distance, 27)


Done! Final RMSE for cosine and 27 neighbours: 0.14783967818760962
Done! Final RMSE for euclidean and 27 neighbours: 0.13379002268866325
Done! Final RMSE for manhattan and 27 neighbours: 0.12318565796549229


In [None]:
### best distance is manhattan
