In [3]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import root_mean_squared_error
from scripts import dataset
import warnings
warnings.filterwarnings("ignore")

original_data = pd.read_json("../kb/KBR.json")

dimensions = original_data.dimension.unique()
models = original_data.model.unique()
datasets = original_data.name.unique()

datasets_fd = ["BachChoralHarmony", "bank", "cancer", "mushrooms", "soybean"]

data_impact = dataset.get_dataset()

def training_testing(distance, k):
    with open("../results/results_prediction_impact.csv", "w") as f:
        f.write("dataset,model,dimension,rmse\n")
        for dataset in datasets:
            for model in models:
                for dimension in dimensions:

                    data = data_impact.copy()

                    if dimension == "consistency" and (dataset in datasets_fd):

                        df = data[(data["model"] == model) & (data["dimension"] == dimension) & (
                                    (data["name"] == "BachChoralHarmony") | (data["name"] == "mushrooms") | (
                                        data["name"] == "bank") | (data["name"] == "cancer") | (
                                                data["name"] == "soybean"))].copy()

                        train = df[df["name"] != dataset]
                        test = df[df["name"] == dataset]

                        columns = df.columns
                        features = columns.drop(
                            ["name", "dimension", "model", "score", "impact", "p_correlated_features_0.5",
                             "p_correlated_features_0.6", "p_correlated_features_0.7", "p_correlated_features_0.8",
                             "p_correlated_features_0.9"])

                        X_train = train[features]
                        y_train = train["impact"]
                        X_test = test[features]
                        y_test = test["impact"]

                        X_train = StandardScaler().fit_transform(X_train)
                        X_train = np.nan_to_num(X_train)

                        X_test = StandardScaler().fit_transform(X_test)
                        X_test = np.nan_to_num(X_test)

                        knn = KNeighborsRegressor(n_neighbors=k, metric=distance)
                        knn.fit(X_train, y_train)
                        y_pred = knn.predict(X_test)
                        error = root_mean_squared_error(y_test, y_pred)
                        #print(dataset+": "+str(error))
                        f.write(dataset + "," + model + "," + dimension + "," + str(error) + "\n")

                    elif dimension != "consistency":

                        df = data[(data["model"] == model) & (data["dimension"] == dimension)].copy()

                        train = df[df["name"] != dataset]
                        test = df[df["name"] == dataset]

                        columns = df.columns
                        features = columns.drop(
                            ["name", "dimension", "model", "score", "impact", "p_correlated_features_0.5",
                             "p_correlated_features_0.6", "p_correlated_features_0.7", "p_correlated_features_0.8",
                             "p_correlated_features_0.9"])

                        X_train = train[features]
                        y_train = train["impact"]
                        X_test = test[features]
                        y_test = test["impact"]

                        X_train = StandardScaler().fit_transform(X_train)
                        X_train = np.nan_to_num(X_train)

                        X_test = StandardScaler().fit_transform(X_test)
                        X_test = np.nan_to_num(X_test)

                        knn = KNeighborsRegressor(n_neighbors=k, metric=distance)
                        knn.fit(X_train, y_train)
                        y_pred = knn.predict(X_test)
                        error = root_mean_squared_error(y_test, y_pred)
                        #print(dataset+": "+str(error))
                        f.write(dataset + "," + model + "," + dimension + "," + str(error) + "\n")

    data = pd.read_csv("../results/results_prediction_impact.csv")
    print("Done! Final RMSE for "+distance+" and "+str(k)+" neighbours: "+str(data.rmse.mean()))

In [4]:
for distance in ['cosine','euclidean','manhattan']:
    training_testing(distance, 14)

Done! Final RMSE for cosine and 14 neighbours: 0.14585447998377396
Done! Final RMSE for euclidean and 14 neighbours: 0.1367176584399224
Done! Final RMSE for manhattan and 14 neighbours: 0.1344474223217409


In [7]:
for k in [6,8,10,12,14,16]:
    training_testing('manhattan', k)

Done! Final RMSE for manhattan and 6 neighbours: 0.14455334071991155
Done! Final RMSE for manhattan and 8 neighbours: 0.1396082024193386
Done! Final RMSE for manhattan and 10 neighbours: 0.13692906640306862
Done! Final RMSE for manhattan and 12 neighbours: 0.13518459137687308
Done! Final RMSE for manhattan and 14 neighbours: 0.1344474223217409
Done! Final RMSE for manhattan and 16 neighbours: 0.13441317404519837


In [10]:
for k in [13,14,15,16,17,18,19,20]:
    training_testing('manhattan', k)


Done! Final RMSE for manhattan and 13 neighbours: 0.13472700213379546
Done! Final RMSE for manhattan and 14 neighbours: 0.1344474223217409
Done! Final RMSE for manhattan and 15 neighbours: 0.13464626295638926
Done! Final RMSE for manhattan and 16 neighbours: 0.13441317404519837
Done! Final RMSE for manhattan and 17 neighbours: 0.134041287912328
Done! Final RMSE for manhattan and 18 neighbours: 0.13381574555676198
Done! Final RMSE for manhattan and 19 neighbours: 0.13377086437265032
Done! Final RMSE for manhattan and 20 neighbours: 0.13361985273748317


In [None]:
### best k = 14

In [11]:
for distance in ['cosine','euclidean','manhattan']:
    training_testing(distance, 14)


Done! Final RMSE for cosine and 14 neighbours: 0.14585447998377396
Done! Final RMSE for euclidean and 14 neighbours: 0.1367176584399224
Done! Final RMSE for manhattan and 14 neighbours: 0.1344474223217409


In [None]:
### best distance = manhattan
