In [4]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import root_mean_squared_error
from scripts import kb
import warnings
warnings.filterwarnings("ignore")
kb_completeness = kb.get_kb_impact_completeness()

In [5]:
datasets = kb_completeness.name.unique()
objects = kb_completeness.column_name.unique()
ml_algorithms = kb_completeness.ml_algorithm.unique()

columns_X = ['n_tuples', 'constancy',
       'imbalance', 'uniqueness', 'unalikeability', 'entropy', 'density',
       'mean_char', 'std_char', 'skewness_char', 'kurtosis_char', 'min_char',
       'max_char', 'missing_perc']

techniques = ['impute_standard', 'impute_mode',
       'impute_random', 'impute_knn', 'impute_mice',
       'impute_logistic_regression', 'impute_random_forest', 'impute_kproto']

def training_testing_completeness(distance, k):
    with open("../results/prediction_completeness.csv", "w") as f1:
        f1.write("dataset,model,technique,rmse\n")

        for dataset in datasets:
            for model in ml_algorithms:
                for technique in techniques:

                    data = kb_completeness.copy()

                    df = data[(data["ml_algorithm"] == model)].copy()

                    train = df[df["name"] != dataset]
                    test = df[df["name"] == dataset]

                    X_train = train[columns_X]
                    y_train = train[technique]
                    X_test = test[columns_X]
                    y_test = test[technique]

                    X_test_not_scaled = X_test.reset_index(drop=True).copy()

                    X_train = StandardScaler().fit_transform(X_train)
                    X_train = np.nan_to_num(X_train)

                    X_test = StandardScaler().fit_transform(X_test)
                    X_test = np.nan_to_num(X_test)

                    knn = KNeighborsRegressor(n_neighbors=k, metric=distance)
                    knn.fit(X_train, y_train)

                    y_pred = knn.predict(X_test)
                    error = root_mean_squared_error(y_test, y_pred)
                    #print(dataset+": "+str(error))
                    f1.write(dataset + "," + model + "," + technique + "," + str(error) + "\n")

    data = pd.read_csv("../results/prediction_completeness.csv")
    print("Done! Final RMSE for "+distance+" and "+str(k)+" neighbours: "+str(data.rmse.mean()))

In [6]:
for distance in ['cosine','euclidean','manhattan']:
    training_testing_completeness(distance, 14)

Done! Final RMSE for cosine and 14 neighbours: 0.03750194407465105
Done! Final RMSE for euclidean and 14 neighbours: 0.03445399473219821
Done! Final RMSE for manhattan and 14 neighbours: 0.03428640153711452


In [7]:
for k in [16,17,18,19,20]:
    training_testing_completeness('manhattan', k)


Done! Final RMSE for manhattan and 16 neighbours: 0.033675432890968894
Done! Final RMSE for manhattan and 17 neighbours: 0.03335791362454722
Done! Final RMSE for manhattan and 18 neighbours: 0.03313336681800607
Done! Final RMSE for manhattan and 19 neighbours: 0.03293154628660894
Done! Final RMSE for manhattan and 20 neighbours: 0.03275508295163396


In [8]:
for k in [29,30,31,32,33]:
    training_testing_completeness('manhattan', k)


Done! Final RMSE for manhattan and 29 neighbours: 0.031288365862363346
Done! Final RMSE for manhattan and 30 neighbours: 0.03109480482901224
Done! Final RMSE for manhattan and 31 neighbours: 0.030957760168404215
Done! Final RMSE for manhattan and 32 neighbours: 0.030837217455562956
Done! Final RMSE for manhattan and 33 neighbours: 0.030697730728576837


In [9]:
for k in [30,31,32,33,34,35]:
    training_testing_completeness('manhattan', k)

Done! Final RMSE for manhattan and 30 neighbours: 0.03109480482901224
Done! Final RMSE for manhattan and 31 neighbours: 0.030957760168404215
Done! Final RMSE for manhattan and 32 neighbours: 0.030837217455562956
Done! Final RMSE for manhattan and 33 neighbours: 0.030697730728576837
Done! Final RMSE for manhattan and 34 neighbours: 0.030523142926943046
Done! Final RMSE for manhattan and 35 neighbours: 0.03040744150568995


In [12]:
for k in [35,36,37,38,39,40]:
    training_testing_completeness('manhattan', k)

Done! Final RMSE for manhattan and 35 neighbours: 0.03040744150568995
Done! Final RMSE for manhattan and 36 neighbours: 0.030344349599401868
Done! Final RMSE for manhattan and 37 neighbours: 0.030246419458858986
Done! Final RMSE for manhattan and 38 neighbours: 0.03013634326119047
Done! Final RMSE for manhattan and 39 neighbours: 0.030056916261208405
Done! Final RMSE for manhattan and 40 neighbours: 0.029943995529867345


In [13]:
### best k = 35

In [14]:
for distance in ['cosine','euclidean','manhattan']:
    training_testing_completeness(distance, 35)


Done! Final RMSE for cosine and 35 neighbours: 0.03241262117425579
Done! Final RMSE for euclidean and 35 neighbours: 0.0304920652536932
Done! Final RMSE for manhattan and 35 neighbours: 0.03040744150568995


In [None]:
### best distance is cosine
