In [7]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import root_mean_squared_error
from scripts import kb
from scripts import tt
import warnings
warnings.filterwarnings("ignore")

kb_completeness = kb.get_kb_completeness()

In [9]:
datasets = kb_completeness.name.unique()
objects = kb_completeness.column_name.unique()
ml_algorithms = kb_completeness.ml_algorithm.unique()

columns_X = ['n_tuples', 'constancy',
       'imbalance', 'uniqueness', 'unalikeability', 'entropy', 'density',
       'mean_char', 'std_char', 'skewness_char', 'kurtosis_char', 'min_char',
       'max_char', 'missing_perc']

techniques = ['impute_standard', 'impute_mode',
       'impute_random', 'impute_knn', 'impute_mice',
       'impute_logistic_regression', 'impute_random_forest', 'impute_kproto']

def training_testing_completeness(distance, k):
    with open("../results/prediction_completeness.csv", "w") as f1:
        f1.write("dataset,model,technique,rmse\n")

        for dataset in datasets:
            for model in ml_algorithms:
                for technique in techniques:

                    data = kb_completeness.copy()

                    df = data[(data["ml_algorithm"] == model)].copy()

                    train = df[df["name"] != dataset]
                    test = df[df["name"] == dataset]

                    X_train = train[columns_X]
                    y_train = train[technique]
                    X_test = test[columns_X]
                    y_test = test[technique]

                    X_test_not_scaled = X_test.reset_index(drop=True).copy()

                    X_train = StandardScaler().fit_transform(X_train)
                    X_train = np.nan_to_num(X_train)

                    X_test = StandardScaler().fit_transform(X_test)
                    X_test = np.nan_to_num(X_test)

                    knn = KNeighborsRegressor(n_neighbors=k, metric=distance)
                    knn.fit(X_train, y_train)

                    y_pred = knn.predict(X_test)
                    error = root_mean_squared_error(y_test, y_pred)
                    #print(dataset+": "+str(error))
                    f1.write(dataset + "," + model + "," + technique + "," + str(error) + "\n")

    data = pd.read_csv("../results/prediction_completeness.csv")
    print("Done! Final RMSE for "+distance+" and "+str(k)+" neighbours: "+str(data.rmse.mean()))

In [10]:
for distance in ['cosine','euclidean','manhattan']:
    training_testing_completeness(distance, 14)

Done! Final RMSE for cosine and 14 neighbours: 0.28223076261854035
Done! Final RMSE for euclidean and 14 neighbours: 0.2581437353366429
Done! Final RMSE for manhattan and 14 neighbours: 0.24912381885348828


In [12]:
for k in [6,8,10,12,14,16]:
    training_testing_completeness('manhattan', k)

Done! Final RMSE for manhattan and 6 neighbours: 0.2568508803924446
Done! Final RMSE for manhattan and 8 neighbours: 0.2547787010031717
Done! Final RMSE for manhattan and 10 neighbours: 0.2528791130492213
Done! Final RMSE for manhattan and 12 neighbours: 0.2492238051379258
Done! Final RMSE for manhattan and 14 neighbours: 0.24912381885348828
Done! Final RMSE for manhattan and 16 neighbours: 0.24693435530040247


In [14]:
for k in [16,17,18,19,20]:
    training_testing_completeness('manhattan', k)


Done! Final RMSE for manhattan and 16 neighbours: 0.24693435530040247
Done! Final RMSE for manhattan and 17 neighbours: 0.2465750529121183
Done! Final RMSE for manhattan and 18 neighbours: 0.2456570173479543
Done! Final RMSE for manhattan and 19 neighbours: 0.24474936752829052
Done! Final RMSE for manhattan and 20 neighbours: 0.24436143598909366


In [15]:
for k in [28,30]:
    training_testing_completeness('manhattan', k)


Done! Final RMSE for manhattan and 28 neighbours: 0.24031303491138986
Done! Final RMSE for manhattan and 30 neighbours: 0.23985426403517782


In [16]:
for k in [35,40]:
    training_testing_completeness('manhattan', k)


Done! Final RMSE for manhattan and 35 neighbours: 0.23753230560278488
Done! Final RMSE for manhattan and 40 neighbours: 0.23515027435497998


In [17]:
for k in [29,30,31,32,33]:
    training_testing_completeness('manhattan', k)


Done! Final RMSE for manhattan and 29 neighbours: 0.24005730201911626
Done! Final RMSE for manhattan and 30 neighbours: 0.23985426403517782
Done! Final RMSE for manhattan and 31 neighbours: 0.23944081485166535
Done! Final RMSE for manhattan and 32 neighbours: 0.23894062829128193
Done! Final RMSE for manhattan and 33 neighbours: 0.2384035931793092


In [18]:
for k in [25,26,27,28,29,30,31,32,33,34,35]:
    training_testing_completeness('manhattan', k)

Done! Final RMSE for manhattan and 25 neighbours: 0.2419999650106256
Done! Final RMSE for manhattan and 26 neighbours: 0.24119666278860027
Done! Final RMSE for manhattan and 27 neighbours: 0.24098328920500114
Done! Final RMSE for manhattan and 28 neighbours: 0.24031303491138986
Done! Final RMSE for manhattan and 29 neighbours: 0.24005730201911626
Done! Final RMSE for manhattan and 30 neighbours: 0.23985426403517782
Done! Final RMSE for manhattan and 31 neighbours: 0.23944081485166535
Done! Final RMSE for manhattan and 32 neighbours: 0.23894062829128193
Done! Final RMSE for manhattan and 33 neighbours: 0.2384035931793092
Done! Final RMSE for manhattan and 34 neighbours: 0.23791266516624193
Done! Final RMSE for manhattan and 35 neighbours: 0.23753230560278488


In [19]:
for k in [35,36,37,38,39,40]:
    training_testing_completeness('manhattan', k)

Done! Final RMSE for manhattan and 35 neighbours: 0.23753230560278488
Done! Final RMSE for manhattan and 36 neighbours: 0.2368785180316553
Done! Final RMSE for manhattan and 37 neighbours: 0.2364316367515493
Done! Final RMSE for manhattan and 38 neighbours: 0.2358245938286988
Done! Final RMSE for manhattan and 39 neighbours: 0.23546993857763654
Done! Final RMSE for manhattan and 40 neighbours: 0.23515027435497998


In [20]:
### best k = 35

In [21]:
for distance in ['cosine','euclidean','manhattan']:
    training_testing_completeness(distance, 35)


Done! Final RMSE for cosine and 35 neighbours: 0.269264002667985
Done! Final RMSE for euclidean and 35 neighbours: 0.23920020888147323
Done! Final RMSE for manhattan and 35 neighbours: 0.23753230560278488


In [22]:
### best distance is cosine
