In [3]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import root_mean_squared_error
from scripts import kb

kb_completeness = kb.get_kb_completeness()

datasets = kb_completeness.name.unique()
objects = kb_completeness.column_name.unique()
ml_algorithms = kb_completeness.ml_algorithm.unique()

columns_X = ['n_tuples', 'uniqueness', 'min',
       'max', 'mean', 'median', 'std', 'skewness', 'kurtosis', 'mad', 'iqr',
       'p_min', 'p_max', 'k_min', 'k_max', 's_min', 's_max', 'entropy',
       'density', 'missing_perc']

techniques = ['impute_standard', 'impute_mean',
       'impute_median', 'impute_random', 'impute_knn', 'impute_mice',
       'impute_linear_regression', 'impute_random_forest', 'impute_cmeans']

def training_testing_completeness(distance, k):
    with open("../results/prediction_completeness.csv", "w") as f1:
        f1.write("dataset,model,technique,rmse\n")

        for dataset in datasets:
            for model in ml_algorithms:
                for technique in techniques:

                    data = kb_completeness.copy()

                    df = data[(data["ml_algorithm"] == model)].copy()

                    train = df[df["name"] != dataset]
                    test = df[df["name"] == dataset]

                    X_train = train[columns_X]
                    y_train = train[technique]
                    X_test = test[columns_X]
                    y_test = test[technique]

                    X_test_not_scaled = X_test.reset_index(drop=True).copy()

                    X_train = StandardScaler().fit_transform(X_train)
                    X_train = np.nan_to_num(X_train)

                    X_test = StandardScaler().fit_transform(X_test)
                    X_test = np.nan_to_num(X_test)

                    knn = KNeighborsRegressor(n_neighbors=k, metric=distance)
                    knn.fit(X_train, y_train)

                    y_pred = knn.predict(X_test)
                    error = root_mean_squared_error(y_test, y_pred)
                    #print(dataset+": "+str(error))
                    f1.write(dataset + "," + model + "," + technique + "," + str(error) + "\n")

    data = pd.read_csv("../results/prediction_completeness.csv")
    print("Done! Final RMSE for "+distance+" and "+str(k)+" neighbours: "+str(data.rmse.mean()))

In [4]:
for distance in ['cosine','euclidean','manhattan']:
    training_testing_completeness(distance, 14)

Done! Final RMSE for cosine and 14 neighbours: 0.2234964096702528
Done! Final RMSE for euclidean and 14 neighbours: 0.23700040612407924
Done! Final RMSE for manhattan and 14 neighbours: 0.24779536855736914


In [5]:
for k in [6,8,10,12,14,16]:
    training_testing_completeness('cosine', k)

Done! Final RMSE for cosine and 6 neighbours: 0.2484942828092965
Done! Final RMSE for cosine and 8 neighbours: 0.24153365317859538
Done! Final RMSE for cosine and 10 neighbours: 0.23580131171634128
Done! Final RMSE for cosine and 12 neighbours: 0.22883179467046666
Done! Final RMSE for cosine and 14 neighbours: 0.2234964096702528
Done! Final RMSE for cosine and 16 neighbours: 0.22080637307679482


In [6]:
for k in [16,17,18,19,20]:
    training_testing_completeness('cosine', k)


Done! Final RMSE for cosine and 16 neighbours: 0.22080637307679482
Done! Final RMSE for cosine and 17 neighbours: 0.21919149831765278
Done! Final RMSE for cosine and 18 neighbours: 0.21847573598914824
Done! Final RMSE for cosine and 19 neighbours: 0.21750152149617577
Done! Final RMSE for cosine and 20 neighbours: 0.21660755006501076


In [7]:
for k in [28,30]:
    training_testing_completeness('cosine', k)


Done! Final RMSE for cosine and 28 neighbours: 0.21189808231174462
Done! Final RMSE for cosine and 30 neighbours: 0.21115120888260408


In [8]:
for k in [35,40]:
    training_testing_completeness('cosine', k)


Done! Final RMSE for cosine and 35 neighbours: 0.20886694731596211
Done! Final RMSE for cosine and 40 neighbours: 0.20734816020965618


In [9]:
for k in [29,30,31,32,33]:
    training_testing_completeness('cosine', k)


Done! Final RMSE for cosine and 29 neighbours: 0.21146622001390292
Done! Final RMSE for cosine and 30 neighbours: 0.21115120888260408
Done! Final RMSE for cosine and 31 neighbours: 0.21090803957425963
Done! Final RMSE for cosine and 32 neighbours: 0.21030037700146556
Done! Final RMSE for cosine and 33 neighbours: 0.20973499477256022


In [11]:
for k in [25,26,27,28,29,30,31,32,33,34,35]:
    training_testing_completeness('cosine', k)

Done! Final RMSE for cosine and 25 neighbours: 0.21315782868746064
Done! Final RMSE for cosine and 26 neighbours: 0.21301410500616663
Done! Final RMSE for cosine and 27 neighbours: 0.21241772876877643
Done! Final RMSE for cosine and 28 neighbours: 0.21189808231174462
Done! Final RMSE for cosine and 29 neighbours: 0.21146622001390292
Done! Final RMSE for cosine and 30 neighbours: 0.21115120888260408
Done! Final RMSE for cosine and 31 neighbours: 0.21090803957425963
Done! Final RMSE for cosine and 32 neighbours: 0.21030037700146556
Done! Final RMSE for cosine and 33 neighbours: 0.20973499477256022
Done! Final RMSE for cosine and 34 neighbours: 0.2092172169965556
Done! Final RMSE for cosine and 35 neighbours: 0.20886694731596211


In [12]:
for k in [35,36,37,38,39,40]:
    training_testing_completeness('cosine', k)

Done! Final RMSE for cosine and 35 neighbours: 0.20886694731596211
Done! Final RMSE for cosine and 36 neighbours: 0.20865878958147713
Done! Final RMSE for cosine and 37 neighbours: 0.20844695101277075
Done! Final RMSE for cosine and 38 neighbours: 0.2080538389141549
Done! Final RMSE for cosine and 39 neighbours: 0.20782331246574073
Done! Final RMSE for cosine and 40 neighbours: 0.20734816020965618


In [None]:
### best k = 35

In [13]:
for distance in ['cosine','euclidean','manhattan']:
    training_testing_completeness(distance, 35)


Done! Final RMSE for cosine and 35 neighbours: 0.20886694731596211
Done! Final RMSE for euclidean and 35 neighbours: 0.22718240853682345
Done! Final RMSE for manhattan and 35 neighbours: 0.23147498501620317


In [None]:
### best distance is cosine
