In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import root_mean_squared_error
from scripts import kb

kb_completeness = kb.get_kb_impact_completeness()

datasets = kb_completeness.name.unique()
objects = kb_completeness.column_name.unique()
ml_algorithms = kb_completeness.ml_algorithm.unique()

columns_X = ['n_tuples', 'uniqueness', 'min',
       'max', 'mean', 'median', 'std', 'skewness', 'kurtosis', 'mad', 'iqr',
       'p_min', 'p_max', 'k_min', 'k_max', 's_min', 's_max', 'entropy',
       'density', 'missing_perc']

techniques = ['impute_standard', 'impute_mean',
       'impute_median', 'impute_random', 'impute_knn', 'impute_mice',
       'impute_linear_regression', 'impute_random_forest', 'impute_cmeans']

def training_testing_completeness(distance, k):
    with open("../results/prediction_completeness.csv", "w") as f1:
        f1.write("dataset,model,technique,rmse\n")

        for dataset in datasets:
            for model in ml_algorithms:
                for technique in techniques:

                    data = kb_completeness.copy()

                    df = data[(data["ml_algorithm"] == model)].copy()

                    train = df[df["name"] != dataset]
                    test = df[df["name"] == dataset]

                    X_train = train[columns_X]
                    y_train = train[technique]
                    X_test = test[columns_X]
                    y_test = test[technique]

                    X_test_not_scaled = X_test.reset_index(drop=True).copy()

                    X_train = StandardScaler().fit_transform(X_train)
                    X_train = np.nan_to_num(X_train)

                    X_test = StandardScaler().fit_transform(X_test)
                    X_test = np.nan_to_num(X_test)

                    knn = KNeighborsRegressor(n_neighbors=k, metric=distance)
                    knn.fit(X_train, y_train)

                    y_pred = knn.predict(X_test)
                    error = root_mean_squared_error(y_test, y_pred)
                    #print(dataset+": "+str(error))
                    f1.write(dataset + "," + model + "," + technique + "," + str(error) + "\n")

    data = pd.read_csv("../results/prediction_completeness.csv")
    print("Done! Final RMSE for "+distance+" and "+str(k)+" neighbours: "+str(data.rmse.mean()))

In [2]:
for distance in ['cosine','euclidean','manhattan']:
    training_testing_completeness(distance, 14)

Done! Final RMSE for cosine and 14 neighbours: 0.03514512001250314
Done! Final RMSE for euclidean and 14 neighbours: 0.036440963282997646
Done! Final RMSE for manhattan and 14 neighbours: 0.03663092203344591


In [3]:
for k in [6,8,10,12,14,16]:
    training_testing_completeness('cosine', k)

Done! Final RMSE for cosine and 6 neighbours: 0.03991882602327664
Done! Final RMSE for cosine and 8 neighbours: 0.03823708062485948
Done! Final RMSE for cosine and 10 neighbours: 0.03698233010656723
Done! Final RMSE for cosine and 12 neighbours: 0.03605985745717254
Done! Final RMSE for cosine and 14 neighbours: 0.03514512001250314
Done! Final RMSE for cosine and 16 neighbours: 0.03452365572771227


In [4]:
for k in [16,17,18,19,20]:
    training_testing_completeness('cosine', k)


Done! Final RMSE for cosine and 16 neighbours: 0.03452365572771227
Done! Final RMSE for cosine and 17 neighbours: 0.0342498777287806
Done! Final RMSE for cosine and 18 neighbours: 0.03400443314221963
Done! Final RMSE for cosine and 19 neighbours: 0.03390497232970522
Done! Final RMSE for cosine and 20 neighbours: 0.033661152309438454


In [5]:
for k in [28,30]:
    training_testing_completeness('cosine', k)


Done! Final RMSE for cosine and 28 neighbours: 0.032351080539066414
Done! Final RMSE for cosine and 30 neighbours: 0.032118686903739174


In [6]:
for k in [35,40]:
    training_testing_completeness('cosine', k)


Done! Final RMSE for cosine and 35 neighbours: 0.0317248181517502
Done! Final RMSE for cosine and 40 neighbours: 0.03138646296677046


In [7]:
for k in [29,30,31,32,33]:
    training_testing_completeness('cosine', k)


Done! Final RMSE for cosine and 29 neighbours: 0.03219946845616403
Done! Final RMSE for cosine and 30 neighbours: 0.032118686903739174
Done! Final RMSE for cosine and 31 neighbours: 0.032051922731198126
Done! Final RMSE for cosine and 32 neighbours: 0.03195974552208088
Done! Final RMSE for cosine and 33 neighbours: 0.0318356806844158


In [8]:
for k in [30,31,32,33,34,35]:
    training_testing_completeness('cosine', k)

Done! Final RMSE for cosine and 30 neighbours: 0.032118686903739174
Done! Final RMSE for cosine and 31 neighbours: 0.032051922731198126
Done! Final RMSE for cosine and 32 neighbours: 0.03195974552208088
Done! Final RMSE for cosine and 33 neighbours: 0.0318356806844158
Done! Final RMSE for cosine and 34 neighbours: 0.03176476447924047
Done! Final RMSE for cosine and 35 neighbours: 0.0317248181517502


In [9]:
for k in [35,36,37,38,39,40]:
    training_testing_completeness('cosine', k)

Done! Final RMSE for cosine and 35 neighbours: 0.0317248181517502
Done! Final RMSE for cosine and 36 neighbours: 0.03166176343107265
Done! Final RMSE for cosine and 37 neighbours: 0.03159684311214556
Done! Final RMSE for cosine and 38 neighbours: 0.03152310878860519
Done! Final RMSE for cosine and 39 neighbours: 0.03145414605126418
Done! Final RMSE for cosine and 40 neighbours: 0.03138646296677046


In [10]:
### best k = 35

In [11]:
for distance in ['cosine','euclidean','manhattan']:
    training_testing_completeness(distance, 35)


Done! Final RMSE for cosine and 35 neighbours: 0.0317248181517502
Done! Final RMSE for euclidean and 35 neighbours: 0.0325425647202705
Done! Final RMSE for manhattan and 35 neighbours: 0.0330465804309964


In [None]:
### best distance is cosine
