In [1]:
import numpy as np
import pandas as pd  
import random as rnd
import math

from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV

from IPython.display import display, HTML

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

In [2]:
# paths with datasets

paths = [
    "./proj_o.csv"
]

for num in range(1,11) :
    name = "isomap_o_dim" + str(num) + ".csv"
    paths.append(name)

In [3]:
paths

['./proj_o.csv',
 'isomap_o_dim1.csv',
 'isomap_o_dim2.csv',
 'isomap_o_dim3.csv',
 'isomap_o_dim4.csv',
 'isomap_o_dim5.csv',
 'isomap_o_dim6.csv',
 'isomap_o_dim7.csv',
 'isomap_o_dim8.csv',
 'isomap_o_dim9.csv',
 'isomap_o_dim10.csv']

In [4]:
# load the datasets

datasets = [np.genfromtxt(path, delimiter=';') for path in paths]

In [5]:
#isomap_o = datasets[2]

In [6]:
PCA_o = datasets[0]

In [7]:
#datasets[2]

array([[-0.0538136 ,  0.33322081],
       [-0.18348114, -0.07652568],
       [-0.2844246 , -0.26008137],
       ...,
       [-0.30795715, -0.67117225],
       [-0.1270504 ,  0.33897461],
       [-0.19206462, -0.18906588]])

In [8]:
print(PCA_o.shape)

(14098, 20)


In [9]:
#isomap_o

array([[-0.0538136 ,  0.33322081],
       [-0.18348114, -0.07652568],
       [-0.2844246 , -0.26008137],
       ...,
       [-0.30795715, -0.67117225],
       [-0.1270504 ,  0.33897461],
       [-0.19206462, -0.18906588]])

In [10]:
PCA_o

array([[-0.15233025,  0.2125124 , -0.00612836, ...,  0.0028785 ,
        -0.00215792, -0.00415046],
       [-0.16197173, -0.08758274, -0.01822054, ..., -0.00951794,
        -0.00061307, -0.00421629],
       [-0.15970531, -0.22186946, -0.01824396, ...,  0.00350789,
        -0.00393516, -0.00741874],
       ...,
       [-0.07531961, -0.5441434 , -0.11227014, ..., -0.01949586,
        -0.00116283, -0.00619764],
       [-0.03387491,  0.18981409,  0.01463004, ...,  0.01733281,
        -0.00830419, -0.00217058],
       [-0.08965747, -0.14413832, -0.00702615, ...,  0.00309083,
         0.00562926,  0.0063224 ]])

In [11]:
size = PCA_o.shape[0]

In [12]:
size

14098

# Repeat the code below for each dimension

In [639]:
# Fragment below should be calculated for all the interested dimensions, i.e. from 1 to 10

indexes = np.arange(size)

In [640]:
indexes

array([    0,     1,     2, ..., 14095, 14096, 14097])

In [641]:
# ratio for the training and the testing

ratio = 0.8

In [642]:
# randomly choosing the training indexes

tr_indexes = np.random.choice(range(size), math.floor(size * ratio), replace=False)

In [643]:
tr_indexes

array([ 8671,  6374,  2919, ...,  6551,  5889, 11781])

In [644]:
tr_indexes.size

11278

In [645]:
# check that there are no duplicates

u, c = np.unique(tr_indexes, return_counts=True)
dup = u[c > 1]

In [646]:
dup.size

0

In [647]:
# all the other indexes are for testing

test_indexes = np.delete(indexes, tr_indexes)

In [648]:
test_indexes.size

2820

In [649]:
# check the sizes

test_indexes.size + tr_indexes.size - size

0

In [650]:
test_indexes

array([    2,     7,    22, ..., 14088, 14089, 14094])

In [651]:
# randomly shuffle testing indexes

np.random.shuffle(test_indexes)

In [652]:
test_indexes

array([ 903, 7806, 3485, ..., 2880,  669, 7552])

In [653]:
# Change the datasets index for every iteration

test_in = np.take(datasets[10], test_indexes, axis = 0)

In [654]:
test_in

array([[-0.18968371, -0.06854088, -0.00902227, ...,  0.03067484,
        -0.0076571 , -0.01255257],
       [-0.20216073, -0.13966318, -0.01149782, ...,  0.01578627,
        -0.02814504, -0.03040133],
       [-0.11532285,  0.13884351, -0.02629507, ...,  0.06583339,
        -0.06481115,  0.03358341],
       ...,
       [ 0.13255629,  0.60677708, -0.06653959, ..., -0.03929787,
        -0.00565445, -0.1297162 ],
       [-0.33705379, -0.35341762, -0.02237093, ..., -0.02418308,
         0.04552451, -0.0334964 ],
       [ 0.29474751, -0.19686945, -0.17898078, ...,  0.29412513,
         0.12157646, -0.00497117]])

In [655]:
test_in.shape

(2820, 10)

In [656]:
# this are the true values for the testing samples

test_true = np.take(PCA_o, test_indexes, axis = 0)

In [657]:
test_true

array([[-0.15466659, -0.08195035, -0.01693082, ..., -0.00045501,
        -0.00255765, -0.00481845],
       [-0.16399843, -0.12859017, -0.02020846, ..., -0.00452592,
         0.00046965, -0.00125023],
       [-0.08734526,  0.04231773,  0.00045266, ..., -0.00325491,
         0.00681675,  0.0076079 ],
       ...,
       [-0.14488306,  0.45150594, -0.00709505, ...,  0.002433  ,
        -0.00069504, -0.00133956],
       [-0.18149808, -0.30111925, -0.02622533, ..., -0.0130031 ,
        -0.00077018, -0.0052894 ],
       [ 0.35652767, -0.02235619, -0.07761212, ..., -0.04215056,
        -0.08435196,  0.01680226]])

In [658]:
test_true.shape

(2820, 20)

In [659]:
# Change the datasets index for every iteration

train_in = np.take(datasets[10], tr_indexes, axis = 0)

In [660]:
train_in.shape

(11278, 10)

In [661]:
# values for the training the samples - PCA projected values corresponding to Isomap ones

train_out = np.take(PCA_o, tr_indexes, axis = 0)

In [662]:
train_out.shape

(11278, 20)

# Fragment here is for testing

In [1335]:
# K-NN regressor

knn = KNeighborsRegressor(n_neighbors=5, weights='distance')

In [341]:
# Use code below is for 1 dim case only

train_in = train_in.reshape(-1,1)

In [342]:
# Use code below is for 1 dim case only

test_in = test_in.reshape(-1, 1)

In [343]:
# Use code below is for 1 dim case only

train_out.reshape(1, -1)

array([[ 0.06831993, -0.07757869, -0.02375929, ...,  0.0015896 ,
        -0.00365918, -0.00239234]])

In [598]:
# Use code below is for 1 dim case only

#train_in

array([[-0.02787141],
       [ 0.1540273 ],
       [-0.08613664],
       ...,
       [-0.05950193],
       [ 0.16421847],
       [ 0.12667289]])

In [605]:
# Use code below is for 1 dim case only

#test_in

array([[-0.04228756],
       [ 0.14010688],
       [-0.17423329],
       ...,
       [-0.48914115],
       [ 0.64539735],
       [-0.06462557]])

In [1336]:
# fit K-NN regressor for the training data

knn.fit(train_in, train_out)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='distance')

In [1337]:
# compute the prediction

test_out = knn.predict(test_in)

In [1338]:
knn.score(train_in, train_out)



1.0

In [1339]:
knn.score(test_in, test_true)



0.9704680661340206

In [1340]:
# Evaluate the mean average prediction error (mean of the differences divided by the initial values)

diff = test_out - test_true
abs_diff = np.apply_along_axis(np.linalg.norm, 1, diff, ord=2)
abs_true = np.apply_along_axis(np.linalg.norm, 1, test_true, ord=2)
mape = np.mean(abs_diff/abs_true)

In [1341]:
mape

0.14472588067510644

# Fragment ended

In [663]:
# K-NN regressor

knn = KNeighborsRegressor()

In [664]:
# grid parameters

parameters = {
    'n_neighbors' : [3, 4, 5, 6, 7, 8, 10, 15],
    'weights' : ('uniform', 'distance'),
    'leaf_size' : [30, 40, 50, 60, 100, 10, 20],
    'p' : [2, 3, 1],
}

In [665]:
# gridsearch cross validation

reg = GridSearchCV(knn, parameters)

In [666]:
# fitting the regression via cross-validation

reg.fit(train_in, train_out)















































































































































































































GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'leaf_size': [30, 40, 50, 60, 100, 10, 20],
                         'n_neighbors': [3, 4, 5, 6, 7, 8, 10, 15],
                         'p': [2, 3, 1], 'weights': ('uniform', 'distance')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [77]:
reg.cv_results_

{'mean_fit_time': array([0.00919871, 0.00867105, 0.00954204, 0.00920892, 0.0093224 ,
        0.00830336, 0.00979576, 0.00930929, 0.00917697, 0.00884318,
        0.00833826, 0.00856137, 0.00842767, 0.00847411, 0.00879507,
        0.00942488, 0.00915909, 0.00978904, 0.00839424, 0.00845814,
        0.00860658, 0.0093029 , 0.0093823 , 0.00945239, 0.00925303,
        0.0090888 , 0.00919027, 0.00941424, 0.00845599, 0.00827985,
        0.00844398, 0.00932345, 0.00949759, 0.00897727, 0.0086854 ,
        0.00855947, 0.00860662, 0.00857224, 0.0089397 , 0.0090694 ,
        0.00864172, 0.00906782, 0.00853834, 0.00835247, 0.00922718,
        0.00872622, 0.00841923, 0.00850043, 0.00737157, 0.00762639,
        0.00819511, 0.00916815, 0.00784283, 0.00748048, 0.00792146,
        0.01017227, 0.00851531, 0.00779929, 0.00814443, 0.00749102,
        0.00750003, 0.00758018, 0.00812798, 0.00814905, 0.00758753,
        0.0074223 , 0.00928392, 0.00766263, 0.0086153 , 0.00968046,
        0.0078433 , 0.00771999,

In [78]:
df = pd.DataFrame(reg.cv_results_)

In [79]:
display(df)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_leaf_size,param_n_neighbors,param_p,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.009199,0.000452,0.028787,0.003744,30,3,2,uniform,"{'leaf_size': 30, 'n_neighbors': 3, 'p': 2, 'w...",0.962121,0.961562,0.963736,0.959446,0.962789,0.961931,0.001438,99
1,0.008671,0.000470,0.028318,0.001865,30,3,2,distance,"{'leaf_size': 30, 'n_neighbors': 3, 'p': 2, 'w...",0.963916,0.963198,0.964869,0.961171,0.964415,0.963514,0.001296,15
2,0.009542,0.000807,0.280939,0.019067,30,3,3,uniform,"{'leaf_size': 30, 'n_neighbors': 3, 'p': 3, 'w...",0.961174,0.960531,0.962032,0.958558,0.963103,0.961080,0.001528,148
3,0.009209,0.000731,0.290408,0.019942,30,3,3,distance,"{'leaf_size': 30, 'n_neighbors': 3, 'p': 3, 'w...",0.963137,0.962365,0.963347,0.960141,0.964702,0.962738,0.001501,71
4,0.009322,0.000522,0.042043,0.001964,30,3,1,uniform,"{'leaf_size': 30, 'n_neighbors': 3, 'p': 1, 'w...",0.964388,0.961101,0.963796,0.958054,0.961030,0.961674,0.002268,120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,0.008359,0.000163,0.051243,0.001893,20,15,2,distance,"{'leaf_size': 20, 'n_neighbors': 15, 'p': 2, '...",0.958536,0.955132,0.958782,0.953495,0.960801,0.957349,0.002651,253
332,0.008759,0.000145,0.464264,0.006411,20,15,3,uniform,"{'leaf_size': 20, 'n_neighbors': 15, 'p': 3, '...",0.946010,0.946307,0.949023,0.943125,0.952902,0.947473,0.003295,323
333,0.008563,0.000149,0.464077,0.006179,20,15,3,distance,"{'leaf_size': 20, 'n_neighbors': 15, 'p': 3, '...",0.958122,0.954419,0.957790,0.952747,0.960437,0.956703,0.002757,267
334,0.008424,0.000231,0.068248,0.000897,20,15,1,uniform,"{'leaf_size': 20, 'n_neighbors': 15, 'p': 1, '...",0.946426,0.943675,0.948937,0.942733,0.951304,0.946615,0.003198,330


In [667]:
# best estimator parameters

reg.best_estimator_

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                    weights='distance')

In [668]:
reg.best_score_

0.9740370452839882

In [669]:
# the best grid parameters

reg.best_params_

{'leaf_size': 30, 'n_neighbors': 4, 'p': 2, 'weights': 'distance'}

In [670]:
# make the prediction

test_out = reg.predict(test_in)

In [671]:
# regression on the full dataset - this is the inverse mapping to the PCA data
# for dimension 1 use the first line, otherwise the second

#whole_out = reg.predict(datasets[1].reshape(-1,1))
whole_out = reg.predict(datasets[10])

In [672]:
whole_out.shape

(14098, 20)

In [673]:
# store reconstructed PCA values to csv files

np.savetxt("isomap_o_dim10_reconstructed_pca.csv", whole_out, delimiter=";")

## All the below fragment was a draught

In [1353]:
diff = test_out - test_true
abs_diff = np.apply_along_axis(np.linalg.norm, 1, diff, ord=2)
abs_true = np.apply_along_axis(np.linalg.norm, 1, test_true, ord=2)
#max_ = max(abs_true)
#min_ = min(abs_true)
mae = np.mean(abs_diff/abs_true)

In [1354]:
mae

0.1453009936899391

In [672]:
#max_

1.1278615245477308

In [673]:
#min_

0.08463281201350935

In [761]:
# Create the list and then comment this line
#mae_isomap = []

In [1238]:
# Add till all the dimensions are estimated
mae_isomap.append(mae)

In [1356]:
#mae_isomap

[0.5590120270598874,
 0.36197127567478454,
 0.26252150659639933,
 0.21638834747217764,
 0.17131891040044125,
 0.14965449334733436,
 0.1453009936899391,
 0.13813865586050603,
 0.13073217895700262,
 0.1290491839141481]

In [1355]:
mae_isomap[6] = mae

In [1357]:
x = np.linspace(1, 10, 10,endpoint=True)

In [1358]:
y = np.take(mae_isomap, range(0,10))

In [1]:
plt.plot(x, y, '-ok')

NameError: name 'plt' is not defined

In [103]:
#df.to_csv('grid_search_knn_2.csv', index=False, sep=';')