In [17]:
import numpy as np
import pandas as pd  
import random as rnd
import math
import os
from sklearn.manifold import Isomap, LocallyLinearEmbedding, TSNE
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV

from IPython.display import display, HTML

%matplotlib inline
import matplotlib.pyplot as plt

from utils import project, cross_val_score_custom, create_clustering_pivot_table, clustering, plot_proj_clustering, MF2PCA2ORIG

In [10]:
# paths with datasets

# paths = [
#     "./proj_o.csv"
# ]

# for num in range(1,11) :
#     name = "isomap_o_dim" + str(num) + ".csv"
#     paths.append(name)

root = './separate_datasets_data/'
paths = os.listdir(root)

In [11]:
paths

['ptb_proj_f.csv',
 'AGP_proj_o.csv',
 't2d_proj_f.csv',
 'ibd_proj_o.csv',
 't2d_proj_g.csv',
 'ptb_proj_g.csv',
 'ptb_proj_o.csv',
 'AGP_proj_f.csv',
 'AGP_proj_g.csv',
 'ibd_proj_f.csv',
 't2d_proj_o.csv',
 'ibd_proj_g.csv']

In [12]:
# load the datasets
datasets = {path.split('.')[0]:np.genfromtxt(os.path.join(root,path), delimiter=';') for path in paths}

In [14]:
def mape(y, y_pred):
    # Evaluate the mean average prediction error (mean of the differences divided by the initial values)
    diff = y - y_pred
    abs_diff = np.apply_along_axis(np.linalg.norm, 1, diff, ord=2)
    abs_true = np.apply_along_axis(np.linalg.norm, 1, y, ord=2)
    mape = np.mean(abs_diff/abs_true)
    return mape

In [9]:
results = defaultdict(dict)
for dataset_name, data in datasets.items():
    MAX_DIM = min(10,data.shape[1])
    COMPONENTS_RANGE = np.arange(1,MAX_DIM)
    results[dataset_name]['range'] = COMPONENTS_RANGE
    MAE_mf2pca2orig_list = []
    MAE_mf2pca2orig_cv_list = []
    for n_components in COMPONENTS_RANGE:
        dataset = Isomap(n_components=n_components).fit_transform(data)
        cv_results = 
        MAE_mf2pca2orig_list
        MAE_mf2pca2orig_cv_list

In [11]:
# Fragment below should be calculated for all the interested dimensions, i.e. from 1 to 10
indexes = np.arange(size)
# ratio for the training and the testing
ratio = 0.8
# randomly choosing the training indexes
tr_indexes = np.random.choice(range(size), math.floor(size * ratio), replace=False)
# check that there are no duplicates
u, c = np.unique(tr_indexes, return_counts=True)
dup = u[c > 1]
assert dup.size == 0
# all the other indexes are for testing
test_indexes = np.delete(indexes, tr_indexes)
# check the sizes
assert test_indexes.size + tr_indexes.size - size == 0
# randomly shuffle testing indexes
np.random.shuffle(test_indexes)
# Change the datasets index for every iteration
test_in = np.take(dataset, test_indexes, axis = 0)
# this are the true values for the testing samples
test_true = np.take(PCA_o, test_indexes, axis = 0)
# Change the datasets index for every iteration
train_in = np.take(dataset, tr_indexes, axis = 0)
# values for the training the samples - PCA projected values corresponding to Isomap ones
train_out = np.take(PCA_o, tr_indexes, axis = 0)

In [12]:
# K-NN regressor
knn = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=5, weights='distance'))
# Use code below is for 1 dim case only
train_in = train_in.reshape(-1,dim)
# Use code below is for 1 dim case only
test_in = test_in.reshape(-1, dim)

# fit K-NN regressor for the training data
knn.fit(train_in, train_out)

MultiOutputRegressor(estimator=KNeighborsRegressor(algorithm='auto',
                                                   leaf_size=30,
                                                   metric='minkowski',
                                                   metric_params=None,
                                                   n_jobs=None, n_neighbors=5,
                                                   p=2, weights='distance'),
                     n_jobs=None)

In [13]:
# compute the prediction
test_out = knn.predict(test_in)
# compute the prediction
train_out_pred = knn.predict(train_in)

In [14]:
def mape(y, y_pred):
    # Evaluate the mean average prediction error (mean of the differences divided by the initial values)
    diff = y - y_pred
    abs_diff = np.apply_along_axis(np.linalg.norm, 1, diff, ord=2)
    abs_true = np.apply_along_axis(np.linalg.norm, 1, y, ord=2)
    mape = np.mean(abs_diff/abs_true)
    return mape

In [15]:
# Evaluate the mean average prediction error (mean of the differences divided by the initial values)
mape(train_out, train_out_pred)

0.0

In [16]:
# Evaluate the mean average prediction error (mean of the differences divided by the initial values)
mape(test_true, test_out)

0.13066134830124734

# GridSearch

In [None]:
# K-NN regressor
knn = KNeighborsRegressor()

# grid parameters
parameters = {
    'n_neighbors' : [3, 4, 5, 6, 7, 8, 10, 15],
    'weights' : ('uniform', 'distance'),
    'leaf_size' : [30, 40, 50, 60, 100, 10, 20],
    'p' : [2, 3, 1],
}

# gridsearch cross validation
reg = GridSearchCV(knn, parameters)
# fitting the regression via cross-validation
reg.fit(train_in, train_out)
# best estimator parameters
reg.best_estimator_
# make the prediction
test_out = reg.predict(test_in)
print('MAPE', mape(test_true, test_out))
# regression on the full dataset - this is the inverse mapping to the PCA data
# for dimension 1 use the first line, otherwise the second
#whole_out = reg.predict(datasets[1].reshape(-1,1))
whole_out = reg.predict(dataset)
# store reconstructed PCA values to csv files
# np.savetxt(f"isomap_o_dim{dim}_reconstructed_pca.csv", whole_out, delimiter=";")