In [1]:
import numpy as np
import pandas as pd  
import random as rnd
import math
import os
import glob
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.base import BaseEstimator
from sklearn.metrics import make_scorer, r2_score
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.manifold import Isomap, LocallyLinearEmbedding, TSNE
from sklearn.decomposition import PCA
from IPython.display import display, HTML
from sklearn.model_selection import cross_val_score, cross_validate, KFold
from tqdm import tqdm_notebook
from collections import defaultdict
%matplotlib inline
import matplotlib.pyplot as plt

from utils import project

In [2]:
def cross_val_score_custom(est, X_mf, X_pca, X_orig, scoring, cv):
    scoring_list = []
    kfold = KFold(n_splits=cv, shuffle=True, random_state=42)
    for train_index, test_index in kfold.split(X_mf):
        X_mf_train = X_mf[train_index]
        X_pca_train = X_pca[train_index]
        
        X_mf_test = X_mf[test_index]
        X_orig_test = X_orig[test_index]
        
        est.fit(X_mf_train, X_pca_train)
        X_orig_test_pred = est.predict(X_mf_test)
        
        scoring_list.append(scoring(X_orig_test, X_orig_test_pred))
    return np.array(scoring_list)

In [3]:
class MF2PCA2ORIG(BaseEstimator):
    def __init__(self, pca_module, mo_regressor):
        super(MF2PCA2ORIG, self).__init__()
        self.pca = pca_module
        self.mo_regressor = mo_regressor
        
    def fit(self, X,y):
        self.mo_regressor.fit(X,y)
        return self
        
    def predict(self, X):
        y_pred = self.mo_regressor.predict(X)
        return self.pca.inverse_transform(y_pred)

In [4]:
# paths to the databases containing the PCA and Isomap projections 
root_pca = './separate_datasets_data'
paths = [os.path.join(root_pca,path) for path in os.listdir(root_pca)]
paths_pca = list(filter(lambda x: 'err' not in x.split("/")[-1].split(".")[0].split('_'), paths))
paths_mf_lle = list(filter(lambda x: 'lle' in x.split("/")[-1].split(".")[0].split('_'), paths))
paths_mf_isomap = list(filter(lambda x: 'isomap' in x.split("/")[-1].split(".")[0].split('_'), paths))

In [5]:
def mae_score(y, y_pred):
    return np.linalg.norm(y_pred - y, axis=1, ord=1).mean() / np.linalg.norm(y, axis=1, ord=1).mean()

scorer = make_scorer(r2_score, greater_is_better=True)
scorer_mae = make_scorer(mae_score, greater_is_better=False)

In [6]:
knn = KNeighborsRegressor(n_neighbors=12, weights='distance')
mo_knn = MultiOutputRegressor(knn)

In [None]:
res_mf2pca_r2 = defaultdict(list)
res_mf2pca_mae = defaultdict(list)

res_inv_pca_mae = {}

res_mf2pca2orig_r2 = defaultdict(list)
res_mf2pca2orig_mae = defaultdict(list)

folders = glob.glob('???')
for f in tqdm_notebook(folders):
    for t in ['o', 'f', 'g']:
        print(f'DATASET: {f}, TAX: {t}')
        label = f'{f}_proj_{t}'
        dataframe = pd.read_csv(f'{f}/pivot_{t}_normalized.csv', skipinitialspace=True, sep=';', engine='python')
        dataframe = dataframe.drop('Unnamed: 0', axis = 1)
        data = dataframe.values
        data_projected, pca = project(data, plot=False)
        data_projected_inverse = pca.inverse_transform(data_projected)
        
        res_inv_pca_mae[label] = mae_score(data_projected_inverse, data)
        est_composed = MF2PCA2ORIG(pca, mo_knn)
        
        dims = np.arange(2,data_projected.shape[1], 4)
        
        for n in tqdm_notebook(dims):
            transformer = Isomap(n_components=n, n_neighbors=12)
            dataset_transformed = transformer.fit_transform(data_projected)
            
            cv_results_r2_mf2pca = cross_val_score(mo_knn, dataset_transformed, data_projected, scoring=scorer, cv=5, n_jobs=-1)
            cv_results_mae_mf2pca = cross_val_score(mo_knn, dataset_transformed, data_projected, scoring=scorer_mae, cv=5, n_jobs=-1)
            
            cv_results_r2_mf2orig = cross_val_score_custom(est_composed, dataset_transformed, data_projected, data, r2_score, cv=5)
            cv_results_mae_mf2orig = cross_val_score_custom(est_composed, dataset_transformed, data_projected, data, mae_score, cv=5)

            res_mf2pca_r2[label].append(cv_results_r2_mf2pca.mean())
            res_mf2pca_mae[label].append(cv_results_mae_mf2pca.mean())
            
            res_mf2pca2orig_r2[label].append(cv_results_r2_mf2orig.mean()) 
            res_mf2pca2orig_mae[label].append(cv_results_mae_mf2orig.mean()) 


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

DATASET: AGP, TAX: o


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


DATASET: AGP, TAX: f


HBox(children=(IntProgress(value=0, max=9), HTML(value='')))