In [None]:
import itertools
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.datasets as skd
import warnings
from sklearn import preprocessing
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import (
    train_test_split, StratifiedShuffleSplit, 
    ShuffleSplit, KFold, StratifiedKFold,
)

import kditransform

In [None]:
def get_cahousing():
    dataset = skd.fetch_california_housing()
    X_full, y_full = dataset.data, dataset.target
    return X_full, y_full

def get_abalone():
    data = pd.read_csv("abalone/abalone.data", header=None)
    data.columns = [
        'Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 
        'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings'
    ]
    data['SexM'] = 1. * (data.Sex == 'M')
    data['SexF'] = 1. * (data.Sex == 'F')
    data['SexI'] = 1. * (data.Sex == 'I')
    X = data[[
        'SexM', 'SexF', 'SexI',
        'Length', 'Diameter', 'Height', 'Whole weight', 
        'Shucked weight', 'Viscera weight', 'Shell weight',
    ]].values
    y = data['Rings'].values
    return X, y

In [None]:
n_random = 100
n_alphas = 20
n_splits = 30
outer_test_size = 0.3
inner_test_size = 0.3

datasets = [
    ("cahousing", get_cahousing()),
    (("abalone"), get_abalone()),
]
methods = [
    (preprocessing.MinMaxScaler, 'min-max'),
    (preprocessing.QuantileTransformer, 'quantile'),
    (kditransform.KDITransformer, 'KD-integral'),
]
test_dict = {}
alpha_dict = {}
for dname, (X_all, y_all) in datasets:
    test_dict[dname] = {}
    alpha_dict[dname] = {}
    for mname in [mname for _, mname in methods]+['KD-integral (bwf=CV)']:
        test_dict[dname][mname] = {'linr': np.zeros(n_random)}
    alpha_dict[dname] = {'linr': np.zeros((n_random, n_alphas))}

for dname, (X_all, y_all) in datasets:
    print(dname, X_all.shape)
    alphas = list(np.geomspace(0.1, 10, n_alphas))
    random_states = list(range(n_random))
    for rix, rstate in enumerate(random_states):
        X_train, X_test, y_train, y_test = train_test_split(
            X_all, y_all, test_size=outer_test_size, random_state=rstate)
          
        for preprocessor, pname in methods:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                prepper = preprocessor().fit(X_train)
                X_train_prepped = prepper.transform(X_train)
                X_test_prepped = prepper.transform(X_test)
            linrer = LinearRegression().fit(X_train_prepped, y_train)
            mse_test_linr = np.sqrt(metrics.mean_squared_error(y_test, linrer.predict(X_test_prepped)))
            test_dict[dname][pname]['linr'][rix] = mse_test_linr        
    
        sss = ShuffleSplit(n_splits=n_splits, test_size=inner_test_size, random_state=rstate)
        mse_allsplits_linr = np.zeros((n_splits, len(alphas)))
        for sssi, (trtr_index, trdev_index) in enumerate(sss.split(X_train, y_train)):
            X_trtr = X_train[trtr_index, :]
            y_trtr = y_train[trtr_index]
            X_trdev = X_train[trdev_index, :]
            y_trdev = y_train[trdev_index]
            for aix, alpha in enumerate(alphas):
                prepper = kditransform.KDITransformer(alpha=alpha).fit(X_train)
                X_train_prepped = prepper.transform(X_train)
                X_trtr_prepped = prepper.transform(X_trtr)
                X_trdev_prepped = prepper.transform(X_trdev)
                linrer = LinearRegression().fit(X_trtr_prepped, y_trtr)
                mse_trdev_linr = np.sqrt(metrics.mean_squared_error(y_trdev, linrer.predict(X_trdev_prepped)))
                mse_allsplits_linr[sssi, aix] = mse_trdev_linr 
        mse_splits_linr = np.mean(mse_allsplits_linr, axis=0)
        print(rix, np.argmin(mse_splits_linr))  
    
        for aix, alpha in enumerate(alphas):
            prepper = kditransform.KDITransformer(alpha=alpha).fit(X_train)
            X_train_prepped = prepper.transform(X_train)
            X_test_prepped = prepper.transform(X_test)
            linrer = LinearRegression().fit(X_train_prepped, y_train)
            mse_test_linr = np.sqrt(metrics.mean_squared_error(y_test, linrer.predict(X_test_prepped)))
            alpha_dict[dname]['linr'][rix, aix] = mse_test_linr
            if aix == np.argmin(mse_splits_linr):
                test_dict[dname]['KD-integral (bwf=CV)']['linr'][rix] = mse_test_linr

In [None]:
for dname in list(test_dict.keys()):
    markers = itertools.cycle(('^', 's', '*', 'o', '.'))
    colors = itertools.cycle(('red', 'blue', 'purple', 'magenta', 'green')) 
    alph = np.array([np.min(alphas), np.max(alphas)]);
    fig = plt.figure(figsize=(4,3), dpi=150)
    for pname in list(test_dict[dname].keys()):
        tmean = test_dict[dname][pname]['linr'].mean()
        tstd = np.std(test_dict[dname][pname]['linr'], ddof=1) / np.sqrt(n_random)
        (_, caps, _) = plt.errorbar(
            alph, tmean*np.ones(2), yerr=tstd*np.ones(2), 
            marker=next(markers), c = next(colors), markersize=10, alpha=0.5, capsize=4,
            label=f'{pname} (bwf=1)' if pname=='KD-integral' else pname);
        for cap in caps:
            cap.set_markeredgewidth(2)
    amean = alpha_dict[dname]['linr'].mean(axis=0)
    astd = np.std(alpha_dict[dname]['linr'], axis=0, ddof=1) / np.sqrt(n_random)
    plt.errorbar(
        alphas, amean, yerr=astd,
        marker=next(markers), c=next(colors), markersize=2, alpha=0.5, capsize=2,
        label='KD-integral');
    plt.xlabel('KD-integral bandwidth factor');
    plt.xscale('log');
    plt.ylabel('rMSE');
    plt.xlim(np.min(alphas), np.max(alphas));
    fig.savefig(f'MSE-vs-bwf-{dname}-linr-nolegend.pdf', bbox_inches='tight')
    plt.legend(loc='lower center');
    fig.savefig(f'MSE-vs-bwf-{dname}-linr.pdf', bbox_inches='tight')
    plt.title(f'{dname} linr');