In [None]:
import itertools
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
from sklearn import preprocessing
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.model_selection import (
    train_test_split, StratifiedShuffleSplit, 
    ShuffleSplit, KFold, StratifiedKFold,
)

import kdquantile

In [None]:
def get_wine():
    columns = [
        'Class label',
        'Alcohol', 'Malic acid', 'Ash', 'Ash alcalinity',
        'Magnesium', 'Total phenols', 'Flavanoids',
        'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 
        'Hue', 'OD280/OD315 of diluted wines', 'Proline'
    ]
    df = pd.io.parsers.read_csv('wine_data.csv', header=None, usecols=[0,1,2])
    df.columns = columns[0:3]
    X_wine = df.values[:,1:]
    y_wine = df.values[:,0]
    return X_wine, y_wine

def get_iris():
    df = sns.load_dataset('iris')
    X_iris = df.values[:, 0:4]
    y_iris = df.values[:, 4]
    return X_iris, y_iris

def get_penguins():
    df = sns.load_dataset('penguins')
    df = df.dropna()
    columns = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
    X_penguins = df[columns].values
    y_penguins = df[['species']].values.flatten()
    return X_penguins, y_penguins

def get_hawks():
    columns = ['Wing', 'Weight', 'Culmen', 'Hallux', 'Tail']
    # Downloaded from https://vincentarelbundock.github.io/Rdatasets/csv/Stat2Data/Hawks.csv
    df = pd.io.parsers.read_csv('Hawks.csv')
    df = df[columns+['Species']]
    df = df.dropna()
    X_hawks = df[columns].values
    y_hawks = df[['Species']].values.flatten()
    return X_hawks, y_hawks

def get_sonar():
    dataset = pd.read_csv("sonar.csv", header=None)
    data = dataset.values
    # separate into input and output columns
    X, y = data[:, :-1], data[:, -1]
    # ensure inputs are floats and output is an integer label
    X_full = X.astype('float32')
    y_full = preprocessing.LabelEncoder().fit_transform(y.astype('str'))
    return X_full, y_full

In [None]:
n_random = 2
n_alphas = 10
n_splits = 30
outer_test_size = 0.3
inner_test_size = 0.3

datasets = [
    ("wine", get_wine()),
    ("iris", get_iris()),
    ("penguins", get_penguins()),
    ("hawks", get_hawks()),
    ("sonar", get_sonar()),
]
methods = [
    (preprocessing.MinMaxScaler, 'min-max'),
    (preprocessing.QuantileTransformer, 'quantile'),
    (kdquantile.KDQuantileTransformer, 'KD-quantile'),
]
test_dict = {}
alpha_dict = {}
for dname, (X_all, y_all) in datasets:
    test_dict[dname] = {}
    alpha_dict[dname] = {}
    for mname in [mname for _, mname in methods]+['KD-quantile (bwf=CV)']:
        test_dict[dname][mname] = {'pc': np.zeros(n_random), 'knn': np.zeros(n_random)}
    alpha_dict[dname] = {'pc': np.zeros((n_random, n_alphas)), 'knn': np.zeros((n_random, n_alphas))}

for dname, (X_all, y_all) in datasets:
    alphas = list(np.geomspace(0.1, 10, n_alphas))
    random_states = list(range(n_random))
    for rix, rstate in enumerate(random_states):
        X_train, X_test, y_train, y_test = train_test_split(
            X_all, y_all, test_size=outer_test_size, random_state=rstate)
          
        for preprocessor, pname in methods:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                prepper = preprocessor().fit(X_train)
                X_train_prepped = prepper.transform(X_train)
                X_test_prepped = prepper.transform(X_test)
            # PCA + GaussianNB
            pcaer = PCA(n_components=2).fit(X_train_prepped)
            X_train_prepped_pc = pcaer.transform(X_train_prepped)
            X_test_prepped_pc = pcaer.transform(X_test_prepped)
            gnb_pc = GaussianNB().fit(X_train_prepped_pc, y_train)
            acc_test_pc = metrics.accuracy_score(y_test, gnb_pc.predict(X_test_prepped_pc))
            test_dict[dname][pname]['pc'][rix] = acc_test_pc
            # KNN
            knner = KNeighborsClassifier().fit(X_train_prepped, y_train)
            acc_test_knn = metrics.accuracy_score(y_test, knner.predict(X_test_prepped))
            test_dict[dname][pname]['knn'][rix] = acc_test_knn        
    
        sss = ShuffleSplit(n_splits=n_splits, test_size=inner_test_size, random_state=rstate)
        acc_allsplits_pc = np.zeros((n_splits, len(alphas)))
        acc_allsplits_knn = np.zeros((n_splits, len(alphas)))
        for sssi, (trtr_index, trdev_index) in enumerate(sss.split(X_train, y_train)):
            X_trtr = X_train[trtr_index, :]
            y_trtr = y_train[trtr_index]
            X_trdev = X_train[trdev_index, :]
            y_trdev = y_train[trdev_index]
            for aix, alpha in enumerate(alphas):
                prepper = kdquantile.KDQuantileTransformer(alpha=alpha).fit(X_train)
                X_train_prepped = prepper.transform(X_train)
                X_trtr_prepped = prepper.transform(X_trtr)
                X_trdev_prepped = prepper.transform(X_trdev)
                pcaer = PCA(n_components=2).fit(X_train_prepped)
                X_trtr_prepped_pc = pcaer.transform(X_trtr_prepped)
                X_trdev_prepped_pc = pcaer.transform(X_trdev_prepped)
                gnb_pc = GaussianNB().fit(X_trtr_prepped_pc, y_trtr)
                acc_trdev_pc = metrics.accuracy_score(y_trdev, gnb_pc.predict(X_trdev_prepped_pc))
                acc_allsplits_pc[sssi, aix] = acc_trdev_pc
                knner = KNeighborsClassifier().fit(X_trtr_prepped, y_trtr)
                acc_trdev_knn = metrics.accuracy_score(y_trdev, knner.predict(X_trdev_prepped))
                acc_allsplits_knn[sssi, aix] = acc_trdev_knn 
        acc_splits_pc = np.mean(acc_allsplits_pc, axis=0)
        acc_splits_knn = np.mean(acc_allsplits_knn, axis=0)
        print(np.argmax(acc_splits_pc), np.argmax(acc_splits_knn))  
    
        for aix, alpha in enumerate(alphas):
            prepper = kdquantile.KDQuantileTransformer(alpha=alpha).fit(X_train)
            X_train_prepped = prepper.transform(X_train)
            X_test_prepped = prepper.transform(X_test)
            pcaer = PCA(n_components=2).fit(X_train_prepped)
            X_train_prepped_pc = pcaer.transform(X_train_prepped)
            X_test_prepped_pc = pcaer.transform(X_test_prepped)
            gnb_pc = GaussianNB().fit(X_train_prepped_pc, y_train)
            acc_test_pc = metrics.accuracy_score(y_test, gnb_pc.predict(X_test_prepped_pc))
            alpha_dict[dname]['pc'][rix, aix] = acc_test_pc
            if aix == np.argmax(acc_splits_pc):
                test_dict[dname]['KD-quantile (bwf=CV)']['pc'][rix] = acc_test_pc    
            knner = KNeighborsClassifier().fit(X_train_prepped, y_train)
            acc_test_knn = metrics.accuracy_score(y_test, knner.predict(X_test_prepped))
            alpha_dict[dname]['knn'][rix, aix] = acc_test_knn
            if aix == np.argmax(acc_splits_knn):
                test_dict[dname]['KD-quantile (bwf=CV)']['knn'][rix] = acc_test_knn

In [None]:
for dname in list(test_dict.keys()):
    markers = itertools.cycle(('^', 's', '*', 'o', '.'))
    colors = itertools.cycle(('red', 'blue', 'purple', 'magenta', 'green')) 
    alph = np.array([np.min(alphas), np.max(alphas)]);
    fig = plt.figure(figsize=(4,3), dpi=150)
    for pname in list(test_dict[dname].keys()):
        tmean = test_dict[dname][pname]['pc'].mean()
        tstd = np.std(test_dict[dname][pname]['pc'], ddof=1) / np.sqrt(n_random)
        (_, caps, _) = plt.errorbar(
            alph, tmean*np.ones(2), yerr=tstd*np.ones(2), 
            marker=next(markers), c = next(colors), markersize=10, alpha=0.5, capsize=4,
            label=f'{pname} (bwf=1)' if pname=='KD-quantile' else pname);
        for cap in caps:
            cap.set_markeredgewidth(2)
    amean = alpha_dict[dname]['pc'].mean(axis=0)
    astd = np.std(alpha_dict[dname]['pc'], axis=0, ddof=1) / np.sqrt(n_random)
    plt.errorbar(
        alphas, amean, yerr=astd,
        marker=next(markers), c=next(colors), markersize=2, alpha=0.5, capsize=2,
        label='KD-quantile');
    plt.xlabel('KD-quantile bandwidth factor');
    plt.ylabel('Accuracy');
    plt.xscale('log');
    #plt.grid();
    plt.xlim(np.min(alphas), np.max(alphas));
    fig.savefig(f'Accuracy-vs-bwf-{dname}-pca-nolegend.pdf', bbox_inches='tight')
    plt.legend(loc='lower center');
    fig.savefig(f'Accuracy-vs-bwf-{dname}-pca.pdf', bbox_inches='tight')
    plt.title(f'{dname} PCA');
    
    markers = itertools.cycle(('d', 's', '*', 'o', '.'))
    colors = itertools.cycle(('red', 'blue', 'purple', 'magenta', 'green')) 
    fig = plt.figure(figsize=(4,3), dpi=150)
    for pname in list(test_dict[dname].keys()):
        tmean = test_dict[dname][pname]['knn'].mean()
        tstd = np.std(test_dict[dname][pname]['knn'], ddof=1) / np.sqrt(n_random)
        (_, caps, _) = plt.errorbar(
            alph, tmean*np.ones(2), yerr=tstd*np.ones(2), 
            marker=next(markers), c = next(colors), markersize=10, alpha=0.5, capsize=4,
            label=f'{pname} (bwf=1)' if pname=='KD-quantile' else pname);
        for cap in caps:
            cap.set_markeredgewidth(2)
    amean = alpha_dict[dname]['knn'].mean(axis=0)
    astd = np.std(alpha_dict[dname]['knn'], axis=0, ddof=1) / np.sqrt(n_random)
    plt.errorbar(
        alphas, amean, yerr=astd,
        marker=next(markers), c=next(colors), markersize=2, alpha=0.5, capsize=2,
        label='KD-quantile');
    plt.legend(loc='lower center');
    plt.xlabel('KD-quantile bandwidth factor');
    plt.xscale('log');
    plt.ylabel('Accuracy');
    #plt.grid();
    plt.xlim(np.min(alphas), np.max(alphas));
    fig.savefig(f'Accuracy-vs-bwf-{dname}-knn.pdf', bbox_inches='tight')
    plt.title(f'{dname} knn');