# HDy with different configurations

## Preparation

In [69]:
import numpy as np
random_state = 42
np.random.seed(random_state)
import os, glob
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.5f' % x)

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from quantification.metrics import binary_kl_divergence, absolute_error
from quantification.dm import HDy, EDy

import seaborn as sns
import matplotlib.pyplot as plt

In [70]:
import warnings
from sklearn.exceptions import DataConversionWarning
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter("ignore", DataConversionWarning)
warnings.simplefilter("ignore", SettingWithCopyWarning)

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.exceptions import NotFittedError
from sklearn.utils.extmath import softmax

class LinearRegressionWrapper(LogisticRegression):
    """Class which overrides the sigmoid function of sklearn's LogisticRegression"""

    def __init__(self, alpha=1., penalty='l2', dual=False, tol=1e-4, C=1.0,
                 fit_intercept=True, intercept_scaling=1, class_weight=None,
                 random_state=None, solver='liblinear', max_iter=100,
                 multi_class='ovr', verbose=0, warm_start=False, n_jobs=1):
        super(LinearRegressionWrapper, self).__init__(penalty, dual, tol, C, fit_intercept, intercept_scaling, class_weight, random_state, solver,
                         max_iter, multi_class, verbose, warm_start, n_jobs)
        self.alpha = alpha

    def _predict_proba(self, X):
        prob = self.decision_function(X)
        prob = -1 * prob * self.alpha
        np.exp(prob, prob)
        prob += 1
        np.reciprocal(prob, prob)
        if prob.ndim == 1:
            return np.vstack([1 - prob, prob]).T
        else:
            # OvR normalization, like LibLinear's predict_probability
            prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
            return prob

    def predict_proba(self, X):
        if not hasattr(self, "coef_"):
            raise NotFittedError("Call fit before prediction")
        calculate_ovr = self.coef_.shape[0] == 1 or self.multi_class == "ovr"
        if calculate_ovr:
            return self._predict_proba(X)
        else:
            return softmax(self.decision_function(X), copy=False)

In [40]:
datasets_dir = "datasets"
dataset_files = [file for file in glob.glob(os.path.join(datasets_dir, "*.csv")) if "k9" not in file]
dataset_names = [os.path.split(name)[-1][:-4] for name in dataset_files]
print("There are a total of {} datasets.".format(len(dataset_names)))

There are a total of 43 datasets.


In [66]:
n_datasets = len(dataset_names)

columns=['dataset', 'alpha', 'bins', 'truth', 'predictions', 'kld', 'mae']
errors_df = pd.DataFrame(columns=columns)

## Utils

#### Standard scale data

In [42]:
from sklearn.preprocessing import StandardScaler

def normalize(X_train, X_test):
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test


#### Load data

In [43]:
def load_data(dfile):
    df = pd.read_csv(dfile, header=None)
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values.astype(np.int)
    if -1 in np.unique(y):
        y[y == -1] = 0
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=random_state)
    
    X_train, X_test = normalize(X_train, X_test)
    
    return X_train, X_test, y_train, y_test 

#### GridSearch parameters

In [44]:
estimator_grid = {'C': [10 ** i for i in range(-3, 3)]}
grid_params = dict(verbose=False)

## Experiments

In [62]:
for alpha in [0.2, 0.4, 0.6, 0.8, 1]:
    for b in [8, 16, 32, 64, 132]:
        hdy = HDy(b=b, estimator_class=LinearRegressionWrapper(alpha=alpha, random_state=random_state), estimator_grid=estimator_grid, grid_params=grid_params)

        for dname, dfile in zip(dataset_names, dataset_files)

            X_train, X_test, y_train, y_test = load_data(dfile)

            hdy.fit(X_train, y_train)
            prev_pred = hdy.predict(X_test)[1]

            prev_true = np.unique(y_test, return_counts=True)[1][1] / len(X_test)
            kld = binary_kl_divergence(prev_true, prev_pred)
            mae = absolute_error(prev_true, prev_pred)

            errors_df = errors_df.append(pd.DataFrame([[dname, alpha, b, prev_true, prev_pred, kld, mae]], columns=columns))

100%|██████████| 43/43 [01:04<00:00,  1.49s/it]
100%|██████████| 43/43 [01:06<00:00,  1.54s/it]
100%|██████████| 43/43 [01:03<00:00,  1.49s/it]
100%|██████████| 43/43 [01:08<00:00,  1.59s/it]
100%|██████████| 43/43 [01:02<00:00,  1.46s/it]
100%|██████████| 43/43 [00:53<00:00,  1.24s/it]
100%|██████████| 43/43 [01:01<00:00,  1.43s/it]
100%|██████████| 43/43 [00:55<00:00,  1.30s/it]
100%|██████████| 43/43 [01:01<00:00,  1.43s/it]
100%|██████████| 43/43 [01:03<00:00,  1.48s/it]
100%|██████████| 43/43 [00:52<00:00,  1.22s/it]
100%|██████████| 43/43 [00:56<00:00,  1.31s/it]
100%|██████████| 43/43 [01:04<00:00,  1.49s/it]
100%|██████████| 43/43 [01:06<00:00,  1.54s/it]
100%|██████████| 43/43 [01:02<00:00,  1.45s/it]
100%|██████████| 43/43 [01:03<00:00,  1.49s/it]
100%|██████████| 43/43 [01:01<00:00,  1.44s/it]
100%|██████████| 43/43 [01:06<00:00,  1.54s/it]
100%|██████████| 43/43 [00:58<00:00,  1.37s/it]
100%|██████████| 43/43 [00:59<00:00,  1.37s/it]
100%|██████████| 43/43 [01:01<00:00,  1.

In [71]:
for alpha in tqdm([0.2, 0.4, 0.6, 0.8, 1]):
        edy = EDy(estimator_class=LinearRegressionWrapper(alpha=alpha, random_state=random_state), estimator_grid=estimator_grid, grid_params=grid_params)

        for dname, dfile in zip(dataset_names, dataset_files):

            X_train, X_test, y_train, y_test = load_data(dfile)

            edy.fit(X_train, y_train)
            prev_pred = edy.predict(X_test)[1]

            prev_true = np.unique(y_test, return_counts=True)[1][1] / len(X_test)
            kld = binary_kl_divergence(prev_true, prev_pred)
            mae = absolute_error(prev_true, prev_pred)

            errors_df = errors_df.append(pd.DataFrame([[dname, alpha, b, prev_true, prev_pred, kld, mae]], columns=columns))

100%|██████████| 5/5 [26:05<00:00, 313.02s/it]


In [72]:
errors_df.groupby(['alpha', 'bins'])[['kld', 'mae']].agg(['mean', 'median'])

Unnamed: 0_level_0,Unnamed: 1_level_0,kld,kld,mae,mae
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,mean,median
alpha,bins,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0.2,8,0.01004,0.0019,0.03331,0.02118
0.2,16,0.01004,0.0019,0.03331,0.02118
0.2,32,0.01004,0.0019,0.03331,0.02118
0.2,64,0.01004,0.0019,0.03331,0.02118
0.2,132,0.01004,0.0019,0.03331,0.02118
0.4,8,0.00968,0.0018,0.03248,0.01935
0.4,16,0.00968,0.0018,0.03248,0.01935
0.4,32,0.00968,0.0018,0.03248,0.01935
0.4,64,0.00968,0.0018,0.03248,0.01935
0.4,132,0.00968,0.0018,0.03248,0.01935


In [73]:
errors_df.groupby(['alpha'])[['kld', 'mae']].agg(['mean', 'median'])

Unnamed: 0_level_0,kld,kld,mae,mae
Unnamed: 0_level_1,mean,median,mean,median
alpha,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0.2,0.01004,0.0019,0.03331,0.02118
0.4,0.00968,0.0018,0.03248,0.01935
0.6,0.00941,0.00146,0.03164,0.0193
0.8,0.00919,0.0013,0.03087,0.01928
1.0,0.009,0.00129,0.0303,0.01687
