# Loading real data

In [1]:
import random
import numpy as np
import pandas as pd
import sklearn as sk
import sklearn.metrics
import scipy as sp

In [2]:
"""
Created on Tue Nov  6 10:06:52 2018

@author: yandexdataschool

Original Code found in:
https://github.com/yandexdataschool/roc_comparison

updated: Raul Sanchez-Vazquez
"""

import scipy.stats
from scipy import stats

# AUC comparison adapted from
# https://github.com/Netflix/vmaf/
def compute_midrank(x):
    """Computes midranks.
    Args:
       x - a 1D numpy array
    Returns:
       array of midranks
    """
    J = np.argsort(x)
    Z = x[J]
    N = len(x)
    T = np.zeros(N, dtype=np.float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = 0.5*(i + j - 1)
        i = j
    T2 = np.empty(N, dtype=np.float)
    # Note(kazeevn) +1 is due to Python using 0-based indexing
    # instead of 1-based in the AUC formula in the paper
    T2[J] = T + 1
    return T2


def compute_midrank_weight(x, sample_weight):
    """Computes midranks.
    Args:
       x - a 1D numpy array
    Returns:
       array of midranks
    """
    J = np.argsort(x)
    Z = x[J]
    cumulative_weight = np.cumsum(sample_weight[J])
    N = len(x)
    T = np.zeros(N, dtype=np.float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = cumulative_weight[i:j].mean()
        i = j
    T2 = np.empty(N, dtype=np.float)
    T2[J] = T
    return T2


def fastDeLong(predictions_sorted_transposed, label_1_count, sample_weight):
    if sample_weight is None:
        return fastDeLong_no_weights(predictions_sorted_transposed, label_1_count)
    else:
        return fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight)


def fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight):
    """
    The fast version of DeLong's method for computing the covariance of
    unadjusted AUC.
    Args:
       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
          sorted such as the examples with label "1" are first
    Returns:
       (AUC value, DeLong covariance)
    Reference:
     @article{sun2014fast,
       title={Fast Implementation of DeLong's Algorithm for
              Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves},
       author={Xu Sun and Weichao Xu},
       journal={IEEE Signal Processing Letters},
       volume={21},
       number={11},
       pages={1389--1393},
       year={2014},
       publisher={IEEE}
     }
    """
    # Short variables are named as they are in the paper
    m = label_1_count
    n = predictions_sorted_transposed.shape[1] - m
    positive_examples = predictions_sorted_transposed[:, :m]
    negative_examples = predictions_sorted_transposed[:, m:]
    k = predictions_sorted_transposed.shape[0]

    tx = np.empty([k, m], dtype=np.float)
    ty = np.empty([k, n], dtype=np.float)
    tz = np.empty([k, m + n], dtype=np.float)
    for r in range(k):
        tx[r, :] = compute_midrank_weight(positive_examples[r, :], sample_weight[:m])
        ty[r, :] = compute_midrank_weight(negative_examples[r, :], sample_weight[m:])
        tz[r, :] = compute_midrank_weight(predictions_sorted_transposed[r, :], sample_weight)
    total_positive_weights = sample_weight[:m].sum()
    total_negative_weights = sample_weight[m:].sum()
    pair_weights = np.dot(sample_weight[:m, np.newaxis], sample_weight[np.newaxis, m:])
    total_pair_weights = pair_weights.sum()
    aucs = (sample_weight[:m]*(tz[:, :m] - tx)).sum(axis=1) / total_pair_weights
    v01 = (tz[:, :m] - tx[:, :]) / total_negative_weights
    v10 = 1. - (tz[:, m:] - ty[:, :]) / total_positive_weights
    sx = np.cov(v01)
    sy = np.cov(v10)
    delongcov = sx / m + sy / n
    return aucs, delongcov


def fastDeLong_no_weights(predictions_sorted_transposed, label_1_count):
    """
    The fast version of DeLong's method for computing the covariance of
    unadjusted AUC.
    Args:
       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
          sorted such as the examples with label "1" are first
    Returns:
       (AUC value, DeLong covariance)
    Reference:
     @article{sun2014fast,
       title={Fast Implementation of DeLong's Algorithm for
              Comparing the Areas Under Correlated Receiver Oerating
              Characteristic Curves},
       author={Xu Sun and Weichao Xu},
       journal={IEEE Signal Processing Letters},
       volume={21},
       number={11},
       pages={1389--1393},
       year={2014},
       publisher={IEEE}
     }
    """
    # Short variables are named as they are in the paper
    m = label_1_count
    n = predictions_sorted_transposed.shape[1] - m
    positive_examples = predictions_sorted_transposed[:, :m]
    negative_examples = predictions_sorted_transposed[:, m:]
    k = predictions_sorted_transposed.shape[0]

    tx = np.empty([k, m], dtype=np.float)
    ty = np.empty([k, n], dtype=np.float)
    tz = np.empty([k, m + n], dtype=np.float)
    for r in range(k):
        tx[r, :] = compute_midrank(positive_examples[r, :])
        ty[r, :] = compute_midrank(negative_examples[r, :])
        tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
    aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
    v01 = (tz[:, :m] - tx[:, :]) / n
    v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
    sx = np.cov(v01)
    sy = np.cov(v10)
    delongcov = sx / m + sy / n
    return aucs, delongcov


def calc_pvalue(aucs, sigma):
    """Computes log(10) of p-values.
    Args:
       aucs: 1D array of AUCs
       sigma: AUC DeLong covariances
    Returns:
       log10(pvalue)
    """
    l = np.array([[1, -1]])
    z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
    return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)


def compute_ground_truth_statistics(ground_truth, sample_weight):
    assert np.array_equal(np.unique(ground_truth), [0, 1])
    order = (-ground_truth).argsort()
    label_1_count = int(ground_truth.sum())
    if sample_weight is None:
        ordered_sample_weight = None
    else:
        ordered_sample_weight = sample_weight[order]

    return order, label_1_count, ordered_sample_weight


def delong_roc_variance(ground_truth, predictions, sample_weight=None):
    """
    Computes ROC AUC variance for a single set of predictions
    Args:
       ground_truth: np.array of 0 and 1
       predictions: np.array of floats of the probability of being class 1
    """
    order, label_1_count, ordered_sample_weight = compute_ground_truth_statistics(
        ground_truth, sample_weight)
    predictions_sorted_transposed = predictions[np.newaxis, order]
    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count, ordered_sample_weight)
    assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
    return aucs[0], delongcov


In [3]:
column_names = ['A1','A2','A3','A4','A5','A6','A7','A8','A9','A10','A11','A12','A13','A14','A15']

australian = pd.read_csv(
#    "drive/My Drive/Discrétisation ICLR19/opendata/australian.dat",
    "~/Google Drive/Discrétisation ICLR19/opendata/australian.dat",
    sep="\s",
    header=None,
    names = column_names,
    na_values=[
        '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
        '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null',
        '.'
    ])

  if sys.path[0] == '':


In [4]:
australian.dropna(inplace=True)

In [5]:
australian.reset_index(inplace=True, drop=True)

# Establishing 1st benchmark: naïve logistic regression

## Logistic Regression

### Label Encoding

In [6]:
australian_label_encoders = []

australian_encoded = australian.copy()

for j in [
        'A4','A5','A6','A12'
]:
    temp = sk.preprocessing.LabelEncoder()
    temp.fit(australian[j].astype(str))
    australian_label_encoders.append(temp)
    australian_encoded[j] = temp.transform(australian[j].astype(str))

### One-hot encoding

In [7]:
australian_one_hot_encoder = sk.preprocessing.OneHotEncoder(categories='auto',sparse=False,handle_unknown="ignore")
australian_one_hot_encoder.fit(australian_encoded[[
    'A4','A5','A6','A12'
]])
australian_one_hot_encoded = australian_encoded.copy()
australian_one_hot_encoded.drop(
    ['A4','A5','A6','A12'],
    axis=1,
    inplace=True)
australian_one_hot_encoded = pd.concat(
    [
        australian_one_hot_encoded,
        pd.DataFrame(
            australian_one_hot_encoder.transform(australian_encoded[[
                'A4','A5','A6','A12'
            ]]),
            index=australian_one_hot_encoded.index)
    ],
    axis=1)

### Data split

In [8]:
import sklearn.model_selection

In [9]:
australian_features_train, australian_features_test, australian_perf_train, australian_perf_test = sk.model_selection.train_test_split(
    australian_one_hot_encoded.drop('A15', axis=1),
    australian_one_hot_encoded.A15,
    test_size=0.33,
    random_state=1)

In [10]:
australian_nn_features_train = australian_encoded.iloc[
    australian_features_train.index, :].drop(
        'A15', axis=1)
australian_nn_features_test = australian_encoded.iloc[australian_features_test.index, :].drop(
    'A15', axis=1)
australian_nn_perf_train = australian_encoded.iloc[
    australian_features_train.index, :].A15
australian_nn_perf_test = australian_encoded.iloc[australian_features_test.index, :].A15

### LR on train data

In [11]:
import sklearn.linear_model

In [12]:
australian_naive_LR = sk.linear_model.LogisticRegression(C=1e20, tol=1e-8, solver="newton-cg")
australian_naive_LR.fit(
    australian_features_train[australian_features_train.isna().sum(axis=1) == 0],
    australian_perf_train[australian_features_train.isna().sum(axis=1) == 0])

LogisticRegression(C=1e+20, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=1e-08, verbose=0, warm_start=False)

### Application of learnt LR on test data

In [14]:
alpha = .95
y_pred = australian_naive_LR.predict_proba(australian_features_test)[:, 1]
y_true = australian_perf_test

auc, auc_cov = delong_roc_variance(
    y_true,
    y_pred)

auc_std = np.sqrt(auc_cov)
lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2)

ci = stats.norm.ppf(
    lower_upper_q,
    loc=auc,
    scale=auc_std)

ci[ci > 1] = 1

print('Gini:', 2*auc-1)
print('AUC COV:', auc_cov)
print('95% Gini CI:', 2*ci-1)

Gini: 0.7205893817728231
AUC COV: 0.0006986699403243063
95% Gini CI: [0.61697641 0.82420235]


# Establishing 2nd benchmark: MDLP disc + Chi2 grouping

## MDLP disc

In [18]:
from mdlp.discretization import MDLP
transformer_cont_australian = MDLP()

In [19]:
transformer_cont_australian.fit(
    australian_nn_features_train[[
        'A1','A2','A3','A7','A8','A9','A10','A11','A13','A14'
    ]], australian_nn_perf_train)

MDLP(continuous_features=None, min_depth=0, random_state=None, shuffle=True)

In [20]:
score_australian_MDLP_one_hot_encoder = sk.preprocessing.OneHotEncoder(categories='auto',sparse=False,handle_unknown="ignore")
score_australian_MDLP_one_hot_encoder.fit(
    transformer_cont_australian.transform(australian_nn_features_train[[
        'A1','A2','A3','A7','A8','A9','A10','A11','A13','A14'
    ]]))

OneHotEncoder(categorical_features=None, categories='auto',
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=False)

## Grouping

In [21]:
def chi2_test(liste):
    try:
        return sp.stats.chi2_contingency(liste)[1]
    except Exception:
        return 1

In [23]:
australian_train_grouped = australian.iloc[australian_features_train.index, :].copy()
d = dict((x, []) for x in ['A4','A5','A6','A12'])

for var in ['A4','A5','A6','A12']:
    
    australian_train_grouped[var] = australian_train_grouped[var].astype(str)
    d[var] = [x for x in np.unique(australian_train_grouped[var])]
    p_value = 1

    while(p_value>0.05):
        if len(np.unique(australian_train_grouped[var]))>1:
            freq_table = australian_train_grouped.groupby([var,'A15']).size().reset_index()
            liste_paires_modalites = [[a,b] for a in np.unique(australian_train_grouped[var]) for b in np.delete(np.unique(australian_train_grouped[var]),np.where(np.unique(australian_train_grouped[var])==a))]
            liste_chi2 = [chi2_test([freq_table.iloc[np.in1d(freq_table[var],pair[0]),2],freq_table.iloc[np.in1d(freq_table[var],pair[1]),2]]) for pair in liste_paires_modalites]
            p_value = max(liste_chi2)
        else: break

        if (p_value>0.05 and len(np.unique(australian_train_grouped[var]))>1):
            australian_train_grouped[var].iloc[np.in1d(australian_train_grouped[var],liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))])] = liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))][0] + ' - ' + liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))][1]
            d[var].remove(liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))][0])
            d[var].remove(liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))][1])
            d[var].append(str(liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))][0] + ' - ' + liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))][1]))
            print('Feature '+var+ ' - levels merged : '+str(liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))]))
        else: break


Feature A4 - levels merged : ['1', '3']
Feature A5 - levels merged : ['1', '12']
Feature A5 - levels merged : ['5', '7']
Feature A5 - levels merged : ['10', '5 - 7']
Feature A5 - levels merged : ['2', '4']
Feature A5 - levels merged : ['2 - 4', '6']
Feature A5 - levels merged : ['11', '13']
Feature A5 - levels merged : ['10 - 5 - 7', '9']
Feature A5 - levels merged : ['1 - 12', '3']
Feature A5 - levels merged : ['11 - 13', '14']
Feature A5 - levels merged : ['8', '10 - 5 - 7 - 9']
Feature A5 - levels merged : ['1 - 12 - 3', '2 - 4 - 6']
Feature A6 - levels merged : ['1', '3']
Feature A6 - levels merged : ['2', '7']
Feature A6 - levels merged : ['2 - 7', '9']
Feature A6 - levels merged : ['2 - 7 - 9', '5']
Feature A6 - levels merged : ['4', '2 - 7 - 9 - 5']
Feature A12 - levels merged : ['1', '3']
Feature A12 - levels merged : ['1 - 3', '2']


## Test time

In [24]:
australian_train_mdlp = score_australian_MDLP_one_hot_encoder.transform(
    transformer_cont_australian.transform(australian_nn_features_train[[
        'A1','A2','A3','A7','A8','A9','A10','A11','A13','A14'
    ]]))

In [38]:
australian_train_grouped_label_encoders = []

australian_train_grouped_encoded = australian_train_grouped.copy()

for j in [
        'A4','A5','A6','A12'
]:
    temp = sk.preprocessing.LabelEncoder()
    temp.fit(australian_train_grouped_encoded[j].astype(str))
    australian_train_grouped_label_encoders.append(temp)
    australian_train_grouped_encoded[j] = temp.transform(australian_train_grouped_encoded[j].astype(str))

In [28]:
score_australian_CHI2_one_hot_encoder = sk.preprocessing.OneHotEncoder(categories='auto',sparse=False,handle_unknown="ignore")

score_australian_CHI2_one_hot_encoder.fit(
        australian_train_grouped_encoded[[
            'A4','A5','A6','A12'
        ]])

OneHotEncoder(categorical_features=None, categories='auto',
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=False)

In [29]:
australian_train_chi2 = score_australian_CHI2_one_hot_encoder.transform(australian_train_grouped_encoded[[
            'A4','A5','A6','A12'
        ]])

In [30]:
australian_adhoc_train = np.concatenate((australian_train_chi2,australian_train_mdlp),axis=1)

In [32]:
australian_adhoc_LR = sk.linear_model.LogisticRegression(C=1e20, tol=1e-8, solver="newton-cg")
australian_adhoc_LR.fit(
    australian_adhoc_train,
    australian_train_grouped_encoded['A15'])

LogisticRegression(C=1e+20, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=1e-08, verbose=0, warm_start=False)

In [35]:
australian_test_grouped = australian.iloc[australian_features_test.index, :].copy()

for var in ['A4','A5','A6','A12']:
    
    australian_test_grouped[var] = australian_test_grouped[var].astype(str)

    for x in d[var]:
        if x.find(' - ')>-1:
            liste_modalites = x.split(' - ')
            australian_test_grouped[var].iloc[np.in1d(australian_test_grouped[var],liste_modalites)] = x
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [39]:
australian_test_grouped_encoded = australian_test_grouped.copy()

for j in ['A4','A5','A6','A12']:
    indice = ['A4','A5','A6','A12'].index(j)
    australian_test_grouped_encoded[j] = australian_train_grouped_label_encoders[indice].transform(australian_test_grouped_encoded[j].astype(str))
    

In [40]:
australian_test_chi2 = score_australian_CHI2_one_hot_encoder.transform(australian_test_grouped_encoded[[
            'A4','A5','A6','A12'
        ]])

In [41]:
australian_test_mdlp = score_australian_MDLP_one_hot_encoder.transform(
    transformer_cont_australian.transform(australian_nn_features_test[[
        'A1','A2','A3','A7','A8','A9','A10','A11','A13','A14'
    ]]))

In [42]:
australian_adhoc_test = np.concatenate(
    (australian_test_chi2, australian_test_mdlp), axis=1)

In [43]:
AUC_LR = sk.metrics.roc_auc_score(
    australian_perf_test,
    australian_adhoc_LR.predict_proba(australian_adhoc_test)[:, 1])
2 * AUC_LR - 1

0.8408825134481952

In [44]:
alpha = .95
y_pred = australian_adhoc_LR.predict_proba(australian_adhoc_test)[:, 1]
y_true = australian.iloc[australian_features_test.index, :]['A15']

auc, auc_cov = delong_roc_variance(
    y_true,
    y_pred)

auc_std = np.sqrt(auc_cov)
lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2)

ci = stats.norm.ppf(
    lower_upper_q,
    loc=auc,
    scale=auc_std)

ci[ci > 1] = 1

print('Gini:', 2*auc-1)
print('AUC COV:', auc_cov)
print('95% Gini CI:', 2*ci-1)

Gini: 0.8408825134481952
AUC COV: 0.0003620886692589753
95% Gini CI: [0.76629166 0.91547336]
