# Loading real data

In [0]:
import random
import numpy as np
import pandas as pd
import sklearn as sk
import sklearn.metrics
import scipy as sp t

In [0]:
"""
Created on Tue Nov  6 10:06:52 2018

@author: yandexdataschool

Original Code found in:
https://github.com/yandexdataschool/roc_comparison

updated: Raul Sanchez-Vazquez
"""

import scipy.stats
from scipy import stats

# AUC comparison adapted from
# https://github.com/Netflix/vmaf/
def compute_midrank(x):
    """Computes midranks.
    Args:
       x - a 1D numpy array
    Returns:
       array of midranks
    """
    J = np.argsort(x)
    Z = x[J]
    N = len(x)
    T = np.zeros(N, dtype=np.float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = 0.5*(i + j - 1)
        i = j
    T2 = np.empty(N, dtype=np.float)
    # Note(kazeevn) +1 is due to Python using 0-based indexing
    # instead of 1-based in the AUC formula in the paper
    T2[J] = T + 1
    return T2


def compute_midrank_weight(x, sample_weight):
    """Computes midranks.
    Args:
       x - a 1D numpy array
    Returns:
       array of midranks
    """
    J = np.argsort(x)
    Z = x[J]
    cumulative_weight = np.cumsum(sample_weight[J])
    N = len(x)
    T = np.zeros(N, dtype=np.float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = cumulative_weight[i:j].mean()
        i = j
    T2 = np.empty(N, dtype=np.float)
    T2[J] = T
    return T2


def fastDeLong(predictions_sorted_transposed, label_1_count, sample_weight):
    if sample_weight is None:
        return fastDeLong_no_weights(predictions_sorted_transposed, label_1_count)
    else:
        return fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight)


def fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight):
    """
    The fast version of DeLong's method for computing the covariance of
    unadjusted AUC.
    Args:
       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
          sorted such as the examples with label "1" are first
    Returns:
       (AUC value, DeLong covariance)
    Reference:
     @article{sun2014fast,
       title={Fast Implementation of DeLong's Algorithm for
              Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves},
       author={Xu Sun and Weichao Xu},
       journal={IEEE Signal Processing Letters},
       volume={21},
       number={11},
       pages={1389--1393},
       year={2014},
       publisher={IEEE}
     }
    """
    # Short variables are named as they are in the paper
    m = label_1_count
    n = predictions_sorted_transposed.shape[1] - m
    positive_examples = predictions_sorted_transposed[:, :m]
    negative_examples = predictions_sorted_transposed[:, m:]
    k = predictions_sorted_transposed.shape[0]

    tx = np.empty([k, m], dtype=np.float)
    ty = np.empty([k, n], dtype=np.float)
    tz = np.empty([k, m + n], dtype=np.float)
    for r in range(k):
        tx[r, :] = compute_midrank_weight(positive_examples[r, :], sample_weight[:m])
        ty[r, :] = compute_midrank_weight(negative_examples[r, :], sample_weight[m:])
        tz[r, :] = compute_midrank_weight(predictions_sorted_transposed[r, :], sample_weight)
    total_positive_weights = sample_weight[:m].sum()
    total_negative_weights = sample_weight[m:].sum()
    pair_weights = np.dot(sample_weight[:m, np.newaxis], sample_weight[np.newaxis, m:])
    total_pair_weights = pair_weights.sum()
    aucs = (sample_weight[:m]*(tz[:, :m] - tx)).sum(axis=1) / total_pair_weights
    v01 = (tz[:, :m] - tx[:, :]) / total_negative_weights
    v10 = 1. - (tz[:, m:] - ty[:, :]) / total_positive_weights
    sx = np.cov(v01)
    sy = np.cov(v10)
    delongcov = sx / m + sy / n
    return aucs, delongcov


def fastDeLong_no_weights(predictions_sorted_transposed, label_1_count):
    """
    The fast version of DeLong's method for computing the covariance of
    unadjusted AUC.
    Args:
       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
          sorted such as the examples with label "1" are first
    Returns:
       (AUC value, DeLong covariance)
    Reference:
     @article{sun2014fast,
       title={Fast Implementation of DeLong's Algorithm for
              Comparing the Areas Under Correlated Receiver Oerating
              Characteristic Curves},
       author={Xu Sun and Weichao Xu},
       journal={IEEE Signal Processing Letters},
       volume={21},
       number={11},
       pages={1389--1393},
       year={2014},
       publisher={IEEE}
     }
    """
    # Short variables are named as they are in the paper
    m = label_1_count
    n = predictions_sorted_transposed.shape[1] - m
    positive_examples = predictions_sorted_transposed[:, :m]
    negative_examples = predictions_sorted_transposed[:, m:]
    k = predictions_sorted_transposed.shape[0]

    tx = np.empty([k, m], dtype=np.float)
    ty = np.empty([k, n], dtype=np.float)
    tz = np.empty([k, m + n], dtype=np.float)
    for r in range(k):
        tx[r, :] = compute_midrank(positive_examples[r, :])
        ty[r, :] = compute_midrank(negative_examples[r, :])
        tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
    aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
    v01 = (tz[:, :m] - tx[:, :]) / n
    v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
    sx = np.cov(v01)
    sy = np.cov(v10)
    delongcov = sx / m + sy / n
    return aucs, delongcov


def calc_pvalue(aucs, sigma):
    """Computes log(10) of p-values.
    Args:
       aucs: 1D array of AUCs
       sigma: AUC DeLong covariances
    Returns:
       log10(pvalue)
    """
    l = np.array([[1, -1]])
    z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
    return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)


def compute_ground_truth_statistics(ground_truth, sample_weight):
    assert np.array_equal(np.unique(ground_truth), [0, 1])
    order = (-ground_truth).argsort()
    label_1_count = int(ground_truth.sum())
    if sample_weight is None:
        ordered_sample_weight = None
    else:
        ordered_sample_weight = sample_weight[order]

    return order, label_1_count, ordered_sample_weight


def delong_roc_variance(ground_truth, predictions, sample_weight=None):
    """
    Computes ROC AUC variance for a single set of predictions
    Args:
       ground_truth: np.array of 0 and 1
       predictions: np.array of floats of the probability of being class 1
    """
    order, label_1_count, ordered_sample_weight = compute_ground_truth_statistics(
        ground_truth, sample_weight)
    predictions_sorted_transposed = predictions[np.newaxis, order]
    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count, ordered_sample_weight)
    assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
    return aucs[0], delongcov

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
column_names = ['A1','A2','A3','A4','A5','A6','A7','A8','A9','A10','A11','A12','A13','A14','A15','A16']

credit = pd.read_csv(
    "/content/drive/My Drive/Discrétisation ICLR19/opendata/credit-screening.data",
    sep=",",
    header= None,
    names = column_names,
    na_values=[
        '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
        '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null',
        '.'
    ])

In [0]:
credit.dropna(inplace=True)
credit.A16 = sklearn.preprocessing.LabelEncoder().fit_transform(credit.A16)


In [0]:
credit.reset_index(inplace=True, drop=True)

# Establishing 1st benchmark: naïve logistic regression

## Logistic Regression

### Label Encoding

In [0]:
credit_label_encoders = []

credit_encoded = credit.copy()

for j in ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']:
    temp = sk.preprocessing.LabelEncoder()
    temp.fit(credit[j].astype(str))
    credit_label_encoders.append(temp)
    credit_encoded[j] = temp.transform(credit[j].astype(str))

### One-hot encoding

In [0]:
credit_one_hot_encoder = sk.preprocessing.OneHotEncoder(categories='auto',sparse=False,handle_unknown="ignore")
credit_one_hot_encoder.fit(credit_encoded[[
    'A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13'
]])
credit_one_hot_encoded = credit_encoded.copy()
credit_one_hot_encoded.drop(
    ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13'],
    axis=1,
    inplace=True)
credit_one_hot_encoded = pd.concat(
    [
        credit_one_hot_encoded,
        pd.DataFrame(
            credit_one_hot_encoder.transform(credit_encoded[[
                'A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13'
            ]]),
            index=credit_one_hot_encoded.index)
    ],
    axis=1)

### Data split

In [0]:
import sklearn.model_selection

In [0]:
credit_features_train, credit_features_test, credit_perf_train, credit_perf_test = sk.model_selection.train_test_split(
    credit_one_hot_encoded.drop('A16', axis=1),
    credit_one_hot_encoded.A16,
    test_size=0.33,
    random_state=1)

In [0]:
credit_nn_features_train = credit_encoded.iloc[
    credit_features_train.index, :].drop(
        'A16', axis=1)
credit_nn_features_test = credit_encoded.iloc[credit_features_test.index, :].drop(
    'A16', axis=1)
credit_nn_perf_train = credit_encoded.iloc[
    credit_features_train.index, :].A16
credit_nn_perf_test = credit_encoded.iloc[credit_features_test.index, :].A16

### LR on train data

In [0]:
import sklearn.linear_model

In [18]:
credit_naive_LR = sk.linear_model.LogisticRegression(C=1e20, tol=1e-8, solver="newton-cg")
credit_naive_LR.fit(
    credit_features_train[(credit_features_train == '?').sum(axis=1) == 0],
    credit_perf_train[(credit_features_train == '?').sum(axis=1) == 0])



LogisticRegression(C=1e+20, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=1e-08, verbose=0, warm_start=False)

### Application of learnt LR on test data

In [19]:
2 * sk.metrics.roc_auc_score(
    credit_perf_test[(credit_features_test == '?').sum(axis=1) == 0],
    credit_naive_LR.predict_proba(credit_features_test[(credit_features_test == '?').sum(axis=1) == 0])[:, 1]) - 1

0.8131016270551155

In [20]:
alpha = .95
y_pred = credit_naive_LR.predict_proba(credit_features_test[(credit_features_test == '?').sum(axis=1) == 0])[:, 1]
y_true = credit_perf_test[(credit_features_test == '?').sum(axis=1) == 0]

auc, auc_cov = delong_roc_variance(
    y_true,
    y_pred)

auc_std = np.sqrt(auc_cov)
lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2)

ci = stats.norm.ppf(
    lower_upper_q,
    loc=auc,
    scale=auc_std)

ci[ci > 1] = 1

print('Gini:', 2*auc-1)
print('AUC COV:', auc_cov)
print('95% Gini CI:', 2*ci-1)

Gini: 0.8131016270551152
AUC COV: 0.0006014770611129498
95% Gini CI: [0.71696528 0.90923798]


# Establishing 2nd benchmark: MDLP disc + Chi2 grouping

## MDLP disc

In [23]:
!pip install mdlp-discretization

Collecting mdlp-discretization
[?25l  Downloading https://files.pythonhosted.org/packages/ab/6f/96722189bc15a9603c5b0f9ff223534683eae75130e8a67eac407ba7c6bd/mdlp_discretization-0.3.2-cp36-cp36m-manylinux1_x86_64.whl (189kB)
[K    100% |████████████████████████████████| 194kB 6.9MB/s 
Installing collected packages: mdlp-discretization
Successfully installed mdlp-discretization-0.3.2


In [0]:
from mdlp.discretization import MDLP
transformer_cont_credit = MDLP()

In [25]:
transformer_cont_credit.fit(
    credit_nn_features_train[[
        'A2','A3','A8','A11','A14','A15'
    ]][(credit_features_train == '?').sum(axis=1) == 0], credit_nn_perf_train[(credit_features_train == '?').sum(axis=1) == 0])



MDLP(continuous_features=None, dtype=<class 'int'>, min_depth=0,
   precision=0.0001, random_state=None)

In [26]:
score_credit_MDLP_one_hot_encoder = sk.preprocessing.OneHotEncoder(categories='auto',sparse=False,handle_unknown="ignore")
score_credit_MDLP_one_hot_encoder.fit(
    transformer_cont_credit.transform(credit_nn_features_train[[
        'A2','A3','A8','A11','A14','A15'
    ]][(credit_features_train == '?').sum(axis=1) == 0]))

OneHotEncoder(categorical_features=None, categories='auto',
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=False)

## Grouping

In [0]:
def chi2_test(liste):
    try:
        return sp.stats.chi2_contingency(liste)[1]
    except Exception:
        return 1

In [28]:
credit_train_grouped = credit.iloc[credit_features_train.index, :].copy()
d = dict((x, []) for x in ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13'])

for var in ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']:
    
    credit_train_grouped[var] = credit_train_grouped[var].astype(str)
    d[var] = [x for x in np.unique(credit_train_grouped[var])]
    p_value = 1

    while(p_value>0.05):
        if len(np.unique(credit_train_grouped[var]))>1:
            freq_table = credit_train_grouped.groupby([var,'A16']).size().reset_index()
            liste_paires_modalites = [[a,b] for a in np.unique(credit_train_grouped[var]) for b in np.delete(np.unique(credit_train_grouped[var]),np.where(np.unique(credit_train_grouped[var])==a))]
            liste_chi2 = [chi2_test([freq_table.iloc[np.in1d(freq_table[var],pair[0]),2],freq_table.iloc[np.in1d(freq_table[var],pair[1]),2]]) for pair in liste_paires_modalites]
            p_value = max(liste_chi2)
        else: break

        if (p_value>0.05 and len(np.unique(credit_train_grouped[var]))>1):
            credit_train_grouped[var].iloc[np.in1d(credit_train_grouped[var],liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))])] = liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))][0] + ' - ' + liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))][1]
            d[var].remove(liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))][0])
            d[var].remove(liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))][1])
            d[var].append(str(liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))][0] + ' - ' + liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))][1]))
            print('Feature '+var+ ' - levels merged : '+str(liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))]))
        else: break


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Feature A1 - levels merged : ['?', 'a']
Feature A1 - levels merged : ['? - a', 'b']
Feature A4 - levels merged : ['?', 'l']
Feature A4 - levels merged : ['u', '? - l']
Feature A5 - levels merged : ['?', 'gg']
Feature A5 - levels merged : ['g', '? - gg']
Feature A6 - levels merged : ['?', 'e']
Feature A6 - levels merged : ['? - e', 'j']
Feature A6 - levels merged : ['? - e - j', 'r']
Feature A6 - levels merged : ['? - e - j - r', 'w']
Feature A6 - levels merged : ['ff', 'k']
Feature A6 - levels merged : ['d', 'ff - k']
Feature A6 - levels merged : ['aa', 'm']
Feature A6 - levels merged : ['d - ff - k', 'i']
Feature A6 - levels merged : ['aa - m', 'c']
Feature A6 - levels merged : ['cc', 'x']
Feature A6 - levels merged : ['? - e - j - r - w', 'aa - m - c']
Feature A6 - levels merged : ['cc - x', 'q']
Feature A7 - levels merged : ['?', 'dd']
Feature A7 - levels merged : ['? - dd', 'o']
Feature A7 - levels merged : ['? - dd - o', 'j']
Feature A7 - levels merged : ['? - dd - o - j', 'n']
Fe

## Test time

In [0]:
credit_train_mdlp = score_credit_MDLP_one_hot_encoder.transform(
    transformer_cont_credit.transform(credit_nn_features_train[[
        'A2','A3','A8','A11','A14','A15'
    ]][(credit_features_train == '?').sum(axis=1) == 0]))

In [0]:
credit_train_grouped_label_encoders = []

credit_train_grouped_encoded = credit_train_grouped.copy()

for j in ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']:
    temp = sk.preprocessing.LabelEncoder()
    temp.fit(credit_train_grouped_encoded[j].astype(str))
    credit_train_grouped_label_encoders.append(temp)
    credit_train_grouped_encoded[j] = temp.transform(credit_train_grouped_encoded[j].astype(str))

In [31]:
score_credit_CHI2_one_hot_encoder = sk.preprocessing.OneHotEncoder(categories='auto',sparse=False,handle_unknown="ignore")

score_credit_CHI2_one_hot_encoder.fit(
        credit_train_grouped_encoded[[
            'A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13'
        ]])

OneHotEncoder(categorical_features=None, categories='auto',
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=False)

In [0]:
credit_train_chi2 = score_credit_CHI2_one_hot_encoder.transform(credit_train_grouped_encoded[[
            'A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13'
        ]][(credit_features_train == '?').sum(axis=1) == 0])

In [0]:
credit_adhoc_train = np.concatenate((credit_train_chi2,credit_train_mdlp),axis=1)

In [34]:
credit_adhoc_LR = sk.linear_model.LogisticRegression(C=1e20, tol=1e-8, solver="newton-cg")
credit_adhoc_LR.fit(
    credit_adhoc_train,
    credit_train_grouped_encoded['A16'][(credit_features_train == '?').sum(axis=1) == 0])

LogisticRegression(C=1e+20, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=1e-08, verbose=0, warm_start=False)

In [35]:
credit_test_grouped = credit.iloc[credit_features_test.index, :].copy()

for var in ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']:
    
    credit_test_grouped[var] = credit_test_grouped[var].astype(str)

    for x in d[var]:
        if x.find(' - ')>-1:
            liste_modalites = x.split(' - ')
            credit_test_grouped[var].iloc[np.in1d(credit_test_grouped[var],liste_modalites)] = x
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [0]:
credit_test_grouped_encoded = credit_test_grouped.copy()

for j in ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']:
    indice = ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13'].index(j)
    credit_test_grouped_encoded[j] = credit_train_grouped_label_encoders[indice].transform(credit_test_grouped_encoded[j].astype(str))
    

In [0]:
credit_test_chi2 = score_credit_CHI2_one_hot_encoder.transform(credit_test_grouped_encoded[[
            'A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13'
        ]])

In [0]:
credit_test_mdlp = score_credit_MDLP_one_hot_encoder.transform(
    transformer_cont_credit.transform(credit_nn_features_test[[
        'A2','A3','A8','A11','A14','A15'
    ]][(credit_features_test == '?').sum(axis=1) == 0]))

In [0]:
credit_adhoc_test = np.concatenate(
    (credit_test_chi2[(credit_features_test == '?').sum(axis=1) == 0], credit_test_mdlp), axis=1)


In [40]:
2 * sk.metrics.roc_auc_score(
    credit_test_grouped_encoded
              ['A16'][(credit_features_test == '?').sum(axis=1) == 0],
    credit_adhoc_LR.predict_proba(credit_adhoc_test)[:, 1]) - 1

0.8792912513842746

In [41]:
alpha = .95
y_pred = credit_adhoc_LR.predict_proba(credit_adhoc_test)[:, 1]
y_true = credit_test_grouped_encoded['A16'][(credit_features_test == '?').sum(axis=1) == 0]

auc, auc_cov = delong_roc_variance(
    y_true,
    y_pred)

auc_std = np.sqrt(auc_cov)
lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2)

ci = stats.norm.ppf(
    lower_upper_q,
    loc=auc,
    scale=auc_std)

ci[ci > 1] = 1

print('Gini:', 2*auc-1)
print('AUC COV:', auc_cov)
print('95% Gini CI:', 2*ci-1)

Gini: 0.8792912513842748
AUC COV: 0.0002828830376610758
95% Gini CI: [0.81336148 0.94522102]
