# Loading real data

In [1]:
import random
import numpy as np
import pandas as pd
import sklearn as sk
import sklearn.metrics
import scipy as sp

In [3]:
"""
Created on Tue Nov  6 10:06:52 2018

@author: yandexdataschool

Original Code found in:
https://github.com/yandexdataschool/roc_comparison

updated: Raul Sanchez-Vazquez
"""

import scipy.stats
from scipy import stats

# AUC comparison adapted from
# https://github.com/Netflix/vmaf/
def compute_midrank(x):
    """Computes midranks.
    Args:
       x - a 1D numpy array
    Returns:
       array of midranks
    """
    J = np.argsort(x)
    Z = x[J]
    N = len(x)
    T = np.zeros(N, dtype=np.float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = 0.5*(i + j - 1)
        i = j
    T2 = np.empty(N, dtype=np.float)
    # Note(kazeevn) +1 is due to Python using 0-based indexing
    # instead of 1-based in the AUC formula in the paper
    T2[J] = T + 1
    return T2


def compute_midrank_weight(x, sample_weight):
    """Computes midranks.
    Args:
       x - a 1D numpy array
    Returns:
       array of midranks
    """
    J = np.argsort(x)
    Z = x[J]
    cumulative_weight = np.cumsum(sample_weight[J])
    N = len(x)
    T = np.zeros(N, dtype=np.float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = cumulative_weight[i:j].mean()
        i = j
    T2 = np.empty(N, dtype=np.float)
    T2[J] = T
    return T2


def fastDeLong(predictions_sorted_transposed, label_1_count, sample_weight):
    if sample_weight is None:
        return fastDeLong_no_weights(predictions_sorted_transposed, label_1_count)
    else:
        return fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight)


def fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight):
    """
    The fast version of DeLong's method for computing the covariance of
    unadjusted AUC.
    Args:
       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
          sorted such as the examples with label "1" are first
    Returns:
       (AUC value, DeLong covariance)
    Reference:
     @article{sun2014fast,
       title={Fast Implementation of DeLong's Algorithm for
              Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves},
       author={Xu Sun and Weichao Xu},
       journal={IEEE Signal Processing Letters},
       volume={21},
       number={11},
       pages={1389--1393},
       year={2014},
       publisher={IEEE}
     }
    """
    # Short variables are named as they are in the paper
    m = label_1_count
    n = predictions_sorted_transposed.shape[1] - m
    positive_examples = predictions_sorted_transposed[:, :m]
    negative_examples = predictions_sorted_transposed[:, m:]
    k = predictions_sorted_transposed.shape[0]

    tx = np.empty([k, m], dtype=np.float)
    ty = np.empty([k, n], dtype=np.float)
    tz = np.empty([k, m + n], dtype=np.float)
    for r in range(k):
        tx[r, :] = compute_midrank_weight(positive_examples[r, :], sample_weight[:m])
        ty[r, :] = compute_midrank_weight(negative_examples[r, :], sample_weight[m:])
        tz[r, :] = compute_midrank_weight(predictions_sorted_transposed[r, :], sample_weight)
    total_positive_weights = sample_weight[:m].sum()
    total_negative_weights = sample_weight[m:].sum()
    pair_weights = np.dot(sample_weight[:m, np.newaxis], sample_weight[np.newaxis, m:])
    total_pair_weights = pair_weights.sum()
    aucs = (sample_weight[:m]*(tz[:, :m] - tx)).sum(axis=1) / total_pair_weights
    v01 = (tz[:, :m] - tx[:, :]) / total_negative_weights
    v10 = 1. - (tz[:, m:] - ty[:, :]) / total_positive_weights
    sx = np.cov(v01)
    sy = np.cov(v10)
    delongcov = sx / m + sy / n
    return aucs, delongcov


def fastDeLong_no_weights(predictions_sorted_transposed, label_1_count):
    """
    The fast version of DeLong's method for computing the covariance of
    unadjusted AUC.
    Args:
       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
          sorted such as the examples with label "1" are first
    Returns:
       (AUC value, DeLong covariance)
    Reference:
     @article{sun2014fast,
       title={Fast Implementation of DeLong's Algorithm for
              Comparing the Areas Under Correlated Receiver Oerating
              Characteristic Curves},
       author={Xu Sun and Weichao Xu},
       journal={IEEE Signal Processing Letters},
       volume={21},
       number={11},
       pages={1389--1393},
       year={2014},
       publisher={IEEE}
     }
    """
    # Short variables are named as they are in the paper
    m = label_1_count
    n = predictions_sorted_transposed.shape[1] - m
    positive_examples = predictions_sorted_transposed[:, :m]
    negative_examples = predictions_sorted_transposed[:, m:]
    k = predictions_sorted_transposed.shape[0]

    tx = np.empty([k, m], dtype=np.float)
    ty = np.empty([k, n], dtype=np.float)
    tz = np.empty([k, m + n], dtype=np.float)
    for r in range(k):
        tx[r, :] = compute_midrank(positive_examples[r, :])
        ty[r, :] = compute_midrank(negative_examples[r, :])
        tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
    aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
    v01 = (tz[:, :m] - tx[:, :]) / n
    v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
    sx = np.cov(v01)
    sy = np.cov(v10)
    delongcov = sx / m + sy / n
    return aucs, delongcov


def calc_pvalue(aucs, sigma):
    """Computes log(10) of p-values.
    Args:
       aucs: 1D array of AUCs
       sigma: AUC DeLong covariances
    Returns:
       log10(pvalue)
    """
    l = np.array([[1, -1]])
    z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
    return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)


def compute_ground_truth_statistics(ground_truth, sample_weight):
    assert np.array_equal(np.unique(ground_truth), [0, 1])
    order = (-ground_truth).argsort()
    label_1_count = int(ground_truth.sum())
    if sample_weight is None:
        ordered_sample_weight = None
    else:
        ordered_sample_weight = sample_weight[order]

    return order, label_1_count, ordered_sample_weight


def delong_roc_variance(ground_truth, predictions, sample_weight=None):
    """
    Computes ROC AUC variance for a single set of predictions
    Args:
       ground_truth: np.array of 0 and 1
       predictions: np.array of floats of the probability of being class 1
    """
    order, label_1_count, ordered_sample_weight = compute_ground_truth_statistics(
        ground_truth, sample_weight)
    predictions_sorted_transposed = predictions[np.newaxis, order]
    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count, ordered_sample_weight)
    assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
    return aucs[0], delongcov

In [4]:
column_names = [
    'timestamp', 'cylinder', 'customer', 'job', 'grain', 'ink', 'proof',
    'blade', 'cylinder1', 'paper', 'ink2', 'direct', 'solvent', 'type_cyl',
    'press', 'press2', 'unit', 'cylinder2', 'paper2', 'plating', 'proof2',
    'viscosity', 'caliper', 'ink3', 'humifity', 'roughness', 'blade2',
    'varnish', 'press3', 'ink4', 'solvent2', 'ESA', 'ESA2', 'wax', 'hardener',
    'roller', 'current', 'anode', 'chrome', 'band'
]

bands = pd.read_csv(
    "~/Google Drive/Discrétisation ICLR19/opendata/bands.data",
    sep=",",
    names=column_names,
    na_values=[
        '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
        '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null',
        '.', '?'
    ])

In [5]:
bands.drop(["timestamp"],axis=1,inplace=True)

In [6]:
for col in [    'cylinder',
                'customer',
                'grain',
                'ink',
                'proof',
                'blade',
                'cylinder1',
                'paper',
                'ink2',
                'direct',
                'solvent',
                'type_cyl',
                'press',
               'cylinder2',
               'paper2',
               'caliper',
               'band']:
    
    
    bands[col] = bands[col].str.upper()


In [7]:
bands[[
    'cylinder', 'customer', 'grain', 'ink', 'proof', 'blade', 'cylinder1',
    'paper', 'ink2', 'direct', 'solvent', 'type_cyl', 'press', 'cylinder2',
    'paper2', 'caliper'
]] = bands[[
    'cylinder', 'customer', 'grain', 'ink', 'proof', 'blade', 'cylinder1',
    'paper', 'ink2', 'direct', 'solvent', 'type_cyl', 'press', 'cylinder2',
    'paper2', 'caliper'
]].fillna(value="missing")

In [8]:
bands.dropna(inplace=True)

In [9]:
bands.band = sklearn.preprocessing.LabelEncoder().fit_transform(bands.band)

In [10]:
bands[[
    'press2', 'plating', 'proof2', 'viscosity', 'caliper', 'ink3', 'humifity',
    'roughness', 'blade2', 'varnish', 'press3', 'ink4', 'solvent2', 'ESA',
    'ESA2', 'wax', 'hardener', 'roller', 'current', 'anode', 'chrome'
]] = bands[[    'press2', 'plating', 'proof2', 'viscosity', 'caliper', 'ink3', 'humifity',
    'roughness', 'blade2', 'varnish', 'press3', 'ink4', 'solvent2', 'ESA',
    'ESA2', 'wax', 'hardener', 'roller', 'current', 'anode', 'chrome'
]].fillna(value=0.0)

In [11]:
bands.reset_index(inplace=True, drop=True)

# Establishing 1st benchmark: naïve logistic regression

## Logistic Regression

### Label Encoding

In [12]:
auto_label_encoders = []

bands_encoded = bands.copy()

for j in [
            'cylinder', 'customer', 'grain', 'ink', 'proof', 'blade', 'cylinder1',
    'paper', 'ink2', 'direct', 'solvent', 'type_cyl', 'press', 'cylinder2',
    'paper2', 'caliper'

]:
    temp = sk.preprocessing.LabelEncoder()
    temp.fit(bands[j].astype(str))
    auto_label_encoders.append(temp)
    bands_encoded[j] = temp.transform(bands[j].astype(str))

### One-hot encoding

In [13]:
bands_one_hot_encoder = sk.preprocessing.OneHotEncoder(categories='auto',sparse=False,handle_unknown="ignore")
bands_one_hot_encoder.fit(bands_encoded[[
        'cylinder', 'customer', 'grain', 'ink', 'proof', 'blade', 'cylinder1',
    'paper', 'ink2', 'direct', 'solvent', 'type_cyl', 'press', 'cylinder2',
    'paper2', 'caliper'

]])
bands_one_hot_encoded = bands_encoded.copy()
bands_one_hot_encoded.drop(
    [    'cylinder', 'customer', 'grain', 'ink', 'proof', 'blade', 'cylinder1',
    'paper', 'ink2', 'direct', 'solvent', 'type_cyl', 'press', 'cylinder2',
    'paper2', 'caliper'
],
    axis=1,
    inplace=True)
bands_one_hot_encoded = pd.concat(
    [
        bands_one_hot_encoded,
        pd.DataFrame(
            bands_one_hot_encoder.transform(bands_encoded[[
                    'cylinder', 'customer', 'grain', 'ink', 'proof', 'blade', 'cylinder1',
    'paper', 'ink2', 'direct', 'solvent', 'type_cyl', 'press', 'cylinder2',
    'paper2', 'caliper'

            ]]),
            index=bands_one_hot_encoded.index)
    ],
    axis=1)

### Data split

In [14]:
import sklearn.model_selection

In [15]:
bands_features_train, bands_features_test, bands_perf_train, bands_perf_test = sk.model_selection.train_test_split(
    bands_one_hot_encoded.drop('band', axis=1),
    bands_one_hot_encoded.band,
    test_size=0.33,
    random_state=1)

In [16]:
bands_nn_features_train = bands_encoded.iloc[
    bands_features_train.index, :].drop(
        'band', axis=1)
bands_nn_features_test = bands_encoded.iloc[bands_features_test.index, :].drop(
    'band', axis=1)
bands_nn_perf_train = bands_encoded.iloc[
    bands_features_train.index, :].band
bands_nn_perf_test = bands_encoded.iloc[bands_features_test.index, :].band

### LR on train data

In [17]:
import sklearn.linear_model

In [18]:
bands_naive_LR = sk.linear_model.LogisticRegression(C=1e20, tol=1e-8, solver="newton-cg")
bands_naive_LR.fit(
    bands_features_train[bands_features_train.isna().sum(axis=1) == 0],
    bands_perf_train[bands_features_train.isna().sum(axis=1) == 0])



LogisticRegression(C=1e+20, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=1e-08, verbose=0, warm_start=False)

### Application of learnt LR on test data

In [19]:
2 * sk.metrics.roc_auc_score(
    bands_perf_test,
    bands_naive_LR.predict_proba(bands_features_test)[:, 1]) - 1

0.4833333333333334

In [20]:
alpha = .95
y_pred = bands_naive_LR.predict_proba(bands_features_test)[:, 1]
y_true = bands_perf_test

auc, auc_cov = delong_roc_variance(
    y_true,
    y_pred)

auc_std = np.sqrt(auc_cov)
lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2)

ci = stats.norm.ppf(
    lower_upper_q,
    loc=auc,
    scale=auc_std)

ci[ci > 1] = 1

print('Gini:', 2*auc-1)
print('AUC COV:', auc_cov)
print('95% Gini CI:', 2*ci-1)

Gini: 0.48333333333333317
AUC COV: 0.0020661063989521437
95% Gini CI: [0.30515519 0.66151148]


# Establishing 2nd benchmark: MDLP disc + Chi2 grouping

## MDLP disc

In [21]:
from mdlp.discretization import MDLP
transformer_cont_bands = MDLP()

In [22]:
transformer_cont_bands.fit(
    bands_nn_features_train[[
'press2', 'plating', 'proof2', 'viscosity', 'caliper', 'ink3', 'humifity',
    'roughness', 'blade2', 'varnish', 'press3', 'ink4', 'solvent2', 'ESA',
    'ESA2', 'wax', 'hardener', 'roller', 'current', 'anode', 'chrome'    ]], bands_nn_perf_train)

MDLP(continuous_features=None, min_depth=0, random_state=None, shuffle=True)

In [23]:
score_bands_MDLP_one_hot_encoder = sk.preprocessing.OneHotEncoder(categories='auto',sparse=False,handle_unknown="ignore")
score_bands_MDLP_one_hot_encoder.fit(
    transformer_cont_bands.transform(bands_nn_features_train[[
'press2', 'plating', 'proof2', 'viscosity', 'caliper', 'ink3', 'humifity',
    'roughness', 'blade2', 'varnish', 'press3', 'ink4', 'solvent2', 'ESA',
    'ESA2', 'wax', 'hardener', 'roller', 'current', 'anode', 'chrome'    ]]))

OneHotEncoder(categorical_features=None, categories='auto',
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=False)

## Grouping

In [24]:
def chi2_test(liste):
    try:
        return sp.stats.chi2_contingency(liste)[1]
    except Exception:
        return 1

In [26]:
bands_train_grouped = bands.iloc[bands_features_train.index, :].copy()
d = dict((x, []) for x in [    'cylinder', 'customer', 'grain', 'ink', 'proof', 'blade', 'cylinder1',
    'paper', 'ink2', 'direct', 'solvent', 'type_cyl', 'press', 'cylinder2',
    'paper2', 'caliper'
])

for var in [    'cylinder', 'customer', 'grain', 'ink', 'proof', 'blade', 'cylinder1',
    'paper', 'ink2', 'direct', 'solvent', 'type_cyl', 'press', 'cylinder2',
    'paper2', 'caliper'
]:
    
    bands_train_grouped[var] = bands_train_grouped[var].astype(str)
    d[var] = [x for x in np.unique(bands_train_grouped[var])]
    p_value = 1

    while(p_value>0.05):
        if len(np.unique(bands_train_grouped[var]))>1:
            freq_table = bands_train_grouped.groupby([var,'band']).size().reset_index()
            liste_paires_modalites = [[a,b] for a in np.unique(bands_train_grouped[var]) for b in np.delete(np.unique(bands_train_grouped[var]),np.where(np.unique(bands_train_grouped[var])==a))]
            liste_chi2 = [chi2_test([freq_table.iloc[np.in1d(freq_table[var],pair[0]),2],freq_table.iloc[np.in1d(freq_table[var],pair[1]),2]]) for pair in liste_paires_modalites]
            p_value = max(liste_chi2)
        else: break

        if (p_value>0.05 and len(np.unique(bands_train_grouped[var]))>1):
            bands_train_grouped[var].iloc[np.in1d(bands_train_grouped[var],liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))])] = liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))][0] + ' - ' + liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))][1]
            d[var].remove(liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))][0])
            d[var].remove(liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))][1])
            d[var].append(str(liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))][0] + ' - ' + liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))][1]))
            print('Feature '+var+ ' - levels merged : '+str(liste_paires_modalites[np.argmax(np.equal(liste_chi2,p_value))]))
        else: break


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Feature cylinder - levels merged : ['AA11', 'AA17']
Feature cylinder - levels merged : ['AA11 - AA17', 'E26']
Feature cylinder - levels merged : ['AA11 - AA17 - E26', 'E303']
Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303', 'E310']
Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310', 'E69']
Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69', 'E72']
Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72', 'E74']
Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74', 'E77']
Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77', 'E86']
Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86', 'E90']
Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90', 'F067']
Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72

Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69', 'G71']
Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71', 'G84']
Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G

Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G95 - G98 - I301 - I303 - I309 - I317 - F679 - G407 - G518 - I320 - I337 - I343 - I349 - I353 - I354 - I358 - J34 - J42 - J43', 'J582']
Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G95 - G98 - I301 - I303 - I309 - I317 - F679 - G407 - G518 - I320 - I337 - I343 - I349 - I353 - I354 - I358 - J34 - J42 - J43 - J582', 'J6']
Feature cylinder - levels merged 

Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G95 - G98 - I301 - I303 - I309 - I317 - F679 - G407 - G518 - I320 - I337 - I343 - I349 - I353 - I354 - I358 - J34 - J42 - J43 - J582 - J6 - J60 - J68 - M200 - M374 - M402 - M432 - M45 - M65 - O14 - O15 - O21 - O4 - O6 - O7', 'O8']
Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G95 - G98 - I301 - I303 - I309 - I317 - F679 - G407 - G518 - I320 - I337 - I343

Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G95 - G98 - I301 - I303 - I309 - I317 - F679 - G407 - G518 - I320 - I337 - I343 - I349 - I353 - I354 - I358 - J34 - J42 - J43 - J582 - J6 - J60 - J68 - M200 - M374 - M402 - M432 - M45 - M65 - O14 - O15 - O21 - O4 - O6 - O7 - O8 - R1 - R15 - R17 - R2 - R22 - R23 - R3 - R32 - R34 - R43 - R5 - R6 - R7', 'R9']
Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G9

Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G95 - G98 - I301 - I303 - I309 - I317 - F679 - G407 - G518 - I320 - I337 - I343 - I349 - I353 - I354 - I358 - J34 - J42 - J43 - J582 - J6 - J60 - J68 - M200 - M374 - M402 - M432 - M45 - M65 - O14 - O15 - O21 - O4 - O6 - O7 - O8 - R1 - R15 - R17 - R2 - R22 - R23 - R3 - R32 - R34 - R43 - R5 - R6 - R7 - R9 - T117 - T133 - T178 - T218 - T234 - T244 - T245 - T312 - T313 - T78 - T8', 'W357']
Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467

Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G95 - G98 - I301 - I303 - I309 - I317 - F679 - G407 - G518 - I320 - I337 - I343 - I349 - I353 - I354 - I358 - J34 - J42 - J43 - J582 - J6 - J60 - J68 - M200 - M374 - M402 - M432 - M45 - M65 - O14 - O15 - O21 - O4 - O6 - O7 - O8 - R1 - R15 - R17 - R2 - R22 - R23 - R3 - R32 - R34 - R43 - R5 - R6 - R7 - R9 - T117 - T133 - T178 - T218 - T234 - T244 - T245 - T312 - T313 - T78 - T8 - W357 - W364 - W406 - W717 - W785 - W90 - W92 - X001 - X103 - X108 - X118', 'X12']
Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F6

Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G95 - G98 - I301 - I303 - I309 - I317 - F679 - G407 - G518 - I320 - I337 - I343 - I349 - I353 - I354 - I358 - J34 - J42 - J43 - J582 - J6 - J60 - J68 - M200 - M374 - M402 - M432 - M45 - M65 - O14 - O15 - O21 - O4 - O6 - O7 - O8 - R1 - R15 - R17 - R2 - R22 - R23 - R3 - R32 - R34 - R43 - R5 - R6 - R7 - R9 - T117 - T133 - T178 - T218 - T234 - T244 - T245 - T312 - T313 - T78 - T8 - W357 - W364 - W406 - W717 - W785 - W90 - W92 - X001 - X103 - X108 - X118 - X12 - X126 - X132 - X138 - X139 - X141 - X146 - X147 - X151 - X155', 'X163']
Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F

Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G95 - G98 - I301 - I303 - I309 - I317 - F679 - G407 - G518 - I320 - I337 - I343 - I349 - I353 - I354 - I358 - J34 - J42 - J43 - J582 - J6 - J60 - J68 - M200 - M374 - M402 - M432 - M45 - M65 - O14 - O15 - O21 - O4 - O6 - O7 - O8 - R1 - R15 - R17 - R2 - R22 - R23 - R3 - R32 - R34 - R43 - R5 - R6 - R7 - R9 - T117 - T133 - T178 - T218 - T234 - T244 - T245 - T312 - T313 - T78 - T8 - W357 - W364 - W406 - W717 - W785 - W90 - W92 - X001 - X103 - X108 - X118 - X12 - X126 - X132 - X138 - X139 - X141 - X146 - X147 - X151 - X155 - X163 - X18 - X185 - X195 - X197 - X199 - X201 - X203 - X220 - X243', 'X249']
Feature cylinder - levels merged : ['AA11 - AA17 - E2

Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G95 - G98 - I301 - I303 - I309 - I317 - F679 - G407 - G518 - I320 - I337 - I343 - I349 - I353 - I354 - I358 - J34 - J42 - J43 - J582 - J6 - J60 - J68 - M200 - M374 - M402 - M432 - M45 - M65 - O14 - O15 - O21 - O4 - O6 - O7 - O8 - R1 - R15 - R17 - R2 - R22 - R23 - R3 - R32 - R34 - R43 - R5 - R6 - R7 - R9 - T117 - T133 - T178 - T218 - T234 - T244 - T245 - T312 - T313 - T78 - T8 - W357 - W364 - W406 - W717 - W785 - W90 - W92 - X001 - X103 - X108 - X118 - X12 - X126 - X132 - X138 - X139 - X141 - X146 - X147 - X151 - X155 - X163 - X18 - X185 - X195 - X197 - X199 - X201 - X203 - X220 - X243 - X249 - X25 - X251 - X253 - X264 - X271 - X273 - X281 - X291',

Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G95 - G98 - I301 - I303 - I309 - I317 - F679 - G407 - G518 - I320 - I337 - I343 - I349 - I353 - I354 - I358 - J34 - J42 - J43 - J582 - J6 - J60 - J68 - M200 - M374 - M402 - M432 - M45 - M65 - O14 - O15 - O21 - O4 - O6 - O7 - O8 - R1 - R15 - R17 - R2 - R22 - R23 - R3 - R32 - R34 - R43 - R5 - R6 - R7 - R9 - T117 - T133 - T178 - T218 - T234 - T244 - T245 - T312 - T313 - T78 - T8 - W357 - W364 - W406 - W717 - W785 - W90 - W92 - X001 - X103 - X108 - X118 - X12 - X126 - X132 - X138 - X139 - X141 - X146 - X147 - X151 - X155 - X163 - X18 - X185 - X195 - X197 - X199 - X201 - X203 - X220 - X243 - X249 - X25 - X251 - X253 - X264 - X271 - X273 - X281 - X291 -

Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G95 - G98 - I301 - I303 - I309 - I317 - F679 - G407 - G518 - I320 - I337 - I343 - I349 - I353 - I354 - I358 - J34 - J42 - J43 - J582 - J6 - J60 - J68 - M200 - M374 - M402 - M432 - M45 - M65 - O14 - O15 - O21 - O4 - O6 - O7 - O8 - R1 - R15 - R17 - R2 - R22 - R23 - R3 - R32 - R34 - R43 - R5 - R6 - R7 - R9 - T117 - T133 - T178 - T218 - T234 - T244 - T245 - T312 - T313 - T78 - T8 - W357 - W364 - W406 - W717 - W785 - W90 - W92 - X001 - X103 - X108 - X118 - X12 - X126 - X132 - X138 - X139 - X141 - X146 - X147 - X151 - X155 - X163 - X18 - X185 - X195 - X197 - X199 - X201 - X203 - X220 - X243 - X249 - X25 - X251 - X253 - X264 - X271 - X273 - X281 - X291 -

Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G95 - G98 - I301 - I303 - I309 - I317 - F679 - G407 - G518 - I320 - I337 - I343 - I349 - I353 - I354 - I358 - J34 - J42 - J43 - J582 - J6 - J60 - J68 - M200 - M374 - M402 - M432 - M45 - M65 - O14 - O15 - O21 - O4 - O6 - O7 - O8 - R1 - R15 - R17 - R2 - R22 - R23 - R3 - R32 - R34 - R43 - R5 - R6 - R7 - R9 - T117 - T133 - T178 - T218 - T234 - T244 - T245 - T312 - T313 - T78 - T8 - W357 - W364 - W406 - W717 - W785 - W90 - W92 - X001 - X103 - X108 - X118 - X12 - X126 - X132 - X138 - X139 - X141 - X146 - X147 - X151 - X155 - X163 - X18 - X185 - X195 - X197 - X199 - X201 - X203 - X220 - X243 - X249 - X25 - X251 - X253 - X264 - X271 - X273 - X281 - X291 -

Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G95 - G98 - I301 - I303 - I309 - I317 - F679 - G407 - G518 - I320 - I337 - I343 - I349 - I353 - I354 - I358 - J34 - J42 - J43 - J582 - J6 - J60 - J68 - M200 - M374 - M402 - M432 - M45 - M65 - O14 - O15 - O21 - O4 - O6 - O7 - O8 - R1 - R15 - R17 - R2 - R22 - R23 - R3 - R32 - R34 - R43 - R5 - R6 - R7 - R9 - T117 - T133 - T178 - T218 - T234 - T244 - T245 - T312 - T313 - T78 - T8 - W357 - W364 - W406 - W717 - W785 - W90 - W92 - X001 - X103 - X108 - X118 - X12 - X126 - X132 - X138 - X139 - X141 - X146 - X147 - X151 - X155 - X163 - X18 - X185 - X195 - X197 - X199 - X201 - X203 - X220 - X243 - X249 - X25 - X251 - X253 - X264 - X271 - X273 - X281 - X291 -

Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G95 - G98 - I301 - I303 - I309 - I317 - F679 - G407 - G518 - I320 - I337 - I343 - I349 - I353 - I354 - I358 - J34 - J42 - J43 - J582 - J6 - J60 - J68 - M200 - M374 - M402 - M432 - M45 - M65 - O14 - O15 - O21 - O4 - O6 - O7 - O8 - R1 - R15 - R17 - R2 - R22 - R23 - R3 - R32 - R34 - R43 - R5 - R6 - R7 - R9 - T117 - T133 - T178 - T218 - T234 - T244 - T245 - T312 - T313 - T78 - T8 - W357 - W364 - W406 - W717 - W785 - W90 - W92 - X001 - X103 - X108 - X118 - X12 - X126 - X132 - X138 - X139 - X141 - X146 - X147 - X151 - X155 - X163 - X18 - X185 - X195 - X197 - X199 - X201 - X203 - X220 - X243 - X249 - X25 - X251 - X253 - X264 - X271 - X273 - X281 - X291 -

Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G95 - G98 - I301 - I303 - I309 - I317 - F679 - G407 - G518 - I320 - I337 - I343 - I349 - I353 - I354 - I358 - J34 - J42 - J43 - J582 - J6 - J60 - J68 - M200 - M374 - M402 - M432 - M45 - M65 - O14 - O15 - O21 - O4 - O6 - O7 - O8 - R1 - R15 - R17 - R2 - R22 - R23 - R3 - R32 - R34 - R43 - R5 - R6 - R7 - R9 - T117 - T133 - T178 - T218 - T234 - T244 - T245 - T312 - T313 - T78 - T8 - W357 - W364 - W406 - W717 - W785 - W90 - W92 - X001 - X103 - X108 - X118 - X12 - X126 - X132 - X138 - X139 - X141 - X146 - X147 - X151 - X155 - X163 - X18 - X185 - X195 - X197 - X199 - X201 - X203 - X220 - X243 - X249 - X25 - X251 - X253 - X264 - X271 - X273 - X281 - X291 -

Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G95 - G98 - I301 - I303 - I309 - I317 - F679 - G407 - G518 - I320 - I337 - I343 - I349 - I353 - I354 - I358 - J34 - J42 - J43 - J582 - J6 - J60 - J68 - M200 - M374 - M402 - M432 - M45 - M65 - O14 - O15 - O21 - O4 - O6 - O7 - O8 - R1 - R15 - R17 - R2 - R22 - R23 - R3 - R32 - R34 - R43 - R5 - R6 - R7 - R9 - T117 - T133 - T178 - T218 - T234 - T244 - T245 - T312 - T313 - T78 - T8 - W357 - W364 - W406 - W717 - W785 - W90 - W92 - X001 - X103 - X108 - X118 - X12 - X126 - X132 - X138 - X139 - X141 - X146 - X147 - X151 - X155 - X163 - X18 - X185 - X195 - X197 - X199 - X201 - X203 - X220 - X243 - X249 - X25 - X251 - X253 - X264 - X271 - X273 - X281 - X291 -

Feature cylinder - levels merged : ['AA11 - AA17 - E26 - E303 - E310 - E69 - E72 - E74 - E77 - E86 - E90 - F067 - F121 - F131 - F146 - F159 - F169 - F25 - F264 - F331 - F372 - F383 - F571 - F601 - F374 - F672 - F685 - F72 - F76 - F98 - G150 - G3 - G416 - G462 - G467 - G48 - G496 - G5 - G55 - G60 - G604 - G608 - G634 - G640 - G69 - G71 - G84 - G95 - G98 - I301 - I303 - I309 - I317 - F679 - G407 - G518 - I320 - I337 - I343 - I349 - I353 - I354 - I358 - J34 - J42 - J43 - J582 - J6 - J60 - J68 - M200 - M374 - M402 - M432 - M45 - M65 - O14 - O15 - O21 - O4 - O6 - O7 - O8 - R1 - R15 - R17 - R2 - R22 - R23 - R3 - R32 - R34 - R43 - R5 - R6 - R7 - R9 - T117 - T133 - T178 - T218 - T234 - T244 - T245 - T312 - T313 - T78 - T8 - W357 - W364 - W406 - W717 - W785 - W90 - W92 - X001 - X103 - X108 - X118 - X12 - X126 - X132 - X138 - X139 - X141 - X146 - X147 - X151 - X155 - X163 - X18 - X185 - X195 - X197 - X199 - X201 - X203 - X220 - X243 - X249 - X25 - X251 - X253 - X264 - X271 - X273 - X281 - X291 -

Feature customer - levels merged : ['AMES - BELKS - BRENDLS - BESTPROD - BURDINES - CASLIVING - CASUALLIVING - CENPURCH - CHILDWORLD - DUNNS - EXXON - GLOBAL - CHILDCRAFT - GURNEY - HANHOUSE - HANOVERHOUSE - HANOVERHSE - HANOVRHOUS - HANOVRHOUSE - HOMESHOPPING - JCP - JCPENNY - KIDSRUS', 'LAZARUS']
Feature customer - levels merged : ['AMES - BELKS - BRENDLS - BESTPROD - BURDINES - CASLIVING - CASUALLIVING - CENPURCH - CHILDWORLD - DUNNS - EXXON - GLOBAL - CHILDCRAFT - GURNEY - HANHOUSE - HANOVERHOUSE - HANOVERHSE - HANOVRHOUS - HANOVRHOUSE - HOMESHOPPING - JCP - JCPENNY - KIDSRUS - LAZARUS', 'MERVYNS']
Feature customer - levels merged : ['AMES - BELKS - BRENDLS - BESTPROD - BURDINES - CASLIVING - CASUALLIVING - CENPURCH - CHILDWORLD - DUNNS - EXXON - GLOBAL - CHILDCRAFT - GURNEY - HANHOUSE - HANOVERHOUSE - HANOVERHSE - HANOVRHOUS - HANOVRHOUSE - HOMESHOPPING - JCP - JCPENNY - KIDSRUS - LAZARUS - MERVYNS', 'NATLWILDLIFE']
Feature customer - levels merged : ['AMES - BELKS - BRENDLS - BES

## Test time

In [38]:
bands_train_mdlp = score_bands_MDLP_one_hot_encoder.transform(
    transformer_cont_bands.transform(bands_nn_features_train[[
'press2', 'plating', 'proof2', 'viscosity', 'caliper', 'ink3', 'humifity',
    'roughness', 'blade2', 'varnish', 'press3', 'ink4', 'solvent2', 'ESA',
    'ESA2', 'wax', 'hardener', 'roller', 'current', 'anode', 'chrome'    ]]))

In [30]:
bands_train_grouped_label_encoders = []

bands_train_grouped_encoded = bands_train_grouped.copy()

for j in [
            'cylinder', 'customer', 'grain', 'ink', 'proof', 'blade', 'cylinder1',
    'paper', 'ink2', 'direct', 'solvent', 'type_cyl', 'press', 'cylinder2',
    'paper2', 'caliper'

]:
    temp = sk.preprocessing.LabelEncoder()
    temp.fit(bands_train_grouped_encoded[j].astype(str))
    bands_train_grouped_label_encoders.append(temp)
    bands_train_grouped_encoded[j] = temp.transform(bands_train_grouped_encoded[j].astype(str))

In [33]:
score_bands_CHI2_one_hot_encoder = sk.preprocessing.OneHotEncoder(categories='auto',sparse=False,handle_unknown="ignore")

score_bands_CHI2_one_hot_encoder.fit(
        bands_train_grouped_encoded[[
                'cylinder', 'customer', 'grain', 'ink', 'proof', 'blade', 'cylinder1',
    'paper', 'ink2', 'direct', 'solvent', 'type_cyl', 'press', 'cylinder2',
    'paper2', 'caliper'

        ]])

OneHotEncoder(categorical_features=None, categories='auto',
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=False)

In [37]:
bands_train_chi2 = score_bands_CHI2_one_hot_encoder.transform(bands_train_grouped_encoded.iloc[bands_features_train.index, :][[
                'cylinder', 'customer', 'grain', 'ink', 'proof', 'blade', 'cylinder1',
    'paper', 'ink2', 'direct', 'solvent', 'type_cyl', 'press', 'cylinder2',
    'paper2', 'caliper']])


In [39]:
bands_adhoc_train = np.concatenate((bands_train_chi2,bands_train_mdlp),axis=1)

In [40]:
bands_adhoc_LR = sk.linear_model.LogisticRegression(C=1e20, tol=1e-8, solver="newton-cg")
bands_adhoc_LR.fit(
    bands_adhoc_train,
    bands_train_grouped_encoded['band'])

LogisticRegression(C=1e+20, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=1e-08, verbose=0, warm_start=False)

In [None]:
bands_test_grouped = bands.iloc[bands_features_test.index, :].copy()

for var in ['cylinder', 'customer', 'grain', 'ink', 'proof', 'blade', 'cylinder1',
    'paper', 'ink2', 'direct', 'solvent', 'type_cyl', 'press', 'cylinder2',
    'paper2', 'caliper']:
    
    bands_test_grouped[var] = bands_test_grouped[var].astype(str)

    for x in d[var]:
        if x.find(' - ')>-1:
            liste_modalites = x.split(' - ')
            bands_test_grouped[var].iloc[np.in1d(bands_test_grouped[var],liste_modalites)] = x
        

In [None]:
bands_test_grouped_encoded = bands_test_grouped.copy()

for j in ['cylinder', 'customer', 'grain', 'ink', 'proof', 'blade', 'cylinder1',
    'paper', 'ink2', 'direct', 'solvent', 'type_cyl', 'press', 'cylinder2',
    'paper2', 'caliper']:
    indice = ['cylinder', 'customer', 'grain', 'ink', 'proof', 'blade', 'cylinder1',
    'paper', 'ink2', 'direct', 'solvent', 'type_cyl', 'press', 'cylinder2',
    'paper2', 'caliper'].index(j)
    bands_test_grouped_encoded[j] = bands_train_grouped_label_encoders[indice].transform(bands_test_grouped_encoded[j].astype(str))
    

In [41]:
bands_test_chi2 = score_bands_CHI2_one_hot_encoder.transform(bands_grouped_encoded.iloc[bands_features_test.index, :][[
                'cylinder', 'customer', 'grain', 'ink', 'proof', 'blade', 'cylinder1',
    'paper', 'ink2', 'direct', 'solvent', 'type_cyl', 'press', 'cylinder2',
    'paper2', 'caliper'

        ]])

In [42]:
bands_test_mdlp = score_bands_MDLP_one_hot_encoder.transform(
    transformer_cont_bands.transform(bands_nn_features_test[[
'press2', 'plating', 'proof2', 'viscosity', 'caliper', 'ink3', 'humifity',
    'roughness', 'blade2', 'varnish', 'press3', 'ink4', 'solvent2', 'ESA',
    'ESA2', 'wax', 'hardener', 'roller', 'current', 'anode', 'chrome'    ]]))

In [43]:
bands_adhoc_test = np.concatenate(
    (bands_test_chi2, bands_test_mdlp), axis=1)

In [44]:
bands_adhoc_test.shape

(123, 48)

In [45]:
bands_adhoc_train.shape

(247, 48)

In [46]:
2 * sk.metrics.roc_auc_score(
    bands_grouped_encoded.iloc[bands_features_test.index, :]
              ['band'],
    bands_adhoc_LR.predict_proba(bands_adhoc_test)[:, 1]) - 1

0.4633333333333334

In [48]:
alpha = .95
y_pred = bands_adhoc_LR.predict_proba(bands_adhoc_test)[:, 1]
y_true = bands_perf_test

auc, auc_cov = delong_roc_variance(
    y_true,
    y_pred)

auc_std = np.sqrt(auc_cov)
lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2)

ci = stats.norm.ppf(
    lower_upper_q,
    loc=auc,
    scale=auc_std)

ci[ci > 1] = 1

print('Gini:', 2*auc-1)
print('AUC COV:', auc_cov)
print('95% Gini CI:', 2*ci-1)

Gini: 0.4633333333333336
AUC COV: 0.0020082882416991455
95% Gini CI: [0.28766596 0.63900071]
