This notebook covers feature-selection using multiple algorithms

**Game plan**

- Try with vs. without resampling methods (including with and w/o over-sampling)
- Classical feature-selection techniques: RFE, SFS, Chi2 (SelectKBest from sklearn)
- Novel feature-selection: boruta (Stephan), RelieF (Kononenko, I. 1994)
- Sensitivity-analysis based, vary each feature by a considerable range and observe relative changes in outputs, the features that produce the greatest deviation in the output considered most relevant (VARS and Sobol), this requires a fitted estimator

> The key idea in **ReliefF** is to evaluate the contribution of each feature to the
> class difference and intraclass similarity [15]. With a randomly selected data,
> the algorithm looks for the closest k hits (those with the same class label) and
> errors (those with a different class label). After that, it updates the quality of the
> contribution of the features with regard to the difference between the features values
> of the selected data and the closest ones

_Evaluating Selection Process_

- `evaluate_classifier` function
- Information-Gain per-features
- Peasrons, Feature-Assessment by Sliding Thresholds (FAST, FAIR)
- S2N (signal-to-noise correlation coefficient)


In [2]:
import pandas as pd
from pandas import DataFrame
import imblearn
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.under_sampling import InstanceHardnessThreshold, RandomUnderSampler
from imblearn.pipeline import Pipeline
import numpy as np
from pandas import DataFrame, MultiIndex
from sklearn import preprocessing
from tqdm import tqdm
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

np.logical_xnor = lambda a, b: np.logical_not(np.logical_xor(a, b))

pd.set_option("display.max_rows", None)

KEYS = ["UUID", "year_month"]
TARGET = "has_card"
ROOT = "dbfs:/mnt/zcae070001/2XG_Tables/CC/DM/"
ROOT_MISSING_REMOVE = ROOT + "high_missing_remove/"
ROOT_VAR_REMOVE = ROOT + "low_var_remove/"
ROOT_CHI_REMOVE = ROOT + "chi_remove/"


In [3]:
from sklearn.datasets import load_breast_cancer
from imblearn.datasets import make_imbalance
from sklearn.model_selection import train_test_split

# assume this is where I load data
X, y = load_breast_cancer(return_X_y=True)
X.shape, y.shape
y.sum(), len(y)


(357, 569)

In [4]:
from xgboost import XGBClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

clf = XGBClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)




  from pandas import MultiIndex, Int64Index


# Functions

Two functions:
1. For evaluating a *SINGLE* classifier (more detail, DiscriminationThreshold, etc)
2. For *COMPARING* classifiers, less detail, side-by-side bar plots, etc

## Evaluation

In [5]:
def cumulative_gain_curve(y_true, y_score, pos_label=None):
    """This function generates the points necessary to plot the Cumulative Gain
    Note: This implementation is restricted to the binary classification task.
    Args:
        y_true (array-like, shape (n_samples)): True labels of the data.
        y_score (array-like, shape (n_samples)): Target scores, can either be
            probability estimates of the positive class, confidence values, or
            non-thresholded measure of decisions (as returned by
            decision_function on some classifiers).
        pos_label (int or str, default=None): Label considered as positive and
            others are considered negative
    Returns:
        percentages (numpy.ndarray): An array containing the X-axis values for
            plotting the Cumulative Gains chart.
        gains (numpy.ndarray): An array containing the Y-axis values for one
            curve of the Cumulative Gains chart.
    Raises:
        ValueError: If `y_true` is not composed of 2 classes. The Cumulative
            Gain Chart is only relevant in binary classification.
    """
    y_true, y_score = np.asarray(y_true), np.asarray(y_score)

    # ensure binary classification if pos_label is not specified
    classes = np.unique(y_true)
    if pos_label is None and not (
        np.array_equal(classes, [0, 1])
        or np.array_equal(classes, [-1, 1])
        or np.array_equal(classes, [0])
        or np.array_equal(classes, [-1])
        or np.array_equal(classes, [1])
    ):
        raise ValueError("Data is not binary and pos_label is not specified")
    elif pos_label is None:
        pos_label = 1.0

    # make y_true a boolean vector
    y_true = y_true == pos_label

    sorted_indices = np.argsort(y_score)[::-1]
    y_true = y_true[sorted_indices]
    gains = np.cumsum(y_true)

    percentages = np.arange(start=1, stop=len(y_true) + 1)

    gains = gains / float(np.sum(y_true))
    percentages = percentages / float(len(y_true))

    gains = np.insert(gains, 0, [0])
    percentages = np.insert(percentages, 0, [0])

    return percentages, gains

def aic(y, y_pred):
  resid = y - y_pred.ravel()
  sse = sum(resid ** 2)

  return 2 - 2*np.log(sse) 


def binned_residuals(y_true, y_pred_proba):
    import pandas as pd
    from pandas import DataFrame
    def get_upper(x):
        return x.split(',')[1][:-1]

    y_join = []
    for valid, pred_proba in zip(y_true, y_pred_proba):
        residual = float(valid) - float(pred_proba)
        y_join.append([valid, pred_proba, residual])

    y_join = np.array(sorted(y_join, key=lambda x: x[1]))

    y_pd = pd.DataFrame(y_join, columns=['Actual', 'Predicted', 'Residual'])

    n_bins = 50
    y_pd['bin'], bins = pd.cut(y_pd['Predicted'], bins=n_bins, retbins=True)
    grp = y_pd.groupby('bin').agg({'Residual': 'mean'}).reset_index()
    grp['bin'] = grp['bin'].astype('str').map(get_upper)

    return grp

def class_separation(y, y_hat_proba):
  bins = np.linspace(0, 1, 11)
  data = pd.DataFrame({'y': y, 'y_hat_proba': y_hat_proba})

  pos = data[data.y == 1]
  neg = data[data.y == 0]

  pos_hist, _ = np.histogram(pos.y_hat_proba, bins=bins)
  neg_hist, _ = np.histogram(neg.y_hat_proba, bins=bins)

  return (bins, pos_hist, neg_hist)

def gain_bar_chart(y, y_pred_proba):
    df = pd.DataFrame({
        'y_true': y,
        'y_pred_proba': y_pred_proba
    }).sort_values('y_pred_proba')

    df['bin'] = pd.cut(df.y_pred_proba, bins=np.linspace(0,1, 11)).astype(str)
    net_pos_cnt = np.sum(y)
    df = df.groupby('bin', as_index=False).sum()

    return df.bin.values, df.y_true/net_pos_cnt

def evaluate_classifier(y, y_hat, y_hat_proba):
  
  from sklearn import metrics
  from sklearn import calibration
  
  if len(y_hat_proba.shape) > 1:
    raise IndexError('Scores should be one dimensional')
    
    
  metrics_dict = {}
  tn, fp, fn, tp = metrics.confusion_matrix(y, y_hat).ravel()
  f1_score = metrics.f1_score(y, y_hat)
  fbeta_score = metrics.fbeta_score(y, y_hat, beta=2)
  specificity = tn / (tn+fp)
  recall = tp / (tp + fn)
  precision = tp / (tp+fp) 

  fpr, tpr, _ = metrics.roc_curve(y, y_hat_proba)
  pr, rec, _ = metrics.precision_recall_curve(y, y_hat_proba)
  aucpr = metrics.auc(rec, pr)
  prob_true, prob_pred = calibration.calibration_curve(y, y_hat_proba)
  binary_crossentropy = metrics.log_loss(y, y_hat_proba)
  residuals = binned_residuals(y, y_hat_proba)

  hist, bin_edges = np.histogram(y_pred_proba, bins=np.linspace(0, 1, 11))

  classes = np.unique(y)
  percentages, gains0 = cumulative_gain_curve(y, y_hat_proba,classes[0])
  percentages, gains1 = cumulative_gain_curve(y, y_hat_proba,classes[1])

  bins, pos_cnt = gain_bar_chart(y, y_hat_proba)

  spec, sens = specificity_sensitivity_curve(y, y_hat_proba)

  deviance = 2*metrics.log_loss(y, y_hat_proba, normalize=False)

  _aic = None 

  try:
    _aic = aic(y, y_hat_proba)  
  except Exception as e:
    print('no aic: ', e) 

  metrics_dict = {
    'tn': tn, 
    'fp': fp,
    'fn': fn, 
    'tp': tp, 
    'f1': f1_score, 
    'aic': _aic,
    'specificity': specificity,
    'precision': precision,
    'recall': recall,
    'aucpr': aucpr,
    'roc_curve': (fpr, tpr),
    'pr_curve': (pr, rec),
    'residuals': residuals, 
    'calibration': (prob_true, prob_pred), 
    'deviance': deviance,
    'binary_crossentropy': binary_crossentropy,
    'lift_curve': (percentages, gains0, gains1),
    'spec_sens': (spec, sens),
    'prob_hist': (bin_edges, hist),
    'fbeta': fbeta_score,
    'gain_bar': (bins, pos_cnt)
  }
  
    
  return metrics_dict

def specificity_sensitivity_curve(y, y_pred_proba):
  from copy import deepcopy
  from sklearn.metrics import confusion_matrix
  # sensitivity = TP/(TP + FN)
  # specificity = TP/(TP + FP)

  # assuming threshold is between 0 and 1
  sens, spec = [], []
  probs = np.linspace(0, 1, num=51) # range of thresholds
  for prob in probs:
      y_pred = deepcopy(y_pred_proba)
      y_pred[y_pred <= prob] = 0
      y_pred[y_pred > prob] = 1
      tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
      sens.append(tn/(tn+fp))
      spec.append(tp/(tp+fp))

  return spec, sens 




## Plotting

In [23]:
from plotly.subplots import make_subplots
from plotly import graph_objects as go


def discrimination_threshold(y, y_hat_proba, thresholds=None):
    from sklearn import metrics
    from copy import deepcopy
    precisions = []
    recalls = []
    f1s = []
    if not thresholds:
        thresholds = np.linspace(0, 1, 101)

    for t in thresholds:
        y_pred = deepcopy(y_hat_proba)
        y_pred[y_pred < t] = 0
        y_pred[y_pred >= t] = 1
        precisions.append(metrics.precision_score(y, y_pred))
        recalls.append(metrics.recall_score(y, y_pred))
        f1s.append(metrics.f1_score(y, y_pred))

    return {'precision': precisions, 'recall': recalls, 'f1': f1s, 'thresholds': thresholds}

def plot_metrics(title, y, y_hat, y_hat_proba):
    '''For SINGLE evaluation'''
    line_spec = None
    marker_spec = None
    annotations = [] 
    metrics_dict = evaluate_classifier(y, y_hat, y_hat_proba)

    fig = make_subplots(
        rows=3, cols=3, 
        specs = [
            [{'type': 'scatter'}, {'type': 'scatter'},{'type': 'scatter'},],
            [{'type': 'scatter'}, {'type': 'scatter'},{'type': 'scatter'},],
            [{'type': 'scatter'}, {'type': 'table'},{'type': 'scatter'},],
        ],
        subplot_titles=['ROC', 'PR', 'Probability Distribution', 'Residual', 'Calibration', 'Spec-Sens', 'Gains', 'Metrics', 'Discrimination Threshold'])



    #------------------------------PRECISION RECALL---------------------#
    pr, rec = metrics_dict['pr_curve']
    fig.add_trace(go.Scatter(
        x=[float(i) for i in rec],
        y=pr,
    ), row=1, col=2)
    fig['layout']['yaxis2']['title']='Precision'
    fig['layout']['xaxis2']['title']='Recall'

    fig.update_layout(
        yaxis2=dict(tickvals=[0.0, 0.25, 0.50, 0.75, 1.00]),
        xaxis2=dict(tickvals=[0.0, 0.25, 0.50, 0.75, 1.00]),
    )

    #-------------------------------FPR-vs-TPR--------------------------#

    fpr, tpr = metrics_dict['roc_curve']
    fig.add_trace(go.Scatter(
        x=fpr,
        y=tpr,
    ), row=1, col=1)
    fig['layout']['yaxis1']['title']='FPR'
    fig['layout']['xaxis1']['title']='TPR'


    #------------------------------CLASS SEPARATION---------------------#
    bins, pos_hist, neg_hist = class_separation(y, y_hat_proba)
    fig.add_trace(
        go.Bar(
            x=bins,
            y=list(np.round(pos_hist, 1)),
            marker=dict(color='red'),
            name='+ve'
        ),
        row=1, col=3
    )
    fig.add_trace(
        go.Bar(
            x=bins,
            y=list(np.round(neg_hist, 1)),
            marker=dict(color='blue'),
            name='-ve'
        ),
        row=1, col=3
    )
    fig['layout']['xaxis3']['title']='Mean Probability'
    fig['layout']['yaxis3']['title']='Count'


    #------------------------------BINNED RESIDUALS---------------------#
    residuals = metrics_dict['residuals']
    fig.add_trace(go.Scatter(
        x=[float(i) for i in residuals['bin']],
        y=[float(i) for i in residuals['Residual']],
        mode='markers'
      ), row=2, col=1)
    fig['layout']['xaxis4']['title']='Bin'
    fig['layout']['yaxis4']['title']='Residual'


    #--------------------------------CALIBRATION-------------------------#
    prob_true, prob_pred = metrics_dict['calibration']
    fig.add_trace(go.Scatter(
        x=prob_true,
        y=prob_pred,
        mode='markers'
    ), row=2, col=2)
    fig['layout']['xaxis5']['title']='Prob True'
    fig['layout']['yaxis5']['title']='Prob Pred'

    #---------------------------------SPEC-SENS---------------------------#
    spec, sens = metrics_dict['spec_sens']
    fig.add_trace(go.Scatter(
        x=np.linspace(0, 1, 51),
        y=spec,
        line=dict(color='red'),
        mode='markers+lines'
    ), row=2, col=3)

    fig.add_trace(go.Scatter(
        x=np.linspace(0, 1, 51),
        y=sens,
        line=dict(color='blue'),
        mode='markers+lines'
    ), row=2, col=3)

    #---------------------------------GAIN CHART---------------------------#
    percentages, gain0, gain1 = metrics_dict['lift_curve']
    bins, pos_cnt = metrics_dict['gain_bar']
    # fig.add_trace(
    #     go.Scatter(x=percentages, y=gain0,name='-ve',  line=dict(color='blue')),row=3, col=1
    # )
    # fig.add_trace(
    #     go.Scatter(x=percentages, y=gain1, name='+ve', line=dict(color='red')),row=3, col=1
    # )

    fig.add_trace(
        go.Bar(x=bins, y=pos_cnt), row=3, col=1
    )
    fig.add_trace(
        go.Scatter(x=bins, y=np.cumsum(pos_cnt)), row=3, col=1
    )


    #---------------------------------THRESHOLDS----------------------------#
    threshold_dict = discrimination_threshold(y, y_hat_proba)
    fig.add_trace(
       go.Scatter(
            x=threshold_dict['thresholds'],
            y=threshold_dict['precision'],
            line=dict(color='royalblue'),
            name='precision'
        ),
        row=3, col=3
    )
    fig.add_trace(
       go.Scatter(
            x=threshold_dict['thresholds'],
            y=threshold_dict['recall'],
            line=dict(color='green'),
            name='recall'
        ),
        row=3, col=3
    )
    fig.add_trace(
       go.Scatter(
            x=threshold_dict['thresholds'],
            y=threshold_dict['f1'],
            line=dict(color='firebrick'),
            name='f1'
        ),
        row=3, col=3
    )

    fig['layout']['yaxis8']['title']='Metric'
    fig['layout']['xaxis8']['title']='Threshold'
    
    #---------------------------------TABLE METRICS-------------------------#

    values = [
        ['fbeta', 'precision', 'recall', 'AIC', 'binary_crossentropy', 'F1', 'specificity', 'AUC PR'],
        [metrics_dict['fbeta'], metrics_dict['precision'],metrics_dict['recall'], metrics_dict['aic'], metrics_dict['binary_crossentropy'], metrics_dict['f1'], metrics_dict['specificity'], metrics_dict['aucpr']]
      ]

    fig.add_trace(go.Table(
        cells=dict(
          values=values,
          line_color='darkslategray',
        #   fill=dict(color=['paleturquoise', 'white']),
          align=['left', 'center'],
          font_size=12,
          height=25)
      ), row=3, col=2)

    fig.update_layout(
        title=title,
        template='plotly_dark',
        autosize=False,
        width=1200,
        height=1200,
        showlegend=False,
        yaxis2=dict(tickmode='array', tickvals=[0.0, 0.25, 0.50, 0.75, 1.00]),
        xaxis2=dict(tickmode='array', tickvals=[0.0, 0.25, 0.50, 0.75, 1.00]),
        yaxis1=dict(tickmode='array', tickvals=[0.0, 0.25, 0.50, 0.75, 1.00]),
        xaxis1=dict(tickmode='array', tickvals=[0.0, 0.25, 0.50, 0.75, 1.00]),
    )

    fig.show()


In [24]:
# clf = clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# y_pred_proba = clf.predict_proba(X_test)

plot_metrics('', y_test, y_pred, y_pred_proba[:, 1])


invalid value encountered in long_scalars


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:

def plot_compare(title, metrics_dict):
  '''For COMPARING things'''
  pass

In [148]:
  import colorcet as ct
  from plotly.subplots import make_subplots
  '''Plots curves and comparisons'''
  if metrics:
    # single
    metrics_dict = {title: metrics}
  
  # if metrics_dict is passed, assume compare
  if metrics_dict:
    line_ = {}
    marker_ = {}
    colors_ = {}
    for i, key in enumerate(metrics_dict.keys()):
      
      #np.random.choice(['#1f77b4','#ff7f0e','#2ca02c','#d62728','#9467bd','#8c564b','#e377c2','#7f7f7f','#bcbd22','#17becf'])
      colors_[key] = ct.glasbey[i]
      line_[key] = go.scatter.Line(color=ct.glasbey[i], dash=np.random.choice(['dash', 'dot']))
      marker_[key] = go.scatter.Marker(color=ct.glasbey[i], symbol=np.random.choice(['diamond', 'triangle-up', 'square']))
    
    fig = make_subplots(rows=2, cols=3, 
                            subplot_titles=['ROC', 'PR', None, 'Residual','Calibration', 'Specificity-Sensitivity'],
                           specs=[
                             [{"type": "scatter"},{"type": "scatter"}, {"type": "scatter"}],
                             [{"type": "scatter"}, {'type': 'scatter'}, {'type': 'scatter'}]])
    

    for model in metrics_dict.keys():
      # pr_curve
      _metrics =  metrics_dict[model]
      pr, rec = _metrics['pr_curve']
      fpr, tpr = _metrics['roc_curve']


      prob_true, prob_pred = _metrics['calibration']

      residuals = _metrics['residuals']
      percentages, gain0, gain1 = _metrics['lift_curve']

      spec, sens = _metrics['spec_sens']

      fig.add_trace(go.Scatter(
        x=[float(i) for i in rec],
        y=pr,
        name=model, 
        line=line_[model]
      ), row=1, col=2)

      fig.add_trace(go.Scatter(
        x=fpr,
        y=tpr,
        name=model, 
        line=line_[model]
      ), row=1, col=1)


      fig.add_trace(go.Scatter(
        x=[float(i) for i in residuals['bin']],
        y=[float(i) for i in residuals['Residual']],
        name=model,
        line=line_[model],
        marker=marker_[model],
        mode='markers'
      ), row=2, col=1)

      fig.add_trace(go.Scatter(
        x=prob_true,
        y=prob_pred,
        name=model,
        line=line_[model],
        marker=marker_[model],
        mode='markers+lines'
      ), row=2, col=2)

      fig.add_trace(go.Scatter(
        x=np.linspace(0, 1, 51),
        y=spec,
        name=model+'_specificity',
        line=dict(color='red'),#line_[model],
        # marker=marker_[model],
        mode='markers+lines'
      ), row=2, col=3)

      fig.add_trace(go.Scatter(
        x=np.linspace(0, 1, 51),
        y=sens,
        name=model+'_sensivity',
        line=dict(color='blue'),#line_[model],
        # marker=marker_[model],
        mode='markers+lines'
      ), row=2, col=3)
  
    fig.update_layout(
      title=title,
      autosize=False,
      width=1200,
      height=800
    )
    fig.show()
    
    
    fig = make_subplots(rows=2, cols=4, subplot_titles=['precision', 'recall', 'binary_crossentropy', 'AIC', 'f1', 'specificity', 'aucpr'])

    if len(metrics_dict.keys()) == 1:

      fig = go.Figure(
        go.Scatter(
          x=metrics_dict[key]['prob_hist'][0][:-1],
          y=metrics_dict[key]['prob_hist'][1],
          name='Prediction Distribution'
        )
      )
      
      fig.update_layout(
        autosize=False,
        width=600, height=600,
        xaxis = dict(title='Mean Probability'), 
        yaxis = dict(title='Count')
      )
      
      fig.show()
      
      # if only single, output as table
      values = [
        ['precision', 'recall', 'AIC', 'binary_crossentropy', 'F1', 'specificity', 'AUC PR'],
        [metrics_dict[key]['precision'],metrics_dict[key]['recall'], metrics_dict[key]['aic'], metrics_dict[key]['binary_crossentropy'], metrics_dict[key]['f1'], metrics_dict[key]['specificity'], metrics_dict[key]['aucpr']]
      ]

      fig = go.Figure(go.Table(
        cells=dict(
          values=values,
          line_color='darkslategray',
          fill=dict(color=['paleturquoise', 'white']),
          align=['left', 'center'],
          font_size=12,
          height=30)
      ))
      fig.update_layout(
        autosize=False,
        width=800,
        height=800
      )
      fig.show()
    else:
      for key in metrics_dict.keys():
        fig.add_trace(go.Bar(
          x=[key], 
          y=[metrics_dict[key]['precision']],
          name=key, 
          marker_color=colors_[key]
        ), row=1, col=1)
        fig.add_trace(go.Bar(
          x=[key], 
          y=[metrics_dict[key]['recall']],
          name=key, 
          marker_color=colors_[key]
        ), row=1, col=2)
        fig.add_trace(go.Bar(
          x=[key], 
          y=[metrics_dict[key]['aic']],
          name=key, 
          marker_color=colors_[key]
        ), row=1, col=4)
        fig.add_trace(go.Bar(
          x=[key], 
          y=[metrics_dict[key]['binary_crossentropy']],
          name=key, 
          marker_color=colors_[key]
        ), row=1, col=3)
        fig.add_trace(go.Bar(
          x=[key], 
          y=[metrics_dict[key]['f1']],
          name=key, 
          marker_color=colors_[key]
        ), row=2, col=1)
        fig.add_trace(go.Bar(
          x=[key], 
          y=[metrics_dict[key]['specificity']],
          name=key, 
          marker_color=colors_[key]
        ), row=2, col=2)
        fig.add_trace(go.Bar(
          x=[key], 
          y=[metrics_dict[key]['aucpr']],
          name=key, 
          marker_color=colors_[key]
        ), row=2, col=3)

      fig.update_layout(showlegend=False, autosize=False, width=1200, height=800)
      fig.show()

In [149]:
from xgboost import XGBClassifier

clf = XGBClassifier(eval_metric="logloss", use_label_encoder=False)

model = clf.fit(
    X_train,
    y_train,
)


In [150]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

spec, sens = specificity_sensitivity_curve(y_pred, y_pred_proba)



invalid value encountered in long_scalars



In [151]:
from plotly import graph_objects as go

y = y_test  # for testing
classes = np.unique(y)
percentages, gains1 = cumulative_gain_curve(y, y_pred_proba, classes[0])
percentages, gains2 = cumulative_gain_curve(y, y_pred_proba, classes[1])


fig = go.Figure(
    [
        go.Scatter(x=percentages, y=gains1, name=f"Class: {classes[0]}"),
        go.Scatter(x=percentages, y=gains2, name=f"Class: {classes[1]}"),
    ]
)
fig.update_layout(autosize=False, width=600, height=600)
fig.show()

metrics = evaluate_classifier(y_test, y_pred, y_pred_proba)



invalid value encountered in long_scalars



In [152]:
def plot_lift(y, y_pred_proba):
    import pandas as pd
    import numpy as np
    from pandas import DataFrame
    from plotly import graph_objects as go
    from sklearn.metrics import precision_score

    ranges = np.linspace(0, 1, 11)

    model_predictions = DataFrame({"target": y, "pred": y_pred_proba})
    model_predictions["decile"] = pd.cut(
        model_predictions["pred"], ranges, include_lowest=True, labels=False
    )
    rand_model_prec = precision_score(
        model_predictions["target"], model_predictions["target"] * 0 + 1
    )

    data = DataFrame(
        columns=["prop_cohort", "size", "true_target_perc", "lift", "acum_coverage"]
    )
    ticks_range = list(range(10))
    ticks_y = [None] * 10
    ticks_x = [None] * 10
    coverage = [None] * 10

    fig = go.Figure()

    bar_x = []
    bar_y = []
    for i in ticks_range:
        ticks_y[i] = "{:.0f}%".format((i + 1) * 10)
        ticks_x[i] = f"{ranges[9-i+1]} - {ranges[9-i]}"

        dec_df = model_predictions[model_predictions["decile"] == 9 - i]
        decile_prec = precision_score(dec_df["target"], dec_df["target"] * 0 + 1)
        lift = decile_prec / rand_model_prec

        if i == 0:
            coverage[i] = dec_df["target"].sum() / model_predictions["target"].sum()
        else:
            coverage[i] = (
                coverage[i - 1]
                + dec_df["target"].sum() / model_predictions["target"].sum()
            )

        bar_x.append(i)
        bar_y.append(decile_prec)
        data = data.append(
            {
                "prop_cohort": ticks_x[i],
                "size": dec_df.shape[0],
                "true_target_perc": decile_prec,
                "lift": lift,
                "acum_coverage": coverage[i],
            },
            ignore_index=True,
        )
        fig.add_annotation(
            x=i, y=decile_prec + 0.03, text="{:.2f}".format(lift), showarrow=False
        )

    fig.add_trace(go.Bar(x=bar_x, y=bar_y, name="Cohort true target %"))

    fig.add_trace(
        go.Scatter(
            x=ticks_range,
            y=coverage,
            line=dict(color="orange"),
            name="Cumulative Coverage %",
        )
    )

    fig.update_layout(
        yaxis=dict(
            title="True Target %",
            tickmode="array",
            tickvals=np.linspace(0, 1, 11),
            ticktext=[f"{i}%" for i in range(0, 101, 10)],
        ),
        xaxis=dict(
            title="Model Prediction Range",
            tickmode="array",
            tickvals=np.linspace(0, 9, 10),
            ticktext=[f"{i/10}-{(i-1)/10}" for i in range(10, 0, -1)],
        ),
        showlegend=True,
        autosize=False,
        width=1000,
        height=800,
    )

    fig.show()


plot_lift(y_test, y_pred_proba)



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


Precision is 

In [153]:
plot_metrics(title="", metrics=metrics)


In [None]:
def aic(y, y_pred, k):
    resid = y - y_pred.ravel()
    sse = sum(resid**2)

    return 2 * k - 2 * np.log(sse)


In [166]:
from sklearn.metrics import precision_recall_curve

prec, rec, thresh = precision_recall_curve(y_test, y_pred_proba[:, 1])

In [158]:
def plot_compare(title, metrics_dict=None):
  import colorcet as ct
  from plotly.subplots import make_subplots
  '''Plots curves and comparisons'''
  # if metrics_dict is passed, assume compare
  line_ = {}
  marker_ = {}
  colors_ = {}
  for i, key in enumerate(metrics_dict.keys()):
    colors_[key] = ct.glasbey[i]
    line_[key] = go.scatter.Line(color=ct.glasbey[i], dash=np.random.choice(['dash', 'dot']))
    marker_[key] = go.scatter.Marker(color=ct.glasbey[i], symbol=np.random.choice(['diamond', 'triangle-up', 'square']))
  
  fig = make_subplots(rows=3, cols=3, 
                          subplot_titles=['ROC', 'Residuals','Calibration','PR Curve', 'Precision', 'Recall', 'FBeta', 'AIC', 'AUC'],
                          specs=[
                            [{"type": "scatter"},{"type": "scatter"}, {"type": "scatter"}, ],
                            [{"type": "scatter"}, {'type': 'bar'}, {'type': 'bar'}],
                            [{"type": "bar"}, {'type': 'bar'}, {'type': 'bar'}]])
  

  for model in metrics_dict.keys():
    # pr_curve
    _metrics =  metrics_dict[model]
    pr, rec = _metrics['pr_curve']
    fpr, tpr = _metrics['roc_curve']



    prob_true, prob_pred = _metrics['calibration']

    residuals = _metrics['residuals']
    percentages, gain0, gain1 = _metrics['lift_curve']

    spec, sens = _metrics['spec_sens']

    # Curves
    #------------------------------ROC CURVE------------------------------#
    fig.add_trace(go.Scatter(
      y=tpr,
      x=fpr,
      name=model,
      line=line_[model]
    ), row=1, col=1)
    #------------------------------RESIDUALS------------------------------#
    fig.add_trace(go.Scatter(
      x=[float(i) for i in residuals['bin']],
      y=[float(i) for i in residuals['Residual']],
      name=model,
      line=line_[model],
      marker=marker_[model],
      mode='markers'
    ), row=1, col=2)
    #------------------------------CALIBRATION----------------------------#

    fig.add_trace(go.Scatter(
      x=prob_true,
      y=prob_pred,
      name=model,
      line=line_[model],
      marker=marker_[model],
      mode='markers+lines'
    ), row=1, col=3)

    #------------------------------PR-CURVE-------------------------------#
    fig.add_trace(go.Scatter(
      x=rec,
      y=pr,
      name=model,
      line=line_[model],
      marker=marker_[model],
      mode='markers+lines'
    ), row=2, col=1)


    # Bar Charts
    #------------------------------PRECISOIN------------------------------#
    fig.add_trace(go.Bar(
        x=[model], 
        y=[_metrics['precision']],
        name=model, 
        marker_color=colors_[model]
      ), row=2, col=2)
    #--------------------------------RECALL-------------------------------#
    fig.add_trace(go.Bar(
      x=[model], 
      y=[_metrics['recall']],
      name=model, 
      marker_color=colors_[model]
    ), row=2, col=3)
    #-------------------------------F BETA--------------------------------#
    fig.add_trace(go.Bar(
      x=[model],
      y=[_metrics['fbeta']],
      name=model,
      marker_color=colors_[model],
    ), row=3, col=1)
    #------------------------------- AIC ---------------------------------#
    fig.add_trace(go.Bar(
      x=[model],
      y=[_metrics['aic']],
      name=model,
      marker_color=colors_[model],
    ), row=3, col=2)
    #------------------------------- AUC ---------------------------------#
    fig.add_trace(go.Bar(
      x=[model],
      y=[_metrics['aucpr']],
      name=model,
      marker_color=colors_[model],
    ), row=3, col=3)  


    fig.update_layout(
      title=title,
      template='plotly_dark',
      autosize=False,
      width=1200,
      height=1200
    )
    fig.show()

In [159]:
metrics_dict = evaluate_classifier(y_test, y_pred, y_pred_proba[:, 1])


invalid value encountered in long_scalars



In [160]:
plot_compare('', {'model1': metrics_dict})

In [181]:
#Feature selection class to eliminate multicollinearity
class MultiCollinearityEliminator():
    
    #Class Constructor
    def __init__(self, df, target, threshold):
        self.df = df
        self.target = target
        self.threshold = threshold

    #Method to create and return the feature correlation matrix dataframe
    def createCorrMatrix(self, include_target = False):
        #Checking we should include the target in the correlation matrix
        if (include_target == False):
            df_temp = self.df.drop([self.target], axis =1)
            
            #Setting method to Pearson to prevent issues in case the default method for df.corr() gets changed
            #Setting min_period to 30 for the sample size to be statistically significant (normal) according to 
            #central limit theorem
            corrMatrix = df_temp.corr(method='pearson', min_periods=30).abs()
        #Target is included for creating the series of feature to target correlation - Please refer the notes under the 
        #print statement to understand why we create the series of feature to target correlation
        elif (include_target == True):
            corrMatrix = self.df.corr(method='pearson', min_periods=30).abs()
        return corrMatrix

    #Method to create and return the feature to target correlation matrix dataframe
    def createCorrMatrixWithTarget(self):
        #After obtaining the list of correlated features, this method will help to view which variables 
        #(in the list of correlated features) are least correlated with the target
        #This way, out the list of correlated features, we can ensure to elimate the feature that is 
        #least correlated with the target
        #This not only helps to sustain the predictive power of the model but also helps in reducing model complexity
        
        #Obtaining the correlation matrix of the dataframe (along with the target)
        corrMatrix = self.createCorrMatrix(include_target = True)                           
        #Creating the required dataframe, then dropping the target row 
        #and sorting by the value of correlation with target (in asceding order)
        corrWithTarget = pd.DataFrame(corrMatrix.loc[:,self.target]).drop([self.target], axis = 0).sort_values(by = self.target)                    
        print(corrWithTarget, '\n')
        return corrWithTarget

    #Method to create and return the list of correlated features
    def createCorrelatedFeaturesList(self):
        #Obtaining the correlation matrix of the dataframe (without the target)
        corrMatrix = self.createCorrMatrix(include_target = False)                          
        colCorr = []
        #Iterating through the columns of the correlation matrix dataframe
        for column in corrMatrix.columns:
            #Iterating through the values (row wise) of the correlation matrix dataframe
            for idx, row in corrMatrix.iterrows():                                            
                if(row[column]>self.threshold) and (row[column]<1):
                    #Adding the features that are not already in the list of correlated features
                    if (idx not in colCorr):
                        colCorr.append(idx)
                    if (column not in colCorr):
                        colCorr.append(column)
        print(colCorr, '\n')
        return colCorr

    #Method to eliminate the least important features from the list of correlated features
    def deleteFeatures(self, colCorr):
        #Obtaining the feature to target correlation matrix dataframe
        corrWithTarget = self.createCorrMatrixWithTarget()                                  
        for idx, row in corrWithTarget.iterrows():
            print(idx, '\n')
            if (idx in colCorr):
                self.df = self.df.drop(idx, axis =1)
                break
        return self.df

    #Method to run automatically eliminate multicollinearity
    def autoEliminateMulticollinearity(self):
        #Obtaining the list of correlated features
        colCorr = self.createCorrelatedFeaturesList()                                       
        while colCorr != []:
            #Obtaining the dataframe after deleting the feature (from the list of correlated features) 
            #that is least correlated with the taregt
            self.df = self.deleteFeatures(colCorr)
            #Obtaining the list of correlated features
            colCorr = self.createCorrelatedFeaturesList()                                     
        return self.df

In [22]:
# y_pred, y_pred_proba, y_test  
from xgboost import XGBClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

clf = XGBClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)[:, 1]

# y_pred = np.expand_dims(y_pred, -1)
# y_pred_proba = np.expand_dims(y_pred_proba[:, 1], -1)
# y_test = np.expand_dims(y_test, -1)





In [43]:
def gain_bar_chart(y, y_pred_proba):
    df = pd.DataFrame({
        'y_true': y,
        'y_pred_proba': y_pred_proba
    }).sort_values('y_pred_proba')

    df['bin'] = pd.cut(df.y_pred_proba, bins=np.linspace(0,1, 11)).astype(str)
    net_pos_cnt = np.sum(y)
    df = df.groupby('bin', as_index=False).sum()

    return df.bin.values, df.y_true/net_pos_cnt

In [44]:
gain_bar_chart(y_test, y_pred_proba)

(array(['(0.0, 0.1]', '(0.1, 0.2]', '(0.5, 0.6]', '(0.6, 0.7]',
        '(0.7, 0.8]', '(0.8, 0.9]', '(0.9, 1.0]'], dtype=object),
 0    0.000000
 1    0.000000
 2    0.022222
 3    0.000000
 4    0.000000
 5    0.022222
 6    0.955556
 Name: y_true, dtype: float64)

In [31]:
df.groupby('bin', as_index=False).sum()

Unnamed: 0,bin,y_true,y_pred_proba
0,"(0.0, 0.1]",0,0.379376
1,"(0.1, 0.2]",0,0.15857
2,"(0.5, 0.6]",2,1.150462
3,"(0.6, 0.7]",0,0.60285
4,"(0.7, 0.8]",0,0.797633
5,"(0.8, 0.9]",2,2.672747
6,"(0.9, 1.0]",86,87.535446
