In [31]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from pandas import Series, DataFrame
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

clf = LogisticRegression(max_iter=1000, solver='liblinear')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)[:, 1]

def binned_residuals(y_true, y_pred_proba):
    import pandas as pd


    def get_upper(x):
        return x.split(',')[1][:-1]

    y_join = []
    for valid, pred_proba in zip(y_true, y_pred_proba):
        residual = valid - pred_proba
        y_join.append([valid, pred_proba, residual])

    y_join = np.array(sorted(y_join, key=lambda x: x[1]))

    y_pd = DataFrame(y_join, columns=['Actual', 'Predicted', 'Residual'])

    n_bins = 50
    y_pd['bin'], bins = pd.cut(y_pd['Predicted'], bins=n_bins, retbins=True)
    grp = y_pd.groupby('bin').agg({'Residual': 'mean'}).reset_index()
    grp['bin'] = grp['bin'].astype('str').map(get_upper)

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=[i/n_bins for i in range(0, n_bins)],
        y=[0 for _ in range(0, n_bins)],
        name='Baseline'
    ))

    fig.add_trace(
        go.Scatter(
            x=grp['bin'],
            y=grp['Residual'],
            mode='markers',
            name='Logistic Model'
        )
    )


    fig.update_layout(
        autosize=False,
        width=800,
        height=800
    )
    fig.show()


In [32]:
binned_residuals(y_test, y_pred_proba)

In [None]:
import plotly.io as pio
pio.renderers.default = "notebook_connected"
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=[i/n_bins for i in range(0, n_bins)],
    y=[0 for _ in range(0, n_bins)],
    name='Baseline'
))

fig.add_trace(
    go.Scatter(
        x=grp['bin'],
        y=grp['Residual'],
        mode='markers',
        name='Logistic Model'
    )
)


fig.update_layout(
    autosize=False,
    width=800,
    height=800
)
fig.show()


In [1]:
from copy import deepcopy
from sklearn import metrics, calibration
# assessing model performance


def binned_residuals(y_true, y_pred_proba):
    import pandas as pd

    def get_upper(x):
        return x.split(',')[1][:-1]

    y_join = []
    for valid, pred_proba in zip(y_true, y_pred_proba):
        residual = valid - pred_proba
        y_join.append([valid, pred_proba, residual])

    y_join = np.array(sorted(y_join, key=lambda x: x[1]))

    y_pd = DataFrame(y_join, columns=['Actual', 'Predicted', 'Residual'])

    n_bins = 50
    y_pd['bin'], bins = pd.cut(y_pd['Predicted'], bins=n_bins, retbins=True)
    grp = y_pd.groupby('bin').agg({'Residual': 'mean'}).reset_index()
    grp['bin'] = grp['bin'].astype('str').map(get_upper)

    return grp


def evaluate_classifier(y, y_hat, y_hat_proba, X_train=None, y_train=None):

  from sklearn import metrics

  if len(y_hat_proba.shape) > 1:
    raise IndexError('Scores should be one dimensional')

  metrics_dict = {}
  tn, fp, fn, tp = metrics.confusion_matrix(y, y_hat).ravel()
  f1_score = metrics.f1_score(y, y_hat)
  specificity = tn / (tn+fp)
  recall = tp / (tp + fn)
  precision = tp / (tp+fp)

  fpr, tpr, _ = metrics.roc_curve(y, y_hat)
  pr, rec, _ = metrics.precision_recall_curve(y, y_hat)
  prob_true, prob_pred = calibration.calibration_curve(y, y_hat_proba)
  binary_crossentropy = metrics.log_loss(y, y_hat_proba)
  residuals = binned_residuals(y, y_hat_proba)

  deviance = 2*metrics.log_loss(y, y_hat_proba, normalize=False)

  metrics_dict = {
      'tn': tn,
      'fp': fp,
      'fn': fn,
      'tp': tp,
      'f1': f1_score,
      'specificity': specificity,
      'precision': precision,
      'recall': recall,
      'roc_curve': (fpr, tpr),
      'pr_curve': (pr, rec),
      'residuals': residuals,
      'calibration': (prob_true, prob_pred),
      'deviance': deviance,
      'binary_crossentropy': binary_crossentropy
  }

  if X_train is not None and y_train is not None:
    logit = sm.GLM(y_train, X_train, family=sm.families.Binomial()).fit(
        solver='bfgs', maxiter=512)
    AIC = logit.aic
    BIC = logit.bic

    prsquared = 1 - logit.llf/logit.llnull

    metrics_dict.update({
        'AIC': AIC,
        'BIC': BIC,
        'mcfadden_r2': prsquared
    })

  return metrics_dict


def evaluate_predictions(y, y_hat, plot=False, y_hat_proba=None):
  '''y is actual, y_hat is predicted'''

  if plot:
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots

    fig = make_subplots(rows=2, cols=3,
                        subplot_titles=['ROC', 'PR', 'Residual',
                                        'Confusion Matrix', 'Calibration'],
                        specs=[
                            [{"type": "scatter"}, {"type": "scatter"},
                                {"type": "scatter"}],
                            [{"type": "table"}, {"type": "scatter"}, {'type': 'scatter'}]]
                        )
    fig.add_trace(go.Scatter(
        x=fpr,
        y=tpr,
        name='ROC'
    ), row=1, col=1)

    fig.add_trace(go.Scatter(
        x=rec,
        y=pr,
        name='PR Curve'
    ), row=1, col=2)

    bins, edges = np.histogram(residuals, bins=100)

    fig.add_trace(go.Scatter(
        x=edges[:-1],
        y=bins,
        name='Residuals'
    ), row=1, col=3)

    fig.add_trace(go.Table(
        cells=dict(
            values=[[0, 1], metrics.confusion_matrix(
                y, y_hat)[:, 0], metrics.confusion_matrix(y, y_hat)[:, 1]]
        ),
        header=dict(values=['', '0', '1'])
    ), row=2, col=1)

    fig.add_trace(go.Scatter(
        x=prob_true,
        y=prob_pred,
        name='Calibration'
    ), row=2, col=2)

    fig.update_layout(
        autosize=False,
        width=1200,
        height=800,
        showlegend=False
    )

  if plot:
    fig.show()
  return {
      'precision': metrics.precision_score(y, y_hat),
      'recall': metrics.recall_score(y, y_hat),
      'roc_auc': metrics.roc_auc_score(y, y_hat),
      'f1': metrics.f1_score(y, y_hat),
      'cmatrix': metrics.confusion_matrix(y, y_hat),
      'sensitivity': sensitivity,
      'specificity': specificity,
      'fpr': fpr,
      'tpr': tpr,
      'prob_true': prob_true,
      'prob_pred': prob_pred
  }


def binned_residuals(y_true, y_pred_proba):
    import pandas as pd

    def get_upper(x):
        return x.split(',')[1][:-1]

    y_join = []
    for valid, pred_proba in zip(y_true, y_pred_proba):
        residual = valid - pred_proba
        y_join.append([valid, pred_proba, residual])

    y_join = np.array(sorted(y_join, key=lambda x: x[1]))

    y_pd = DataFrame(y_join, columns=['Actual', 'Predicted', 'Residual'])

    n_bins = 50
    y_pd['bin'], bins = pd.cut(y_pd['Predicted'], bins=n_bins, retbins=True)
    grp = y_pd.groupby('bin').agg({'Residual': 'mean'}).reset_index()
    grp['bin'] = grp['bin'].astype('str').map(get_upper)

    return grp


In [3]:
import numpy as np
d = {
    'a': [i for i in range(1, 10)], 
    'b': np.random.choice(['a', 'b', 'c'], size=10)
}

In [6]:
d.get(['a', 'b'])

TypeError: unhashable type: 'list'