In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#loading Dataset
df = pd.read_csv("decision_curve_data.csv")

**Question 1:** What is the intervention associated with the readmission model?

**Question 2:** In figure 1 from 'a simple, step-by-step guide to interpreting decision curve analysis', the extremes of the x-axis are 'I'm worried about disease' and 'I'm worried about biopsy'. What do the extremes of this scale represent for the readmission case?

In [None]:
def decision_curve(data, probabilities, y, labels, xlim=[0,1]):
    y = data.loc[:,y]
    event_rate = np.mean(y)
    N = data.shape[0]

    # make nb table
    nb = pd.DataFrame(np.arange(0.01,1,0.01),columns=['threshold'])
    nb['treat_all'] = event_rate - (1-event_rate)*nb.threshold/(1-nb.threshold)
    nb['treat_none'] = 0

    # Make plot
    ymax = np.max(np.max(nb.loc[:,nb.columns!='threshold']))
    plt.figure(figsize=(10,6))
    plt.plot(nb.threshold, nb.treat_all)
    plt.plot(nb.threshold, nb.treat_none)
    plt.ylim(bottom=-0.01,top=ymax)
    plt.xlim(left=xlim[0],right=xlim[1])
    plt.legend(title='Predictors', labels=['Treat all','Treat none']+labels)
    plt.xlabel('Decision probability threshold')
    plt.ylabel('Net benefit (reduction readmission)')

    return plt

# Generate decision curve
probabilities = ['logit_pred','dtree_pred','rm_pred']
labels = ['Logistic regression','Decision tree', 'Random Forest']
y = 'readmitted'
plt = decision_curve(df, probabilities, y, labels, xlim=[0,0.5])
plt.show()

**Question 3:** What do the 'treat all' and 'treat none' strategies signify in the readmission case? 

**Question 4:** Fill in the formula for net benefit.

In [None]:
def decision_curve(data, probabilities, y, labels, xlim=[0,1]):
    y = data.loc[:,y]
    event_rate = np.mean(y)
    N = data.shape[0]

    # make nb table
    nb = pd.DataFrame(np.arange(0.01,1,0.01),columns=['threshold'])
    nb['treat_all'] = event_rate - (1-event_rate)*nb.threshold/(1-nb.threshold)
    nb['treat_none'] = 0

    # cycling through each predictor and calculating net benefit
    for m in probabilities:
        nb[m] = 0
        p = data.loc[:,m]
        for ind,t in enumerate(nb.threshold):
            tp = np.sum(y.loc[p>=t]==True)
            fp = np.sum(y.loc[p>=t]==False)
            if np.sum(p>=t)==0:
                tp=fp=0
            nb.iloc[ind,nb.columns.get_indexer([m])] = ... # <--- fill in formula here!!!

    # Make plot
    ymax = np.max(np.max(nb.loc[:,nb.columns!='threshold']))
    plt.figure(figsize=(10,6))
    plt.plot(nb.threshold, nb.treat_all)
    plt.plot(nb.threshold, nb.treat_none)
    for m in probabilities:
        plt.plot(nb.threshold, nb.loc[:,m])
    plt.ylim(bottom=-0.01,top=ymax)
    plt.xlim(left=xlim[0],right=xlim[1])
    plt.legend(title='Predictors', labels=['Treat all','Treat none']+labels)
    plt.xlabel('Decision probability threshold')
    plt.ylabel('Net benefit (reduction readmission)')

    return plt

# Generate decision curve
probabilities = ['logit_pred','dtree_pred','rm_pred']
labels = ['Logistic regression','Decision tree', 'Random Forest']
y = 'readmitted'
plt = decision_curve(df, probabilities, y, labels, xlim=[0,0.5])
plt.show()

**Question 5:** What does net benefit represent? 

**Question 6:** Discuss the plot. Which model do you prefer based on NB approach? Why?