In [1]:
#create function to plot dodge histogram and filled histogram for a variable
def histplots_by(x_var, df, bw=0, h_var='churn', fs=(12,8)):

    fig, ax = plt.subplots(2, 2, figsize=fs) #create 4 subplots
    
    if bw == 0: #if no bin width is input, use default binning
        sns.histplot(data=df, x=x_var, hue=h_var, element='step', fill=False, ax=ax[0,0]).set(title='Distributions by churn')
        sns.histplot(data=df, x=x_var, hue=h_var, multiple='fill', ax=ax[1,0]).set(title='Ratio', ylabel='Percentage')
        
    else: #if bin width is perscribed, calculate range and expand each side by half a bin width for presentation
        low = math.floor(df[x_var].min())
        high = math.ceil(df[x_var].max())
        b_range = [low-(bw/2), high+(bw/2)]
        
        sns.histplot(data=df, x=x_var, hue=h_var, multiple='dodge', binwidth=bw, binrange=b_range, ax=ax[0,0])
        sns.histplot(data=df, x=x_var, hue=h_var, multiple='fill', binwidth=bw, binrange=b_range, ax=ax[1,0])    
    
    sns.histplot(data=df[df[h_var] == False], x=x_var, stat='proportion', ax=ax[0,1]).set(title='No churn distribution')
    sns.histplot(data=df[df[h_var] == True], x=x_var, stat='proportion', color='darkorange', ax=ax[1,1]).set(title='Churn distribution')
    
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    fig.suptitle(x_var+" distribution by "+h_var)    
    return(fig) #return figure with two subplots

In [2]:
#create function to label new categories based on bin range
def generate_labels(bins):
    labels = []
    for i in range(len(bins)):
        if i == len(bins)-1: #skip last member in list
            pass
        else:
            if i == 0: #for first label, use description from second member of list  
                new_label = f"<={bins[i+1]}"
            elif i == len(bins)-2: #for second to last member in list, use only description from that member
                new_label = f">{bins[i]}"
            else: #use member and subsequent member to describe the range for all remaining members in the list
                new_label = f">{bins[i]} <={bins[i+1]}"
            labels.append(new_label)
    return(labels) #return list of labels for use in cutting the dataset

In [3]:
#create a function to cut a column into categoroes based upon a list of ranges
#return two subplots to display the distribution of the new categories
def cut_data(initial_col, ranges, df):
    new_col = initial_col + '_range'
    categories=generate_labels(bins)
    df[new_col] = pd.cut(df[initial_col],
                         bins=ranges,
                         labels=categories)
    fig = histplots_by(new_col, df)
    fig.suptitle(initial_col+" categorization")
    return(fig)

In [4]:
def plot_lift_curves(y_true, y_scores):
    # Create DataFrame with true labels and predicted scores
    df_pos = pd.DataFrame({'true': y_true, 'score': y_scores})
    df_neg = pd.DataFrame({'true': y_true, 'score': 1 - y_scores})
    
    # Sort by predicted score
    df_pos = df_pos.sort_values(by='score', ascending=False)
    df_neg = df_neg.sort_values(by='score', ascending=False)
    
    # Calculate cumulative sums for positive and negative classes
    df_pos['cum_true_pos'] = df_pos['true'].cumsum()
    df_pos['cum_total'] = np.arange(1, len(df_pos) + 1)
    df_neg['cum_true_neg'] = (~df_neg['true'].astype(bool)).cumsum()
    df_neg['cum_total'] = np.arange(1, len(df_neg) + 1)
    
    # Total positives and negatives
    total_positives = df_pos['true'].sum()
    total_negatives = (~df_neg['true'].astype(bool)).sum()
    
    # Calculate positive and negative lift
    df_pos['pos_lift'] = df_pos['cum_true_pos'] / df_pos['cum_total'] / (total_positives / len(df_pos))
    df_neg['neg_lift'] = df_neg['cum_true_neg'] / df_neg['cum_total'] / (total_negatives / len(df_neg))
    

    fig, ax = plt.subplots(1, 2, figsize=(12, 4))
    
    # Plot positive lift curve
    ax[0].plot(df_pos['cum_total'], df_pos['pos_lift'], label='Positive Lift Curve', color='blue')
    ax[0].plot([0, len(df_pos)], [1, 1], 'r--', label='Baseline')
    ax[0].set(xlabel='Number of Samples',ylabel='Lift',title='Positive Lift Curve')
    ax[0].legend()
    
    # Plot negative lift curve
    ax[1].plot(df_neg['cum_total'], df_neg['neg_lift'], label='Negative Lift Curve', color='green')
    ax[1].plot([0, len(df_neg)], [1, 1], 'r--', label='Baseline')
    ax[1].set(xlabel='Number of Samples',ylabel='Lift',title='Negative Lift Curve')
    ax[1].legend()
        
    plt.show()

In [17]:
#create class to generate key statistics from confusion matrix
class confusion_matrix_class:
    def __init__(self, cm, inverse=False):
        if inverse == True: #scikit learn uses the lower value (i.e. 0) as positive value, chosing inverse changes the positive value
            order = [1,0]
            self.cm = cm[order, :][:, order]
            self.tp = cm[1,1]
            self.fn = cm[1,0]
            self.fp = cm[0,1]
            self.tn = cm[0,0]
        else:
            self.cm = cm
            self.tp = cm[0,0]
            self.fn = cm[0,1]
            self.fp = cm[1,0]
            self.tn = cm[1,1]
        self.act_pos = self.tp + self.fn
        self.act_neg = self.tn + self.fp
        self.pred_pos = self.tp + self.fp
        self.pred_neg = self.tn + self.fn
        self.tot = self.act_pos + self.act_neg
        self.accuracy = ((self.tp+self.tn)/(self.tot))
        self.precision = (self.tp/(self.pred_pos))
        self.recall = (self.tp/(self.act_pos))
        self.F1 = (2*self.precision*self.recall)/(self.precision+self.recall)
        self.specificity = (self.tn/(self.act_neg))

In [16]:
#create function to produce consistent results for comparison across models
def create_results_record(label, cm, rounding=3):                
    new_record_dict = {'label': label,
                      'accuracy': round(cm.accuracy,rounding),
                      'precision': round(cm.precision,rounding),
                      'recall': round(cm.recall,rounding),
                      'F1': round(cm.F1,rounding),
                      #'specificity': cm.specificity,
                      'tp': cm.tp,
                      'fp': cm.fp,
                      'fn': cm.fn,
                      'tn': cm.tn,
                      'pred_pos': cm.pred_pos,
                      'pred_neg': cm.pred_neg,
                      'act_pos': cm.act_pos,
                      'act_neg': cm.act_neg,
                      'total': cm.tot
                      }
    return(new_record_dict)