# Classification - Communicating Performance to Business 

## 1) Import & Prepare Data

In [None]:
import pandas as pd
churn = pd.read_csv("https://raw.githubusercontent.com/casbdai/datasets/main/churn.csv")

### Check Structure of Data

In [None]:
churn.info()

### Separate Features and Labels

In [None]:
X = churn.drop("churn",axis=1) # Features
y = churn["churn"] # Target variable

### Dummy code pandas "objects"

In [None]:
X = pd.get_dummies(X, drop_first = True)
X.head()

In [None]:
X.info()

## 2) Create Test & Training Data


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)


*   **X:** Features to be split into testing and training data
*   **y:** Labels to be split into testing and training data
*   **test_size:** proportion of the dataset in the test data; usually ~ 30%
*   **random_state:** seed for making results reproducible. Instances are randomly distributed among testing and training data. However, every computer splits randomly in a different fashion. Providing a seed, makes results reproducible because with the same seed, all computers split the data in the same fashion.




## 3) Import, Initiate, and Train Models

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
tree_educatedguess = DecisionTreeClassifier(criterion="entropy",
                              max_depth=30,
                              min_samples_leaf=50,
                              random_state=12)
tree_educatedguess.fit(X_train, y_train)

In [None]:
forest = RandomForestClassifier(n_estimators=1000)
forest.fit(X_train,y_train)

In [None]:
boost = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.5)
boost.fit(X_train,y_train)

# Evaluating Model Performance the Business Way

## Lift Curve

In [None]:
def plot_lift_curve(y_val, y_pred):
    """ calculation and plotting of model lift """
    
    from pandas import DataFrame
    from numpy import ceil, arange
    import matplotlib.pyplot as plt
    
    #Define an auxiliar dataframe to plot the curve
    aux_lift = DataFrame()
    aux_lift['true'] = y_val
    aux_lift['predicted'] = y_pred
    aux_lift.sort_values('predicted', ascending=False, inplace=True)
    
    #Create the values that will go into the X axis of our plot
    xval = arange(0.01,1.01,0.01) #create sequnence of values from 0.01 to 1.00 in steps of 0.01
    lift = [] #create empty list for storing lift values
    
    #Calculate the total ratio of true events in the data
    ratio_true_events_total = aux_lift['true'].sum() / len(aux_lift)
    
    #Calculate lift for each x value its correspondent y value
    for x in xval:
        index_xval = int(ceil(x*len(aux_lift))) #get index at specific value of x
        dataframe_xval = aux_lift.iloc[:index_xval,:]   #subset dataframe from 0 to index_val
        lift_xval = dataframe_xval['true'].sum()/len(dataframe_xval) #calculate lift for subset
        lift.append(lift_xval / ratio_true_events_total) #store results
    
    #Build results dataframe
    lift = DataFrame({"Lift":lift, "ProportionSample":xval})
    
    #Create plot
    fig, ax = plt.subplots(figsize = (13,5), dpi=300)
    ax.plot(lift["ProportionSample"],lift["Lift"], color='green', linewidth = 3, label = "Model")
    ax.plot([0,1],[1,1],color="grey", label="Baseline")
    ax.set_xlabel('\nProportion of sample', fontsize=13)
    ax.set_ylabel('Lift\n', fontsize=13)
    ax.set_title('Lift Curve\n', fontsize=15)
    ax.xaxis.set_tick_params(labelsize=11)
    ax.yaxis.set_tick_params(labelsize=11)
    ax.legend()
    plt.show()

## Expected Value of Models

Define value of business outcomes 

In [None]:
value_true_positive = 
value_false_positive = 

Define Function for Scoring Model:

In [None]:
def calculate_expected_value_model(matrix, value_true_positive, value_false_positive):
  """ works only for confusion matrices in specified form """

  #calculate prior probability of positive class
  p_prior_pos = matrix[1,:].sum() / matrix.sum() 
  
   #calculate conditional probabilities
  p_neg_instances = matrix[0,:]/matrix[0,:].sum()
  p_pos_instances = matrix[1,:]/matrix[1,:].sum() 

  # calculate expected values
  pos = p_prior_pos * (value_true_positive * p_pos_instances[1] + 0 * p_pos_instances[0])
  neg = (1 - p_prior_pos) * (value_false_positive * p_neg_instances[1] + 0 * p_neg_instances[0])
  return round(pos + neg, 2)

Get Expected Value for each contacted customer for random forest:

Get Expected Value for each contacted customer for decision tree:

## Get threshold probability

In [None]:
def calculate_targeting_threshold(value_true_positive, value_false_positive):
    from sympy.solvers import solve
    from sympy import Symbol
    x = Symbol("x")
    p = solve(x*value_true_positive + (1-x)*value_false_positive, x)
    return float(p[0])