In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### Top Turn Down Reason (TTDR)
##### This code is used to select top 4 turndown reasons of an applicant.
The selection logic is listed below:
1. Divide all features into N buckets (1 missing bucket + (N-1) non-missing buckets)
2. Determine the bucket with lowest actual target rate (bad rate) as benchmark bucket (safest bucket)
3. For all features, calculate the distance between selected bucket of that applicant and the benchmark bucket 
4. Rank all feature distances from high to low
5. Pick up the top 4

See the example below

<img src="PNG/TTDR.PNG">

In [2]:
def TTDR(df, var_list, ind_list, target_variable, select_record, bins):
    
# Parameter Instruction:
    # df: DataFrame with all raw features and Actual Target Variable (0/1, bad/good)
    # var_list: continues variables list, could have missing value
    # ind_list: indicator variables list, could have missing value
    # select_record: the selected record needs Turn Down Reason, pandas.series.Series format
    # bins: number of buckets (include benchmark bucket)
    # The function output is top 4 features with largest distance 
    
    TR_dic = {}
    select_target = {}
       
    for var in var_list:
        na_df = df[df[var].isnull()]   # ---- Missing Bucket
        nna_df = df[df[var].notnull()] # ---- Non-missing Bucket

    # Step1: Calculate threshold of each bucket 
        base = []
        target_mean = [na_df[target_variable].mean()] # ---- placeholder for target rate of each bucket, 1st one is missing bucket
        for i in range(1, bins+1):
            base.append(np.percentile(nna_df[var], (100/bins)*i))
        
        for i in range(0, bins):
            if i == 0:
                target = nna_df[nna_df[var] <= base[i]][target_variable].mean()
                if np.isnan(select_record[var]):
                    select_target[var] = na_df[target_variable].mean()
                elif select_record[var] <= base[i]:
                    select_target[var] = target
            if i > 0:
                target = nna_df[(nna_df[var] <= base[i]) & (nna_df[var] > base[i-1])][target_variable].mean()
                if np.isnan(select_record[var]):
                    select_target[var] = na_df[target_variable].mean()                
                elif (select_record[var] <= base[i]) and (select_record[var] > base[i-1]):
                    select_target[var] = target
                    
            target_mean.append(target)
        TR_dic[var] = target_mean
        
    for var in ind_list:
        v0 = nna_df[nna_df[var] == 0][target_variable].mean()
        v1 = nna_df[nna_df[var] == 1][target_variable].mean()
        vm = na_df[target_variable].mean()
        TR_dic[var] = [v0, v1, vm]        
        if select_record[var] == 0:
            select_target[var] = v0
        if select_record[var] == 1:
            select_target[var] = v1
        if np.isnan(select_record[var]):
            select_target[var] = vm
            
    select_target = pd.DataFrame.from_dict(select_target, orient='index')
    select_target.rename(columns={0:'Selected'}, inplace=True)
        
    # Step2: Create Benchmark Bucket (the bucket with minimum target rate of each raw feature)
    benchmark = {}
    for key, value in TR_dic.items():
        benchmark[key] = min(value)
    benchmark = pd.DataFrame.from_dict(benchmark, orient = 'index')
    benchmark.rename(columns={0:'Benchmark'}, inplace=True)
        
    # Step3: Calculate Distance between selected record and benchmark for each feature
    select_dist = select_target.merge(benchmark, left_index=True, right_index=True, how = 'left')
    select_dist['Distance'] = select_dist['Selected'] - select_dist['Benchmark']

    return select_dist['Distance'].nlargest(4)

##### Example

In [3]:
d = {'Y': [0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1]
    ,'X1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, None]
    ,'X2': [100, 90, 80, 70, 60, 50, 40, 30, 20, 10, 0, None]
    ,'X3': [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, None, None]
    ,'X4': [1, 2, 3, 5, 8, 13, 21, 34, 55, 89, None, 144]
    ,'X5': [5, 0, 4, 9, 3, 7, 6, 8, 1, 2, None, 10]}
df = pd.DataFrame(d)
df 

Unnamed: 0,X1,X2,X3,X4,X5,Y
0,1.0,100.0,0.0,1.0,5.0,0
1,2.0,90.0,0.0,2.0,0.0,1
2,3.0,80.0,0.0,3.0,4.0,0
3,4.0,70.0,1.0,5.0,9.0,1
4,5.0,60.0,0.0,8.0,3.0,0
5,6.0,50.0,1.0,13.0,7.0,1
6,7.0,40.0,0.0,21.0,6.0,0
7,8.0,30.0,0.0,34.0,8.0,0
8,9.0,20.0,0.0,55.0,1.0,1
9,10.0,10.0,0.0,89.0,2.0,0


In [4]:
var_list = ['X1', 'X2', 'X4', 'X5']
ind_list = ['X3']

# Assume applicant is the record with index=10
select_record = df.loc[10]
select_record

X1    11.0
X2     0.0
X3     NaN
X4     NaN
X5     NaN
Y      1.0
Name: 10, dtype: float64

In [5]:
TTDR(df = df
    ,var_list = var_list
    ,ind_list = ind_list
    ,target_variable = 'Y'
    ,select_record = select_record
    ,bins = 5)

X5    1.000000
X3    0.750000
X4    0.666667
X2    0.666667
Name: Distance, dtype: float64