## Model 7: IF +FS3 (kurtosis)

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../models/iforest_df_test_ckurtosis.csv',sep=';', index_col=0)

A score close to 1 indicates anomalies.  
Score much smaller than 0.5 indicates normal observations.  
If all scores are close to 0.5 then the entire sample does not seem to have clearly distinct anomalies.  

prob_0	= probabilidade de estar satisfeito  
prob_1 = probabilidade de churn

In [3]:
def ks(data=None,target=None, prob=None):
    ## finding at: https://www.listendata.com/2019/07/KS-Statistics-Python.html
    data['target0'] = 1 - data[target]
    data['bucket']  = pd.qcut(data[prob].rank(method='first'), 5)
    # data['bucket'] = pd.qcut(data[prob], 10)
    grouped = data.groupby('bucket', as_index = False)
    kstable = pd.DataFrame()
    kstable['min_prob'] = grouped.min()[prob]
    kstable['max_prob'] = grouped.max()[prob]
    kstable['events']   = grouped.sum()[target]
    kstable['nonevents'] = grouped.sum()['target0']
    kstable = kstable.sort_values(by="min_prob", ascending=False).reset_index(drop = True)
    kstable['event_rate'] = (kstable.events / data[target].sum()).apply('{0:.2}'.format).astype(float) * 100
    kstable['nonevent_rate'] = (kstable.nonevents / data['target0'].sum()).apply('{0:.2}'.format).astype(float) * 100
    kstable['cum_eventrate']=(kstable.events / data[target].sum()).cumsum()
    kstable['cum_noneventrate']=(kstable.nonevents / data['target0'].sum()).cumsum()
    kstable['KS'] = np.round(kstable['cum_eventrate']-kstable['cum_noneventrate'], 3) * 100

    #Formating
    kstable['cum_eventrate']= kstable['cum_eventrate'].apply('{0:.2}'.format).astype(float) * 100
    kstable['cum_noneventrate']= kstable['cum_noneventrate'].apply('{0:.2}'.format).astype(float) * 100
    kstable.index = range(1,6)
    kstable.index.rename('Decile', inplace=True)
    pd.set_option('display.max_columns', 9)
    # print(kstable)
    
    #Display KS
    from colorama import Fore
    print(Fore.RED + "KS is " + str(max(kstable['KS']))+"%"+ " at decile " + str((kstable.index[kstable['KS']==max(kstable['KS'])][0])))
    return(kstable)

In [4]:
from scipy.stats import ks_2samp
yhat = df['iforest_kurt_score']
y = df['TARGET']
def ks_stat(y, yhat):
    return ks_2samp(yhat[y==1], yhat[y!=1]).statistic
ks_stat(y, yhat)

0.20256876144914338

In [5]:
results_quintil = ks(df, target='TARGET', prob='iforest_kurt_score')
results_quintil

[31mKS is 18.6% at decile 1


Unnamed: 0_level_0,min_prob,max_prob,events,nonevents,event_rate,nonevent_rate,cum_eventrate,cum_noneventrate,KS
Decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,-0.4565,-0.4565,230,2811,38.0,19.0,38.0,19.0,18.6
2,-0.456658,-0.4565,73,2968,12.0,20.0,50.0,40.0,10.3
3,-0.457297,-0.456658,135,2905,22.0,20.0,72.0,59.0,12.7
4,-0.459133,-0.457297,70,2971,12.0,20.0,84.0,80.0,3.8
5,-0.499749,-0.459133,99,2942,16.0,20.0,100.0,100.0,0.0


In [6]:
from sklearn.metrics import roc_auc_score

print("Roc Auc: ",roc_auc_score(df['TARGET'], df['iforest_kurt_score']))
print("Gini: ",2*(roc_auc_score(df['TARGET'], df['iforest_kurt_score']))-1)

Roc Auc:  0.6015788940856819
Gini:  0.20315778817136376


In [7]:
from scipy.stats import ks_2samp
yhat = df['iforest_kurt_score']
y = df['TARGET']
def ks_stat(y, yhat):
    return ks_2samp(yhat[y==1], yhat[y!=1]).statistic
print("KS: ",ks_stat(y, yhat))

KS:  0.20256876144914338


In [8]:
results_quintil.columns = ['min_prob','max_prob','Qtd de eventos (Target)','Qtd de não-eventos (Target)','% de eventos no Decil','% de não-eventos no Decil','% de evento acumulados','% de não-evento acumulados','KS']
results_quintil.to_csv("results_model7_iforest_fskurtos.csv",sep=';')