### Ferramenta para segmentação de score

#### Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from pylab import rcParams
import pickle
import sys
import os

from ipywidgets import interactive, fixed, Button, ButtonStyle, FloatSlider, IntSlider
from IPython.display import display, clear_output, HTML

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

rcParams['figure.figsize'] = 14,14
rcParams['legend.fontsize'] = 10
rcParams['xtick.labelsize'] = 12
rcParams['ytick.labelsize'] = 12
rcParams['font.size'] = 12
rcParams['xtick.alignment'] = 'center'
pd.set_option('max_row', 500)

%matplotlib inline

#### Funções auxiliares

In [2]:
def calcula_roc(true, pred):
    roc = roc_auc_score(y_true=true, y_score=pred)
    return roc

def calcula_gini(true, pred):
    roc = roc_auc_score(y_true=true, y_score=pred)
    gini = (2*roc)-1
    return gini

def dist_gamma(x):
    dist=stats.gamma(0.8)
    return dist.cdf(x)

def check_variaveis(df, campo_cpf, campo_ref, campo_score, campo_target):
    erro = False
    if campo_ref not in df.columns:
        print (f'Erro! Campo {campo_ref} não encontrado!')
        erro = True
        
    if campo_ref not in df.columns:
        print (f'Erro! Campo {campo_ref} não encontrado!')
        erro = True
        
    if campo_ref not in df.columns:
        print (f'Erro! Campo {campo_ref} não encontrado!')
        erro = True
        
    if campo_ref not in df.columns:
        print (f'Erro! Campo {campo_ref} não encontrado!')
        erro = True
        
    if (df[[campo_cpf, campo_ref, campo_score, campo_target]].isnull().sum().any() != 0):
        print ('Erro! Campos missing encontrados.')
        erro = True
    
    if not erro:
        print ('Tudo ok!')

def quebras_iniciais_from_tree(df, df_desenv, campo_target, campo_score, min_samples_leaf=0.02, max_leaf_nodes=40):
    roc = calcula_roc(df[[campo_target]], df[[campo_score]])
    gini = calcula_gini(df[[campo_target]], df[[campo_score]])
    print (f'Roc: {roc}')
    print (f'Gini: {gini}')
    
    clf_dt = DecisionTreeClassifier(max_depth=None,
                                criterion='gini',
                                random_state=123,
                                min_samples_leaf=min_samples_leaf,
                                max_leaf_nodes=max_leaf_nodes)
    
    clf_dt.fit(df[[campo_score]], df[[campo_target]])
    df['folha'] = clf_dt.apply(df[[campo_score]])
    
    mins = df.groupby('folha')[campo_score].min().values
    maxs = df.groupby('folha')[campo_score].max().values
    
    print ('Folhas:', df['folha'].nunique())
    
    # transforma score em folhas
    final_vet = [0]
    vet = sorted(np.concatenate([mins, maxs]))
    for i in range(1, len(vet)-1, 2):
        final_vet.append(np.mean(vet[i:i+1]))
    final_vet.append(1)
    final_vet = sorted(final_vet)
    print (f'GHs: {len(final_vet)-1}')
    return final_vet
    

def grupos_from_quebras(qtds, quebras, score):
    gh=1
    position=1
    bins=[-np.inf]
    for i in range(len(qtds)):
        while (qtds[i] > 0):
            position+=1
            qtds[i]-=1
        if (position > (len(quebras)-2)):
            break
        bins.append(quebras[position])
    bins.append(np.inf)
    bins = sorted(list(set(bins)))
    labels = ['GH'+str(i+1).zfill(2) for i in range(len(bins)-1)]
    return bins, labels
    
def plot_estabilidade(df):
    plt.close()
    plt.figure(figsize=(8,8))
    for gh in sorted(list(df['GH'].unique())):
        dfg = df[df['GH'] == gh]
        dp = dfg.groupby('Safra')[['Resposta']].mean()
        dp['Bad12'] = dp['Resposta'].values.tolist()
        dp.drop('Resposta', axis=1, inplace=True)
        plt.plot(range(0, len(dp.index)), dp.values*100, label=gh)
    plt.xticks(range(0, len(dp.index)), [str(int(a)) for a in dp.index], rotation=60)
    plt.grid(False)
    plt.legend(loc='upper right')
    plt.show()
    
def show_infos(df_sample, df_sample_atual, bins, labels):
    dfr = df_sample[[campo_score, campo_target, campo_ref]].copy()
    
    dfr.columns = ['Score', 'Resposta', 'Safra']
    dfr['Volume'] = 1
    dfr['GH'] = pd.cut(dfr['Score'], bins, labels=labels)
    
    grp = dfr.groupby('GH').agg({'Resposta':'mean', 'Volume':lambda x: sum(x)/dfr.shape[0]})
    grp['Bad6'] = grp['Resposta']
    grp['Bad12'] = grp['Bad6'].values.tolist()
    grp = grp[['Bad6', 'Bad12', 'Volume']]
    
    plot_estabilidade(dfr)
    
    grp_show = (grp*100).round(2).astype(str) + '%'
    display(grp_show)
    
    
def sliders_init(NUM_MAX_GHS):
    path_model = 'saved_sliders.pkl'
    if os.path.exists(path_model):
        values = pickle.load(open(path_model, 'rb')) + [0]*NUM_MAX_GHS
        dic = {'GH'+str(i+1).zfill(2):IntSlider(min=0, max=40, step=1, value=j) for i,j in zip(range(NUM_MAX_GHS), values)}
    else:
        dic = {'GH'+str(i+1).zfill(2):IntSlider(min=0, max=40, step=1, value=0) for i in range(NUM_MAX_GHS)}
    return dic

def gera_GHs(df_sample, df_sample_atual, quebras, bins_in, labels_in, qtds_in, **dic):
    bins, labels = grupos_from_quebras(list(dic.values()), quebras, df_sample[campo_score])
    
    show_infos(df_sample, df_sample_atual, bins, labels)

    bins_in[0] = bins
    labels_in[0] = labels
    qtds_in[0] = list(dic.values())
    pickle.dump(list(dic.values()), open('saved_sliders.pkl', 'wb'))
    
def ajusteFino(df_sample, df_sample_atual, bins_in, labels_in, **dic):
    bins = [-np.inf] + list(dic.values()) + [np.inf]
    labels = labels_in[0]
    
    show_infos(df_sample, df_sample_atual, bins, labels)
    
    bins_in[0] = bins
    labels_in[0] = labels

#### Leitura dos dados

In [3]:
df = pd.read_excel('dados.xlsx')
print(f'Shape: {df.shape}')
df.head()

Shape: (24, 4)


Unnamed: 0,cpf,safra,score,target
0,1,202101,0.9,1
1,3,202101,0.91,1
2,4,202101,0.85,0
3,9,202101,0.4,0
4,4,202101,0.2,0


#### Parâmetros

In [4]:
NUM_MAX_GHS = 10

campo_cpf = 'cpf'
campo_ref = 'safra'
campo_score = 'score'
campo_target = 'target'

#### Check de variáveis

In [5]:
check_variaveis(df, campo_cpf, campo_ref, campo_score, campo_target)

Tudo ok!


#### Check de volumetria

In [6]:
df[campo_ref].value_counts(dropna=False)

202104    6
202103    6
202102    6
202101    6
Name: safra, dtype: int64

In [7]:
df[campo_ref].value_counts(dropna=False, normalize=True)

202104    0.25
202103    0.25
202102    0.25
202101    0.25
Name: safra, dtype: float64

#### Calcula Gini e aplica quebras

In [8]:
df_sample = df[[campo_score, campo_target, campo_ref]].copy()
df_sample_atual = df[[campo_score]].copy()

# define grupos iniciais
final_vet = quebras_iniciais_from_tree(df_sample, df_sample_atual, campo_target, campo_score, min_samples_leaf=0.02, max_leaf_nodes=40)

display(HTML("""<style>table {position: absolute; top:12%; left:60%;}"</style>"""))
display(HTML("""<style>widget-hslider{height:14px;}"</style>"""))

Roc: 0.9166666666666665
Gini: 0.833333333333333
Folhas: 4
GHs: 4


#### Gera Grupos

In [9]:
dic = sliders_init(NUM_MAX_GHS)

bins, labels, qtds= [0], [0], [0]

# sliders
interactive(gera_GHs,
            df_sample=fixed(df_sample),
            df_sample_atual=fixed(df_sample_atual),
            quebras=fixed(final_vet),
            bins_in=fixed(bins),
            labels_in=fixed(labels),
            qtds_in=fixed(qtds),
            **dic)

interactive(children=(IntSlider(value=1, description='GH01', max=40), IntSlider(value=0, description='GH02', m…

#### Ajuste Fino

In [15]:
quebras = {}
for i in range(1, len(bins[0])-1, 1):
    nmax = min(bins[0][i]*1.5, bins[0][i+1])
    nmin = max(0, bins[0][i-1])
    quebras['GH'+str(i).zfill(2)] = FloatSlider(values=bins[0][1], min=0, max=1, step=((nmax-nmin)/20))
    
interactive(ajusteFino,
            df_sample=fixed(df_sample),
            df_sample_atual=fixed(df_sample_atual),
            bins_in=fixed(bins),
            labels_in=fixed(labels),
            qtds_in=fixed(qtds),
            **quebras)

interactive(children=(FloatSlider(value=0.0, description='GH01', max=1.0, step=0.030000000000000006), Output()…

In [11]:
df_full = df.copy()
df_full['GH'] = pd.qcut(df_full[campo_score], bins[0], labels=labels[0])

  if idx % 1 == 0:


OverflowError: cannot convert float infinity to integer

In [None]:
df_full.groupby('GH')[campo_target].mean()

In [None]:
df_full.to_csv('base_out.csv', index=False, sep=';')

In [None]:
# resultado
bins[0]