In [4]:
# pip install CHAID

In [5]:
from CHAID import Tree
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
# Leitura dos dados
telefonia = pd.read_table("Telefonia_AD.txt")
telefonia.head()

Unnamed: 0,cod_cliente,Minutos_realizados_T0,Tempo_casa,Qtd_retencao_6meses,Qtd_prod,resposta
0,1,1.848,127,0,0,0
1,2,38.137,187,0,2,0
2,3,74.415,158,0,2,0
3,4,13.002,137,0,0,0
4,5,7.172,133,0,2,0


In [7]:
telefonia.dtypes

cod_cliente                int64
Minutos_realizados_T0    float64
Tempo_casa                 int64
Qtd_retencao_6meses        int64
Qtd_prod                   int64
resposta                   int64
dtype: object

In [8]:
telefonia.shape

(318463, 6)

In [9]:
telefonia.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cod_cliente,318463.0,159232.0,91932.493726,1.0,79616.5,159232.0,238847.5,318463.0
Minutos_realizados_T0,296340.0,130.239079,222.233789,0.022,21.681,56.408,136.45775,1474.066
Tempo_casa,318463.0,109.89254,58.811786,3.0,72.0,101.0,149.0,230.0
Qtd_retencao_6meses,318463.0,0.187202,0.585582,0.0,0.0,0.0,0.0,45.0
Qtd_prod,318463.0,0.729912,0.951895,0.0,0.0,1.0,1.0,30.0
resposta,318463.0,0.008685,0.09279,0.0,0.0,0.0,0.0,1.0


In [10]:
# Verificando quantidade de missings
telefonia.isnull().sum()

cod_cliente                  0
Minutos_realizados_T0    22123
Tempo_casa                   0
Qtd_retencao_6meses          0
Qtd_prod                     0
resposta                     0
dtype: int64

In [11]:
# Tratamento da variável Minutos_realizados_T0
telefonia.Minutos_realizados_T0 = telefonia.Minutos_realizados_T0.fillna(0)

In [12]:
# Como a variável explicativa também deve ser categórica, 
# vamos segmentar a Idade em quartil
telefonia['Minutos_realizados_T0_q'] = pd.qcut(telefonia.Minutos_realizados_T0, 4)
telefonia['Tempo_casa_q'] = pd.qcut(telefonia.Tempo_casa, 4)
telefonia['Qtd_retencao_6meses_q'] = pd.qcut(telefonia.Qtd_retencao_6meses, 4, duplicates='drop')
telefonia['Qtd_prod_q'] = pd.qcut(telefonia.Qtd_prod, 4, duplicates='drop')

In [13]:
# Tabela Bidimensional: covariável x resposta
minutos_tab = pd.crosstab(telefonia["Minutos_realizados_T0_q"],telefonia["resposta"],margins=True)
minutos_tab

resposta,0,1,All
Minutos_realizados_T0_q,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(-0.001, 16.082]",78480,1141,79621
"(16.082, 49.72]",79056,558,79614
"(49.72, 126.731]",79101,514,79615
"(126.731, 1474.066]",79060,553,79613
All,315697,2766,318463


In [14]:
tempo_casa_tab = pd.crosstab(telefonia["Tempo_casa_q"],telefonia["resposta"],margins=True)
tempo_casa_tab

resposta,0,1,All
Tempo_casa_q,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(2.999, 72.0]",82167,1505,83672
"(72.0, 101.0]",80505,604,81109
"(101.0, 149.0]",74097,397,74494
"(149.0, 230.0]",78928,260,79188
All,315697,2766,318463


In [15]:
qtd_retencao_tab = pd.crosstab(telefonia["Qtd_retencao_6meses_q"],telefonia["resposta"],margins=True)
qtd_retencao_tab

resposta,0,1,All
Qtd_retencao_6meses_q,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(-0.001, 45.0]",315697,2766,318463
All,315697,2766,318463


In [16]:
qtd_prod_tab = pd.crosstab(telefonia["Qtd_prod_q"],telefonia["resposta"],margins=True)
qtd_prod_tab

resposta,0,1,All
Qtd_prod_q,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(-0.001, 1.0]",269411,2702,272113
"(1.0, 30.0]",46286,64,46350
All,315697,2766,318463


## Modelo - Árvore de Decisão

In [17]:
# Transformando a variável resposta em categórica
telefonia['resposta_cat'] = telefonia.resposta.astype('category')

In [18]:
var_explicativas = telefonia[[
    'Minutos_realizados_T0_q',
    'Tempo_casa_q', 
    'Qtd_retencao_6meses_q',
    'Qtd_prod_q']]

var_resposta = telefonia['resposta_cat']

In [19]:
# Constrói o modelo de árvore
modelo = Tree.from_numpy(
    var_explicativas.to_numpy(), 
    var_resposta.to_numpy(), 
    split_titles=['Minutos_realizados_T0_q',
                  'Tempo_casa_q', 
                  'Qtd_retencao_6meses_q',
                  'Qtd_prod_q'], 
    min_child_node_size=2)

In [20]:
modelo.print_tree()

([], {0: 315697.0, 1: 2766.0}, (Tempo_casa_q, p=1.9579279056686072e-264, score=1221.0786987646634, groups=[[Interval(2.999, 72.0, closed='right')], [Interval(72.0, 101.0, closed='right')], [Interval(101.0, 149.0, closed='right')], [Interval(149.0, 230.0, closed='right')]]), dof=3))
|-- ([Interval(2.999, 72.0, closed='right')], {0: 82167.0, 1: 1505.0}, (Qtd_prod_q, p=8.690503159875556e-32, score=137.65040968906783, groups=[[Interval(-0.001, 1.0, closed='right')], [Interval(1.0, 30.0, closed='right')]]), dof=1))
|   |-- ([Interval(-0.001, 1.0, closed='right')], {0: 72676.0, 1: 1477.0}, <Invalid Chaid Split> - the max depth has been reached)
|   +-- ([Interval(1.0, 30.0, closed='right')], {0: 9491.0, 1: 28.0}, <Invalid Chaid Split> - the max depth has been reached)
|-- ([Interval(72.0, 101.0, closed='right')], {0: 80505.0, 1: 604.0}, (Minutos_realizados_T0_q, p=1.0250634750014723e-20, score=87.11277400461087, groups=[[Interval(-0.001, 16.082, closed='right')], [Interval(16.082, 49.72, clo

In [21]:
modelo.classification_rules()

[{'node': 2,
  'rules': [{'variable': 'Qtd_prod_q',
    'data': [Interval(-0.001, 1.0, closed='right')]},
   {'variable': 'Tempo_casa_q',
    'data': [Interval(2.999, 72.0, closed='right')]}]},
 {'node': 3,
  'rules': [{'variable': 'Qtd_prod_q',
    'data': [Interval(1.0, 30.0, closed='right')]},
   {'variable': 'Tempo_casa_q',
    'data': [Interval(2.999, 72.0, closed='right')]}]},
 {'node': 5,
  'rules': [{'variable': 'Minutos_realizados_T0_q',
    'data': [Interval(-0.001, 16.082, closed='right')]},
   {'variable': 'Tempo_casa_q',
    'data': [Interval(72.0, 101.0, closed='right')]}]},
 {'node': 6,
  'rules': [{'variable': 'Minutos_realizados_T0_q',
    'data': [Interval(16.082, 49.72, closed='right'),
     Interval(49.72, 126.731, closed='right'),
     Interval(126.731, 1474.066, closed='right')]},
   {'variable': 'Tempo_casa_q',
    'data': [Interval(72.0, 101.0, closed='right')]}]},
 {'node': 8,
  'rules': [{'variable': 'Minutos_realizados_T0_q',
    'data': [Interval(-0.001, 16.

In [22]:
# Salvando na base os nós
telefonia['node'] = modelo.node_predictions()

In [23]:
# Salvando a taxa de resposta por nó
probs = telefonia.groupby(['node']).agg({'resposta':'mean'}).reset_index()

In [24]:
# Marcando se o nó é propenso
probs['propenso'] = np.where(probs['resposta'] >= telefonia['resposta'].mean(), 1, 0)

In [25]:
probs = probs.rename(columns={"resposta":"prob"})
probs

Unnamed: 0,node,prob,propenso
0,2.0,0.019918,1
1,3.0,0.002941,0
2,5.0,0.011853,1
3,6.0,0.005645,0
4,8.0,0.009927,1
5,9.0,0.003893,0
6,11.0,0.008153,0
7,12.0,0.00202,0


In [26]:
telefonia = telefonia.merge(probs, how='left', on='node')

In [27]:
tabela_desempenho = pd.crosstab(telefonia['resposta'], telefonia['propenso'])

In [28]:
tabela_desempenho

propenso,0,1
resposta,Unnamed: 1_level_1,Unnamed: 2_level_1
0,202208,113489
1,834,1932


In [29]:
acuracia = (tabela_desempenho[0][0] + tabela_desempenho[1][1])/tabela_desempenho.sum().sum()
acuracia

0.6410163818088758

In [30]:
sensibilidade = (tabela_desempenho[1][1])/(tabela_desempenho[1][1] + tabela_desempenho[0][1])
sensibilidade

0.6984815618221258

In [31]:
especificidade = (tabela_desempenho[0][0])/(tabela_desempenho[0][0] + tabela_desempenho[1][0])
especificidade

0.6405128968599638