In [1]:
# importações
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree

In [2]:
bank = pd.read_csv('/home/amador/dados/bank-numeric.csv')
bank.head()

Unnamed: 0,age,balance,duration,campaign,previous,default_cat,housing_cat,loan_cat,recent_pdays,deposit_cat,...,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,poutcome_failure,poutcome_success,poutcome_unknown
0,59,2343,1042,1,0,0,1,0,0.0001,1,...,0,1,0,0,1,0,0,0,0,1
1,56,45,1467,1,0,0,0,0,0.0001,1,...,0,1,0,0,1,0,0,0,0,1
2,41,1270,1389,1,0,0,1,0,0.0001,1,...,0,1,0,0,1,0,0,0,0,1
3,55,2476,579,1,0,0,1,0,0.0001,1,...,0,1,0,0,1,0,0,0,0,1
4,54,184,673,2,0,0,0,0,0.0001,1,...,0,1,0,0,0,1,0,0,0,1


In [3]:
# separando os dados
bank_data = bank.drop('deposit_cat', 1) # ou axis=1
bank_target = bank.deposit_cat

In [4]:
# dividindo em treino e teste em 70/30
X_train, X_test, y_train, y_test = train_test_split(bank_data,bank_target,test_size=0.3)

In [5]:
# função para treinar o modelo com parâmetro max_depth (controla profundidade, logo a complexidade da árvore)
def compara_modelos(maxdepth): # o parâmetro é o próprio valor de maxdepth
    if maxdepth == 0: # se maxdepth for igual a zero
        dt = tree.DecisionTreeClassifier(random_state=1)# a árvore vai crescer até todas as folha se tornem puras
    else:   
        dt = tree.DecisionTreeClassifier(random_state=1, max_depth=maxdepth) # se diferente de 0
    dt.fit(X_train, y_train) # vai ser treinado
    train_score = dt.score(X_train, y_train) # score do treino
    test_score = dt.score(X_test, y_test) # score do teste
    return train_score,test_score # retorno dos scores
# quanto mais destoantes os valores, mais inviesado está

In [6]:
compara_modelos(0) # totalmente enviesado

(1.0, 0.7369363989250522)

In [7]:
compara_modelos(2) # bom resultado, pois está parecido nos resultados

(0.7284013823115321, 0.719020603165124)

In [8]:
compara_modelos(10) # começando a sofrer overfitting

(0.864712658389863, 0.78501045088086)

In [9]:
print('{:10} {:20} {:20}'.format('depth', 'Training score','Testing score'))
print('{:10} {:20} {:20}'.format('-----', '--------------','-------------'))
print('{:1}         {} '.format(2,str(compara_modelos(2))))
print('{:1}         {} '.format(3,str(compara_modelos(3))))
print('{:1}         {} '.format(4,str(compara_modelos(4))))
print('{:1}         {} '.format(10,str(compara_modelos(10))))
print('{:1}         {} '.format(15,str(compara_modelos(15))))
print('{:1}         {} '.format('Full',str(compara_modelos(0))))

depth      Training score       Testing score       
-----      --------------       -------------       
2         (0.7284013823115321, 0.719020603165124) 
3         (0.7755023678484577, 0.7691848312929233) 
4         (0.7848457698707283, 0.7790385189608838) 
10         (0.864712658389863, 0.78501045088086) 
15         (0.9493152438243952, 0.7494774559570021) 
Full         (1.0, 0.7369363989250522) 


In [10]:
# Verificando as features mais importantes para o modelo de arvore de decisão treinado
# Treinando o modelo utilizando o valor de max_depth igual a 4
dt = tree.DecisionTreeClassifier(max_depth=4)
dt.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=4)

In [11]:
# array com nivel de importância
dt.feature_importances_

array([0.00236846, 0.        , 0.67773833, 0.        , 0.        ,
       0.        , 0.08645433, 0.        , 0.10749534, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.12594355,
       0.        ])

In [12]:
# Listando as features e sua importância para o modelo
fi = dt.feature_importances_
l = len(bank.columns)
for i in range(0,len(bank.columns)-1):
    print('{:.<20} {:3}'.format(bank.columns[i], fi[i]))
    
# pode ser um inicio para avaliar a seleção de features e exclusão daquelas que não influenciam
# mas utilizar outras abordagens também

age................. 0.00236845514551972
balance............. 0.0
duration............ 0.6777383260704934
campaign............ 0.0
previous............ 0.0
default_cat......... 0.0
housing_cat......... 0.08645432721450465
loan_cat............ 0.0
recent_pdays........ 0.10749534394049491
deposit_cat......... 0.0
job_blue-collar..... 0.0
job_entrepreneur.... 0.0
job_other........... 0.0
job_pink-collar..... 0.0
job_self-employed... 0.0
job_technician...... 0.0
job_white-collar.... 0.0
marital_divorced.... 0.0
marital_married..... 0.0
marital_single...... 0.0
education_primary... 0.0
education_secondary. 0.0
education_tertiary.. 0.0
education_unknown... 0.0
poutcome_failure.... 0.12594354762898713
poutcome_success.... 0.0
