In [1]:
import pandas as pd
base = pd.read_csv('credito.csv')
base.head()

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [5]:
print(base.age.mean())
print(f'Vamos utilizar este valor para substituir a idades faltantes: {base.loc[base.age > 0].age.mean()}')

40.80755937840458
Vamos utilizar este valor para substituir a idades faltantes: 40.92770044906149


In [6]:
# Preprocessamento
base.loc[base.age < 0, 'age'] = 40.92
del base['clientid']

In [7]:
previsores = base.iloc[:, 0:3].values # Atributos - income, age e loan
classe = base.iloc[:, 3].values # Rótulo - default 0 e 1

In [8]:
from sklearn.impute import SimpleImputer
# Ajustando valores missing, substituindo-os pela média
imputer = SimpleImputer(strategy='mean')
imputer = imputer.fit(previsores[:, 0:3])
previsores[:, 0:3] = imputer.transform(previsores[:, 0:3])

In [9]:
# Ajustando a escala dos atributos, para evitar problemas no algoritmo referente aos pesos.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [10]:
previsores

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [11]:
previsores = scaler.fit_transform(previsores)
previsores

array([[ 1.45393393,  1.36538093,  1.20281942],
       [-0.76217555,  0.5426602 ,  0.69642695],
       [ 0.83682073,  1.67417189,  1.17471147],
       ...,
       [-0.07122592, -0.97448519,  0.35420081],
       [-0.11000289,  1.73936739, -0.92675625],
       [ 1.682986  ,  1.14917639,  0.96381038]])

In [12]:
# Divisão de ROWs de teste e treino
from sklearn.model_selection import train_test_split
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = \
    train_test_split(previsores,
                     classe,
                     test_size=0.25,
                     random_state = 0)

In [13]:
# Iniciando a aplicação da Decision Tree
from sklearn.ensemble import RandomForestClassifier
# n_estimators estipula quantas árvores serão geradas
classificador = RandomForestClassifier(n_estimators=10,
                                       criterion='entropy',
                                       random_state=0)

classificador.fit(previsores_treinamento, classe_treinamento)

previsoes = classificador.predict(previsores_teste)

from sklearn.metrics import confusion_matrix, accuracy_score
precisao = accuracy_score(classe_teste, previsoes)
matriz = confusion_matrix(classe_teste, previsoes)
matriz

array([[432,   4],
       [ 12,  52]], dtype=int64)

In [14]:
precisao

0.968