# Cycle 02 - Supervised Learning - Classification

## 1.0 Imports

In [147]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics as mt

## 1.1 Load Datase

In [172]:
df = pd.read_csv( '/Users/adriele/Documents/repos/fund_ml/dataset/train.csv')

In [173]:
df.loc[:, 'limite_adicional'].value_counts()

Negar       7995
Conceder    1505
Name: limite_adicional, dtype: int64

In [174]:
df.loc[:, 'limite_adicional'].value_counts( normalize=True )

Negar       0.841579
Conceder    0.158421
Name: limite_adicional, dtype: float64

## 2.0  Features Selection
                  

In [175]:
features = ['idade', 'saldo_atual', 'divida_atual', 'renda_anual', 'valor_em_investimentos',
                              'taxa_utilizacao_credito', 'num_emprestimos', 'num_contas_bancarias', 'num_cartoes_credito',
                              'dias_atraso_dt_venc', 'num_pgtos_atrasados', 'num_consultas_credito', 'taxa_juros']

label = ['limite_adicional']

## 3.0 Data Preparation for train

In [187]:
x_train = df.loc[:, features]
y_train = df.loc[:, label].values.ravel()


k = 21

# Treinamento do algoritmo KNN
knn_classifier = KNeighborsClassifier( n_neighbors = k )
knn_classifier.fit( x_train, y_train )

# Previsão das observações
y_pred = knn_classifier.predict( x_train )

# Comparação do Previsto com o Realizado
#df1 = df.loc[:, ['id_cliente', 'limite_adicional']]

df_results = df.copy()
df_results['classificacao'] = y_pred
df_results.sample( 10)

Unnamed: 0,id_cliente,idade,saldo_atual,divida_atual,renda_anual,valor_em_investimentos,taxa_utilizacao_credito,num_emprestimos,num_contas_bancarias,num_cartoes_credito,dias_atraso_dt_venc,num_pgtos_atrasados,num_consultas_credito,taxa_juros,investe_exterior,pessoa_polit_exp,limite_adicional,classificacao
6272,8047,57,422.697187,1063.18,42593.69303,96.372156,40.725937,1,6,3,27,10,1,11,Não,Não,Negar,Negar
5475,596,49,368.731345,939.2,29024.931285,55.447013,36.614841,3,5,3,25,14,4,8,Sim,Não,Negar,Negar
557,1202,42,266.090911,1332.45,11202.535825,41.734346,38.448629,2,7,4,30,12,6,18,Não,Não,Negar,Negar
3411,6147,31,361.887885,1406.53,72577.575,124.367443,40.122092,5,8,3,29,13,11,18,Não,Não,Negar,Negar
5305,6344,52,297.462809,1372.83,51221.1455,365.480357,40.277766,0,7,3,10,14,3,15,Não,Não,Negar,Negar
3875,10619,21,190.676829,2514.62,27461.60137,185.101395,25.764323,5,9,10,60,22,9,28,Não,Não,Negar,Negar
4778,6384,34,639.14663,675.51,66947.132,70.576037,37.008729,0,5,3,13,18,4,5,Não,Não,Negar,Negar
8782,9046,25,496.292058,1369.02,90537.63888,348.918938,29.297803,2,1,4,9,2,1,10,Não,Não,Negar,Negar
3036,6928,35,389.594288,2482.73,42827.1053,68.008761,35.56124,5,10,8,25,19,7,16,Não,Não,Negar,Negar
7610,4766,59,336.67671,21.07,40289.68412,57.668059,34.809239,4,4,4,30,8,4,7,Não,Não,Negar,Negar


In [188]:
df_results['acertos'] =  ( df_results.loc[:, ['id_cliente', 'limite_adicional', 'classificacao']].apply(
lambda x: 1 if x['limite_adicional'] == x['classificacao'] else 0, axis=1 ) )

# 4.0  Performance

## 4.1 Matriz de confusao

In [189]:
mt.confusion_matrix( y_train, y_pred )

array([[ 146, 1359],
       [ 102, 7893]])

In [190]:
print( 'Matriz de Confusao: {:.2f}'.format(( 369 + 7792 ) / (9500)* 100 ))

Matriz de Confusao: 85.91


## 4.2 Acuracia

In [191]:
df_results['acertos'].sum()

8039

In [192]:
df_results.shape

(9500, 19)

In [193]:
print( 'Acuracia: {:.2f}'.format(( 8161 / 9500 ) * 100 ))

Acuracia: 85.91


In [194]:
mt.accuracy_score( y_train, y_pred)

0.8462105263157895

- k3 = 88%
- k5 = 86%
- k7 = 85%
- k9 = 85% 
- k11 = 85%
- k13 = 84%
- k15 = 84%
- k17 = 84%
- k19 = 84%
- k21 = 84%


## 4.3 Precisao

In [184]:

mt.precision_score( y_train, y_pred, average="binary", pos_label='Conceder' )


0.6111111111111112

In [185]:
## 4.4 Recall

In [186]:

mt.recall_score( y_train, y_pred, average='binary', pos_label='Conceder' )

0.10963455149501661