# 1.0 Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

# 2.0 KNN (K-Nearest Neighbors)

## 2.1 Data Load

In [2]:
# Load dataset
df = pd.read_csv('../datasets/train.csv')
df.head()

Unnamed: 0,id_cliente,idade,saldo_atual,divida_atual,renda_anual,valor_em_investimentos,taxa_utilizacao_credito,num_emprestimos,num_contas_bancarias,num_cartoes_credito,dias_atraso_dt_venc,num_pgtos_atrasados,num_consultas_credito,taxa_juros,investe_exterior,pessoa_polit_exp,limite_adicional
0,1767,21,278.172008,2577.05,24196.89636,104.306544,31.038763,6,5,7,21,14,9,15,Não,Não,Negar
1,11920,40,268.874152,2465.39,19227.37796,69.858778,36.917093,5,8,5,40,23,10,18,Não,Não,Negar
2,8910,36,446.643127,1055.29,42822.28223,134.201478,34.561714,0,3,6,26,13,3,15,Sim,Não,Negar
3,4964,58,321.141267,703.05,51786.826,297.350067,31.493561,0,3,7,12,7,2,1,Sim,Não,Negar
4,10100,35,428.716114,891.29,44626.85346,134.201478,28.028887,2,8,7,24,10,8,20,Sim,Não,Negar


In [3]:
# Data Shape
print(f'The dataframe have {df.shape[0]}  rows and {df.shape[1]} columns')

The dataframe have 9500  rows and 17 columns


## 2.2 Implement Model

In [4]:
# Check data balance
df['limite_adicional'].value_counts(normalize=True)

Negar       0.841579
Conceder    0.158421
Name: limite_adicional, dtype: float64

In [5]:
# Isolate the predictor variables
features = ['idade', 'saldo_atual', 'divida_atual', 'renda_anual', 'valor_em_investimentos',
'taxa_utilizacao_credito', 'num_emprestimos', 'num_contas_bancarias', 'num_cartoes_credito',
'dias_atraso_dt_venc', 'num_pgtos_atrasados', 'num_consultas_credito', 'taxa_juros']

# Isolate the response variable
label = all_features[-1]

# Preparating train data
x_train = df.loc[:, features]
y_train = df.loc[:, label].values.ravel() # Values/ravel turns into an array

In [6]:
# Create knn object with a number of neighbors
knn_classifier = KNeighborsClassifier(n_neighbors=7)

# Training the algorithm
knn_classifier.fit(x_train, y_train)

# Predictions
y_pred = knn_classifier.predict(x_train)
y_pred

# Comparing predict vs realized
df1 = df.loc[:, ['id_cliente', 'limite_adicional']]
df1['predicted'] = y_pred
df1['right'] = df1.apply(lambda x: 1 if x['limite_adicional'] == x['predicted'] else 0, axis=1)

df1

Unnamed: 0,id_cliente,limite_adicional,predicted,right
0,1767,Negar,Negar,1
1,11920,Negar,Negar,1
2,8910,Negar,Negar,1
3,4964,Negar,Negar,1
4,10100,Negar,Negar,1
...,...,...,...,...
9495,5155,Negar,Negar,1
9496,11977,Negar,Negar,1
9497,9278,Negar,Negar,1
9498,2525,Negar,Negar,1


## 2.3 Metrics

**<span class="mark">Remembering that we have a class (Conceder) with a representativity of 84%, which indicates unbalanced data.</span>**

### 2.3.1 Confusion Matrix

In [7]:
# Confusion matrix
metrics.confusion_matrix(y_train, y_pred)

array([[ 369, 1136],
       [ 203, 7792]], dtype=int64)

369  Classified as **Conceder** that is **Conceder**  
1136 Claassified as **Negar** that was **Conceder**  
203 Classified as **Conceder** that was **Negar**  
7792 Classified as **Negar** that was **Negar**  

### 2.3.2 Accuracy

In [10]:
# Accuracy (mainstream method)
print(f'The algorithm got a {round(np.mean(df1.right) * 100, 2)}% accuracy')

The algorithm got a 85.91% accuracy


In [11]:
# Accuracy with metrics library
acc = metrics.accuracy_score(y_train, y_pred)

print('The algorithm got a {:.2f}% accuracy'.format(acc * 100))

The algorithm got a 85.91% accuracy


### 2.3.3 Precision

In [24]:
# Precision (of all predictions made on the positive class, how many times did I got right?)
prec = metrics.precision_score(y_train, y_pred, pos_label='Conceder')
print('The algorithm got a {:.2f}% precision score'.format(prec * 100))

The algorithm got a 64.51% precision score


### 2.3.4 Recall

In [25]:
recall = metrics.recall_score(y_train, y_pred, pos_label='Conceder')
print('The algorithm got a {:.2f}% recall score'.format(recall * 100))

The algorithm got a 24.52% recall score


The algorithm have a good accuracy due to the Negar Class, so if our objective is to aim to deny customers, it seems good, but it can get better. Now, if our objective is to provide a good experience by conceiving, we are way far from good.

## 3.0 Exercises

### 3.1.1 Retraining the algorithm to k = [3, 5, 7, 9, 11, 13, 15, 17, 19, 21]

In [30]:
# Checking our dataframe
df.head()

Unnamed: 0,id_cliente,idade,saldo_atual,divida_atual,renda_anual,valor_em_investimentos,taxa_utilizacao_credito,num_emprestimos,num_contas_bancarias,num_cartoes_credito,dias_atraso_dt_venc,num_pgtos_atrasados,num_consultas_credito,taxa_juros,investe_exterior,pessoa_polit_exp,limite_adicional
0,1767,21,278.172008,2577.05,24196.89636,104.306544,31.038763,6,5,7,21,14,9,15,Não,Não,Negar
1,11920,40,268.874152,2465.39,19227.37796,69.858778,36.917093,5,8,5,40,23,10,18,Não,Não,Negar
2,8910,36,446.643127,1055.29,42822.28223,134.201478,34.561714,0,3,6,26,13,3,15,Sim,Não,Negar
3,4964,58,321.141267,703.05,51786.826,297.350067,31.493561,0,3,7,12,7,2,1,Sim,Não,Negar
4,10100,35,428.716114,891.29,44626.85346,134.201478,28.028887,2,8,7,24,10,8,20,Sim,Não,Negar


In [45]:
# Create an empty dataframe that will receive the k and the metrics for each k value
df2 = pd.DataFrame()

# Separaring predictor and predicted variables
features = df.select_dtypes(exclude="object").columns.to_list()
labels = "limite_adicional"

# Creating datasets
x_train = df.loc[:, features]
y_train = df.loc[:, label].values

array(['Negar', 'Negar', 'Negar', ..., 'Negar', 'Negar', 'Negar'],
      dtype=object)

In [46]:
# For k = 3

# Create KNN object
knn_classifier = KNeighborsClassifier(n_neighbors=3)

# Train the momdel
knn_classifier.fit(x_train, y_train)

# Make prediction
y_pred = knn_classifier.predict(x_train)



array(['Negar', 'Negar', 'Negar', ..., 'Negar', 'Negar', 'Negar'],
      dtype=object)