**19/12/2021**
<br>
Bruno Mazzilli González

> # Modelo K-nearest neighbours

El modelo **K-Nearest Neighbours** es un modelo fácil de entender y que no suele tener mucho tiempo de computo en comparación con otros modelos de aprendizaje automatico. Para hacer una predicción el algoritmo de K-Nearest Neighbours busca la representacion en el dataset que más se 'parece' para de este modo predecir la variable objetivo asociada a esta representación.

### Librerias

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
from matplotlib import pyplot as plt # plots
import seaborn as sns # plots
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

### Importamos datos

In [2]:
X_train = pd.read_parquet('data/x_train.parquet')

X_test = pd.read_parquet('data/x_test.parquet')
X_test.reset_index(drop=True, inplace=True)

Y_train = pd.read_parquet('data/y_train.parquet')

Y_test = pd.read_parquet('data/y_test.parquet')
Y_test.reset_index(drop=True, inplace=True)

Para utilizar scikit-learn tenemos que convertir el dataframe de Pandas a un Numpy array:

### Modelo K-nearest Neighbours

In [3]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
%%time

#k neighbours que viene por defecto
knn_model = KNeighborsClassifier(n_neighbors=5)

knn_model.fit(X_train,np.ravel(Y_train))

Y_pred_knn = knn_model.predict(X_test)

<br>

### Matriz de Confusión

In [5]:
def conf_matrix(Y_test, Y_pred): # va a comparar los resultados de las predicciones de la variable target vs la real del testset
    cnf_matrix_tra = confusion_matrix(Y_test, Y_pred)
    conf_plot = sns.heatmap(cnf_matrix_tra, annot=True, cmap='Blues', fmt='g')
    bottom, top = conf_plot.get_ylim()
    conf_plot.set_ylim(bottom + 0.5, top - 0.5)
    conf_plot.set_title("Confusion matrix, without normalization")
    conf_plot.set_ylabel('True Label')
    conf_plot.set_xlabel('Predicted Label');

In [None]:
conf_matrix(Y_test, Y_pred_knn)
print(classification_report(Y_test, Y_pred_knn))

### Matriz de Confusión Normalizada

In [None]:
def conf_matrix_norm(Y_test, Y_pred): 
    cnf_matrix_tra = confusion_matrix(Y_test, Y_pred,  normalize='true')
    conf_plot = sns.heatmap(cnf_matrix_tra, annot=True, cmap='Blues', fmt='g')
    bottom, top = conf_plot.get_ylim()
    conf_plot.set_ylim(bottom + 0.5, top - 0.5)
    conf_plot.set_title("Confusion matrix, with normalization")
    conf_plot.set_ylabel('True Label')
    conf_plot.set_xlabel('Predicted Label');

In [None]:
conf_matrix_norm(Y_test, Y_pred_rf)
print(classification_report(Y_test, Y_pred_knn))

### Curva ROC:

In [None]:
metrics.plot_roc_curve(random_forest, X_test, Y_test)                       # CURVA ROC
plt.show() 
print("KNN score train: ", knn_model.score(X_train, Y_train)) # score_train
print("KNN score: ", knn_model.score(X_test, Y_test))         # score_test

### Gain Curve:

In [None]:
# Deriving Class probabilities
predicted_probabilities = knn_model.predict_proba(X_test)
# Creating the plot
skplt.metrics.plot_cumulative_gain(Y_test, predicted_probabilities);

### Feature importance

In [None]:
coefs = np.abs(random_forest.feature_importances_)
indices = np.argsort(coefs)[::-1]

plt.figure()
plt.title("Feature importances (Random Forest)")
plt.bar(range(10), coefs[indices[:10]],
       color="r", align="center")
plt.xticks(range(10), X_train.columns[indices[:10]], rotation=45, ha='right')
plt.subplots_adjust(bottom=0.3)

#### Guardamos el modelo

In [None]:
# función para guardar el modelo
def save_models(filename, model):
    with open(filename, 'wb') as file:
        pickle.dump(model, file)

In [None]:
save_models('models/knn_model.pkl', knn_model)