In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from numpy import random
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from numpy import matlib
import qgrid

In [2]:
X = np.genfromtxt('Data_PCA/X_features.csv',delimiter=',')
Y = np.genfromtxt('Data_PCA/Y_labels.csv',delimiter=',')

In [3]:
print('Tamaño X: ',X.shape)
print('Tamaño Y: ', Y.shape)

Tamaño X:  (14108, 150)
Tamaño Y:  (14108,)


In [4]:
def error_logistic(Y_pred, Y_true):
    error = 0
    for ye, y in zip(Y_pred, Y_true):
        if ye != y:
            error += 1
    error = float(error)/float(np.size(Y_pred))
    
    return error

In [5]:
def K_vecinos(vecinos):
    Folds = 4
    random.seed(19680801)
    EficienciaTrain = np.zeros(Folds)
    EficienciaVal = np.zeros(Folds)
    Error = np.zeros(Folds)
    skf = StratifiedKFold(n_splits=Folds)
    j = 0
        
    for train, test in skf.split(X, Y):
        Xtrain = X[train,:]
        Ytrain = Y[train]
        Xtest = X[test,:]
        Ytest = Y[test]
        
        #Se normalizan los datos
        media = np.mean(Xtrain)
        desvia = np.std(Xtrain)
        Xtrain = preprocessing.scale(Xtrain)
        Xtest = (Xtest - np.matlib.repmat(media, Xtest.shape[0], 1))/np.matlib.repmat(desvia, Xtest.shape[0], 1)
        
        
        modelo = KNeighborsClassifier(n_neighbors=vecinos)
        modelo.fit(Xtrain,Ytrain)
        
        #Validación con muestras de entrenamiento
        Ytrain_pred = modelo.predict(Xtrain)
        
        #Validación con las muestras de test    
        Yest = modelo.predict(Xtest)
        
        
        Error[j] = error_logistic(Y_pred=Yest,Y_true=Ytest)
        
        #Evaluamos las predicciones del modelo con los datos de test
        EficienciaTrain[j] = np.mean(Ytrain_pred == Ytrain)
        EficienciaVal[j] = np.mean(Yest == Ytest)
        j += 1
        
    print("Modelo entrenado con "+ str(vecinos)+ " vecinos")
    
    return np.mean(EficienciaVal), np.std(EficienciaVal),np.mean(Error),np.std(Error)    

In [6]:
import qgrid

randn = np.random.randn
df_types = pd.DataFrame({
    'Numero de vecinos' : pd.Series([1, 2, 3, 4, 5, 6, 7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39])})
df_types["Eficiencia en validacion"] = ""
df_types["Intervalo de confianza"] = ""
df_types["Error"] = ""
df_types["Std Error"] = ""
df_types.set_index(['Numero de vecinos'], inplace=True)


In [7]:
i = 0
for vecinos in df_types.index:   
    eficiencia_val, ic_val,error, std_error = K_vecinos(vecinos)
    df_types["Eficiencia en validacion"][vecinos]=str(eficiencia_val)
    df_types["Intervalo de confianza"][vecinos] = str(ic_val)
    df_types["Error"][vecinos] = str(error)
    df_types["Std Error"][vecinos] = str(std_error)
    i=i+1

Modelo entrenado con 1 vecinos
Modelo entrenado con 2 vecinos
Modelo entrenado con 3 vecinos
Modelo entrenado con 4 vecinos
Modelo entrenado con 5 vecinos
Modelo entrenado con 6 vecinos
Modelo entrenado con 7 vecinos
Modelo entrenado con 9 vecinos
Modelo entrenado con 11 vecinos
Modelo entrenado con 13 vecinos
Modelo entrenado con 15 vecinos
Modelo entrenado con 17 vecinos
Modelo entrenado con 19 vecinos
Modelo entrenado con 21 vecinos
Modelo entrenado con 23 vecinos
Modelo entrenado con 25 vecinos
Modelo entrenado con 27 vecinos
Modelo entrenado con 29 vecinos
Modelo entrenado con 31 vecinos
Modelo entrenado con 33 vecinos
Modelo entrenado con 35 vecinos
Modelo entrenado con 37 vecinos
Modelo entrenado con 39 vecinos


In [8]:
qgrid_widget = qgrid.show_grid(df_types, show_toolbar=False)
qgrid_widget
qgrid_widget.get_changed_df()

Unnamed: 0_level_0,Eficiencia en validacion,Intervalo de confianza,Error,Std Error
Numero de vecinos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.3240015778480044,0.0045563729214176,0.6759984221519957,0.0045563729214177
2,0.3427136027411531,0.0073770818455088,0.6572863972588469,0.0073770818455088
3,0.3564653503677893,0.0039527758333552,0.6435346496322105,0.0039527758333552
4,0.3590158884502941,0.005124071024259,0.6409841115497059,0.0051240710242591
5,0.3600090773688942,0.0035942401218493,0.6399909226311057,0.0035942401218493
6,0.3695074763692582,0.004769627156748,0.6304925236307417,0.004769627156748
7,0.3700742492118799,0.0040546723333502,0.6299257507881201,0.0040546723333502
9,0.3671691213827183,0.0058901547547093,0.6328308786172817,0.0058901547547093
11,0.3724843099463268,0.0047345820145869,0.6275156900536731,0.0047345820145868
13,0.3819112041678082,0.0046901529439903,0.6180887958321919,0.0046901529439904


In [9]:
df_types

Unnamed: 0_level_0,Eficiencia en validacion,Intervalo de confianza,Error,Std Error
Numero de vecinos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.3240015778480044,0.0045563729214176,0.6759984221519957,0.0045563729214177
2,0.3427136027411531,0.0073770818455088,0.6572863972588469,0.0073770818455088
3,0.3564653503677893,0.0039527758333552,0.6435346496322105,0.0039527758333552
4,0.3590158884502941,0.005124071024259,0.6409841115497059,0.0051240710242591
5,0.3600090773688942,0.0035942401218493,0.6399909226311057,0.0035942401218493
6,0.3695074763692582,0.004769627156748,0.6304925236307417,0.004769627156748
7,0.3700742492118799,0.0040546723333502,0.6299257507881201,0.0040546723333502
9,0.3671691213827183,0.0058901547547093,0.6328308786172817,0.0058901547547093
11,0.3724843099463268,0.0047345820145869,0.6275156900536731,0.0047345820145868
13,0.3819112041678082,0.0046901529439903,0.6180887958321919,0.0046901529439904
