In [None]:
#http://scikit-learn.org/stable/modules/neighbors.html
#http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
#https://kevinzakka.github.io/2016/07/13/k-nearest-neighbor/

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets

In [4]:
from sklearn.neighbors import KNeighborsClassifier

In [5]:
iris_aux = datasets.load_iris()
iris = pd.DataFrame(iris_aux.data, columns=iris_aux.feature_names)
iris['target'] = pd.Series(iris_aux.target)
iris['species'] = pd.Categorical.from_codes(iris_aux.target, iris_aux.target_names)
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,species
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X = iris.iloc[:,:4]
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [8]:
y = iris.iloc[:,4]
y.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int32

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
knn = KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='auto', leaf_size=30, 
                           p=2, metric='minkowski', metric_params=None, n_jobs=1)
# p = 2 e metric = 'minkowski' = distancia euclidiana (?padronizada)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [11]:
pred = knn.predict(X_test)

In [12]:
# evaluate accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, pred))

0.98


### CROSS VALIDATION

In [13]:
from sklearn.model_selection import GridSearchCV

### Importanto métricas de validação de CV

In [14]:
from sklearn.metrics import make_scorer, f1_score, fbeta_score, roc_auc_score, auc, roc_curve, precision_score, recall_score, classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold, KFold, LeaveOneOut

In [15]:
metrics = [f1_score,  precision_score , recall_score,  accuracy_score] #roc_auc_score]

cv_kfold =KFold(n_splits=3, shuffle=False, random_state=None)


In [16]:
#GridSearchCV(estimator, param_grid, scoring=None, fit_params=None, n_jobs=1, 
#             iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', 
#             error_score='raise', return_train_score='warn')

clf = KNeighborsClassifier()
#parametros do KNN
#KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='auto', leaf_size=30, 
#                           p=2, metric='minkowski', metric_params=None, n_jobs=1)
parameters = {
    'weights' : ['uniform', 'distance'],
    'n_neighbors': [1,2,3,4,5,6] ,
    'algorithm': ['auto']
}

model = GridSearchCV(estimator = clf, 
                     param_grid = parameters, scoring= 'accuracy', fit_params=None, n_jobs=1, iid=True, refit=True, 
                     cv=cv_kfold, verbose=0, pre_dispatch='2*n_jobs', error_score='raise', return_train_score= True)

#scoring: forma de avaliar os parametros, o melhor pro modelo. se não passar, usa o default do modelo.
# pode receber: http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
model

GridSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=False),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'weights': ['uniform', 'distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6], 'algorithm': ['auto']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [17]:
# X_train.values : transforma DF em matriz
#grid_fit = model.fit(X_train.values,y_train.values)
knn_fit = model.fit(X_train,y_train)


In [18]:
# melhor socore:
knn_fit.best_score_

0.95

In [19]:
#melhores parametros
knn_fit.best_params_

{'algorithm': 'auto', 'n_neighbors': 1, 'weights': 'uniform'}

In [20]:
pd.DataFrame(knn_fit.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.00233,0.0004725673,0.000996,0.0008121578,auto,1,uniform,"{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.970588,0.909091,0.969697,0.95,0.028713,1,1.0,1.0,1.0,1.0,0.0
1,0.001332,0.0004705274,0.000999,3.371748e-07,auto,1,distance,"{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",0.970588,0.909091,0.969697,0.95,0.028713,1,1.0,1.0,1.0,1.0,0.0
2,0.001338,0.0004785635,0.00033,0.0004672119,auto,2,uniform,"{'algorithm': 'auto', 'n_neighbors': 2, 'weigh...",0.941176,0.939394,0.969697,0.95,0.013843,1,0.939394,0.985075,0.955224,0.959897,0.01894
3,0.001,4.052337e-07,0.000333,0.0004709208,auto,2,distance,"{'algorithm': 'auto', 'n_neighbors': 2, 'weigh...",0.970588,0.909091,0.969697,0.95,0.028713,1,1.0,1.0,1.0,1.0,0.0
4,0.000666,0.000471258,0.000333,0.0004710332,auto,3,uniform,"{'algorithm': 'auto', 'n_neighbors': 3, 'weigh...",0.970588,0.878788,0.969697,0.94,0.042961,7,0.924242,0.985075,0.955224,0.954847,0.024836
5,0.0,0.0,0.0,0.0,auto,3,distance,"{'algorithm': 'auto', 'n_neighbors': 3, 'weigh...",1.0,0.878788,0.969697,0.95,0.051493,1,1.0,1.0,1.0,1.0,0.0
6,0.0,0.0,0.005208,0.007365134,auto,4,uniform,"{'algorithm': 'auto', 'n_neighbors': 4, 'weigh...",0.941176,0.909091,0.939394,0.93,0.014692,10,0.939394,0.985075,0.955224,0.959897,0.01894
7,0.0,0.0,0.0,0.0,auto,4,distance,"{'algorithm': 'auto', 'n_neighbors': 4, 'weigh...",0.941176,0.878788,0.969697,0.93,0.037789,10,1.0,1.0,1.0,1.0,0.0
8,0.005214,0.00737435,0.0,0.0,auto,5,uniform,"{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.941176,0.909091,0.969697,0.94,0.024633,7,0.954545,0.970149,0.970149,0.964948,0.007356
9,0.0,0.0,0.0,0.0,auto,5,distance,"{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.970588,0.909091,0.969697,0.95,0.028713,1,1.0,1.0,1.0,1.0,0.0


In [21]:
kfold_predict = knn_fit.predict(X_test)
kfold_predict

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0, 1, 1, 2, 1, 2])

In [22]:
y_test.values

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0, 1, 2, 2, 1, 2])

In [23]:
pd.DataFrame(confusion_matrix(y_test, kfold_predict, labels = None, sample_weight=None))
#linhas é real e colunas é predito

Unnamed: 0,0,1,2
0,19,0,0
1,0,15,0
2,0,1,15


In [24]:
accuracy_score(y_test, kfold_predict)

0.98

In [25]:
from sklearn import metrics
print(metrics.classification_report(y_test, kfold_predict))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        19
          1       0.94      1.00      0.97        15
          2       1.00      0.94      0.97        16

avg / total       0.98      0.98      0.98        50



#### Leave one out

In [26]:
cv_loo = LeaveOneOut()

In [27]:
clf = KNeighborsClassifier()
#parametros do KNN
#KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='auto', leaf_size=30, 
#                           p=2, metric='minkowski', metric_params=None, n_jobs=1)
parameters = {
    'weights' : ['uniform', 'distance'],
    'n_neighbors': [1,2,3,4,5,6] ,
    'algorithm': ['auto']
}

model_loo = GridSearchCV(estimator = clf, 
                     param_grid = parameters, scoring= 'accuracy', fit_params=None, n_jobs=1, iid=True, refit=True, 
                     cv=cv_loo, verbose=0, pre_dispatch='2*n_jobs', error_score='raise', return_train_score= True)

#scoring: forma de avaliar os parametros, o melhor pro modelo. se não passar, usa o default do modelo.
# pode receber: http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
model

GridSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=False),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'weights': ['uniform', 'distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6], 'algorithm': ['auto']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [28]:
knn_fit_loo = model_loo.fit(X_train,y_train)

In [29]:
knn_fit_loo.best_params_

{'algorithm': 'auto', 'n_neighbors': 4, 'weights': 'distance'}

In [30]:
knn_fit_loo.best_score_

0.95

In [31]:
pd.DataFrame(knn_fit_loo.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,...,split92_train_score,split93_train_score,split94_train_score,split95_train_score,split96_train_score,split97_train_score,split98_train_score,split99_train_score,mean_train_score,std_train_score
0,0.001081,0.0034,0.000764,0.003056,auto,1,uniform,"{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,0.002023,0.004828,0.000473,0.002196,auto,1,distance,"{'algorithm': 'auto', 'n_neighbors': 1, 'weigh...",1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,0.001101,0.003701,0.000726,0.003056,auto,2,uniform,"{'algorithm': 'auto', 'n_neighbors': 2, 'weigh...",1.0,1.0,...,0.969697,0.969697,0.969697,0.969697,0.979798,0.969697,0.969697,0.969697,0.969899,0.002466
3,0.00088,0.003404,0.000519,0.002669,auto,2,distance,"{'algorithm': 'auto', 'n_neighbors': 2, 'weigh...",1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,0.000896,0.003397,0.000539,0.002666,auto,3,uniform,"{'algorithm': 'auto', 'n_neighbors': 3, 'weigh...",1.0,1.0,...,0.959596,0.959596,0.959596,0.959596,0.969697,0.959596,0.959596,0.959596,0.959899,0.002655
5,0.00135,0.00422,0.000529,0.002665,auto,3,distance,"{'algorithm': 'auto', 'n_neighbors': 3, 'weigh...",1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
6,0.000825,0.003416,0.000655,0.00306,auto,4,uniform,"{'algorithm': 'auto', 'n_neighbors': 4, 'weigh...",1.0,1.0,...,0.949495,0.969697,0.949495,0.949495,0.959596,0.949495,0.949495,0.949495,0.950505,0.004165
7,0.000781,0.003405,0.000937,0.003711,auto,4,distance,"{'algorithm': 'auto', 'n_neighbors': 4, 'weigh...",1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
8,0.000832,0.003407,0.000957,0.003707,auto,5,uniform,"{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",1.0,1.0,...,0.959596,0.969697,0.959596,0.959596,0.969697,0.959596,0.959596,0.959596,0.960202,0.003446
9,0.000794,0.003066,0.000725,0.003056,auto,5,distance,"{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [32]:
loo_predict = knn_fit_loo.predict(X_test)
loo_predict

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0, 1, 1, 2, 1, 2])

In [33]:
pd.DataFrame(confusion_matrix(y_test, loo_predict, labels=None, sample_weight=None))

Unnamed: 0,0,1,2
0,19,0,0
1,0,15,0
2,0,1,15


In [34]:
accuracy_score(y_test, loo_predict)

0.98

In [35]:
from sklearn import metrics
print(metrics.classification_report(y_test, loo_predict))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        19
          1       0.94      1.00      0.97        15
          2       1.00      0.94      0.97        16

avg / total       0.98      0.98      0.98        50

