# Bug Report
The following illustrates a bug with cuML v0.11 `neighbors.KNeighborsClassifier.predict()` function.

In [8]:
import cudf, cuml, sklearn
import pandas as pd, numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from cuml.neighbors import NearestNeighbors as cuNearestNeighbors
from cuml.neighbors import KNeighborsClassifier as cuKNeighborsClassifier
from sklearn.neighbors import NearestNeighbors as skKNearestNeighbors
from sklearn.neighbors import KNeighborsClassifier as skKNeighborsClassifier
cuml.__version__, sklearn.__version__

('0.11.0', '0.22.1')

# Load MNIST Data
We use Kaggle's MNIST dataset available here:
https://www.kaggle.com/c/digit-recognizer/data

In [5]:
train = cudf.read_csv('../../../Mnist/train.csv')
train.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
X_train, X_test, y_train, y_test = train_test_split(train.iloc[:,1:], train.iloc[:,0],\
        test_size=0.2, random_state=42)

# cuML Model

In [28]:
model = cuKNeighborsClassifier(n_neighbors=10)
model.fit(X_train, y_train)

The following accuracy score is too low. kNN easily achieves 96% accuracy on MNIST

In [11]:
%%time
model.score(X_test,y_test)

CPU times: user 1.18 s, sys: 76.1 ms, total: 1.26 s
Wall time: 1.31 s


0.8855952620506287

The cause is that the `predict` method is not correctly choosing the mode from available neighbors. In the code below, we see that the prediction for `X_test.iloc[7]` should be digit 2 but it incorrectly being reported by `predict` as digit 8.

In [29]:
%%time
y_hat = model.predict(X_test)
y_hat_p = model.predict_proba(X_test)

CPU times: user 2.22 s, sys: 160 ms, total: 2.38 s
Wall time: 2.41 s


In [18]:
y_hat[0].to_array()[:10]

array([8, 1, 9, 9, 8, 6, 2, 8, 7, 1], dtype=int32)

In [20]:
y_hat_p['y_hat'] = y_hat_p.to_pandas().idxmax(axis=1)
y_hat_p[:10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,y_hat
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,9
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,9
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8
5,0.1,0.0,0.0,0.0,0.0,0.1,0.8,0.0,0.0,0.0,6
6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
7,0.0,0.0,0.7,0.0,0.0,0.0,0.0,0.2,0.1,0.0,2
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,7
9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


If we use `predict_proba()` and find the `argmax` ourselves, we achieve 96% accuracy shown below.

In [31]:
(y_hat_p.to_pandas().idxmax(axis=1)==y_test.to_array() ).sum()/y_test.shape[0]

0.9616666666666667

Additionally, we can find the mode ourselves using `NearestNeighbors`.

In [33]:
%%time
model = cuNearestNeighbors(n_neighbors=10)
model.fit(X_train)
distances, indices = model.kneighbors(X_test)

CPU times: user 1.51 s, sys: 100 ms, total: 1.61 s
Wall time: 1.61 s


In [34]:
for i in range(10):
    d = y_train[ indices.iloc[i,:] ].to_array()
    print(i, d, stats.mode(d)[0] )

0 [8 8 8 8 8 8 8 8 8 8] [8]
1 [1 1 1 1 1 1 1 1 1 1] [1]
2 [9 9 9 9 9 9 9 9 9 9] [9]
3 [9 9 9 9 9 9 9 9 9 9] [9]
4 [8 8 8 8 8 8 8 8 8 8] [8]
5 [6 6 6 6 0 6 5 6 6 6] [6]
6 [2 2 2 2 2 2 2 2 2 2] [2]
7 [2 8 2 2 2 7 2 2 7 2] [2]
8 [7 7 7 7 7 7 7 7 7 7] [7]
9 [1 1 1 1 1 1 1 1 1 1] [1]


# Scikit-learn Model

In [37]:
model = skKNeighborsClassifier(n_neighbors=10,n_jobs=6)
model.fit(X_train.to_pandas(), y_train.to_pandas())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=6, n_neighbors=10, p=2,
                     weights='uniform')

In [39]:
%%time
model.score(X_test.to_pandas(),y_test.to_pandas())

CPU times: user 8min 29s, sys: 173 ms, total: 8min 29s
Wall time: 1min 26s


0.9616666666666667

In [40]:
y_hat = model.predict(X_test.iloc[:10].to_pandas())
y_hat_p = model.predict_proba(X_test.iloc[:10].to_pandas())

CPU times: user 2.59 s, sys: 24.3 ms, total: 2.61 s
Wall time: 1.87 s


In [41]:
y_hat[:10]

array([8, 1, 9, 9, 8, 6, 2, 2, 7, 1])

In [46]:
y_hat_p = np.concatenate((y_hat_p,y_hat_p.argmax(axis=1).reshape((-1,1))),axis=1)
y_hat_p[:10]

array([[0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. , 0. , 8. ],
       [0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. , 9. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. , 9. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. , 0. , 8. ],
       [0.1, 0. , 0. , 0. , 0. , 0.1, 0.8, 0. , 0. , 0. , 6. ],
       [0. , 0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 2. ],
       [0. , 0. , 0.7, 0. , 0. , 0. , 0. , 0.2, 0.1, 0. , 2. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. , 0. , 0. , 7. ],
       [0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. ]])