# 1NN против RandomForest

In [1]:
from sklearn import datasets
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import numpy as np

### Загрузим dataset

In [2]:
dataset = datasets.load_digits()

In [3]:
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.25, shuffle=False)

## Сделаем через самопальный 1NN

In [4]:
def e2metric(x, y):
    return np.sqrt(np.dot((x-y), (x-y)))

In [5]:
e2metric(X_train[0], X_train[1])

59.556695677312391

In [6]:
class My1NN:
        
    def _get_neighbour_distances(self, x, all_x=None):
        if all_x is None:
            all_x = self.all_x
        f = lambda dot: e2metric(dot, x)
        distances = np.apply_along_axis(f, 1, all_x)
        return distances
    
    def _find_nearest_neighbour(self, x, all_x=None):
        if all_x is None:
            all_x = self.all_x
        distances = self._get_neighbour_distances(x, all_x)
        min_d_index = np.argmin(distances)
        return min_d_index, distances[min_d_index]
    
    def fit(self, X, y):
        self.all_x = X
        self.all_y = y
        
    def predict(self, X):
        f = lambda x: self._find_nearest_neighbour(x)[0]
        indexes = np.apply_along_axis(f, 1, X)
        return self.all_y[indexes]

In [7]:
classifiler = My1NN()

In [8]:
classifiler.fit(X_train, y_train)

In [9]:
%time predictions = classifiler.predict(X_test)

CPU times: user 7.91 s, sys: 20 ms, total: 7.93 s
Wall time: 7.91 s


In [10]:
print("Error percent in 1NN is: {}".
      format(np.argwhere(predictions != y_test).size / y_test.size))

Error percent in 1NN is: 0.03777777777777778


## Попробуем сделать через RandomForest

In [11]:
rf_classifiler = RandomForestClassifier(n_estimators=1000, n_jobs=4)

In [12]:
%time rf_classifiler.fit(X_train, y_train)

CPU times: user 4.42 s, sys: 208 ms, total: 4.62 s
Wall time: 2.45 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=4,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [13]:
rf_predictions = rf_classifiler.predict(X_test)

In [14]:
print("Error percent in RandomForest is: {}".
      format(np.argwhere(rf_predictions != y_test).size / y_test.size))

Error percent in RandomForest is: 0.06222222222222222


Ого, да самописный 1NN работает лучше, чем RandomForest. Ребята с Coursera говорят, что такое _иногда_ случается.