In [69]:
import pandas as pd
import numpy as np
import sklearn
import random
from scipy import stats as st

from sklearn.datasets import make_regression
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [70]:
X, y = make_classification(n_samples=10,n_classes=2, n_features=4, n_informative=2, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)

X.columns = [f'col_{col}' for col in X.columns]
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.4)


In [71]:
X
for i in range(3):
    pass
X

Unnamed: 0,col_0,col_1,col_2,col_3
0,-0.925336,-1.140215,-0.838792,1.695858
1,-0.461711,-0.587231,-1.971718,2.03731
2,1.440444,1.777367,1.511576,-2.79776
3,1.547117,1.899693,0.834445,-2.399814
4,0.883943,1.068339,-0.970073,-0.26156
5,1.425444,1.727259,-1.185827,-0.712069
6,-0.580675,-0.720634,-0.960593,1.397206
7,-0.762815,-0.938205,-0.543048,1.284181
8,-1.597318,-1.962874,-0.992251,2.57794
9,-2.38936,-2.895397,1.976862,1.201904


In [72]:
X_test

Unnamed: 0,col_0,col_1,col_2,col_3
6,-0.580675,-0.720634,-0.960593,1.397206
4,0.883943,1.068339,-0.970073,-0.26156
5,1.425444,1.727259,-1.185827,-0.712069
9,-2.38936,-2.895397,1.976862,1.201904


In [73]:
class MyKNNClf_works:
    def __init__(self, k = 3):
        self.k = k
        self.train_size = None
        self.X = None 
        self.y = None
    def fit(self, X:pd.DataFrame, y:pd.Series):
        self.X = X
        self.y = y 
        self.train_size = X.shape
        return self.train_size
        
    def count_min_distans_for_k(self, x):
        distanses = [np.sqrt((np.sum((np.array(x) - self.X.iloc[i])**2))) for i in range(self.X.shape[0])]
        print('distanses', distanses)
        indexes = np.argsort(distanses)[:self.k]
        
        labels = y.iloc[indexes]

        return labels[:self.k]
    
    def predict(self, X_test:pd.DataFrame):
        pred = np.ones(X_test.shape[0], dtype=int)
        for i in range(X_test.shape[0]):

            labels = self.count_min_distans_for_k(X_test.iloc[i])
            predicted_lable = int(labels.mode()[0])

            pred[i] = predicted_lable

        return pred
    
    def predict_proba(self, X_test:pd.DataFrame):
        prabability = np.zeros(X_test.shape[0])
        for  i in range(X_test.shape[0]):
            true_labels = self.count_min_distans_for_k(X_test.iloc[i])
            prabability[i] = true_labels.sum()/self.k
            
        return prabability


In [75]:
class MyKNNClf_test:
    def __init__(self, k:int = 3, metric: str = 'euclidean'):
        self.k = k 
        self.train_size = None
        self.X = None
        self.y = None
        self.metric = metric
        self.metrics = {
            'euclidean': self.euclidean_distances, 
            'chebyshev': self.chebyshev_distances, 
            'manhattan':self.manhattan_distances, 
            'cosine': self.cosine_distances}

    def fit(self, X:pd.DataFrame, y:pd.Series):
        self.X_train = X.copy()
        self.y_train = y.copy()
        self.train_size = X.shape
        
    def euclidean_distances(self, row):
        distanses = np.sqrt(np.sum((self.X_train - row)**2, axis = 1))
        
        return distanses
    
    def chebyshev_distances(self, row):
        return np.max(np.abs(self.X_train.values - row.values))
    
    def manhattan_distances(self, row):
        return np.sum(np.abs(self.X_train.values - row.values))
    def cosine_distances(self, row):
        return np.sum(self.X_train.value - row.values)/(np.sqrt(np.sum((self.X_train.value)**2))*np.sqrt(np.sum((row.values)**2)))

    def _predict(self, row):
        predict_proba = self._predict_proba(row)
        if predict_proba >= 0.5:
            return 1
        else:
            return 0 
    def _predict_proba(self, row):
        distanses = self.metrics[self.metric](row)
        indexes = np.argsort(distanses)[:self.k]
        self.y_train = np.array(self.y_train)
        labels = self.y_train[indexes]
        proba = labels.mean()
        return proba
        
            
    def predict(self, X:pd.DataFrame):
        return [X.apply(self._predict, axis = 1)]
        
    def predict_proba(self, X:pd.DataFrame):
        return [X.apply(self._predict_proba, axis = 1)]
        
        

In [76]:
model_works = MyKNNClf_works()
model_works.fit(X_train,y_train);
out_proba = model_works.predict_proba(X_test)
print('out_proba', out_proba)
out = model_works.predict(X_test);
print(out.sum)
out


distanses [5.833891232833391, 5.388239509222738, 0.6315598275343339, 1.992952196624828, 0.517329599062704, 1.2099819946345562]
distanses [3.6610527942327153, 2.99321923836731, 3.46409056706314, 4.838198373018745, 3.0511797558967135, 3.2923843695732504]
distanses [3.410106385989372, 2.6409310782587303, 4.434762544913692, 5.797913570425221, 4.03625483823471, 4.134601448196524]
distanses [7.260573011865205, 7.264035880138701, 3.6600452157293355, 3.493714515360034, 3.5823132406536815, 5.033154872538991]
out_proba [1.         0.66666667 0.33333333 1.        ]
distanses [5.833891232833391, 5.388239509222738, 0.6315598275343339, 1.992952196624828, 0.517329599062704, 1.2099819946345562]
distanses [3.6610527942327153, 2.99321923836731, 3.46409056706314, 4.838198373018745, 3.0511797558967135, 3.2923843695732504]
distanses [3.410106385989372, 2.6409310782587303, 4.434762544913692, 5.797913570425221, 4.03625483823471, 4.134601448196524]
distanses [7.260573011865205, 7.264035880138701, 3.6600452157

array([1, 1, 0, 1])

In [77]:
model_test = MyKNNClf_test()
model_test.fit(X_train, y_train);
out = model_test.predict_proba(X_test);
print('out', out)
out = model_test.predict(X_test);
out


out [6    0.333333
4    0.666667
5    1.000000
9    0.333333
dtype: float64]


[6    0
 4    1
 5    1
 9    0
 dtype: int64]

In [78]:
class MyKNNClf:
    def __init__(self, k:int = 3, weight:str = 'uniform'):
        self.k = k
        self.train_size = None
        self.X_train = None
        self.y_train = None
        self.weight = weight 
        
    def fit(self, X:pd.DataFrame, y:pd.Series):
        self.X_train = X.values
        self.y_train = y.values
        self.train_size = X.shape
        
   
    def count_distance(self, x_test):
        distance = np.linalg.norm(self.X_train - x_test, axis = 1)
        return distance
                       
    def predict(self, X:pd.DataFrame):
        proba = self.predict_proba(X)
        return (proba >= 0.5).astype(int)
        
    def predict_proba(self, X:pd.DataFrame):
        X = X.values
        proba = []
        for x_test in X:
            distances = self.count_distance(x_test)
            labels_index = np.argsort(distances)[:self.k]
            labels = self.y_train[labels_index]
            print('labels',labels)
            proba.append(labels.mean())
        print('proba', proba)
        return np.array(proba)
        

In [79]:
model_test_1 = MyKNNClf()
model_test_1.fit(X_train, y_train);
out = model_test_1.predict_proba(X_test);
print('out', out)
out = model_test_1.predict(X_test);
out


labels [1 0 0]
labels [1 1 0]
labels [1 1 1]
labels [0 1 0]
proba [0.3333333333333333, 0.6666666666666666, 1.0, 0.3333333333333333]
out [0.33333333 0.66666667 1.         0.33333333]
labels [1 0 0]
labels [1 1 0]
labels [1 1 1]
labels [0 1 0]
proba [0.3333333333333333, 0.6666666666666666, 1.0, 0.3333333333333333]


array([0, 1, 1, 0])