# Метод k-ближайших соседей

#### Настройка среды

- Создаем изолированное окружение: `python -m venv venv`
- Активируем: (unix) `. venv/bin/activate` или (win) `. venv/Scripts/activate`
- Устанавливаем зависимости: `pip install -r practicum_8/requirements.txt`

In [1]:
import random
import warnings

import category_encoders as ce
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import yaml

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

TEST_SPLIT = 0.2
SEED = 42

random.seed(SEED)
np.random.seed(SEED)

## Реализуем простейшую версию kNN для задач классификации

In [2]:
class KNNClassifier:
    pass

## Эксперименты

Протестируем реализованный нами `KNNClassifier` и его стандартную реализацию из scikit-learn `sklearn.neighbors.KNeighborsClassifier` на датасетах [WDBC](https://www.kaggle.com/datasets/mohaiminul101/wisconsin-diagnostic-breast-cancer-wdbc) и [Mushrooms](https://www.kaggle.com/datasets/uciml/mushroom-classification).

In [3]:
with open("../config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

#### Подготовка WDBC

In [4]:
df_wdbc = pd.read_csv(cfg["classification"]["wdbc"])
target_col_wdbc = "diagnosis"

df_wdbc = df_wdbc.drop(["id", "Unnamed: 32"], axis=1)
df_wdbc[target_col_wdbc] = df_wdbc[target_col_wdbc].replace({"B": 0, "M": 1})
feature_cols_wdbc = list(df_wdbc.columns)
feature_cols_wdbc.remove(target_col_wdbc)

y_wdbc = df_wdbc[[target_col_wdbc]]
df_wdbc = df_wdbc[feature_cols_wdbc]

df_wdbc_train, df_wdbc_test, y_wdbc_train, y_wdbc_test = train_test_split(
    df_wdbc, y_wdbc, test_size=TEST_SPLIT, random_state=SEED
)

#### Подготовка Mushrooms

In [5]:
df_mushrooms = pd.read_csv(cfg["classification"]["mushrooms"])
target_col_mushrooms = "class"

feature_cols_mushrooms = list(df_mushrooms.columns)
feature_cols_mushrooms.remove(target_col_mushrooms)

y_mushrooms = df_mushrooms[[target_col_mushrooms]]
df_mushrooms = df_mushrooms[feature_cols_mushrooms]

(
    df_mushrooms_train,
    df_mushrooms_test,
    y_mushrooms_train,
    y_mushrooms_test,
) = train_test_split(df_mushrooms, y_mushrooms, test_size=TEST_SPLIT, random_state=SEED)

count_encoder = ce.CountEncoder()
df_mushrooms_train = count_encoder.fit_transform(df_mushrooms_train)
df_mushrooms_test = count_encoder.transform(df_mushrooms_test)

label_encoder = LabelEncoder()
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    y_mushrooms_train = label_encoder.fit_transform(y_mushrooms_train.values)
    y_mushrooms_test = label_encoder.fit_transform(y_mushrooms_test.values)

#### Визуализация границ разделения классификаторов

In [6]:
# TODO