In [366]:
import pandas as pd
import numpy as np

In [367]:
df = pd.read_csv("../wines_dataframe.csv", encoding="cp1252")
df.head()

Unnamed: 0,name,price,rating,country_0,country_1,country_2,region_0,region_1,region_2,region_3,...,grape_1,grape_2,grape_3,manufacturer_0,manufacturer_1,manufacturer_2,manufacturer_3,manufacturer_4,strength,volume
0,"Vino Tracer Riesling, Weinkellerei Hechtsheim,...",-0.229692,3,0,0,1,0,0,0,1,...,0,0,1,0,0,0,0,1,-1.133567,-0.121366
1,"Vino Lighea, Donnafugata, 2021",-0.180184,4,0,0,2,0,0,0,2,...,0,0,2,0,0,0,0,2,-0.695962,-0.121366
2,"Vino Chenin Blanc, David & Nadia, 2022",-0.155955,1,0,0,3,0,0,0,3,...,0,0,3,0,0,0,0,3,-0.695962,-0.121366
3,"Vino Pinot Noir Alpine Vineyard, Rhys Vineyard...",0.223636,5,0,1,0,0,0,1,0,...,0,1,0,0,0,0,1,0,-0.345878,-0.121366
4,"Vino Grain de Gris, Listel, 2022",-0.228602,2,0,1,1,0,0,1,1,...,0,1,1,0,0,0,1,1,-1.133567,-0.121366


### Можно заметить, что количество признаков уменьшилось

Это из-за того, что я изменил применение __One-Hot-Encoding__ на `BaseNEncoder`, ибо 723 признака слишком сильно влияли на перформанс

In [368]:
from abc import ABC, abstractmethod

In [369]:
class Kernel(ABC):

    @abstractmethod
    def compute(self, x: float):
        pass


class UniformKernel(Kernel):
    def compute(self, x: float):
        return 0.5 * (abs(x) < 1)


class GaussianKernel(Kernel):
    def compute(self, x: float):
        return np.exp(-0.5 * (abs(x) < 1))


class EpanechnikovKernel(Kernel):
    def compute(self, x: float):
        return 0.75 * (1 - x ** 2) * (abs(x) < 1)


class TriangularKernel(Kernel):
    def compute(self, x: float):
        return (1 - abs(x)) * (abs(x) < 1)


class CommonKernel(Kernel):
    def __init__(self, a: float, b: float):
        self.a = a
        self.b = b

    def compute(self, x: float):
        return (1 - abs(x) ** self.a) ** self.b * (abs(x) < 1)

In [370]:
columns = df.columns
for column in columns:
    try:
        df[column] = df[column].astype(float)
    except ValueError:
        pass

df.head()

Unnamed: 0,name,price,rating,country_0,country_1,country_2,region_0,region_1,region_2,region_3,...,grape_1,grape_2,grape_3,manufacturer_0,manufacturer_1,manufacturer_2,manufacturer_3,manufacturer_4,strength,volume
0,"Vino Tracer Riesling, Weinkellerei Hechtsheim,...",-0.229692,3.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-1.133567,-0.121366
1,"Vino Lighea, Donnafugata, 2021",-0.180184,4.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,-0.695962,-0.121366
2,"Vino Chenin Blanc, David & Nadia, 2022",-0.155955,1.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,-0.695962,-0.121366
3,"Vino Pinot Noir Alpine Vineyard, Rhys Vineyard...",0.223636,5.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.345878,-0.121366
4,"Vino Grain de Gris, Listel, 2022",-0.228602,2.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,-1.133567,-0.121366


## Перейдем к метрикам

In [371]:
def minkowski(X: list[float], Y: list[float], p: float = 2) -> float:
    sm = 0.0
    for (x, y) in zip(X, Y):
        sm += abs(x - y) ** p
    return sm ** (1 / p)


def cosine(X: list[float], Y: list[float]) -> float:
    mul = np.dot(X, Y)
    l1 = np.linalg.norm(X)
    l2 = np.linalg.norm(Y)
    return mul / (l1 * l2)


def chebyshev(X: list[float], Y: list[float]) -> float:
    return max(abs(x - y) for (x, y) in zip(X, Y))


def euclidian(X: list[float], Y: list[float]) -> float:
    return minkowski(X, Y, 2)


def manhattan(X: list[float], Y: list[float]) -> float:
    return minkowski(X, Y, 1)


metrics = [
    minkowski,
    cosine,
    euclidian,
    manhattan
]

## Теперь разобъем датасет на train и test

In [372]:
df.columns

Index(['name', 'price', 'rating', 'country_0', 'country_1', 'country_2',
       'region_0', 'region_1', 'region_2', 'region_3', 'sweetness_0',
       'sweetness_1', 'grape_0', 'grape_1', 'grape_2', 'grape_3',
       'manufacturer_0', 'manufacturer_1', 'manufacturer_2', 'manufacturer_3',
       'manufacturer_4', 'strength', 'volume'],
      dtype='object')

In [373]:
TARGET = "rating"

In [374]:
def train_test_split(dataframe: pd.DataFrame, test_size: float = 0.2) -> tuple:
    test = dataframe.sample(frac=test_size)
    train = dataframe.drop(test.index)
    return train, test

In [375]:
train, test = train_test_split(df.drop(["name"], axis=1))

In [376]:
train.head()

Unnamed: 0,price,rating,country_0,country_1,country_2,region_0,region_1,region_2,region_3,sweetness_0,...,grape_1,grape_2,grape_3,manufacturer_0,manufacturer_1,manufacturer_2,manufacturer_3,manufacturer_4,strength,volume
0,-0.229692,3.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-1.133567,-0.121366
1,-0.180184,4.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,-0.695962,-0.121366
2,-0.155955,1.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,-0.695962,-0.121366
3,0.223636,5.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.345878,-0.121366
4,-0.228602,2.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,-1.133567,-0.121366


In [377]:
test.head()

Unnamed: 0,price,rating,country_0,country_1,country_2,region_0,region_1,region_2,region_3,sweetness_0,...,grape_1,grape_2,grape_3,manufacturer_0,manufacturer_1,manufacturer_2,manufacturer_3,manufacturer_4,strength,volume
1175,-0.159993,5.0,0.0,1.0,1.0,0.0,0.0,1.0,2.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,-0.695962,-0.121366
2561,0.223636,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,3.0,3.0,0.0,0.616852,-0.121366
2443,0.021726,4.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,3.0,2.0,0.0,1.0,1.0,3.0,2.0,0.179247,-0.121366
8,-0.005195,5.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,-0.258357,-0.121366
840,-0.09269,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,2.0,1.0,0.0,0.0,2.0,0.0,3.0,-0.258357,-0.121366


### Кажется норм

In [378]:
from typing import Callable
from sklearn.neighbors import NearestNeighbors


class KNNClassifier:
    def __init__(self, k: int, radius: float, kernel: Kernel,
                 metric: Callable[[list[float], list[float]], float] = euclidian, mode: str = "fixed",
                 algorithm: str = "ball_tree"):
        """
        KNNClassifier class
        
        :param k: amount of neighbors to compute. While computing kernel distance, dist to k + 1st neighbor will be used
        :param radius: radius of a fixed window where neighbors are computed
        :param kernel: kernel function to compute weights
        :param metric: metric for distance computation
        :param mode: fixed window or non-fixed window
        :param algorithm: algorithm to use for neighbors finding (ball tree or K-D tree)
        """
        self.neighbors = k + 1
        self.radius = radius
        self.metric = metric
        self.kernel = kernel

        if mode not in ["fixed", "non-fixed"]:
            raise ValueError("Mode must be either fixed or non-fixed")
        self.mode = mode
        self.algorithm = algorithm if algorithm in ["ball_tree", "kd_tree"] else "auto"
        self.tree = None
        self.classes = None

    def __get_classes(self, distances: np.array, indices: np.array, divider: float) -> dict:
        classes = {}
        for ind, dist in zip(indices, distances):
            multiplier = self.kernel.compute(divider)
            target_class = self.classes.iloc[ind]
            if target_class not in classes:
                classes[target_class] = 0
            classes[target_class] += multiplier
        return classes

    def fit(self, X: pd.DataFrame, y: pd.Series):
        if self.algorithm == "kd_tree":
            self.tree = NearestNeighbors(n_neighbors=self.neighbors
                                         , radius=self.radius
                                         , algorithm=self.algorithm
                                         , metric=self.metric.__name__).fit(X)
        else:
            self.tree = NearestNeighbors(n_neighbors=self.neighbors
                                         , radius=self.radius
                                         , algorithm=self.algorithm
                                         , metric=self.metric).fit(X)
        self.classes = y

    def predict(self, point: list[float]) -> float:
        if self.tree is None:
            raise RuntimeError("KNNClassifier has not been fit yet.")

        if self.mode == "fixed":
            rng = self.tree.radius_neighbors(point, return_distance=True, sort_results=True)
            distances = np.asarray(rng[0][0])
            indices = np.asarray(rng[1][0])
            classes = self.__get_classes(distances, indices, self.radius)
        else:
            rng = self.tree.kneighbors(point, return_distance=True)
            distances = np.asarray(rng[0][0])
            indices = np.asarray(rng[1][0])
            classes = self.__get_classes(distances, indices, max(distances))

        mx = 0
        ans = 0
        for key in classes:
            if classes[key] > mx:
                mx = classes[key]
                ans = key
        return ans

### Осталось собственно попробовать что-нибудь предсказать

In [379]:
train.head()

Unnamed: 0,price,rating,country_0,country_1,country_2,region_0,region_1,region_2,region_3,sweetness_0,...,grape_1,grape_2,grape_3,manufacturer_0,manufacturer_1,manufacturer_2,manufacturer_3,manufacturer_4,strength,volume
0,-0.229692,3.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-1.133567,-0.121366
1,-0.180184,4.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,-0.695962,-0.121366
2,-0.155955,1.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,-0.695962,-0.121366
3,0.223636,5.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.345878,-0.121366
4,-0.228602,2.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,-1.133567,-0.121366


In [380]:
test.head()

Unnamed: 0,price,rating,country_0,country_1,country_2,region_0,region_1,region_2,region_3,sweetness_0,...,grape_1,grape_2,grape_3,manufacturer_0,manufacturer_1,manufacturer_2,manufacturer_3,manufacturer_4,strength,volume
1175,-0.159993,5.0,0.0,1.0,1.0,0.0,0.0,1.0,2.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,-0.695962,-0.121366
2561,0.223636,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,3.0,3.0,0.0,0.616852,-0.121366
2443,0.021726,4.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,3.0,2.0,0.0,1.0,1.0,3.0,2.0,0.179247,-0.121366
8,-0.005195,5.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,-0.258357,-0.121366
840,-0.09269,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,2.0,1.0,0.0,0.0,2.0,0.0,3.0,-0.258357,-0.121366


In [381]:
y_train = train[TARGET]
small_y = y_train[:3]
small_y

0    3.0
1    4.0
2    1.0
Name: rating, dtype: float64

In [382]:
train["rating"].iloc[:3]

0    3.0
1    4.0
2    1.0
Name: rating, dtype: float64

In [383]:
train.iloc[2220:2221]

Unnamed: 0,price,rating,country_0,country_1,country_2,region_0,region_1,region_2,region_3,sweetness_0,...,grape_1,grape_2,grape_3,manufacturer_0,manufacturer_1,manufacturer_2,manufacturer_3,manufacturer_4,strength,volume
2775,-0.059038,1.0,0.0,2.0,0.0,0.0,0.0,3.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,3.0,0.0,0.0,-0.258357,-0.121366


In [384]:
X_train = train.drop(TARGET, axis=1)
y_train = train[TARGET]
y_train.iloc[0]

3.0

In [385]:
knn = KNNClassifier(k=10, radius=10, kernel=GaussianKernel(), metric=euclidian, algorithm="ball_tree")
# knn.predict(train.iloc[:1])

In [386]:
knn.fit(X_train, y_train)
knn.predict(X_train.iloc[:1])

1.0

In [387]:
knn.predict(X_train.iloc[1:2])

1.0

### Ого, оно даже работает
### Теперь давайте нормально попредсказываем и посмотрим, насколько хорошо (или плохо) оно работает

In [389]:
from sklearn.metrics import f1_score

### Попробуем заняться подбором гиперпараметров

In [390]:
from typing import Optional


class GridResult:
    def __init__(self):
        self.best_micro_fscore = None
        self.best_macro_fscore = None
        
        self.best_micro_estimator = None
        self.best_macro_estimator = None
        
        self.best_micro_params = {}
        self.best_macro_params = {}
        
    def update_micro(self, estimator: KNNClassifier, score: float, params: dict):
        if self.best_micro_fscore is None or self.best_micro_fscore < score:
            self.best_micro_fscore = score
            self.best_micro_estimator = estimator
            self.best_micro_params = params
    
    def update_macro(self, estimator: KNNClassifier, score: float, params: dict):
        if self.best_macro_fscore is None or self.best_macro_fscore < score:
            self.best_macro_fscore = score
            self.best_macro_estimator = estimator
            self.best_macro_params = params


def grid_search(X: pd.DataFrame, y: pd.Series, params: dict) -> GridResult:
    """
    :param X: Data used to fit KNNClassifier
    :param y: Data target values used to fit KNNClassifier
    :param params: dictionary of parameters that will be used for grid search
                   Every entry must look like "param": list[possible_variations]
    :return: KNNClassifier object with best score
    """
    
    result = GridResult()

    def get_k() -> Optional[list[int]]:
        return params["k"] if "k" in params else None

    def get_radius() -> Optional[list[float]]:
        return params["radius"] if "radius" in params else None

    def get_metric() -> Optional[list[Callable[[list[float], list[float]], float]]]:
        return params["metric"] if "metric" in params else None

    def get_mode() -> Optional[list[str]]:
        return params["mode"] if "mode" in params else None

    def get_algorithm() -> Optional[list[str]]:
        return params["algorithm"] if "algorithm" in params else None

    def get_kernel() -> Optional[list[Kernel]]:
        return params["kernel"] if "kernel" in params else None

    k = get_k()
    radius = get_radius()
    metric = get_metric()
    mode = get_mode()
    algorithm = get_algorithm()
    kernel = get_kernel()

    for _mode in mode:
        for _k, _radius in zip(k, radius):
            for _metric in metric:
                for _algorithm in algorithm:
                    for _kernel in kernel:
                        knn = KNNClassifier(k=_k
                                            , radius=_radius
                                            , kernel=_kernel
                                            , metric=_metric
                                            , algorithm=_algorithm
                                            , mode=_mode)
                        knn.fit(X, y)
                        predictions = []
                        for idx in range(X.shape[0]):
                            point = X.iloc[idx:idx + 1]
                            predictions.append(knn.predict(point))
                        
                        micro_fscore = f1_score(y.tolist(), predictions, average="micro")
                        macro_fscore = f1_score(y.tolist(), predictions, average="macro")
                        
                        _params = {
                            "k": _k,
                            "radius": _radius,
                            "metric": _metric,
                            "mode": _mode,
                            "algorithm": _algorithm,
                            "kernel": _kernel
                        }
                        
                        result.update_micro(knn, micro_fscore, _params)
                        result.update_macro(knn, macro_fscore, _params)
                            
    return result

In [391]:
params = {
    "k": [3, 5, 10],
    "radius": [0.5, 2, 5, 10, 50],
    "metric": metrics,
    "mode": ["fixed", "non-fixed"],
    "algorithm": ["ball_tree"],
    "kernel": [GaussianKernel(), UniformKernel(), TriangularKernel(), EpanechnikovKernel()]
}

best = grid_search(X_train, y_train, params)
print(f"Best micro score - {best.best_micro_fscore}")
print(f"Best micro params - {best.best_micro_params}", sep='\n')
print("-------------------")
print(f"Best macro score - {best.best_macro_fscore}")
print(f"Best macro params - {best.best_macro_params}", sep='\n')


Best micro score - 0.9149031967582171
Best micro params - {'k': 3, 'radius': 0.5, 'metric': <function manhattan at 0x000002325F3C5CA0>, 'mode': 'fixed', 'algorithm': 'ball_tree', 'kernel': <__main__.GaussianKernel object at 0x000002325F514250>}
-------------------
Best macro score - 0.9115748152779966
Best macro params - {'k': 3, 'radius': 0.5, 'metric': <function manhattan at 0x000002325F3C5CA0>, 'mode': 'fixed', 'algorithm': 'ball_tree', 'kernel': <__main__.GaussianKernel object at 0x000002325F514250>}


#### Этот перебор, к сожалению, работает примерно за 50 минут

In [392]:
knn = best.best_macro_estimator
best_params = {
    'k': 3,
    "radius": 0.5,
    "metric": manhattan,
    "mode": "fixed",
    "algorithm": "ball_tree",
    "kernel": GaussianKernel()
}

### Выше можно увидеть, что получилось достичь довольно неплохого результата - `F-score` равный $\sim$ 0.9