In [None]:
import random
import yaml
import warnings
from typing import Literal

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from numpy.typing import NDArray

from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import category_encoders as ce

TEST_SPLIT = 0.2
SEED = 42

random.seed(SEED)
np.random.seed(SEED)

## Наивный байесовский классификатор

In [None]:
with open("../config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

#### Подготовка WDBC

In [None]:
def load_wdbc(use_standard_scaling=True):
    df_wdbc = pd.read_csv(cfg["classification"]["wdbc"])
    target_col_wdbc = "diagnosis"

    df_wdbc = df_wdbc.drop(["id", "Unnamed: 32"], axis=1)
    df_wdbc[target_col_wdbc] = df_wdbc[target_col_wdbc].replace({"B": 0, "M": 1})
    feature_cols_wdbc = list(df_wdbc.columns)
    feature_cols_wdbc.remove(target_col_wdbc)

    y_wdbc = df_wdbc[[target_col_wdbc]]
    X_wdbc = df_wdbc[feature_cols_wdbc]

    X_wdbc_train, X_wdbc_test, y_wdbc_train, y_wdbc_test = train_test_split(
        X_wdbc, y_wdbc, test_size=TEST_SPLIT, random_state=SEED
    )

    if use_standard_scaling:
        standard_scaler = StandardScaler()
        X_wdbc_train[feature_cols_wdbc] = standard_scaler.fit_transform(X_wdbc_train)
        X_wdbc_test[feature_cols_wdbc] = standard_scaler.transform(X_wdbc_test)
    return X_wdbc_train, X_wdbc_test, y_wdbc_train, y_wdbc_test

In [None]:
X_wdbc_train, X_wdbc_test, y_wdbc_train, y_wdbc_test = load_wdbc()

#### Подготовка Mushrooms

In [None]:
def load_mushrooms(
    use_standard_scaling=True,
    encoding: Literal["frequency", "label"] = "frequency"
):
    df_mushrooms = pd.read_csv(cfg["classification"]["mushrooms"])
    target_col_mushrooms = "class"

    feature_cols_mushrooms = list(df_mushrooms.columns)
    feature_cols_mushrooms.remove(target_col_mushrooms)

    y_mushrooms = df_mushrooms[[target_col_mushrooms]]
    X_mushrooms = df_mushrooms[feature_cols_mushrooms]

    (
        X_mushrooms_train,
        X_mushrooms_test,
        y_mushrooms_train,
        y_mushrooms_test,
    ) = train_test_split(X_mushrooms, y_mushrooms, test_size=TEST_SPLIT, random_state=SEED)

    if encoding == "frequency":
        count_encoder = ce.CountEncoder()
        X_mushrooms_train = count_encoder.fit_transform(X_mushrooms_train)
        X_mushrooms_test = count_encoder.transform(X_mushrooms_test)
    elif encoding == "label":
        label_encoder = ce.OrdinalEncoder()
        X_mushrooms_train = label_encoder.fit_transform(X_mushrooms_train)
        X_mushrooms_test = label_encoder.transform(X_mushrooms_test)
    else:
        raise NotImplementedError()

    if use_standard_scaling:
        standard_scaler = StandardScaler()
        X_mushrooms_train[feature_cols_mushrooms] = standard_scaler.fit_transform(
            X_mushrooms_train
        )
        X_mushrooms_test[feature_cols_mushrooms] = standard_scaler.transform(X_mushrooms_test)

    label_encoder = LabelEncoder()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        y_mushrooms_train[target_col_mushrooms] = label_encoder.fit_transform(
            y_mushrooms_train.values
        )
        y_mushrooms_test[target_col_mushrooms] = label_encoder.fit_transform(
            y_mushrooms_test.values
        )
    return X_mushrooms_train, X_mushrooms_test, y_mushrooms_train, y_mushrooms_test

In [None]:
X_mushrooms_train, X_mushrooms_test, y_mushrooms_train, y_mushrooms_test = load_mushrooms()

Обучим Гауссовский наивный байесовский классификатор на wdbc.

Обратите внимание на вероятности, полученные через `predict_proba`. Наивный Байес оказался сильно увереннее в себе, чем, скажем, логическая регрессия, так что даже калибровка здесь не сможет нам помочь.

In [None]:
nb_wdbc = GaussianNB()

nb_wdbc.fit(X=X_wdbc_train, y=y_wdbc_train.values.reshape(-1))
df_wdbc_results = pd.DataFrame({
    'pred': nb_wdbc.predict(X_wdbc_test),
    'pred_proba': nb_wdbc.predict_proba(X_wdbc_test)[:, 1],
    'true': y_wdbc_test.values.reshape(-1),
})
df_wdbc_results.head(50)

Точность наивного Байеса оказалась такой же, как у логистической регрессии, и выше, чем у KNN

In [None]:
nb_wdbc.score(X=X_wdbc_test, y=y_wdbc_test)

Наконец, обучим модель для датасета с грибами. В данном случае точность получилась ниже KNN

In [None]:
nb_mushrooms = GaussianNB()
nb_mushrooms.fit(X=X_mushrooms_train, y=y_mushrooms_train.values.reshape(-1))
nb_mushrooms.score(X=X_mushrooms_test, y=y_mushrooms_test)

Мы помним, что этот датасет состоит полностью из категориальных переменных. Возможно, использование `CategorialNB` улучшит ситуацию

In [None]:
X_mushrooms_train, X_mushrooms_test, y_mushrooms_train, y_mushrooms_test = \
    load_mushrooms(use_standard_scaling=False, encoding="label")
nb_mushrooms = CategoricalNB()
nb_mushrooms.fit(X=X_mushrooms_train, y=y_mushrooms_train.values.reshape(-1))
nb_mushrooms.score(X=X_mushrooms_test, y=y_mushrooms_test)