# Импорты

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import classification_report, roc_curve, r2_score, mean_squared_error

# Дерево решений

In [2]:
import numpy as np
import pandas as pd
from statistics import mode
from multiprocessing import Pool
from sklearn.base import BaseEstimator


def entropy(y):
    p = [len(y[y == k]) / len(y) for k in np.unique(y)]
    return -np.dot(p, np.log2(p))


def gini(y):
    p = [len(y[y == k]) / len(y) for k in np.unique(y)]
    return 1 - np.dot(p, p)


def variance(y):
    return np.var(y)


def mad_median(y):
    return np.mean(np.abs(y - np.median(y)))


def regression_leaf(y):
    return np.mean(y)


def classification_leaf(y):
    return mode(y)


class Node:
    def __init__(self, feature_idx=0, threshold=0, labels=None, left=None, right=None):
        self.feature_idx = feature_idx
        self.threshold = threshold
        self.labels = labels
        self.left = left
        self.right = right


class DecisionTree(BaseEstimator):
    def __init__(self, max_depth=100, min_samples_split=2, min_samples_leaf=1, criterion="entropy",
                 leaf_func="classification_leaf"):
        params = {
            "max_depth": max_depth,
            "min_samples_split": min_samples_split,
            "min_samples_leaf": min_samples_leaf,
            "criterion": criterion,
            "leaf_func": leaf_func
        }

        criteria_dict = {
            "variance": variance,
            "mad_median": mad_median,
            "gini": gini,
            "entropy": entropy
        }

        leaf_dict = {
            "regression_leaf": regression_leaf,
            "classification_leaf": classification_leaf
        }

        for param_name, param_value in params.items():
            setattr(self, param_name, param_value)

        super(DecisionTree, self).set_params(**params)
        self._criterion_function = criteria_dict[criterion]
        self._leaf_value = leaf_dict[leaf_func]
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.root = None
        self.current_depth = 0

    def _functional(self, x_train: pd.DataFrame, y: pd.Series, feature_idx: int, threshold):
        mask = x_train.iloc[:, feature_idx] < threshold
        n_obj = x_train.shape[0]
        n_left = np.sum(mask)
        n_right = n_obj - n_left
        if n_left > 0 and n_right > 0:
            return (
                    self._criterion_function(y)
                    - (n_left / n_obj) * self._criterion_function(y.loc[mask])
                    - (n_right / n_obj) * self._criterion_function(y.loc[~mask])
            )
        else:
            return 0

    def _build_tree(self, x_train: pd.DataFrame, y: pd.Series, depth=1):
        """Train decision tree"""
        max_functional = 0
        best_feature_idx = None
        best_threshold = None
        n_samples, n_features = x_train.shape

        if len(np.unique(y)) == 1:
            return Node(labels=y)

        best_mask = None
        if depth < self.max_depth and n_samples >= self.min_samples_split and n_samples >= self.min_samples_leaf:
            for feature_idx in range(n_features):
                max_value = np.max(x_train.iloc[:, feature_idx])
                min_value = np.min(x_train.iloc[:, feature_idx])
                threshold_values = np.linspace(min_value, max_value, 5)
                functional_values = [
                    self._functional(x_train, y, feature_idx, threshold) for threshold in threshold_values
                ]

                best_threshold_idx = np.nanargmax(functional_values)

                if functional_values[best_threshold_idx] > max_functional:
                    max_functional = functional_values[best_threshold_idx]
                    best_threshold = threshold_values[best_threshold_idx]
                    best_feature_idx = feature_idx
                    best_mask = x_train.iloc[:, feature_idx] < best_threshold

        if best_feature_idx is not None and best_mask is not None:
            return Node(
                feature_idx=best_feature_idx,
                threshold=best_threshold,
                left=self._build_tree(x_train.loc[best_mask], y.loc[best_mask], depth + 1),
                right=self._build_tree(x_train.loc[~best_mask, :], y.loc[~best_mask], depth + 1),
            )
        else:
            self.current_depth = depth
            return Node(labels=y)

    def fit(self, x_train: pd.DataFrame, y: pd.Series):
        """Run training decision tree"""
        self.root = self._build_tree(x_train, y)
        self.max_depth = self.current_depth
        return self

    def _predict_object(self, x: pd.Series):
        """Prediction for one test object"""
        node = self.root
        while node.labels is None:
            if x[node.feature_idx] < node.threshold:
                node = node.left
            else:
                node = node.right
        return self._leaf_value(node.labels)

    def predict(self, x_test: pd.DataFrame) -> np.array:
        """Prediction for all test objects"""
        results = np.array([self._predict_object(x_test.iloc[i]) for i in range(0, x_test.shape[0])])
        return np.array(results)


# Случайный лес

In [3]:
import numpy as np
import pandas as pd
from statistics import mode
from sklearn.base import BaseEstimator


def entropy(y):
    p = [len(y[y == k]) / len(y) for k in np.unique(y)]
    return -np.dot(p, np.log2(p))


def gini(y):
    p = [len(y[y == k]) / len(y) for k in np.unique(y)]
    return 1 - np.dot(p, p)


def variance(y):
    return np.var(y)


def mad_median(y):
    return np.mean(np.abs(y - np.median(y)))


def regression_leaf(y):
    return np.mean(y)


def classification_leaf(y):
    return mode(y)


class Node:
    def __init__(self, feature_idx=0, threshold=0, labels=None, left=None, right=None):
        self.feature_idx = feature_idx
        self.threshold = threshold
        self.labels = labels
        self.left = left
        self.right = right


class RandomForest(BaseEstimator):
    def __init__(self, max_depth=np.inf, min_samples_split=2, min_samples_leaf=1, criterion="gini",
                 leaf_func="classification_leaf", N=10):
        params = {
            "max_depth": max_depth,
            "min_samples_split": min_samples_split,
            "min_samples_leaf": min_samples_leaf,
            "criterion": criterion,
            "leaf_func": leaf_func,
            "N": N
        }

        criterion_dict = {
            "variance": variance,
            "mad_median": mad_median,
            "gini": gini,
            "entropy": entropy
        }

        leaf_dict = {
            "regression_leaf": regression_leaf,
            "classification_leaf": classification_leaf
        }

        for param_name, param_value in params.items():
            setattr(self, param_name, param_value)

        super(RandomForest, self).set_params(**params)
        self._criterion_function = criterion_dict[criterion]
        self._leaf_value = leaf_dict[leaf_func]
        self.bootstrap = []
        self.forest = []

    def _get_bootstrap(self, data: pd.DataFrame, target: pd.Series) -> None:
        n_samples = data.shape[0]
        for i in range(self.N):
            sample_index = np.random.randint(0, n_samples, size=n_samples)
            data_bootstrap = data.iloc[sample_index]
            target_bootstrap = target.iloc[sample_index]
            self.bootstrap.append((data_bootstrap, target_bootstrap))

    def _get_subsample(self, len_sample: int) -> np.ndarray:
        sample_indexes = list(range(len_sample))
        if self._leaf_value == "classification_leaf":
            len_subsample = int(np.sqrt(len_sample))
        else:
            len_subsample = int(np.divide(len_sample, 3))
        subsample = np.random.choice(a=sample_indexes, size=len_subsample, replace=False)
        return subsample

    def _functional(self, x_train: pd.DataFrame, y: pd.Series, feature_idx: int, threshold):
        mask = x_train.iloc[:, feature_idx] < threshold
        n_obj = x_train.shape[0]
        n_left = np.sum(mask)
        n_right = n_obj - n_left
        if n_left > 0 and n_right > 0:
            return (
                    self._criterion_function(y)
                    - (n_left / n_obj) * self._criterion_function(y.loc[mask])
                    - (n_right / n_obj) * self._criterion_function(y.loc[~mask])
            )
        else:
            return 0

    def _build_tree(self, x_train: pd.DataFrame, y: pd.Series, depth=1):
        """Train decision tree"""
        max_functional = 0
        best_feature_idx = None
        best_threshold = None
        n_samples, n_features = x_train.shape

        if len(np.unique(y)) == 1:
            return Node(labels=y)

        best_mask = None
        if depth < self.max_depth and n_samples >= self.min_samples_split and n_samples >= self.min_samples_leaf:
            for feature_idx in range(n_features):
                max_value = np.max(x_train.iloc[:, feature_idx])
                min_value = np.min(x_train.iloc[:, feature_idx])
                threshold_values = np.linspace(min_value, max_value, 5)
                functional_values = [
                    self._functional(x_train, y, feature_idx, threshold) for threshold in threshold_values
                ]

                best_threshold_idx = np.nanargmax(functional_values)

                if functional_values[best_threshold_idx] > max_functional:
                    max_functional = functional_values[best_threshold_idx]
                    best_threshold = threshold_values[best_threshold_idx]
                    best_feature_idx = feature_idx
                    best_mask = x_train.iloc[:, feature_idx] < best_threshold

        if best_feature_idx is not None and best_mask is not None:
            return Node(
                feature_idx=best_feature_idx,
                threshold=best_threshold,
                left=self._build_tree(x_train.loc[best_mask], y.loc[best_mask], depth + 1),
                right=self._build_tree(x_train.loc[~best_mask, :], y.loc[~best_mask], depth + 1),
            )
        else:
            return Node(labels=y)

    def _fit_one_tree(self, x_train: pd.DataFrame, y: pd.Series) -> Node:
        """Обучаем одно дерево"""
        root = self._build_tree(x_train, y)
        return root

    def fit(self, x_train: pd.DataFrame, y: pd.Series):
        """Обучаем случайный лес"""
        self._get_bootstrap(x_train, y)
        for sample_obj, sample_y in self.bootstrap:
            root = self._fit_one_tree(sample_obj, sample_y)
            self.forest.append(root)

    def _predict_object(self, x: pd.Series, node: Node):
        """Prediction for one test object"""
        while node.labels is None:
            if x[node.feature_idx] < node.threshold:
                node = node.left
            else:
                node = node.right
        return self._leaf_value(node.labels)

    def predict(self, x_test: pd.DataFrame):
        """Prediction for all test objects"""
        results = []
        for i in range(0, x_test.shape[0]):
            predictions_for_one = [self._predict_object(x_test.iloc[i], root) for root in self.forest]
            prediction = self._leaf_value(predictions_for_one)
            results.append(prediction)

        return results

# Градиентный бустинг

In [4]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.base import BaseEstimator

def entropy(y):
    p = [len(y[y == k]) / len(y) for k in np.unique(y)]
    return -np.dot(p, np.log2(p))


def gini(y):
    p = [len(y[y == k]) / len(y) for k in np.unique(y)]
    return 1 - np.dot(p, p)


def variance(y):
    return np.var(y)


def mad_median(y):
    return np.mean(np.abs(y - np.median(y)))


def regression_leaf(y):
    return np.mean(y)


def classification_leaf(y):
    return mode(y)

class GradientBoosting(BaseEstimator):
    def __init__(self, n_estimators=10, learning_rate=0.01, max_depth=3, min_samples_split=5, criterion="entropy",
                 leaf_func="classification_leaf", random_state=17, loss_name="mse"):

        criteria_dict = {
            "variance": variance,
            "mad_median": mad_median,
            "gini": gini,
            "entropy": entropy
        }

        leaf_dict = {
            "regression_leaf": regression_leaf,
            "classification_leaf": classification_leaf
        }

        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.leaf_func = leaf_func
        self.random_state = random_state
        self.learning_rate = learning_rate
        self.loss_name = loss_name
        self.initialization = lambda y: np.mean(y) * np.ones([y.shape[0], 1])

        if loss_name == "mse":
            self.objective = self.mse
            self.objective_grad = self.mse_grad

        elif loss_name == "rmsle":
            self.objective = self.rmsle
            self.objective_grad = self.rmsle_grad

        self.trees_ = []

    @staticmethod
    def mse(y, p):
        return np.mean((y - p) ** 2)

    @staticmethod
    def mse_grad(y: np.array, p: np.array):
        return 2 * (p - y) / y.shape[0]

    @staticmethod
    def rmsle(y, p):
        y = y.reshape([y.shape[0], 1])
        p = p.reshape([p.shape[0], 1])
        return np.mean(np.log((p + 1) / (y + 1)) ** 2) ** 0.5

    def rmsle_grad(self, y, p):
        y = y.reshape([y.shape[0], 1])
        p = p.reshape([p.shape[0], 1])
        return 1.0 / (y.shape[0] * (p + 1) * self.rmsle(y, p)) * np.log((p + 1) / (y + 1))

    def fit(self, X: np.array, y: np.array):
        b = self.initialization(y)
        prediction = b.copy()

        for t in tqdm(range(self.n_estimators)):
            if t == 0:
                resid = y
            else:
                resid = -self.objective_grad(y, prediction)

            tree = DecisionTree(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                criterion=self.criterion,
                leaf_func=self.leaf_func
            )
            tree.fit(X, pd.Series(resid))
            b = tree.predict(X).reshape([X.shape[0], 1])
            self.trees_.append(tree)
            prediction += self.learning_rate * b
        return self

    def predict(self, X):
        predictions = np.ones([X.shape[0], 1])
        for t in range(self.n_estimators):
            predictions += self.learning_rate * self.trees_[t].predict(X).reshape([X.shape[0], 1])
        return predictions

# Функции для отрисовки

In [5]:
def show_histplot(data: pd.DataFrame, feature_name: str):
    sns.histplot(data, kde=True, binwidth=0.1)
    plt.xlabel(f'Значения {feature_name}')
    plt.ylabel('Частота')
    plt.title(f'Распределение {feature_name}')
    plt.show()


def get_boxplot(df_column, column_name):
    pd.DataFrame(df_column).boxplot(sym='o', whis=1.0, showmeans=True)
    plt.show()

def get_heatmap(df: pd.DataFrame):
    sns.heatmap(df)

def get_3d(param1: list[int], param2: list[int], result: list[int], name_param1: str, name_param2: str):
    fig = plt.figure()
    ax = plt.axes(projection ='3d')
    ax.plot3D(param1, param2, result, 'green')
    ax.set_title(f'Зависимость метрики R² от {name_param1} и {name_param2}')
    plt.show()

def get_2d(param1: list[int], result: list[int], name_param1: str):
    plt.title(f'Зависимость метрики R² от {name_param1}')
    plt.plot(param1, result)


def output_roc_auc(y_test: pd.Series, preds: pd.Series):
    sns.set(font_scale=1.5)
    sns.set_color_codes("muted")

    plt.figure(figsize=(10, 8))
    fpr, tpr, thresholds = roc_curve(y_test, preds, pos_label=1)
    lw = 2
    plt.plot(fpr, tpr, lw=lw, label='ROC curve ')
    plt.plot([0, 1], [0, 1])
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.savefig("ROC.png")
    plt.show()

def plot_variance(pca, width=8, dpi=100):
    fig, axs = plt.subplots(1, 2)
    n = pca.n_components_
    grid = np.arange(1, n + 1)
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr)
    axs[0].set(
        xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0)
    )
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
    axs[1].set(
        xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
    )
    fig.set(figwidth=8, dpi=100)
    return axs

def get_bias_variance(model_param, bias_model, var_model,
                      avg_model, name_param, name_model):
    plt.figure(figsize=(10,5))
    plt.xlabel(f'Different Values of {name_param}')
    plt.ylabel(f'Tradeoff bias-variance {name_model}')
    plt.plot(model_param, bias_model, color = 'r', label = "avg_bias")
    plt.plot(model_param, var_model, color = 'b', label = 'avg_var')
    plt.plot(model_param, avg_model, color = 'g', label = 'avg_tree')
    plt.legend(bbox_to_anchor=(1, 1),bbox_transform=plt.gcf().transFigure)
    plt.show()

# Функции для обучения/теста моделей

In [6]:
def train_tree(param, X_train, y_train, X_test, y_test, name_param):
    new_model = DecisionTree()
    new_model.name_param = int(param)
    new_model.fit(X_train, y_train)

    preds_model = new_model.predict(X_test)
    mean_f1 = output_metrics_classification(label_encoder.inverse_transform(preds_model),
                                             label_encoder.inverse_transform(y_test))

    variance = np.var(preds_model)
    sse = np.mean((np.mean(preds_model) - y_test) ** 2)
    bias = sse - variance

    return mean_f1['macro avg']['f1-score'], bias, variance, sse


def trainer_one_param(func_train, X_train, y_train, X_test, y_test, arr_param, name_param, *args):
    results = []
    arr_var = []
    arr_sse = []
    arr_bias = []

    with Pool() as pool:
        for result in tqdm(pool.starmap(func_train, [(param, X_train, y_train, X_test, y_test, name_param) for param in arr_param]),
                           total=len(arr_param), desc="перебираем параметры"):
            f1_score, bias, variance, sse = result
            results.append(f1_score)
            arr_bias.append(bias)
            arr_var.append(variance)
            arr_sse.append(sse)

    return results, arr_bias, arr_var, arr_sse

# Всякие полезные функции

In [7]:
# Evaluation function
def evaluation(model_name, y_test, y_pred_test, output=False):
    r2_test = r2_score(y_test, y_pred_test)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

    if output:
        print(f'\n{model_name} Evaluation:')
        print(f'Test R²: {r2_test:.7f}')
        print(f'Test RMSE: {rmse_test:.5f}')
    return r2_test

def output_metrics_classification(y_test: pd.Series, preds: pd.Series):
    report = classification_report(y_test, preds, output_dict=True)
    return report

def show_dependencies(num_layers, train_results, test_results):
    plt.figure(figsize=(10, 6))
    plt.plot(num_layers, test_results, marker='o', label='Test Results', color='b')
    plt.plot(num_layers, train_results, marker='o', label='Train Results', color='g')
    plt.title('Зависимость тестовых и тренировочных результатов от количества слоев')
    plt.xlabel('Количество слоев (num_layers)')
    plt.ylabel('Результаты')
    plt.xticks(num_layers)
    plt.grid(True)
    plt.legend()

# Импорт датасета

Пол/Возраст/Рост/Вес - это физические характеристики

family_history_with_overweight -  семейная история с избыточным весом

FAVC - Частое употребление высококалорийной пищи

FCVC - Частота употребления овощей

NCP - Количество основных приемов пищи

CAEC - Количество приемов пищи между приемами пищи

SMOKE - употребление табака

CH20 - Ежедневное потребление воды

SCC - Контроль потребления калорий

FAF - Частота физической активности

ВТ - Время, в течение которого вы пользуетесь техническими устройствами

CALC - Потребление алкоголя

MTRANS - Использованный транспорт

In [8]:
df = pd.read_csv('/content/drive/MyDrive/Всякие датасеты/train_fat.csv')
df = df.drop('id', axis=1)
df

# Label Encoding

In [9]:
categorical_features = df.columns[df.dtypes=="object"].tolist()
numeric_features = df.columns[df.dtypes!="object"].tolist()

label_encoder = LabelEncoder()
for feature in categorical_features:
    df[feature] = label_encoder.fit_transform((df.loc[:, feature]))

categorical_features = df.columns[df.dtypes=="bool"].tolist()
df

# EDA: Obesity Risk

## Чекаем дупликаты и Nan

In [10]:
print(f"Number of missing value:{df.isna().sum().sum()}")
print(f"Number of duplicated value:{df.duplicated().sum()}")

In [11]:
df = df.dropna(axis=0)

## Распределение признаков

In [12]:
[show_histplot(df[column], column) for column in df.columns]

**Итог**

Таргет имеет гипергеометрическое распределение, то есть модели будет относительно сложно его повторить. Спасибо, что не рандомное.

## Выбросы

### Ящики с усами

In [13]:
[get_boxplot(df[column], column) for column in df.columns]

### Смотрим на выбросы в процентах

In [14]:
def find_outliers(df):
    outliers = {}
    for col in df.columns:
        v = df[col]
        q1 = v.quantile(0.25)
        q3 = v.quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        outliers_count = ((v < lower_bound) | (v > upper_bound)).sum()
        perc = outliers_count * 100.0 / len(df)
        outliers[col] = (perc, outliers_count)
        print(f"Column {col} outliers = {perc:.2f}%")

    return outliers

outliers = find_outliers(df)

**Итог**

Можно заметить, что некоторые признаки имеют большой процент выбросов, но на самом деле это не выбросы, а статистика и ее нужно учитывать, поэтому я не буду ничего с ними делать.

 ## Тепловая карта

In [15]:
sns.heatmap(df.corr(method='spearman'), vmin=-1, vmax=1, center= 0, cmap= 'coolwarm')

## Mutual Information

In [16]:
X = df
y = X['NObeyesdad']
X = X.drop('NObeyesdad', axis=1, inplace=False)

In [17]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

discrete_features = X.dtypes == int
mi_scores = make_mi_scores(X, y, discrete_features)
mi_scores[::3]
plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores)

**Итог**

Предлагаю удалить SMOKE, FAVC, MTRANS, CALC, потому что они не дают никакой связи ни на тепловой карте Спирмена, ни на графике взаимной информации

In [18]:
X = X.drop(['SMOKE', 'FAVC', 'MTRANS', 'CALC'], axis=1)

## Feature engineering

In [19]:
X['mean'] = X[X.columns].mean(axis=1)
X['std'] = X[X.columns].std(axis=1)
X['max'] = X[X.columns].max(axis=1)
X['median'] = X[X.columns].median(axis=1)

### PCA

In [20]:
X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)
pca = PCA()
X_pca = pca.fit_transform(X_scaled)
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

new_pca_df = pd.DataFrame(
    pca.components_.T,
    columns=component_names,
    index=X.columns,
)
new_pca_df

In [21]:
plot_variance(pca)

**Замечание**

Давайте оставим только 10 компонент, так как они объясняют процентов 90% всей информации

In [22]:
X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

pca_df = pd.DataFrame(
    pca.components_.T,
    columns=component_names,
    index=X.columns,
)
X_pca

# Разделение на train/test

In [25]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.3, random_state=42)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Поговорим про метрики

***На какую метрику мне больше всего выгоднее смотреть?***

- Precision показывает долю правильно-положительных ответов среди всех положительных ответов.  То есть Precision демонстрирует способность модели отличать этот класс от других.

- Recall показывает долю правильно-положительных ответов среди всех положительных. То есть Recall демонстрирует способность модели находить этот класс в целом.

- F-1 - баланс между этими двумя метриками, очев.


По-хорошему надо бы смотреть на все три метрики, но на F-1 и Recall в особенности. Потому что F-1 более информативная, а Recall нам нужно здесь больше, так как если модель пропустит человека с высокой степень ожирения и его не проверят, то будет намного хуже (Recall) чем если бы модель причислила класс высокого ожирения человеку, у которого его нет (Precision).

Лучше проверить, чем недопроверить!

# Обучение/тест - дерево решений

## Моя реализация

### Подбираем параметр min_samples_split, не ограничевая высоту

In [27]:
arr_split = range(10, 30, 3)
results, arr_var, arr_sse, arr_bias = [], [], [], []
for split in arr_split:
    tree = DecisionTree(max_depth=100, min_samples_split=split, criterion="entropy", leaf_func="classification_leaf")
    tree.fit(X_train_pca, y_train_pca)
    preds_tree = tree.predict(X_test_pca)
    mean_f1 = output_metrics_classification(label_encoder.inverse_transform(preds_tree), label_encoder.inverse_transform(y_test_pca))
    results.append(mean_f1['macro avg']['f1-score'])

    variance = np.var(preds_tree)
    sse = np.mean((np.mean(preds_tree) - y)** 2)
    bias = sse - variance

    arr_var.append(variance)
    arr_sse.append(sse)
    arr_bias.append(bias)

In [28]:
get_2d(arr_split, results, 'min_samples_split')

In [None]:
get_bias_variance(arr_split, arr_bias, arr_var, arr_sse, "min_samples_leaf", "Decision Tree")

### Подбираем параметр min_samples_leaf, не ограничивая высоту

In [29]:
arr_leaf = range(7, 25, 3)
results, arr_var, arr_sse, arr_bias = [], [], [], []
for leaf in arr_leaf:
    tree = DecisionTree(max_depth=100, min_samples_leaf=leaf, criterion="entropy", leaf_func="classification_leaf")
    tree.fit(X_train_pca, y_train_pca)
    preds_tree = tree.predict(X_test_pca)
    mean_f1 = output_metrics_classification(label_encoder.inverse_transform(preds_tree), label_encoder.inverse_transform(y_test_pca))
    results.append(mean_f1['macro avg']['f1-score'])

    variance = np.var(preds_tree)
    sse = np.mean((np.mean(preds_tree) - y)** 2)
    bias = sse - variance

    arr_var.append(variance)
    arr_sse.append(sse)
    arr_bias.append(bias)

In [30]:
get_2d(arr_leaf, results, 'min_samples_leaf')

In [31]:
get_bias_variance(arr_leaf, arr_bias, arr_var, arr_sse, "min_samples_leaf", "Decision Tree")

### Переберем высоты и посмотрим, какая лучше

In [None]:
arr_depth = range(10, 30, 2)
results = []
for depth in tqdm(arr_depth):
    tree = DecisionTree(max_depth=depth, min_samples_split=10, min_samples_leaf=7, criterion="entropy", leaf_func="classification_leaf")
    tree.fit(X_train_pca, y_train_pca)
    preds_tree = tree.predict(X_test_pca)
    mean_f1 = output_metrics_classification(label_encoder.inverse_transform(preds_tree), label_encoder.inverse_transform(y_test_pca))
    results.append(mean_f1['macro avg']['f1-score'])

In [None]:
get_2d(arr_depth, results, 'min_samples_leaf')

**Итог:**
Давайте посмотрим на метрики последних двух классов: Overweight_Level_II и Overweight_Level_I.
- маленькая Precision у того и того класса означает, что моя модель дала дала много ложно-положительных ответов. То есть она причисляет объектам с отрицательным таргетом положительный класс.

- маленькая Recall у того и того класса означает, что моя модель дала дала много ложно-отрицательных ответов. То есть она причисляет объект с положительным таргетом к другим классам.

**Вывод такой:** модель не умеет не отличать эти класс от других, не находить их.

 Почему так произошло? Потому что судя по support и распределению таргета, имеется дисбаланс классов, а дерево решений зависимо от баланса класов.

Скорее всего последние два класса путаются с Obesity_Type_I и Obesity_Type_III

## Sklearn реализация

### Подбираем параметр min_samples_split, не ограничевая высоту

In [None]:
arr_split = range(10, 30, 3)
best_f1 = -1
results = []
for split in arr_split:
    tree = DecisionTreeClassifier(max_depth=100, min_samples_split=split, criterion="entropy")
    tree.fit(X_train_pca, y_train_pca)
    preds_tree = tree.predict(X_test_pca)
    mean_f1 = output_metrics_classification(label_encoder.inverse_transform(preds_tree), label_encoder.inverse_transform(y_test_pca))
    results.append(mean_f1['macro avg']['f1-score'])

In [None]:
get_2d(arr_split, results, 'min_samples_split')

### Подбираем параметр min_samples_leaf, не ограничивая высоту

In [None]:
arr_leaf = range(7, 25, 3)
best_f1 = -1
results = []
for leaf in arr_leaf:
    tree = DecisionTreeClassifier(max_depth=100, min_samples_leaf=leaf, criterion="entropy")
    tree.fit(X_train_pca, y_train_pca)
    preds_tree = tree.predict(X_test_pca)
    mean_f1 = output_metrics_classification(label_encoder.inverse_transform(preds_tree), label_encoder.inverse_transform(y_test_pca))
    results.append(mean_f1['macro avg']['f1-score'])

In [None]:
get_2d(arr_leaf, results, 'min_samples_leaf')

### Переберем высоты и посмотрим, какая лучше

In [None]:
arr_depth = range(10, 30, 2)
results = []
for depth in tqdm(arr_depth):
    tree = DecisionTreeClassifier(max_depth=depth, min_samples_split=25, min_samples_leaf=11, criterion="entropy")
    tree.fit(X_train_pca, y_train_pca)
    preds_tree = tree.predict(X_test_pca)
    mean_f1 = output_metrics_classification(label_encoder.inverse_transform(preds_tree), label_encoder.inverse_transform(y_test_pca))
    results.append(mean_f1['macro avg']['f1-score'])

In [None]:
get_2d(arr_depth, results, 'max_depth')

# Случайный лес

## Моя реализация

### Подбор гиперпараметров

In [None]:
arr_split = range(10, 30, 3)
max_depths = range(7, 20, 2)
n_estimators = range(5, 15, 2)
best_f1 = -1
results = []
for split in tqdm(arr_split):
    for depth in max_depths:
      for estimators in n_estimators:
        tree = RandomForest(max_depth=depth, min_samples_split=split, min_samples_leaf=8, criterion="entropy", leaf_func="classification_leaf", N=estimators)
        tree.fit(X_train_pca, y_train_pca)
        preds_tree = tree.predict(X_test_pca)
        mean_f1 = output_metrics_classification(label_encoder.inverse_transform(preds_tree), label_encoder.inverse_transform(y_test_pca))
        if best_f1 < mean_f1['macro avg']['f1-score']:
          best_f1 = mean_f1
          best_depth = depth
          best_split = split
          best_n_estimators = estimators


### Построим зависимость от количества деревьев

In [None]:
n_estimators = range(5, 15, 2)
results = []
for estimators in tqdm(n_estimators):
        tree = RandomForest(max_depth=15, min_samples_split=10, min_samples_leaf=8, criterion="entropy", leaf_func="classification_leaf", N=estimators)
        tree.fit(X_train_pca, y_train_pca)
        preds_tree = tree.predict(X_test_pca)
        mean_f1 = output_metrics_classification(label_encoder.inverse_transform(preds_tree), label_encoder.inverse_transform(y_test_pca))
        results.append(mean_f1['macro avg']['f1-score'])

In [None]:
get_2d(arr_split, results, 'min_samples_split')

## Sklearn реализация

### Подбор гиперпараметров

In [None]:
arr_split = range(10, 30, 3)
max_depths = range(7, 20, 2)
n_estimators = range(5, 15, 2)
best_f1 = -1
results = []
for split in tqdm(arr_split):
    for depth in max_depths:
      for estimators in n_estimators:
        tree = RandomForestClassifier(max_depth=depth, min_samples_split=split,criterion="entropy", n_estimators=estimators)
        tree.fit(X_train_pca, y_train_pca)
        preds_tree = tree.predict(X_test_pca)
        mean_f1 = output_metrics_classification(label_encoder.inverse_transform(preds_tree), label_encoder.inverse_transform(y_test_pca))
        if best_f1 < mean_f1['macro avg']['f1-score']:
          best_f1 = mean_f1['macro avg']['f1-score']
          best_depth = depth
          best_split = split
          best_n_estimators = estimators


In [None]:
print(f"best_f1 = {best_f1}, best_depth = {best_depth}, best_split = {best_split}, best_n_estimators = {best_n_estimators}")

### Смотрим на зависимости от количества деревьев на train/test датасете

In [None]:
results_train = []
results_test = []
n_estimators = range(5, 20, 2)
for estimators in n_estimators:
      tree = RandomForestClassifier(max_depth=depth, min_samples_split=split,criterion="entropy", n_estimators=estimators)
      tree.fit(X_train_pca, y_train_pca)
      preds_tree_test = tree.predict(X_test_pca)
      mean_f1_test = output_metrics_classification(label_encoder.inverse_transform(preds_tree_test), label_encoder.inverse_transform(y_test_pca))
      results_test.append(mean_f1_test['macro avg']['f1-score'])

      preds_tree_train = tree.predict(X_train_pca)
      mean_f1_train = output_metrics_classification(label_encoder.inverse_transform(preds_tree_train), label_encoder.inverse_transform(y_train_pca))
      results_train.append(mean_f1_train['macro avg']['f1-score'])

In [None]:
show_dependencies(n_estimators, results_train, results_test)

# Градиентный бустинг

## Моя реализация

In [None]:
results_train = []
results_test = []
n_estimators = range(5, 20, 2)
for estimators in tqdm(n_estimators):
      tree = GradientBoostingClassifier(n_estimators=10, learning_rate=0.01, max_depth=3, min_samples_split=5, criterion="variance",
                 leaf_func="regression_leaf", random_state=17, loss_name="mse")
      tree.fit(X_train_pca, y_train_pca)
      preds_tree_test = tree.predict(X_test_pca)
      mean_f1_test = output_metrics_classification(label_encoder.inverse_transform(preds_tree_test), label_encoder.inverse_transform(y_test_pca))
      results_test.append(mean_f1_test['macro avg']['f1-score'])

      preds_tree_train = tree.predict(X_train_pca)
      mean_f1_train = output_metrics_classification(label_encoder.inverse_transform(preds_tree_train), label_encoder.inverse_transform(y_train_pca))
      results_train.append(mean_f1_train['macro avg']['f1-score'])

In [None]:
show_dependencies(n_estimators, results_train, results_test)

## Sklearn реализация

In [None]:
results_train = []
results_test = []
n_estimators = range(5, 20, 2)
for estimators in tqdm(n_estimators):
      tree = GradientBoostingClassifier(criterion="squared_error", n_estimators=estimators)
      tree.fit(X_train_pca, y_train_pca)
      preds_tree_test = tree.predict(X_test_pca)
      mean_f1_test = output_metrics_classification(label_encoder.inverse_transform(preds_tree_test), label_encoder.inverse_transform(y_test_pca))
      results_test.append(mean_f1_test['macro avg']['f1-score'])

      preds_tree_train = tree.predict(X_train_pca)
      mean_f1_train = output_metrics_classification(label_encoder.inverse_transform(preds_tree_train), label_encoder.inverse_transform(y_train_pca))
      results_train.append(mean_f1_train['macro avg']['f1-score'])

In [None]:
show_dependencies(n_estimators, results_train, results_test)