In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, roc_curve, auc, 
                             roc_auc_score, classification_report)
import warnings
warnings.filterwarnings('ignore')

In [None]:
plt.style.use('ggplot')
plt.rcparameters['figure.figsize'] = (12, 8)
plt.rcparameters['font.size'] = 12
sns.set_palette("husl")

In [3]:
# Загрузка датасета
df = pd.read_csv('diabetes.csv')
X = df.drop('Outcome', axis=1)
y = df['Outcome'] # целевая переменная: диабет присутствует или отсутствует
features = X.columns.tolist()

print(f'Размер датасета: {df.shape[0]} x {df.shape[1]}')
print(df.head())

Размер датасета: 768 x 9
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [4]:
# 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f'Train: {X_train.shape}')
print(f'Test: {X_test.shape}')

Train: (614, 8)
Test: (154, 8)


- - - - - - - - -
**Задание №1:**

Используя библиотеку Scikit-Learn, для полученной модели выполнить процесс подбора гиперпараметров, используя Random Search.

In [None]:
# Параметры
parameters_dist_random = {
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'max_depth': [None, 3, 5, 7, 10, 15, 20, 30],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'class_weight': [None, 'balanced', 'balanced_subsample'],
    'max_samples': [None, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'max_leaf_nodes': [None, 10, 20, 30, 40, 50],
    'min_impurity_decrease': [0.0, 0.1, 0.2, 0.3, 0.4]
}

# Выбран random forest, так как в работе 5.2 он показал себя лучше, чем xgboost
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_distributions=parameters_dist_random,
    n_iter=50,
    scoring='roc_auc',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

start_time_random = time.time()
random_search.fit(X_train, y_train) # обучение
random_search_time = time.time() - start_time_random
best_parameters_random = random_search.best_params_
best_score_random = random_search.best_score_

# Лучшая модель по итогам random search
best_model_random = random_search.best_estimator_
y_predict_random = best_model_random.predict(X_test)
y_predict_proba_random = best_model_random.predict_proba(X_test)[:, 1]
roc_auc_random = roc_auc_score(y_test, y_predict_proba_random)
accuracy_random = accuracy_score(y_test, y_predict_random)
precision_random = precision_score(y_test, y_predict_random)
recall_random = recall_score(y_test, y_predict_random)
f1_random = f1_score(y_test, y_predict_random)

print(f"Лучшие параметры: {best_parameters_random}")
print(f"Время выполнения: {random_search_time:.4f} с")
print(f"ROC-AUC: {roc_auc_random:.4f}")
print(f'Accuracy: {accuracy_random:.4f}')
print(f'Precision: {precision_random:.4f}')
print(f'Recall: {recall_random:.4f}')
print(f'F1-Score: {f1_random:.4f}')


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Лучшие параметры: {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 8, 'min_impurity_decrease': 0.0, 'max_samples': None, 'max_leaf_nodes': 50, 'max_features': 'log2', 'max_depth': 7, 'criterion': 'entropy', 'class_weight': 'balanced', 'bootstrap': True}
Время выполнения: 14.5922 с
ROC-AUC: 0.8330
Accuracy: 0.7468
Precision: 0.6154
Recall: 0.7407
F1-Score: 0.6723


- - - - - - - - -
**Задание №2:**

Выполнить подбор гиперпараметров, для той же модели используя алгоритм TPE и библиотеку Hyperopt.

In [13]:
!pip install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting future (from hyperopt)
  Downloading future-1.0.0-py3-none-any.whl.metadata (4.0 kB)
Collecting py4j (from hyperopt)
  Downloading py4j-0.10.9.9-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   -------------------------- ------------- 1.0/1.6 MB 10.1 MB/s eta 0:00:01
   ---------------------------------------- 1.6/1.6 MB 9.4 MB/s eta 0:00:00
Downloading future-1.0.0-py3-none-any.whl (491 kB)
Downloading py4j-0.10.9.9-py2.py3-none-any.whl (203 kB)
Installing collected packages: py4j, future, hyperopt
Successfully installed future-1.0.0 hyperopt-0.2.7 py4j-0.10.9.9


In [14]:
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

In [22]:
# Пространство поиска
space_hyperopt = {
    'n_estimators': hp.choice('n_estimators', [50, 100, 200, 300, 400, 500]),
    'max_depth': hp.choice('max_depth', [None, 3, 5, 7, 10, 15, 20, 30]),
    'min_samples_split': hp.choice('min_samples_split', [2, 5, 10, 15]),
    'min_samples_leaf': hp.choice('min_samples_leaf', [1, 2, 4, 8]),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None]),
    'criterion': hp.choice('criterion', ['gini', 'entropy']),
    'class_weight': hp.choice('class_weight', [None, 'balanced', 'balanced_subsample']),
    'max_samples': hp.choice('max_samples', [None, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
    'max_leaf_nodes': hp.choice('max_leaf_nodes', [None, 10, 20, 30, 40, 50]),
    'min_impurity_decrease': hp.choice('min_impurity_decrease', [0.0, 0.1, 0.2, 0.3, 0.4])
}

# Функция цели
def objective_function(parameters):    
    model = RandomForestClassifier(
        **parameters,
        random_state=42, n_jobs=-1
    )
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1)
    return {'loss': -scores.mean(), 'status': STATUS_OK}

In [24]:
trials = Trials()
start_time_hyperopt = time.time()

model_hyperopt = fmin(
    fn=objective_function,
    space=space_hyperopt,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=np.random.default_rng(42)
)

hyperopt_time = time.time() - start_time_hyperopt

best_params_hyperopt = {
    'n_estimators': [50, 100, 200, 300, 400, 500][model_hyperopt['n_estimators']],
    'max_depth': [None, 3, 5, 7, 10, 15, 20, 30][model_hyperopt['max_depth']],
    'min_samples_split': [2, 5, 10, 15][model_hyperopt['min_samples_split']],
    'min_samples_leaf': [1, 2, 4, 8][model_hyperopt['min_samples_leaf']],
    'max_features': ['sqrt', 'log2', None][model_hyperopt['max_features']],
    'criterion': ['gini', 'entropy'][model_hyperopt['criterion']],
    'class_weight': [None, 'balanced', 'balanced_subsample'][model_hyperopt['class_weight']],
    'max_samples': [None, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0][model_hyperopt['max_samples']],
    'max_leaf_nodes': [None, 10, 20, 30, 40, 50][model_hyperopt['max_leaf_nodes']],
    'min_impurity_decrease': [0.0, 0.1, 0.2, 0.3, 0.4][model_hyperopt['min_impurity_decrease']]
}


print(f"Лучшие параметры: {best_params_hyperopt}")
print(f"Время выполнения: {hyperopt_time:.2f} с")

100%|██████████| 50/50 [00:37<00:00,  1.33trial/s, best loss: -0.8360160575858251]
Лучшие параметры: {'n_estimators': 300, 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_features': 'sqrt', 'criterion': 'entropy', 'class_weight': None, 'max_samples': 1.0, 'max_leaf_nodes': 20, 'min_impurity_decrease': 0.0}
Время выполнения: 37.68 с


In [28]:
best_model_hyperopt = RandomForestClassifier(**best_params_hyperopt, random_state=42, n_jobs=-1)
best_model_hyperopt.fit(X_train, y_train) # обучение
y_predict_hyperopt = best_model_hyperopt.predict(X_test)
y_predict_proba_hyperopt = best_model_hyperopt.predict_proba(X_test)[:, 1]
roc_auc_hyperopt = roc_auc_score(y_test, y_predict_proba_hyperopt)
accuracy_hyperopt = accuracy_score(y_test, y_predict_hyperopt)
precision_hyperopt = precision_score(y_test, y_predict_hyperopt)
recall_hyperopt = recall_score(y_test, y_predict_hyperopt)
f1_hyperopt = f1_score(y_test, y_predict_hyperopt)

print(f"ROC-AUC: {roc_auc_hyperopt:.4f}")
print(f"Accuracy: {accuracy_hyperopt:.4f}")
print(f"Precision: {precision_hyperopt:.4f}")
print(f"Recall: {recall_hyperopt:.4f}")
print(f"F1-Score: {f1_hyperopt:.4f}")

ROC-AUC: 0.8267
Accuracy: 0.7532
Precision: 0.6818
Recall: 0.5556
F1-Score: 0.6122


- - - - - - -
**Задание №3:**

Проанализировать полученные результаты (как минимум сравнить значение параметров и время, необходимое для оптимизации).

**Вывод:**

Время оптимизации **Random Search** оказалось гораздо быстрее, чем Hyperopt. 

Согласно метрикам ROC-AUC и F1-Score, **Random Search** показал более хорошие результаты (остальные метрики показали более высокий результат после работы Hyperopt, но первые две более устойчивы, потому вывод сделан по ним.)