In [1]:
#импорт библиотек
import numpy as np #для матричных вычислений
import pandas as pd #для анализа и предобработки данных
import matplotlib.pyplot as plt #для визуализации
import seaborn as sns #для визуализации

from sklearn import linear_model #линейные моделиё
from sklearn import ensemble #ансамбли
from sklearn import metrics #метрики
from sklearn import model_selection 
from sklearn.model_selection import train_test_split #сплитование выборки

%matplotlib inline
plt.style.use('seaborn')

In [2]:
#считываем данные
data = pd.read_csv('data/_train_sem09 (1).csv')
data.head()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [3]:
#осуществляем разбивку на тренировочную и тестовую выборки
y = data['Activity']
x = data.drop(columns='Activity')
X_train, X_test, y_train, y_test = model_selection.train_test_split(x, y, stratify=y, test_size=0.3, random_state=42)

## <center>Логистическая регрессия

### Стандартная модель с параметрами по умолчанию

In [10]:
# обучаем модель логистическое регрессии с параметрами по умолчанию
log_reg = linear_model.LogisticRegression(random_state=42, max_iter=2000)
log_reg.fit(X_train, y_train)
y_test_pred = log_reg.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

f1_score на тестовом наборе: 0.78


### Оптимизация логистической регрессии с помощью GridSearchCV

In [13]:
#задаем сетку параметров
params_grid = [
    {'penalty': ['l2', 'none'] , 
    'solver': ['lbfgs', 'sag'], 
    'C': np.linspace(0.01, 1, 5)}, 
    
    {'penalty': ['l2', 'l1'] , 
    'solver': ['liblinear', 'saga'], 
    'C': np.linspace(0.01, 1, 5)}
]

#инициируем экземпляр GridSearchCV
grid_search = model_selection.GridSearchCV(
    estimator=log_reg,
    param_grid=params_grid,
    cv=5,
    n_jobs=-1
)

#обучаем модель и подбираем параметры
%time grid_search.fit(X_train, y_train)
y_test_pred = grid_search.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

CPU times: total: 14.2 s
Wall time: 10min 42s
f1_score на тестовом наборе: 0.79


In [19]:
#выводим лучшие гиперпараметры модели
grid_search.best_params_

{'C': 0.2575, 'penalty': 'l1', 'solver': 'saga'}

### Оптимизация логистической регрессии с помощью RandomizedSearchCV

In [21]:
#задаем диапазон гиперпараметров
params_distrib = [
    {'penalty': ['l2', 'none'] , 
    'solver': ['lbfgs', 'sag', 'newton-cg', 'newton-cholesky'], 
    'C': np.linspace(0.01, 1, 10)}, 
    
    {'penalty': ['l2', 'l1'] , 
    'solver': ['liblinear', 'saga'], 
    'C': np.linspace(0.01, 1, 10)}
]

#инициируем экземпляр RandomizedSearchCV
random_search = model_selection.RandomizedSearchCV(
    estimator=log_reg,
    param_distributions=params_distrib,
    cv=5,
    n_jobs=-1,
    n_iter=50
)

%time random_search.fit(X_train, y_train)
y_test_pred = random_search.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

CPU times: total: 3.25 s
Wall time: 7min 53s
f1_score на тестовом наборе: 0.79


In [68]:
# выведем гиперпараметры модели
print(f'Гиперпараметры модели - {random_search.best_params_}')

Гиперпараметры модели - {'solver': 'sag', 'penalty': 'l2', 'C': 0.12}


### Оптимизация логистической регрессии с помощью Hyperopt

In [63]:
import hyperopt # импотируем Hyperopt
from hyperopt import hp, fmin, tpe, Trials # и необходимые классы

#задаем диапазон гиперпараметров

space = hp.choice('variants', [
    {
        'solver': hp.choice('solver1', ['lbfgs', 'sag', 'newton-cholesky', 'newton-cg']),
        'C': hp.quniform('C1', 0.01, 1, 0.01),
        'penalty': hp.choice('penalty1', ['l2'])
    },
    {
        'solver': hp.choice('solver2', ['lbfgs', 'sag', 'newton-cholesky', 'newton-cg']),
        'C': hp.quniform('C2', 0.01, 1, 0.01),
        'penalty': hp.choice('penalty2', ['none'])
    },
    {
        'solver': hp.choice('solver3', ['liblinear', 'saga']),
        'C': hp.quniform('C3', 0.01, 1, 0.01),
        'penalty': hp.choice('penalty3', ['l2', 'l1'])
    }])

# space = hp.choice(
#     'variants': [
#         {
#             'solver': hp.choice('solver1', ['lbfgs', 'sag', 'newton-cholesky', 'newton-cg']),
#             'C': hp.quniform('C1', 0.01, 1, 0.01),
#             'penalty': hp.choice('penalty1', ['l2', 'none'])
#         }
#     ]
# )

random_state = 42
def hyperopt_lg(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    print(params)
    params = {
        'solver': str(params['solver']),
        'penalty': str(params['penalty']),
        'C': round(float(params['C']), 2)
    }
    
    model = linear_model.LogisticRegression(**params, random_state=random_state, max_iter=2000)
    
    model.fit(X, y)
    
    score = model_selection.cross_val_score(
        estimator=model,
        X=X,
        y=y,
        scoring='f1',
        cv=5,
        n_jobs=-1
    ).mean()
    
    return -score

In [64]:
trials = Trials()

best_params = fmin(
    hyperopt_lg,
    space=space,
    algo=tpe.suggest,
    trials=trials,
    max_evals=50,
    rstate=np.random.default_rng(random_state)
)

print(f'Наилучшие значения гиперпараметров - {best_params}')

{'C': 0.62, 'penalty': 'l2', 'solver': 'newton-cg'}   
{'C': 0.86, 'penalty': 'l2', 'solver': 'liblinear'}                              
{'C': 0.08, 'penalty': 'l2', 'solver': 'lbfgs'}                                  
{'C': 0.22, 'penalty': 'none', 'solver': 'sag'}                                  
  6%|▌         | 3/50 [00:11<02:43,  3.47s/trial, best loss: -0.7909115547558131]






{'C': 0.11, 'penalty': 'l2', 'solver': 'newton-cholesky'}                        
{'C': 0.45, 'penalty': 'l2', 'solver': 'saga'}                                   
{'C': 0.09, 'penalty': 'l2', 'solver': 'saga'}                                   
{'C': 0.8300000000000001, 'penalty': 'none', 'solver': 'lbfgs'}                  
 14%|█▍        | 7/50 [03:06<19:49, 27.67s/trial, best loss: -0.7909115547558131]





{'C': 0.77, 'penalty': 'l2', 'solver': 'newton-cholesky'}                        
{'C': 0.44, 'penalty': 'l2', 'solver': 'liblinear'}                              
{'C': 0.05, 'penalty': 'l2', 'solver': 'lbfgs'}                                   
{'C': 0.11, 'penalty': 'l2', 'solver': 'saga'}                                    
{'C': 0.73, 'penalty': 'none', 'solver': 'sag'}                                   
 24%|██▍       | 12/50 [03:49<07:50, 12.37s/trial, best loss: -0.7914103753551599]






{'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}                                
{'C': 0.91, 'penalty': 'none', 'solver': 'lbfgs'}                                 
 28%|██▊       | 14/50 [06:00<20:10, 33.63s/trial, best loss: -0.7914103753551599]





{'C': 0.99, 'penalty': 'l2', 'solver': 'newton-cg'}                               
{'C': 0.8200000000000001, 'penalty': 'l1', 'solver': 'saga'}                      
 32%|███▏      | 16/50 [06:24<12:45, 22.51s/trial, best loss: -0.7914103753551599]




{'C': 0.11, 'penalty': 'none', 'solver': 'sag'}                                   
 34%|███▍      | 17/50 [09:41<41:05, 74.71s/trial, best loss: -0.7914103753551599]






{'C': 0.14, 'penalty': 'l2', 'solver': 'liblinear'}                               
{'C': 0.72, 'penalty': 'l2', 'solver': 'sag'}                                     
{'C': 0.04, 'penalty': 'l2', 'solver': 'lbfgs'}                                   
{'C': 0.3, 'penalty': 'l2', 'solver': 'lbfgs'}                                    
{'C': 0.3, 'penalty': 'l2', 'solver': 'lbfgs'}                                    
{'C': 0.26, 'penalty': 'l2', 'solver': 'lbfgs'}                                   
{'C': 0.02, 'penalty': 'l2', 'solver': 'lbfgs'}                                   
{'C': 0.44, 'penalty': 'l2', 'solver': 'sag'}                                     
{'C': 0.02, 'penalty': 'l2', 'solver': 'lbfgs'}                                   
{'C': 0.18, 'penalty': 'l2', 'solver': 'lbfgs'}                                   
{'C': 0.43, 'penalty': 'l2', 'solver': 'lbfgs'}                                   
{'C': 0.19, 'penalty': 'l2', 'solver': 'lbfgs'}                                   
{'C'







{'C': 0.56, 'penalty': 'l2', 'solver': 'lbfgs'}                                   
{'C': 0.14, 'penalty': 'l2', 'solver': 'lbfgs'}                                   
{'C': 0.43, 'penalty': 'none', 'solver': 'newton-cholesky'}                       
 72%|███████▏  | 36/50 [14:38<03:04, 13.15s/trial, best loss: -0.7914103753551599]



Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=9.57158e-23): result may not be accurate.



{'C': 0.23, 'penalty': 'l2', 'solver': 'lbfgs'}                                   
{'C': 0.07, 'penalty': 'l2', 'solver': 'newton-cholesky'}                         
{'C': 0.35000000000000003, 'penalty': 'l2', 'solver': 'newton-cholesky'}          
{'C': 0.6900000000000001, 'penalty': 'l1', 'solver': 'liblinear'}                 
{'C': 0.04, 'penalty': 'none', 'solver': 'newton-cg'}                             
 82%|████████▏ | 41/50 [15:06<00:51,  5.73s/trial, best loss: -0.7919953322184076]







{'C': 0.12, 'penalty': 'l2', 'solver': 'newton-cholesky'}                         
{'C': 0.63, 'penalty': 'l1', 'solver': 'saga'}                                    
 86%|████████▌ | 43/50 [16:08<01:53, 16.25s/trial, best loss: -0.7919953322184076]




{'C': 0.29, 'penalty': 'none', 'solver': 'newton-cholesky'}                       
 88%|████████▊ | 44/50 [19:22<06:57, 69.52s/trial, best loss: -0.7919953322184076]



Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=9.57158e-23): result may not be accurate.



{'C': 0.5, 'penalty': 'l2', 'solver': 'newton-cholesky'}                          
{'C': 0.99, 'penalty': 'l1', 'solver': 'saga'}                                    
 92%|█████████▏| 46/50 [19:40<02:33, 38.27s/trial, best loss: -0.7919953322184076]




{'C': 0.88, 'penalty': 'l2', 'solver': 'newton-cholesky'}                         
{'C': 0.68, 'penalty': 'none', 'solver': 'lbfgs'}                                 
 96%|█████████▌| 48/50 [23:00<02:02, 61.07s/trial, best loss: -0.7919953322184076]





{'C': 0.32, 'penalty': 'l1', 'solver': 'liblinear'}                               
100%|██████████| 50/50 [23:16<00:00, 27.93s/trial, best loss: -0.7919953322184076]
Наилучшие значения гиперпараметров - {'C1': 0.07, 'penalty1': 0, 'solver1': 2, 'variants': 0}


In [67]:
params_opt = hyperopt.space_eval(space, best_params)

log_reg = linear_model.LogisticRegression(**params_opt, random_state=random_state, max_iter=2000)
log_reg.fit(X_train, y_train)

y_test_pred = log_reg.predict(X_test)
print(f'Гиперпараметры модели - {params_opt}')
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

Гиперпараметры модели - {'C': 0.07, 'penalty': 'l2', 'solver': 'newton-cholesky'}
f1_score на тестовом наборе: 0.79


### Оптимизация логистической регрессии с помощью Optuna

In [22]:
import optuna

random_state = 42
def optuna_log_reg(trial):
    # var1 = {
    #     'solver': trial.suggest_categorical('solver', ['lbfgs', 'sag', 'newton-cholesky', 'newton-cg']),
    #     'C': trial.suggest_float('C', 0.01, 1),
    #     'penalty': trial.suggest_categorical('penalty', ['l2'])
    # }
    # var2 = {
    #     'solver': trial.suggest_categorical('solver', ['lbfgs', 'sag', 'newton-cholesky', 'newton-cg']),
    #     'C': trial.suggest_float('C', 0.01, 1),
    #     'penalty': trial.suggest_categorical('penalty', ['none'])
    # }
    # var3 = {
    #     'solver': trial.suggest_categorical('solver', ['liblinear', 'saga']),
    #     'C': trial.suggest_float('C', 0.01, 1),
    #     'penalty': trial.suggest_categorical('penalty', ['l2', 'l1'])
    # }
    
    solver = trial.suggest_categorical('solver', ['lbfgs', 'sag', 'newton-cholesky', 'newton-cg'])
    penalty = trial.suggest_categorical('penalty', ['l2', 'none'])
    C = trial.suggest_float('C', 0.01, 1)
    #variants = trial.suggest_categorical('variants', [var1, var2, var3])
    
    model = linear_model.LogisticRegression(solver=solver, 
                                            random_state=random_state, 
                                            max_iter=2000,
                                            C=C,
                                            penalty=penalty)
    
    model.fit(X_train, y_train)
    score = model_selection.cross_val_score(
        estimator=model,
        X=X_train,
        y=y_train,
        scoring='f1',
        cv=5,
        n_jobs=-1
    ).mean()
    
    return score
    

In [23]:

study = optuna.create_study(study_name="LogisticRegression", direction="maximize")
%time study.optimize(optuna_log_reg, n_trials=50)

[32m[I 2023-05-16 18:43:10,013][0m A new study created in memory with name: LogisticRegression[0m
[32m[I 2023-05-16 18:45:15,576][0m Trial 0 finished with value: 0.742379894257529 and parameters: {'solver': 'sag', 'penalty': 'none', 'C': 0.8480378439637759}. Best is trial 0 with value: 0.742379894257529.[0m
[32m[I 2023-05-16 18:45:24,564][0m Trial 1 finished with value: 0.7728364550081914 and parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'C': 0.6885638749108446}. Best is trial 1 with value: 0.7728364550081914.[0m
[32m[I 2023-05-16 18:45:29,110][0m Trial 2 finished with value: 0.7802724328192985 and parameters: {'solver': 'newton-cholesky', 'penalty': 'l2', 'C': 0.24962337935194004}. Best is trial 2 with value: 0.7802724328192985.[0m
[32m[I 2023-05-16 18:45:33,740][0m Trial 3 finished with value: 0.7757655074784271 and parameters: {'solver': 'newton-cholesky', 'penalty': 'l2', 'C': 0.48642865338057645}. Best is trial 2 with value: 0.7802724328192985.[0m
[32m[I 202

CPU times: total: 2min 43s
Wall time: 17min 22s


In [24]:
print("Наилучшие значения гиперпараметров {}".format(study.best_params))
print("f1_score на обучающем наборе: {:.2f}".format(study.best_value))

Наилучшие значения гиперпараметров {'solver': 'newton-cholesky', 'penalty': 'l2', 'C': 0.06304413681525046}
f1_score на обучающем наборе: 0.79


In [25]:
log_reg = linear_model.LogisticRegression(**study.best_params, 
                                        random_state=random_state, 
                                        max_iter=2000,
                                        )
    
log_reg.fit(X_train, y_train)
y_test_pred = log_reg.predict(X_test)
print(f'Гиперпараметры модели - {study.best_params}')
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

Гиперпараметры модели - {'solver': 'newton-cholesky', 'penalty': 'l2', 'C': 0.06304413681525046}
f1_score на тестовом наборе: 0.79


### <center> Случайный лес

### Стандартная модель с параметрами по умолчанию

In [27]:
# обучаем модель случайного леса с параметрами по умолчанию
model_rf = ensemble.RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)
y_test_pred = model_rf.predict(X_test)
print(f'f1_score на тренировочном наборе: {metrics.f1_score(y_train, model_rf.predict(X_train)):.2f}')
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

f1_score на тренировочном наборе: 1.00
f1_score на тестовом наборе: 0.79


### Оптимизация логистической регрессии с помощью GridSearchCV

In [33]:
#задаем сетку параметров
params_grid = {
    'criterion': ['entropy','gini'], 
    'max_depth': np.arange(10, 300, 60), 
    'min_samples_leaf': np.arange(3, 30, 6)
    }
    
#инициируем экземпляр GridSearchCV
grid_search = model_selection.GridSearchCV(
    estimator=model_rf,
    param_grid=params_grid,
    cv=5,
    n_jobs=-1
)

#обучаем модель и подбираем параметры
%time grid_search.fit(X_train, y_train)
y_test_pred = grid_search.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

CPU times: total: 1.09 s
Wall time: 45.9 s
f1_score на тестовом наборе: 0.80


In [34]:
#выводим лучшие гиперпараметры модели
grid_search.best_params_

{'criterion': 'entropy', 'max_depth': 70, 'min_samples_leaf': 3}

### Оптимизация модели случайного леса с помощью RandomizedSearchCV

In [36]:
#задаем диапазон гиперпараметров
params_distrib = {
    'criterion': ['entropy','gini'], 
    'max_depth': np.arange(10, 300, 50), 
    'min_samples_leaf': np.arange(3, 30, 1)
    }

#инициируем экземпляр RandomizedSearchCV
random_search = model_selection.RandomizedSearchCV(
    estimator=model_rf,
    param_distributions=params_distrib,
    cv=5,
    n_jobs=-1,
    n_iter=50
)

%time random_search.fit(X_train, y_train)
y_test_pred = random_search.predict(X_test)
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

CPU times: total: 891 ms
Wall time: 39.3 s
f1_score на тестовом наборе: 0.79


In [37]:
# выведем гиперпараметры модели
print(f'Гиперпараметры модели - {random_search.best_params_}')

Гиперпараметры модели - {'min_samples_leaf': 3, 'max_depth': 110, 'criterion': 'gini'}


### Оптимизация модели случайного леса с помощью Hyperopt

In [38]:
import hyperopt # импотируем Hyperopt
from hyperopt import hp, fmin, tpe, Trials # и необходимые классы

#задаем диапазон гиперпараметров

space = {
        'criterion': hp.choice('criterion', ['gini', 'entropy']),
        'max_depth': hp.quniform('max_depth', 10, 300, 1),
        'min_samples_leaf': hp.quniform('min_samples_leaf', 3, 10, 1)
    }
    
random_state = 42
def hyperopt_rf(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    print(params)
    params = {
        'criterion': str(params['criterion']),
        'max_depth': int(params['max_depth']),
        'min_samples_leaf': int(params['min_samples_leaf'])
    }
    
    model = ensemble.RandomForestClassifier(**params, random_state=random_state)
    
    model.fit(X, y)
    
    score = model_selection.cross_val_score(
        estimator=model,
        X=X,
        y=y,
        scoring='f1',
        cv=5,
        n_jobs=-1
    ).mean()
    
    return -score

In [39]:
trials = Trials()

best_params = fmin(
    hyperopt_rf,
    space=space,
    algo=tpe.suggest,
    trials=trials,
    max_evals=50,
    rstate=np.random.default_rng(random_state)
)

print(f'Наилучшие значения гиперпараметров - {best_params}')

{'criterion': 'gini', 'max_depth': 190.0, 'min_samples_leaf': 9.0}
{'criterion': 'entropy', 'max_depth': 186.0, 'min_samples_leaf': 5.0}            
{'criterion': 'gini', 'max_depth': 31.0, 'min_samples_leaf': 4.0}                
{'criterion': 'entropy', 'max_depth': 72.0, 'min_samples_leaf': 6.0}             
{'criterion': 'gini', 'max_depth': 38.0, 'min_samples_leaf': 7.0}                
{'criterion': 'entropy', 'max_depth': 57.0, 'min_samples_leaf': 8.0}             
{'criterion': 'gini', 'max_depth': 299.0, 'min_samples_leaf': 8.0}               
{'criterion': 'gini', 'max_depth': 251.0, 'min_samples_leaf': 3.0}               
{'criterion': 'gini', 'max_depth': 234.0, 'min_samples_leaf': 8.0}               
{'criterion': 'entropy', 'max_depth': 149.0, 'min_samples_leaf': 5.0}            
{'criterion': 'entropy', 'max_depth': 22.0, 'min_samples_leaf': 3.0}              
{'criterion': 'entropy', 'max_depth': 186.0, 'min_samples_leaf': 8.0}             
{'criterion': 'gini', 'max_de

In [42]:
params_opt = hyperopt.space_eval(space, best_params)
max_depth = int(params_opt['max_depth'])
min_samples_leaf = int(params_opt['min_samples_leaf'])

model_rf = ensemble.RandomForestClassifier(max_depth=max_depth, 
                                           random_state=random_state,
                                           min_samples_leaf=min_samples_leaf,
                                           criterion=params_opt['criterion'])
model_rf.fit(X_train, y_train)

y_test_pred = model_rf.predict(X_test)
print(f'Гиперпараметры модели - {params_opt}')
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

Гиперпараметры модели - {'criterion': 'entropy', 'max_depth': 72.0, 'min_samples_leaf': 6.0}
f1_score на тестовом наборе: 0.79


### Оптимизация модели случайного леса с помощью Optuna

In [45]:
import optuna

random_state = 42
def optuna_random_forest(trial):
    
    criterion = trial.suggest_categorical('criterion', ['entropy', 'gini'])
    max_depth = trial.suggest_int('max_depth', 10, 300)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 3, 10)
        
    model = ensemble.RandomForestClassifier(max_depth=max_depth, 
                                            random_state=random_state, 
                                            criterion=criterion,
                                            min_samples_leaf=min_samples_leaf)
    
    model.fit(X_train, y_train)
    score = model_selection.cross_val_score(
        estimator=model,
        X=X_train,
        y=y_train,
        scoring='f1',
        cv=5,
        n_jobs=-1
    ).mean()
    
    return score
    

In [46]:
study = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")
%time study.optimize(optuna_random_forest, n_trials=50)

[32m[I 2023-05-16 20:05:35,216][0m A new study created in memory with name: RandomForestClassifier[0m
[32m[I 2023-05-16 20:05:39,280][0m Trial 0 finished with value: 0.8158904004335706 and parameters: {'criterion': 'entropy', 'max_depth': 13, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.8158904004335706.[0m
[32m[I 2023-05-16 20:05:42,381][0m Trial 1 finished with value: 0.8091485172524348 and parameters: {'criterion': 'gini', 'max_depth': 80, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.8158904004335706.[0m
[32m[I 2023-05-16 20:05:44,795][0m Trial 2 finished with value: 0.8052070176293349 and parameters: {'criterion': 'entropy', 'max_depth': 43, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.8158904004335706.[0m
[32m[I 2023-05-16 20:05:47,033][0m Trial 3 finished with value: 0.805291739257487 and parameters: {'criterion': 'gini', 'max_depth': 129, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.8158904004335706.[0m
[32m[I 2023-05-16 20:0

CPU times: total: 15 s
Wall time: 2min 19s


In [47]:
print("Наилучшие значения гиперпараметров {}".format(study.best_params))
print("f1_score на обучающем наборе: {:.2f}".format(study.best_value))

Наилучшие значения гиперпараметров {'criterion': 'entropy', 'max_depth': 235, 'min_samples_leaf': 6}
f1_score на обучающем наборе: 0.82


In [48]:
model_rf = ensemble.RandomForestClassifier(**study.best_params, 
                                        random_state=random_state)
    
model_rf.fit(X_train, y_train)
y_test_pred = model_rf.predict(X_test)
print(f'Гиперпараметры модели - {study.best_params}')
print(f'f1_score на тестовом наборе: {metrics.f1_score(y_test, y_test_pred):.2f}')

Гиперпараметры модели - {'criterion': 'entropy', 'max_depth': 235, 'min_samples_leaf': 6}
f1_score на тестовом наборе: 0.79
