In [None]:
!pip install optuna

In [None]:
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap
import lightgbm as lgb
import optuna

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from sklearn.metrics import f1_score

# --- Распаковка и загрузка данных ---
with zipfile.ZipFile('playground-series-s5e7.zip', 'r') as zip_ref:
    zip_ref.extractall('./')
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# --- Feature Engineering ---
for df in [df_train, df_test]:
    df['alone_ratio'] = df['Time_spent_Alone'] / (
        df['Time_spent_Alone'] + df['Social_event_attendance'] + df['Going_outside'] + 1)
    df['social_ratio'] = df['Social_event_attendance'] / \
        (df['Going_outside'] + 1)
    df['is_high_poster'] = (df['Post_frequency'] > 5).astype(int)

# --- Кодировка таргета ---
target = 'Personality'
label_encoder = LabelEncoder()
df_train['target'] = label_encoder.fit_transform(df_train[target])

# --- Признаки ---
num_features = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside',
                'Friends_circle_size', 'Post_frequency', 'alone_ratio', 'social_ratio']
cat_features = ['Stage_fear', 'Drained_after_socializing', 'is_high_poster']

X = df_train[num_features + cat_features]
y = df_train['target']
X_test = df_test[num_features + cat_features]

# --- Препроцессинг ---
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target_encoder', TargetEncoder())
])
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),
    ('cat', cat_pipe, cat_features)
])

# --- Оптимизация через Optuna ---


def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': 42,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'lambda_l1': trial.suggest_float('lambda_l1', 0, 5),
        'lambda_l2': trial.suggest_float('lambda_l2', 0, 5),
    }

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    f1_scores = []

    for train_idx, valid_idx in skf.split(X, y):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]

        X_train_prep = preprocessor.fit_transform(X_train, y_train)
        X_valid_prep = preprocessor.transform(X_valid)

        train_data = lgb.Dataset(X_train_prep, label=y_train)
        valid_data = lgb.Dataset(X_valid_prep, label=y_valid)

        model = lgb.train(
            params,
            train_data,
            valid_sets=[valid_data],
            num_boost_round=1000,
            callbacks=[lgb.early_stopping(
                stopping_rounds=50), lgb.log_evaluation(100)]
        )

        preds = (model.predict(X_valid_prep,
                 num_iteration=model.best_iteration) > 0.5).astype(int)
        f1 = f1_score(y_valid, preds)
        f1_scores.append(f1)

    return np.mean(f1_scores)


print("🚀 Запуск Optuna для подбора гиперпараметров...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)
print(f"✅ Лучшие параметры: {study.best_params}")
print(f"✅ Лучший F1 (CV): {study.best_value:.4f}")

# --- Финальное обучение с подобранными параметрами ---
params = study.best_params
params.update({
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'random_state': 42
})

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
probas = np.zeros(len(X_test))
thresholds = np.linspace(0.3, 0.7, 50)
best_thresholds = []
val_f1_scores = []

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"Fold {fold + 1}")
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]

    X_train_prep = preprocessor.fit_transform(X_train, y_train)
    X_valid_prep = preprocessor.transform(X_valid)
    X_test_prep = preprocessor.transform(X_test)

    train_data = lgb.Dataset(X_train_prep, label=y_train)
    valid_data = lgb.Dataset(X_valid_prep, label=y_valid)

    model = lgb.train(
        params,
        train_data,
        valid_sets=[valid_data],
        num_boost_round=1000,
        callbacks=[lgb.early_stopping(50)]
    )

    val_pred_proba = model.predict(
        X_valid_prep, num_iteration=model.best_iteration)

    best_f1, best_thr = 0, 0.5
    for thr in thresholds:
        preds = (val_pred_proba > thr).astype(int)
        f1 = f1_score(y_valid, preds)
        if f1 > best_f1:
            best_f1 = f1
            best_thr = thr

    print(f"  Лучший порог: {best_thr:.3f}, F1 = {best_f1:.4f}")

    best_thresholds.append(best_thr)
    val_f1_scores.append(best_f1)

    probas += model.predict(X_test_prep,
                            num_iteration=model.best_iteration) / skf.n_splits

final_thr = np.mean(best_thresholds)
print(f"\n📈 Средний F1: {np.mean(val_f1_scores):.4f}")
print(f"📌 Используем финальный порог: {final_thr:.3f}")

# --- Финальное предсказание ---
preds_final = (probas > final_thr).astype(int)
df_test['Personality'] = label_encoder.inverse_transform(preds_final)
df_test[['id', 'Personality']].to_csv('submission.csv', index=False)
print("✅ submission.csv создан и готов к загрузке.")

# --- SHAP анализ ---
explainer = shap.TreeExplainer(model)
X_train_full_prep = preprocessor.transform(X)
shap_values = explainer.shap_values(X_train_full_prep)

if isinstance(shap_values, list):
    shap_values_to_plot = shap_values[1]
else:
    shap_values_to_plot = shap_values

shap.summary_plot(
    shap_values_to_plot,
    X_train_full_prep,
    feature_names=num_features + cat_features,
    plot_type='bar'
)

# --- График важности признаков LightGBM ---
plt.figure(figsize=(10, 6))
lgb.plot_importance(model, max_num_features=20, importance_type='gain')
plt.title('Важность признаков LightGBM')
plt.show()

In [None]:
# --- Сравнение LGBM и RandomForest по ROC и метрикам ---
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


def objective_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 4, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
    max_features = trial.suggest_categorical(
        'max_features', ['sqrt', 'log2', None])

    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )

    # 5-fold CV по F1 (weighted для баланса классов)
    scores = cross_val_score(
        rf,
        X_full_prep,  # уже предобработанные признаки
        y,
        cv=5,
        scoring='f1'
    )
    return scores.mean()

# Предобработка и разбиение для честного сравнения
X_full_prep = preprocessor.fit_transform(X, y)
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    X_full_prep, y, test_size=0.2, stratify=y, random_state=42
)

# --- RandomForest ---
best_rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=8,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)
best_rf_model.fit(X_train_split, y_train_split)

# --- LGBM (для ROC сравнения) ---
# Переобучаем LGBM на том же split
model_lgb = lgb.LGBMClassifier(**params, n_estimators=500)
model_lgb.fit(X_train_split, y_train_split)

# --- ROC и метрики ---
y_train_proba_lgb = model_lgb.predict_proba(X_train_split)[:, 1]
y_test_proba_lgb = model_lgb.predict_proba(X_test_split)[:, 1]
y_test_pred_lgb = (y_test_proba_lgb > best_thr).astype(int)

y_train_proba_rf = best_rf_model.predict_proba(X_train_split)[:, 1]
y_test_proba_rf = best_rf_model.predict_proba(X_test_split)[:, 1]
y_test_pred_rf = (y_test_proba_rf > 0.5).astype(int)

# --- ROC-кривые ---
fpr_lgb, tpr_lgb, _ = roc_curve(y_test_split, y_test_proba_lgb)
auc_lgb = roc_auc_score(y_test_split, y_test_proba_lgb)

fpr_rf, tpr_rf, _ = roc_curve(y_test_split, y_test_proba_rf)
auc_rf = roc_auc_score(y_test_split, y_test_proba_rf)

plt.figure(figsize=(7, 7))
plt.plot(fpr_lgb, tpr_lgb, label=f'LGBM (AUC = {auc_lgb:.3f})')
plt.plot(fpr_rf, tpr_rf, label=f'RandomForest (AUC = {auc_rf:.3f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Сравнение ROC-кривых: LGBM vs RandomForest')
plt.legend()
plt.grid(True)
plt.show()

# --- Метрики ---
acc_lgb = accuracy_score(y_test_split, y_test_pred_lgb)
f1_lgb = f1_score(y_test_split, y_test_pred_lgb)

acc_rf = accuracy_score(y_test_split, y_test_pred_rf)
f1_rf = f1_score(y_test_split, y_test_pred_rf)

print(
    f"✅ LGBM: AUC = {auc_lgb:.4f}, Accuracy = {acc_lgb:.4f}, F1 = {f1_lgb:.4f}")
print(f"✅ RF  : AUC = {auc_rf:.4f}, Accuracy = {acc_rf:.4f}, F1 = {f1_rf:.4f}")

In [None]:
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap
import lightgbm as lgb
import optuna

from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, accuracy_score
from sklearn.ensemble import RandomForestClassifier

# --- Распаковка и загрузка данных ---
with zipfile.ZipFile('playground-series-s5e7.zip', 'r') as zip_ref:
    zip_ref.extractall('./')
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# --- Feature Engineering ---
for df in [df_train, df_test]:
    df['alone_ratio'] = df['Time_spent_Alone'] / (
        df['Time_spent_Alone'] + df['Social_event_attendance'] + df['Going_outside'] + 1)
    df['social_ratio'] = df['Social_event_attendance'] / \
        (df['Going_outside'] + 1)
    df['is_high_poster'] = (df['Post_frequency'] > 5).astype(int)

# --- Кодировка таргета ---
target = 'Personality'
label_encoder = LabelEncoder()
df_train['target'] = label_encoder.fit_transform(df_train[target])

# --- Признаки ---
num_features = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside',
                'Friends_circle_size', 'Post_frequency', 'alone_ratio', 'social_ratio']
cat_features = ['Stage_fear', 'Drained_after_socializing', 'is_high_poster']

X = df_train[num_features + cat_features]
y = df_train['target']
X_test = df_test[num_features + cat_features]

# --- Препроцессинг ---
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target_encoder', TargetEncoder())
])
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),
    ('cat', cat_pipe, cat_features)
])

# --- Optuna для LGBM ---


def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': 42,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'lambda_l1': trial.suggest_float('lambda_l1', 0, 5),
        'lambda_l2': trial.suggest_float('lambda_l2', 0, 5),
    }

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = []

    for train_idx, valid_idx in skf.split(X, y):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]
        X_train_prep = preprocessor.fit_transform(X_train, y_train)
        X_valid_prep = preprocessor.transform(X_valid)

        train_data = lgb.Dataset(X_train_prep, label=y_train)
        valid_data = lgb.Dataset(X_valid_prep, label=y_valid)

        model = lgb.train(params, train_data, valid_sets=[valid_data], num_boost_round=1000,
                          callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)])

        preds = (model.predict(X_valid_prep,
                 num_iteration=model.best_iteration) > 0.5).astype(int)
        scores.append(f1_score(y_valid, preds))

    return np.mean(scores)


print("🚀 Optuna LGBM запуск...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)
print(f"✅ LGBM лучшие параметры: {study.best_params}")
print(f"✅ LGBM лучший F1: {study.best_value:.4f}")

params = study.best_params
params.update({'objective': 'binary', 'metric': 'binary_logloss',
              'boosting_type': 'gbdt', 'random_state': 42})

# --- Optuna для RandomForest ---


def objective_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 4, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
    max_features = trial.suggest_categorical(
        'max_features', ['sqrt', 'log2', None])

    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    X_full_prep = preprocessor.fit_transform(X, y)
    return cross_val_score(rf, X_full_prep, y, cv=3, scoring='f1').mean()


print("\n🚀 Optuna RandomForest запуск...")
study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=30)
print(f"✅ RF лучшие параметры: {study_rf.best_params}")
print(f"✅ RF лучший F1: {study_rf.best_value:.4f}")

# --- Финальное сравнение ---
X_full_prep = preprocessor.fit_transform(X, y)
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    X_full_prep, y, test_size=0.2, stratify=y, random_state=42)

best_rf_model = RandomForestClassifier(
    **study_rf.best_params, class_weight='balanced', random_state=42, n_jobs=-1)
best_rf_model.fit(X_train_split, y_train_split)

model_lgb = lgb.LGBMClassifier(**params, n_estimators=500)
model_lgb.fit(X_train_split, y_train_split)

# --- Метрики ---
y_test_proba_lgb = model_lgb.predict_proba(X_test_split)[:, 1]
y_test_proba_rf = best_rf_model.predict_proba(X_test_split)[:, 1]

fpr_lgb, tpr_lgb, _ = roc_curve(y_test_split, y_test_proba_lgb)
fpr_rf, tpr_rf, _ = roc_curve(y_test_split, y_test_proba_rf)
auc_lgb = roc_auc_score(y_test_split, y_test_proba_lgb)
auc_rf = roc_auc_score(y_test_split, y_test_proba_rf)

plt.figure(figsize=(7, 7))
plt.plot(fpr_lgb, tpr_lgb, label=f'LGBM (AUC = {auc_lgb:.3f})')
plt.plot(fpr_rf, tpr_rf, label=f'RF (AUC = {auc_rf:.3f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-кривая: LGBM vs RandomForest')
plt.legend()
plt.grid(True)
plt.show()

# --- Итоговые метрики ---
y_test_pred_lgb = (y_test_proba_lgb > 0.5).astype(int)
y_test_pred_rf = (y_test_proba_rf > 0.5).astype(int)

print(f"✅ LGBM: AUC = {auc_lgb:.4f}, Accuracy = {accuracy_score(y_test_split, y_test_pred_lgb):.4f}, F1 = {f1_score(y_test_split, y_test_pred_lgb):.4f}")
print(f"✅ RF  : AUC = {auc_rf:.4f}, Accuracy = {accuracy_score(y_test_split, y_test_pred_rf):.4f}, F1 = {f1_score(y_test_split, y_test_pred_rf):.4f}")

# SHAP анализ важности признаков
explainer = shap.TreeExplainer(model)
X_train_full_prep = preprocessor.transform(X)

shap_values = explainer.shap_values(X_train_full_prep)

# Обработка формата shap_values
if isinstance(shap_values, list):
    shap_values_to_plot = shap_values[1]  # Для положительного класса
else:
    shap_values_to_plot = shap_values

print(f"SHAP shape: {shap_values_to_plot.shape}")

# Построение графика
shap.summary_plot(
    shap_values_to_plot,
    X_train_full_prep,
    feature_names=num_features + cat_features,
    plot_type='bar'
)

In [None]:
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap
import lightgbm as lgb
import optuna

from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, accuracy_score
from sklearn.ensemble import RandomForestClassifier

# --- 1. Распаковка и загрузка данных ---
with zipfile.ZipFile('playground-series-s5e7.zip', 'r') as zip_ref:
    zip_ref.extractall('./')
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# --- 2. Feature Engineering ---
for df in [df_train, df_test]:
    df['alone_ratio'] = df['Time_spent_Alone'] / (
        df['Time_spent_Alone'] + df['Social_event_attendance'] +
        df['Going_outside'] + 1
    )
    df['social_ratio'] = df['Social_event_attendance'] / \
        (df['Going_outside'] + 1)
    df['is_high_poster'] = (df['Post_frequency'] > 5).astype(int)

# --- 3. Кодировка таргета ---
target_col = 'Personality'
label_encoder = LabelEncoder()
df_train['target'] = label_encoder.fit_transform(df_train[target_col])

# --- 4. Определение признаков ---
num_features = [
    'Time_spent_Alone',
    'Social_event_attendance',
    'Going_outside',
    'Friends_circle_size',
    'Post_frequency',
    'alone_ratio',
    'social_ratio'
]

cat_features = [
    'Stage_fear',
    'Drained_after_socializing',
    'is_high_poster'
]

X = df_train[num_features + cat_features]
y = df_train['target']
X_test = df_test[num_features + cat_features]

# --- 5. Предобработка признаков ---
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    # добавил smoothing для регуляризации
    ('target_encoder', TargetEncoder(smoothing=0.3))
])

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),
    ('cat', cat_pipe, cat_features)
])

# --- 6. Optuna: оптимизация гиперпараметров LightGBM ---


def objective_lgb(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': 42,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'lambda_l1': trial.suggest_float('lambda_l1', 0, 5),
        'lambda_l2': trial.suggest_float('lambda_l2', 0, 5),
    }

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    f1_scores = []

    for train_idx, valid_idx in skf.split(X, y):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]

        # Фитим препроцессор только на обучающей выборке
        X_train_prep = preprocessor.fit_transform(X_train, y_train)
        X_valid_prep = preprocessor.transform(X_valid)

        train_data = lgb.Dataset(X_train_prep, label=y_train)
        valid_data = lgb.Dataset(X_valid_prep, label=y_valid)

        model = lgb.train(
            params,
            train_data,
            valid_sets=[valid_data],
            num_boost_round=1000,
            callbacks=[lgb.early_stopping(50)]
        )

        preds = (model.predict(X_valid_prep,
                 num_iteration=model.best_iteration) > 0.5).astype(int)
        f1_scores.append(f1_score(y_valid, preds))

    return np.mean(f1_scores)


print("🚀 Запуск оптимизации LightGBM...")
study_lgb = optuna.create_study(
    direction='maximize', study_name='lgbm_f1_optimization')
study_lgb.optimize(objective_lgb, n_trials=30)
print(f"✅ Лучшие параметры LightGBM: {study_lgb.best_params}")
print(f"✅ Лучший F1 LightGBM: {study_lgb.best_value:.4f}")

lgb_params = study_lgb.best_params.copy()
lgb_params.update({'objective': 'binary', 'metric': 'binary_logloss',
                  'boosting_type': 'gbdt', 'random_state': 42})

# --- 7. Optuna: оптимизация гиперпараметров RandomForest ---


def objective_rf(trial):
    rf_params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 4, 20),
        'verbosity': -1,
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'class_weight': 'balanced',
        'random_state': 42,
        'n_jobs': -1
    }
    X_full_prep = preprocessor.fit_transform(X, y)
    rf = RandomForestClassifier(**rf_params)
    return cross_val_score(rf, X_full_prep, y, cv=3, scoring='f1').mean()


print("\n🚀 Запуск оптимизации RandomForest...")
study_rf = optuna.create_study(
    direction='maximize', study_name='rf_f1_optimization')
study_rf.optimize(objective_rf, n_trials=30)
print(f"✅ Лучшие параметры RandomForest: {study_rf.best_params}")
print(f"✅ Лучший F1 RandomForest: {study_rf.best_value:.4f}")

# --- 8. Финальное обучение и сравнение моделей ---
X_full_prep = preprocessor.fit_transform(X, y)
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(
    X_full_prep, y, test_size=0.2, stratify=y, random_state=42
)

# Обучаем финальную RF модель
best_rf_params = study_rf.best_params.copy()
best_rf_model = RandomForestClassifier(
    **best_rf_params, class_weight='balanced', random_state=42, n_jobs=-1)
best_rf_model.fit(X_train_split, y_train_split)

# Обучаем финальную LGBM модель (sklearn API)
model_lgb = lgb.LGBMClassifier(**lgb_params, n_estimators=500)
model_lgb.fit(X_train_split, y_train_split)

# --- 9. Оценка моделей на тестовом сплите ---
y_pred_proba_lgb = model_lgb.predict_proba(X_valid_split)[:, 1]
y_pred_proba_rf = best_rf_model.predict_proba(X_valid_split)[:, 1]

fpr_lgb, tpr_lgb, _ = roc_curve(y_valid_split, y_pred_proba_lgb)
fpr_rf, tpr_rf, _ = roc_curve(y_valid_split, y_pred_proba_rf)

auc_lgb = roc_auc_score(y_valid_split, y_pred_proba_lgb)
auc_rf = roc_auc_score(y_valid_split, y_pred_proba_rf)

plt.figure(figsize=(7, 7))
plt.plot(fpr_lgb, tpr_lgb, label=f'LGBM (AUC = {auc_lgb:.3f})')
plt.plot(fpr_rf, tpr_rf, label=f'RandomForest (AUC = {auc_rf:.3f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-кривая: LGBM vs RandomForest')
plt.legend()
plt.grid(True)
plt.show()

# --- 10. Итоговые метрики ---
y_pred_lgb = (y_pred_proba_lgb > 0.5).astype(int)
y_pred_rf = (y_pred_proba_rf > 0.5).astype(int)

print(f"✅ LGBM: AUC = {auc_lgb:.4f}, Accuracy = {accuracy_score(y_valid_split, y_pred_lgb):.4f}, F1 = {f1_score(y_valid_split, y_pred_lgb):.4f}")
print(f"✅ RF  : AUC = {auc_rf:.4f}, Accuracy = {accuracy_score(y_valid_split, y_pred_rf):.4f}, F1 = {f1_score(y_valid_split, y_pred_rf):.4f}")

# --- 11. SHAP-анализ (LightGBM sklearn API) ---
explainer = shap.Explainer(model_lgb)
shap_values = explainer(X_full_prep)

X_prep_df = pd.DataFrame(X_full_prep, columns=num_features + cat_features)

shap.summary_plot(shap_values, features=X_prep_df, plot_type='bar')

In [1]:
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap
import lightgbm as lgb
import optuna

from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, accuracy_score
from sklearn.ensemble import RandomForestClassifier

# --- 1. Распаковка и загрузка данных ---
with zipfile.ZipFile('playground-series-s5e7.zip', 'r') as zip_ref:
    zip_ref.extractall('./')
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# --- 2. Feature Engineering ---
for df in [df_train, df_test]:
    df['alone_ratio'] = df['Time_spent_Alone'] / (
        df['Time_spent_Alone'] + df['Social_event_attendance'] +
        df['Going_outside'] + 1
    )
    df['social_ratio'] = df['Social_event_attendance'] / \
        (df['Going_outside'] + 1)
    df['is_high_poster'] = (df['Post_frequency'] > 5).astype(int)

# --- 3. Кодировка таргета ---
target_col = 'Personality'
label_encoder = LabelEncoder()
df_train['target'] = label_encoder.fit_transform(df_train[target_col])

# --- 4. Определение признаков ---
num_features = [
    'Time_spent_Alone',
    'Social_event_attendance',
    'Going_outside',
    'Friends_circle_size',
    'Post_frequency',
    'alone_ratio',
    'social_ratio'
]

cat_features = [
    'Stage_fear',
    'Drained_after_socializing',
    'is_high_poster'
]

X = df_train[num_features + cat_features]
y = df_train['target']
X_test = df_test[num_features + cat_features]

# --- 5. Предобработка признаков ---
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    # добавил smoothing для регуляризации
    ('target_encoder', TargetEncoder(smoothing=0.3))
])

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),
    ('cat', cat_pipe, cat_features)
])

# --- 6. Optuna: оптимизация гиперпараметров LightGBM ---


def objective_lgb(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': 42,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'lambda_l1': trial.suggest_float('lambda_l1', 0, 5),
        'lambda_l2': trial.suggest_float('lambda_l2', 0, 5),
    }

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    f1_scores = []

    for train_idx, valid_idx in skf.split(X, y):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]

        # Фитим препроцессор только на обучающей выборке
        X_train_prep = preprocessor.fit_transform(X_train, y_train)
        X_valid_prep = preprocessor.transform(X_valid)

        train_data = lgb.Dataset(X_train_prep, label=y_train)
        valid_data = lgb.Dataset(X_valid_prep, label=y_valid)

        model = lgb.train(
            params,
            train_data,
            valid_sets=[valid_data],
            num_boost_round=1000,
            callbacks=[lgb.early_stopping(50)]
        )

        preds = (model.predict(X_valid_prep,
                 num_iteration=model.best_iteration) > 0.5).astype(int)
        f1_scores.append(f1_score(y_valid, preds))

    return np.mean(f1_scores)


print("🚀 Запуск оптимизации LightGBM...")
study_lgb = optuna.create_study(
    direction='maximize', study_name='lgbm_f1_optimization')
study_lgb.optimize(objective_lgb, n_trials=30)
print(f"✅ Лучшие параметры LightGBM: {study_lgb.best_params}")
print(f"✅ Лучший F1 LightGBM: {study_lgb.best_value:.4f}")

lgb_params = study_lgb.best_params.copy()
lgb_params.update({'objective': 'binary', 'metric': 'binary_logloss',
                  'boosting_type': 'gbdt', 'random_state': 42})

# --- 7. Optuna: оптимизация гиперпараметров RandomForest ---


def objective_rf(trial):
    rf_params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 4, 20),
        'verbosity': -1,
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'class_weight': 'balanced',
        'random_state': 42,
        'n_jobs': -1
    }
    X_full_prep = preprocessor.fit_transform(X, y)
    rf = RandomForestClassifier(**rf_params)
    return cross_val_score(rf, X_full_prep, y, cv=3, scoring='f1').mean()


print("\n🚀 Запуск оптимизации RandomForest...")
study_rf = optuna.create_study(
    direction='maximize', study_name='rf_f1_optimization')
study_rf.optimize(objective_rf, n_trials=30)
print(f"✅ Лучшие параметры RandomForest: {study_rf.best_params}")
print(f"✅ Лучший F1 RandomForest: {study_rf.best_value:.4f}")

# --- 8. Финальное обучение и сравнение моделей ---
X_full_prep = preprocessor.fit_transform(X, y)
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(
    X_full_prep, y, test_size=0.2, stratify=y, random_state=42
)

# Обучаем финальную RF модель
best_rf_params = study_rf.best_params.copy()
best_rf_model = RandomForestClassifier(
    **best_rf_params, class_weight='balanced', random_state=42, n_jobs=-1)
best_rf_model.fit(X_train_split, y_train_split)

# Обучаем финальную LGBM модель (sklearn API)
model_lgb = lgb.LGBMClassifier(**lgb_params, n_estimators=500)
model_lgb.fit(X_train_split, y_train_split)

# --- 9. Оценка моделей на тестовом сплите ---
y_pred_proba_lgb = model_lgb.predict_proba(X_valid_split)[:, 1]
y_pred_proba_rf = best_rf_model.predict_proba(X_valid_split)[:, 1]

fpr_lgb, tpr_lgb, _ = roc_curve(y_valid_split, y_pred_proba_lgb)
fpr_rf, tpr_rf, _ = roc_curve(y_valid_split, y_pred_proba_rf)

auc_lgb = roc_auc_score(y_valid_split, y_pred_proba_lgb)
auc_rf = roc_auc_score(y_valid_split, y_pred_proba_rf)

plt.figure(figsize=(7, 7))
plt.plot(fpr_lgb, tpr_lgb, label=f'LGBM (AUC = {auc_lgb:.3f})')
plt.plot(fpr_rf, tpr_rf, label=f'RandomForest (AUC = {auc_rf:.3f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-кривая: LGBM vs RandomForest')
plt.legend()
plt.grid(True)
plt.show()

# --- 10. Итоговые метрики ---
y_pred_lgb = (y_pred_proba_lgb > 0.5).astype(int)
y_pred_rf = (y_pred_proba_rf > 0.5).astype(int)

print(f"✅ LGBM: AUC = {auc_lgb:.4f}, Accuracy = {accuracy_score(y_valid_split, y_pred_lgb):.4f}, F1 = {f1_score(y_valid_split, y_pred_lgb):.4f}")
print(f"✅ RF  : AUC = {auc_rf:.4f}, Accuracy = {accuracy_score(y_valid_split, y_pred_rf):.4f}, F1 = {f1_score(y_valid_split, y_pred_rf):.4f}")

# --- 11. SHAP-анализ (LightGBM sklearn API) ---
explainer = shap.Explainer(model_lgb)
shap_values = explainer(X_full_prep)

X_prep_df = pd.DataFrame(X_full_prep, columns=num_features + cat_features)

shap.summary_plot(shap_values, features=X_prep_df, plot_type='bar')

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-07-18 17:54:32,386] A new study created in memory with name: lgbm_f1_optimization


🚀 Запуск оптимизации LightGBM...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[372]	valid_0's binary_logloss: 0.129921
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[259]	valid_0's binary_logloss: 0.136232
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:54:35,750] Trial 0 finished with value: 0.9397796186308222 and parameters: {'learning_rate': 0.018841865000372528, 'num_leaves': 23, 'max_depth': 5, 'feature_fraction': 0.7750323703275586, 'bagging_fraction': 0.9924925409455443, 'bagging_freq': 6, 'lambda_l1': 2.0563234574334324, 'lambda_l2': 3.083987652774656}. Best is trial 0 with value: 0.9397796186308222.


Early stopping, best iteration is:
[297]	valid_0's binary_logloss: 0.123731
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[264]	valid_0's binary_logloss: 0.12991
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[209]	valid_0's binary_logloss: 0.136285
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:54:38,637] Trial 1 finished with value: 0.9398055867362812 and parameters: {'learning_rate': 0.022313064374720925, 'num_leaves': 40, 'max_depth': 6, 'feature_fraction': 0.8034408855886714, 'bagging_fraction': 0.6906781318872681, 'bagging_freq': 8, 'lambda_l1': 2.8197729343208455, 'lambda_l2': 0.04867075601451609}. Best is trial 1 with value: 0.9398055867362812.


Early stopping, best iteration is:
[240]	valid_0's binary_logloss: 0.123872
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[106]	valid_0's binary_logloss: 0.130127
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[91]	valid_0's binary_logloss: 0.136157
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:54:41,538] Trial 2 finished with value: 0.9400015502970849 and parameters: {'learning_rate': 0.04738689953834103, 'num_leaves': 59, 'max_depth': 11, 'feature_fraction': 0.7827921070714139, 'bagging_fraction': 0.7184263069173575, 'bagging_freq': 2, 'lambda_l1': 1.687155922734322, 'lambda_l2': 2.9907316114929565}. Best is trial 2 with value: 0.9400015502970849.


Early stopping, best iteration is:
[114]	valid_0's binary_logloss: 0.123957
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[506]	valid_0's binary_logloss: 0.129902
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[350]	valid_0's binary_logloss: 0.136034
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:54:48,038] Trial 3 finished with value: 0.9400847878867973 and parameters: {'learning_rate': 0.01328080219710985, 'num_leaves': 43, 'max_depth': 9, 'feature_fraction': 0.8061860060227325, 'bagging_fraction': 0.9324465477722028, 'bagging_freq': 9, 'lambda_l1': 3.2175804811276527, 'lambda_l2': 0.01153837188966167}. Best is trial 3 with value: 0.9400847878867973.


Early stopping, best iteration is:
[430]	valid_0's binary_logloss: 0.123621
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[484]	valid_0's binary_logloss: 0.130134
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[389]	valid_0's binary_logloss: 0.136025
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:54:55,022] Trial 4 finished with value: 0.9397926392823216 and parameters: {'learning_rate': 0.012318185690021294, 'num_leaves': 59, 'max_depth': 9, 'feature_fraction': 0.8924856896976399, 'bagging_fraction': 0.6333279263420571, 'bagging_freq': 4, 'lambda_l1': 1.9333657948132794, 'lambda_l2': 2.317597854682396}. Best is trial 3 with value: 0.9400847878867973.


Early stopping, best iteration is:
[404]	valid_0's binary_logloss: 0.124404
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.129432
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[64]	valid_0's binary_logloss: 0.136177
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:54:56,306] Trial 5 finished with value: 0.940162765852571 and parameters: {'learning_rate': 0.07195360877725514, 'num_leaves': 53, 'max_depth': 5, 'feature_fraction': 0.6180898310541443, 'bagging_fraction': 0.6500889875446622, 'bagging_freq': 9, 'lambda_l1': 0.7565431971208131, 'lambda_l2': 2.6996946330536256}. Best is trial 5 with value: 0.940162765852571.


Early stopping, best iteration is:
[72]	valid_0's binary_logloss: 0.12363
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[305]	valid_0's binary_logloss: 0.129768
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[178]	valid_0's binary_logloss: 0.135709
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:54:59,246] Trial 6 finished with value: 0.9400395293381245 and parameters: {'learning_rate': 0.03324536631025935, 'num_leaves': 85, 'max_depth': 10, 'feature_fraction': 0.6058317804682748, 'bagging_fraction': 0.6762289506699961, 'bagging_freq': 5, 'lambda_l1': 3.6972787144915316, 'lambda_l2': 4.904359159950713}. Best is trial 5 with value: 0.940162765852571.


Early stopping, best iteration is:
[202]	valid_0's binary_logloss: 0.123434
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[301]	valid_0's binary_logloss: 0.129609
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[209]	valid_0's binary_logloss: 0.136155
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:01,179] Trial 7 finished with value: 0.9399035378973767 and parameters: {'learning_rate': 0.02452877671317121, 'num_leaves': 31, 'max_depth': 4, 'feature_fraction': 0.9098477184940965, 'bagging_fraction': 0.6576940213882952, 'bagging_freq': 4, 'lambda_l1': 0.008106643526042956, 'lambda_l2': 3.9130118024892067}. Best is trial 5 with value: 0.940162765852571.


Early stopping, best iteration is:
[220]	valid_0's binary_logloss: 0.124266
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[419]	valid_0's binary_logloss: 0.129721
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[308]	valid_0's binary_logloss: 0.135975
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:05,309] Trial 8 finished with value: 0.9402097535034143 and parameters: {'learning_rate': 0.017072665400123806, 'num_leaves': 32, 'max_depth': 6, 'feature_fraction': 0.6631582600581258, 'bagging_fraction': 0.9289615633367633, 'bagging_freq': 1, 'lambda_l1': 3.311947707959175, 'lambda_l2': 1.4569912147350954}. Best is trial 8 with value: 0.9402097535034143.


Early stopping, best iteration is:
[382]	valid_0's binary_logloss: 0.123759
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[208]	valid_0's binary_logloss: 0.130529
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[160]	valid_0's binary_logloss: 0.137285
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:08,664] Trial 9 finished with value: 0.9396679909303662 and parameters: {'learning_rate': 0.025812743503543185, 'num_leaves': 34, 'max_depth': 8, 'feature_fraction': 0.9005358763379769, 'bagging_fraction': 0.7941820667231352, 'bagging_freq': 1, 'lambda_l1': 0.6626961361230727, 'lambda_l2': 0.148365748413834}. Best is trial 8 with value: 0.9402097535034143.


Early stopping, best iteration is:
[171]	valid_0's binary_logloss: 0.124938
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[987]	valid_0's binary_logloss: 0.129173
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[574]	valid_0's binary_logloss: 0.136293
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:12,252] Trial 10 finished with value: 0.9401123796654245 and parameters: {'learning_rate': 0.010676632008486, 'num_leaves': 82, 'max_depth': 3, 'feature_fraction': 0.6892308769450365, 'bagging_fraction': 0.8634953591924165, 'bagging_freq': 2, 'lambda_l1': 4.21216873747538, 'lambda_l2': 1.2254750290952314}. Best is trial 8 with value: 0.9402097535034143.


Early stopping, best iteration is:
[682]	valid_0's binary_logloss: 0.12373
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[56]	valid_0's binary_logloss: 0.130496
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[44]	valid_0's binary_logloss: 0.136732
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:14,342] Trial 11 finished with value: 0.9401255758130823 and parameters: {'learning_rate': 0.09367238786745828, 'num_leaves': 51, 'max_depth': 7, 'feature_fraction': 0.6010670992038158, 'bagging_fraction': 0.7740520702661302, 'bagging_freq': 10, 'lambda_l1': 0.9776056428351367, 'lambda_l2': 1.6938569473425635}. Best is trial 8 with value: 0.9402097535034143.


Early stopping, best iteration is:
[65]	valid_0's binary_logloss: 0.12423
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[80]	valid_0's binary_logloss: 0.129475
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[65]	valid_0's binary_logloss: 0.13612
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:16,046] Trial 12 finished with value: 0.9400388299380538 and parameters: {'learning_rate': 0.09115042762013328, 'num_leaves': 77, 'max_depth': 6, 'feature_fraction': 0.679874819777838, 'bagging_fraction': 0.8716591604893789, 'bagging_freq': 7, 'lambda_l1': 4.857353886136769, 'lambda_l2': 1.2639099467223907}. Best is trial 8 with value: 0.9402097535034143.


Early stopping, best iteration is:
[63]	valid_0's binary_logloss: 0.123839
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[155]	valid_0's binary_logloss: 0.12976
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.136264
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:18,272] Trial 13 finished with value: 0.9400009726156547 and parameters: {'learning_rate': 0.060446503109065346, 'num_leaves': 100, 'max_depth': 5, 'feature_fraction': 0.6802358620413486, 'bagging_fraction': 0.886329928511003, 'bagging_freq': 8, 'lambda_l1': 2.7176200787068634, 'lambda_l2': 2.243162384562819}. Best is trial 8 with value: 0.9402097535034143.


Early stopping, best iteration is:
[117]	valid_0's binary_logloss: 0.124066
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[246]	valid_0's binary_logloss: 0.12955
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[159]	valid_0's binary_logloss: 0.135995
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:19,573] Trial 14 finished with value: 0.9400143672657162 and parameters: {'learning_rate': 0.04458805276184861, 'num_leaves': 50, 'max_depth': 3, 'feature_fraction': 0.9931152336687703, 'bagging_fraction': 0.6033480959011922, 'bagging_freq': 3, 'lambda_l1': 3.7121925936685503, 'lambda_l2': 3.5884527856865813}. Best is trial 8 with value: 0.9402097535034143.


Early stopping, best iteration is:
[177]	valid_0's binary_logloss: 0.123998
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[341]	valid_0's binary_logloss: 0.129717
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[287]	valid_0's binary_logloss: 0.135817
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:23,827] Trial 15 finished with value: 0.9400264114156555 and parameters: {'learning_rate': 0.0161736250859594, 'num_leaves': 23, 'max_depth': 7, 'feature_fraction': 0.6482907631930636, 'bagging_fraction': 0.7410434999347775, 'bagging_freq': 10, 'lambda_l1': 1.080621785487548, 'lambda_l2': 0.8690934093705721}. Best is trial 8 with value: 0.9402097535034143.


Early stopping, best iteration is:
[314]	valid_0's binary_logloss: 0.12375
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[133]	valid_0's binary_logloss: 0.130956
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[112]	valid_0's binary_logloss: 0.136921
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:28,004] Trial 16 finished with value: 0.9397919884440418 and parameters: {'learning_rate': 0.03579565238363738, 'num_leaves': 66, 'max_depth': 12, 'feature_fraction': 0.7372464062738422, 'bagging_fraction': 0.9903374330029214, 'bagging_freq': 6, 'lambda_l1': 0.05851695423930259, 'lambda_l2': 1.859736187579111}. Best is trial 8 with value: 0.9402097535034143.


Early stopping, best iteration is:
[118]	valid_0's binary_logloss: 0.125425
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[110]	valid_0's binary_logloss: 0.129405
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[77]	valid_0's binary_logloss: 0.135804
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:29,359] Trial 17 finished with value: 0.9399035378973767 and parameters: {'learning_rate': 0.06527049233814453, 'num_leaves': 67, 'max_depth': 5, 'feature_fraction': 0.7193015427226195, 'bagging_fraction': 0.8245566581171991, 'bagging_freq': 1, 'lambda_l1': 2.3383899940443884, 'lambda_l2': 2.7962409914444706}. Best is trial 8 with value: 0.9402097535034143.


Early stopping, best iteration is:
[93]	valid_0's binary_logloss: 0.123793
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[128]	valid_0's binary_logloss: 0.129439
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[73]	valid_0's binary_logloss: 0.135741
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:31,289] Trial 18 finished with value: 0.9401243625767509 and parameters: {'learning_rate': 0.06970872784893033, 'num_leaves': 45, 'max_depth': 6, 'feature_fraction': 0.6429581341319968, 'bagging_fraction': 0.9327104669223769, 'bagging_freq': 7, 'lambda_l1': 1.5278969984463453, 'lambda_l2': 3.967177663775179}. Best is trial 8 with value: 0.9402097535034143.


Early stopping, best iteration is:
[82]	valid_0's binary_logloss: 0.123658
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[593]	valid_0's binary_logloss: 0.129148
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[300]	valid_0's binary_logloss: 0.135732
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:34,301] Trial 19 finished with value: 0.9403338309212268 and parameters: {'learning_rate': 0.018207728716790647, 'num_leaves': 31, 'max_depth': 4, 'feature_fraction': 0.6376410214445216, 'bagging_fraction': 0.7503112244306667, 'bagging_freq': 5, 'lambda_l1': 3.3954378558850227, 'lambda_l2': 0.8472293826237782}. Best is trial 19 with value: 0.9403338309212268.


Early stopping, best iteration is:
[357]	valid_0's binary_logloss: 0.123573
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[581]	valid_0's binary_logloss: 0.129511
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[333]	valid_0's binary_logloss: 0.135964
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:37,154] Trial 20 finished with value: 0.9401123796654245 and parameters: {'learning_rate': 0.01674896257919611, 'num_leaves': 32, 'max_depth': 4, 'feature_fraction': 0.7367049001286108, 'bagging_fraction': 0.8370245839085179, 'bagging_freq': 4, 'lambda_l1': 4.921663558506509, 'lambda_l2': 0.5971033832099351}. Best is trial 19 with value: 0.9403338309212268.


Early stopping, best iteration is:
[353]	valid_0's binary_logloss: 0.123644
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[521]	valid_0's binary_logloss: 0.129213
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[298]	valid_0's binary_logloss: 0.135782
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:40,259] Trial 21 finished with value: 0.9403332006939591 and parameters: {'learning_rate': 0.019516981746073744, 'num_leaves': 22, 'max_depth': 4, 'feature_fraction': 0.6360600736175013, 'bagging_fraction': 0.7597982413253331, 'bagging_freq': 5, 'lambda_l1': 3.2761049866255387, 'lambda_l2': 1.5963016393809977}. Best is trial 19 with value: 0.9403338309212268.


Early stopping, best iteration is:
[309]	valid_0's binary_logloss: 0.123588
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[420]	valid_0's binary_logloss: 0.129274
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[300]	valid_0's binary_logloss: 0.135726
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:42,962] Trial 22 finished with value: 0.9399416393582665 and parameters: {'learning_rate': 0.02010499804864341, 'num_leaves': 20, 'max_depth': 4, 'feature_fraction': 0.6437612872802227, 'bagging_fraction': 0.7527633293283293, 'bagging_freq': 5, 'lambda_l1': 3.3222309560096703, 'lambda_l2': 1.6024334530068025}. Best is trial 19 with value: 0.9403338309212268.


Early stopping, best iteration is:
[344]	valid_0's binary_logloss: 0.123624
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[870]	valid_0's binary_logloss: 0.129295
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[416]	valid_0's binary_logloss: 0.136203
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:46,069] Trial 23 finished with value: 0.9401251274521779 and parameters: {'learning_rate': 0.014570187044105223, 'num_leaves': 29, 'max_depth': 3, 'feature_fraction': 0.7005388385322993, 'bagging_fraction': 0.7084384376963857, 'bagging_freq': 3, 'lambda_l1': 4.09196133196879, 'lambda_l2': 0.7587400701801811}. Best is trial 19 with value: 0.9403338309212268.


Early stopping, best iteration is:
[520]	valid_0's binary_logloss: 0.123949
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[363]	valid_0's binary_logloss: 0.129319
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[232]	valid_0's binary_logloss: 0.135892
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:48,441] Trial 24 finished with value: 0.94011187123071 and parameters: {'learning_rate': 0.0270697952024369, 'num_leaves': 39, 'max_depth': 4, 'feature_fraction': 0.6520046733692066, 'bagging_fraction': 0.8111841461965248, 'bagging_freq': 3, 'lambda_l1': 3.1613134733187946, 'lambda_l2': 1.1862706802921492}. Best is trial 19 with value: 0.9403338309212268.


Early stopping, best iteration is:
[224]	valid_0's binary_logloss: 0.123539
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[725]	valid_0's binary_logloss: 0.12939
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[525]	valid_0's binary_logloss: 0.135892
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:55:57,616] Trial 25 finished with value: 0.9402223749764591 and parameters: {'learning_rate': 0.010158191098048901, 'num_leaves': 28, 'max_depth': 6, 'feature_fraction': 0.7545663502979559, 'bagging_fraction': 0.768955662265677, 'bagging_freq': 5, 'lambda_l1': 4.287616289746362, 'lambda_l2': 1.9573412749551617}. Best is trial 19 with value: 0.9403338309212268.


Early stopping, best iteration is:
[625]	valid_0's binary_logloss: 0.12377
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[725]	valid_0's binary_logloss: 0.129456
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[525]	valid_0's binary_logloss: 0.135974
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:56:10,226] Trial 26 finished with value: 0.9397926392823216 and parameters: {'learning_rate': 0.010024602699516048, 'num_leaves': 26, 'max_depth': 7, 'feature_fraction': 0.8700178750648823, 'bagging_fraction': 0.7719473887113398, 'bagging_freq': 5, 'lambda_l1': 4.424804995640062, 'lambda_l2': 1.993766390256305}. Best is trial 19 with value: 0.9403338309212268.


Early stopping, best iteration is:
[625]	valid_0's binary_logloss: 0.123877
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[714]	valid_0's binary_logloss: 0.129314
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[471]	valid_0's binary_logloss: 0.135836
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:56:17,242] Trial 27 finished with value: 0.9399035378973767 and parameters: {'learning_rate': 0.012826705071944736, 'num_leaves': 20, 'max_depth': 4, 'feature_fraction': 0.8477639794463531, 'bagging_fraction': 0.7347456276048089, 'bagging_freq': 6, 'lambda_l1': 3.6707921390172427, 'lambda_l2': 0.421247023137131}. Best is trial 19 with value: 0.9403338309212268.


Early stopping, best iteration is:
[528]	valid_0's binary_logloss: 0.123607
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[429]	valid_0's binary_logloss: 0.129581
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[267]	valid_0's binary_logloss: 0.136168
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:56:20,283] Trial 28 finished with value: 0.9397926392823216 and parameters: {'learning_rate': 0.029387789282351206, 'num_leaves': 38, 'max_depth': 3, 'feature_fraction': 0.7585675776826317, 'bagging_fraction': 0.7824914068392186, 'bagging_freq': 7, 'lambda_l1': 4.465155506052218, 'lambda_l2': 0.9913036826528949}. Best is trial 19 with value: 0.9403338309212268.


Early stopping, best iteration is:
[246]	valid_0's binary_logloss: 0.12367
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[419]	valid_0's binary_logloss: 0.129564
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[312]	valid_0's binary_logloss: 0.136268
Training until validation scores don't improve for 50 rounds


[I 2025-07-18 17:56:27,205] Trial 29 finished with value: 0.9396679909303662 and parameters: {'learning_rate': 0.019824586483995436, 'num_leaves': 25, 'max_depth': 5, 'feature_fraction': 0.9518681417960154, 'bagging_fraction': 0.7518733890623551, 'bagging_freq': 6, 'lambda_l1': 3.9440612167972415, 'lambda_l2': 2.028774519152736}. Best is trial 19 with value: 0.9403338309212268.
[I 2025-07-18 17:56:27,208] A new study created in memory with name: rf_f1_optimization
[W 2025-07-18 17:56:27,292] Trial 0 failed with parameters: {'n_estimators': 160, 'max_depth': 6, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': 'log2'} because of the following error: TypeError("RandomForestClassifier.__init__() got an unexpected keyword argument 'verbosity'").
Traceback (most recent call last):
  File "/Users/antontravkin/Sites/python_rtk/.venv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^


Early stopping, best iteration is:
[330]	valid_0's binary_logloss: 0.123703
✅ Лучшие параметры LightGBM: {'learning_rate': 0.018207728716790647, 'num_leaves': 31, 'max_depth': 4, 'feature_fraction': 0.6376410214445216, 'bagging_fraction': 0.7503112244306667, 'bagging_freq': 5, 'lambda_l1': 3.3954378558850227, 'lambda_l2': 0.8472293826237782}
✅ Лучший F1 LightGBM: 0.9403

🚀 Запуск оптимизации RandomForest...


TypeError: RandomForestClassifier.__init__() got an unexpected keyword argument 'verbosity'