In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
FOLDER_PATH = '/Users/artemzmailov/Desktop/GiveMeSomeCredit/'
dataset_train = pd.read_csv(FOLDER_PATH + 'data/train_full_scaled_pipeline_v1.csv', index_col = 0)
dataset_test = pd.read_csv(FOLDER_PATH + 'data/Kaggle_test_full_scaled_pipeline_v1.csv', index_col = 0)
train_label = pd.read_csv(FOLDER_PATH + 'data/train_label_pipeline_v1.csv', index_col = 0).squeeze()


## Principal Component Analysis

In [3]:
from sklearn.decomposition import PCA

# pca = PCA(n_components = 10)
# pca_train = pca.fit(dataset_train)
# fig = px.bar(np.sort(pca_train.explained_variance_ratio_)[::-1])
# fig.show()

# for i in range(8, 1, -1):
#     print(i)
#     pca = PCA(n_components = i)
#     pca_train = pca.fit(dataset_train)
#     cumsum = np.cumsum(np.sort(pca_train.explained_variance_ratio_)[::-1])
#     fig = px.bar(cumsum)
#     fig.show()

## Data Split

In [4]:
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.model_selection import train_test_split

# X_train, X_test, Y_train, Y_test = train_test_split(
#     dataset_train,
#     train_label,
#     stratify = train_label,
#     random_state = 42,
#     shuffle = True,
#     test_size = 0.2)


repeated_kfold = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state = 42)
kfold = StratifiedKFold(n_splits = 5,shuffle = True, random_state = 42)


In [18]:
# print(Y_train.sum() / X_train.shape[0], Y_test.sum() / X_test.shape[0])

In [38]:
weights_scale = train_label.value_counts().iloc[0] / train_label.value_counts().iloc[1]
weights_scale

## Linear Models

## BASELINE

In [7]:
%%time
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.model_selection import train_test_split

repeated_kfold = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state = 42)
kfold = StratifiedKFold(n_splits = 5,shuffle = True, random_state = 42)

from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score

log_reg = LogisticRegressionCV(cv = repeated_kfold,
                                class_weight = 'balanced',
                                random_state = 42,
                                scoring = 'roc_auc',
                                n_jobs = -1).fit(dataset_train, train_label)

print(f"Лучший C: {log_reg.C_}")
print(f"CV scores для каждого C: {log_reg.scores_}")
print(f"Средний AUC на CV: {log_reg.scores_[1].mean():.4f}")

In [5]:
list(log_reg.scores_.values())[0].mean()

In [15]:
import joblib
joblib.dump(log_reg, FOLDER_PATH + 'models/log_reg.pkl')


In [17]:
%%time
from sklearn.linear_model import LogisticRegressionCV
import numpy as np

log_reg = LogisticRegressionCV(
    cv=5,                          # 5-фолдовая кросс-валидация
    class_weight='balanced',       # балансировка классов
    random_state=42,              
    scoring='roc_auc',            # метрика AUC
    n_jobs=-1,                   # используем все ядра CPU
    Cs=np.logspace(-3, 3, 20),   # 20 значений C от 0.001 до 1000
    penalty='l2',                # L2 регуляризация
    solver='lbfgs',             # быстрый solver для L2
    max_iter=2000,             # достаточно для сходимости
    tol=1e-4,                 # точность
    verbose=1                 # показываем прогресс
).fit(dataset_train, train_label)

print("=" * 60)
print("ЛОГИСТИЧЕСКАЯ РЕГРЕССИЯ С CV")
print("=" * 60)
print(f"Лучший C: {log_reg.C_[0]:.4f}")
print(f"Средний AUC на CV: {log_reg.scores_[1].mean():.4f}")
print(f"Стандартное отклонение: {log_reg.scores_[1].mean(axis=0).std():.4f}")

In [20]:
from sklearn.linear_model import LogisticRegressionCV
import numpy as np

# Расширенная сетка вокруг оптимального C = 483.29
log_reg_final = LogisticRegressionCV(
    cv=5,
    class_weight='balanced',
    random_state=42,
    scoring='roc_auc',
    n_jobs=-1,
    Cs=[100, 200, 300, 400, 450, 475, 483, 490, 500, 550, 600, 700, 800, 900, 1000],  # целенаправленный поиск
    penalty='l1',
    solver='liblinear',
    max_iter=3000,
    tol=1e-4,
    verbose=1,
    refit=True
).fit(dataset_train, train_label)

print("=" * 60)
print("ФИНАЛЬНАЯ ЛОГИСТИЧЕСКАЯ РЕГРЕССИЯ (L1) - УТОЧНЕНИЕ C")
print("=" * 60)
print(f"Лучший C: {log_reg_final.C_[0]:.4f}")
print(f"Средний AUC на CV: {log_reg_final.scores_[1].mean():.4f}")
print(f"Стандартное отклонение: {log_reg_final.scores_[1].mean(axis=0).std():.4f}")
print(f"Отобрано признаков: {np.sum(log_reg_final.coef_[0] != 0)}/{len(log_reg_final.coef_[0])}")

In [26]:
from sklearn.linear_model import LogisticRegression
log_reg_final = LogisticRegression(
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    C = 550, 
    penalty='l1',
    solver='liblinear',
    max_iter=3000,
    tol=1e-4,
).fit(dataset_train, train_label)

import joblib
joblib.dump(log_reg_final, FOLDER_PATH + 'models/log_reg_best.pkl')


result = pd.DataFrame({'Id': dataset_test.index, 'Probability': log_reg_final.predict_proba(dataset_test)[:,1]})
result.to_csv(FOLDER_PATH +'data/result_log_reg_pipeline_v1.csv', index = False)

In [14]:
result = pd.DataFrame({'Id': dataset_test.index, 'Probability': log_reg.predict_proba(dataset_test)[:,1]})
result.to_csv(FOLDER_PATH +'data/result_baseline_pipeline_v1.csv', index = False)

## SVC

In [24]:
%%time
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
svc = SVC(random_state = 42, class_weight = 'balanced')
params = {'kernel':['linear', 'poly', 'rbf', 'sigmoid']}
grid_search_cv = GridSearchCV(estimator = svc,
                              param_grid = params,
                              cv = kfold,
                              scoring = 'roc_auc',
                              n_jobs = -1,
                              verbose = 2).fit(X_train, Y_train)

In [20]:
svc = SVC(random_state = 42,
          class_weight = 'balanced',
          verbose = 2,
          C = 1,
          kernel = 'rbf',
          degree = 3).fit(X_train, Y_train)
CV_score = cross_val_score(svc,
                    X = X_test,
                    y = Y_test,
                    cv = kfold,
                    scoring = 'roc_auc',
                    n_jobs = -1)
CV_score.mean()

In [26]:
pd.DataFrame(grid_search_cv.cv_results_) 

In [28]:
grid_search_cv.best_estimator_

## Линейный SVC


In [47]:
%%time
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

linear_svc = LinearSVC(random_state = 42, class_weight = 'balanced', max_iter = 10000, penalty = 'l2', loss = 'squared_hinge')
params = {'tol':[1e-4,1e-3,1e-2,1e-1],
         'C':[0.01,0.1,1,10,100]}
linear_search_cv = GridSearchCV(estimator = linear_svc,
                              param_grid = params,
                              cv = kfold,
                              scoring = 'roc_auc',
                              n_jobs = -1,
                              verbose = 2).fit(X_train, Y_train)

In [48]:
linear_search_cv.best_estimator_

In [51]:
best_linear_svc = linear_search_cv.best_estimator_
result_linear_svc = pd.DataFrame({'Id': dataset_test.index + 1, 'Probability': best_linear_svc.predict(dataset_test)})
result_linear_svc.to_csv(FOLDER_PATH +'data/result_linear_svc.csv', index = False)

In [46]:
linear_search_cv_2.best_estimator_

## SVC RBF

In [21]:
from sklearn.metrics import roc_auc_score
svc_rbf = SVC(random_state = 42, class_weight = 'balanced', kernel = 'rbf', C = 10).fit(X_train, Y_train)
y_score = svc_rbf.decision_function(X_test)
roc_auc_rbf = roc_auc_score(Y_test, y_score)
roc_auc_rbf

In [22]:
result_rbf = pd.DataFrame({'Id': dataset_test.index + 1, 'Probability': svc_rbf.predict(dataset_test)})
result_rbf.to_csv(FOLDER_PATH +'data/result_rbf.csv', index = False)

## Decision Tree

In [15]:
%%time
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
dtc = DecisionTreeClassifier(class_weight = 'balanced', random_state = 42)
dtc_params = {
    'max_depth': [3, 5, 7,8, 10,15, None],    
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4,6,8,10],              
    'criterion': ['gini', 'entropy', 'log_loss'],
    'min_impurity_decrease': [0.0, 0.001, 0.005, 0.01]}
dtc_search_cv = GridSearchCV(estimator = dtc,
                              param_grid = dtc_params,
                              cv = kfold,
                              scoring = 'roc_auc',
                              n_jobs = -1,
                              verbose = 2).fit(dataset_train, train_label)
          

In [17]:
dtc_score = pd.DataFrame(dtc_search_cv.cv_results_)
dtc_best_params = dtc_score[dtc_score['mean_test_score'].sort_values(ascending = False) > 0.85]['params']
for row in dtc_best_params.items():
    print(row)

In [16]:
dtc_search_cv.best_score_

In [11]:
dtc_search_cv.best_estimator_

In [12]:
#dtc_score.iloc[dtc_score['mean_test_score'].sort_values(ascending = False).index].head(50)

In [19]:
from sklearn.metrics import roc_auc_score
best_dtc = dtc_search_cv.best_estimator_
dtc_y_proba = best_dtc.predict_proba(dataset_train)[:,1]
dtc_roc_auc = roc_auc_score(train_label, dtc_y_proba)
dtc_roc_auc

In [22]:
result_dtc = pd.DataFrame({'Id': dataset_test.index, 'Probability': best_dtc.predict_proba(dataset_test)[:,1]})
result_dtc.to_csv(FOLDER_PATH +'data/result_dtc_pipeline_v1.csv', index = False)

## Random Forest

In [23]:
%%time
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 100, n_jobs = -1, random_state = 42, class_weight = 'balanced', criterion = 'gini')
rfc_params = {               
    'max_depth': [8, 10, 15, None],           
    'min_samples_split': [2, 10],            
    'min_samples_leaf': [1, 4],
    'criterion': ['gini', 'entropy', 'log_loss']}     
kfold_rfc = StratifiedKFold(n_splits = 3,shuffle = True, random_state = 42)
rfc_search_cv = GridSearchCV(estimator = rfc,
                              param_grid = rfc_params,
                              cv = kfold_rfc,
                              scoring = 'roc_auc',
                              n_jobs = -1,
                              verbose = 2).fit(dataset_train, train_label)

In [24]:
pd.DataFrame(rfc_search_cv.cv_results_)

In [25]:
rfc_search_cv.best_score_

In [141]:
rfc_score = pd.DataFrame(rfc_search_cv.cv_results_)
rfc_best_params = rfc_score[rfc_score['mean_test_score'].sort_values(ascending = False) > 0.855][['params', 'mean_test_score', 'rank_test_score']]
for row in rfc_best_params.iterrows():
    for idx, val in row[1].items():
        print(f'{idx}: {val}')

In [115]:
rfc_score['mean_test_score'].sort_values(ascending = False)

In [27]:
rfc_search_cv.best_score_

In [172]:
%%time
# Топ 5 вариант: mean_test_score: 0.8597021546202702
rfc_best_estimator = RandomForestClassifier(n_estimators = 200,
                                  n_jobs = -1,
                                  random_state = 42,
                                  class_weight = 'balanced',
                                  criterion = 'gini',
                                  max_depth = 8,
                                  min_samples_leaf = 4,
                                  min_samples_split = 10)

rfc_cv_score = cross_val_score(estimator = rfc_best_estimator,
                               X = X_train,
                               y = Y_train,
                               scoring = 'roc_auc',
                               cv = kfold_rfc,
                               n_jobs = -1,
                               verbose = 2)

In [174]:
rfc_cv_score.mean()

In [28]:
rfc_best = rfc_best_estimator.fit(dataset_train, train_label)

In [29]:
rfc_test_score = rfc_search_cv.best_estimator_.predict_proba(dataset_train)[:,1]
rfc_roc_auc = roc_auc_score(train_label, rfc_test_score)
rfc_roc_auc

In [58]:
rfc_best = rfc_search_cv.best_estimator_

In [33]:
result_rfc = pd.DataFrame({'Id': dataset_test.index, 'Probability': rfc_search_cv.best_estimator_.predict_proba(dataset_test)[:,1]})
result_rfc.to_csv(FOLDER_PATH +'data/result_rfc_pipeline_v1.csv', index = False)
result_rfc

## XGBoost

In [34]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
xgb = XGBClassifier(verbosity = 2,
                    n_estimators = 100,
                    learning_rate = 0.3,
                    min_split_loss = 0,
                    max_depth = 6,
                    min_child_weight = 1,
                    reg_lambda = 1,
                    reg_alpha = 0,
                    scale_pos_weight = 1,
                        random_state = 42).fit(dataset_train,train_label)

In [35]:
xgb_baseline = XGBClassifier(random_state = 42).fit(dataset_train,train_label)
xgb_test_score = xgb_baseline.predict_proba(dataset_train)[:,1]
xgb_roc_auc = roc_auc_score(train_label, xgb_test_score)
xgb_roc_auc

In [36]:
result_xgb_baseline = pd.DataFrame({'Id': dataset_test.index, 'Probability': xgb_baseline.predict_proba(dataset_test)[:,1]})
result_xgb_baseline.to_csv(FOLDER_PATH +'data/result_xgb_baseline_pipeline_v1.csv', index = False)

In [40]:
from sklearn.model_selection import RandomizedSearchCV

weights_scale = train_label.value_counts().iloc[0] / train_label.value_counts().iloc[1]

xgb = XGBClassifier(random_state = 42, scale_pos_weight = weights_scale, n_estimators = 100)
kfold_xgb = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 42)
xgb_params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.3, 0.5],
    'max_depth': [3, 4, 5, 6, 7],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1],
    'reg_lambda': [0.5, 1, 1.5, 2],
    'min_child_weight': [1, 3, 5, 7],
    'min_split_loss': [0, 0.001,0.01,0.1]
}
rand_search_cv = RandomizedSearchCV(estimator = xgb,
                                   param_distributions = xgb_params,
                                   n_iter = 300,
                                   scoring = 'roc_auc',
                                   n_jobs = -1,
                                   cv = kfold_xgb,
                                   verbose = 2).fit(dataset_train,train_label)

In [41]:
xgb_search_res = pd.DataFrame(rand_search_cv.cv_results_).loc[:,['params','mean_test_score', 'rank_test_score']].sort_values(by = ['rank_test_score'])
for idx,row in xgb_search_res.iterrows():
    print(row[1],row[2])
    for i,val in row[0].items():
        print(f'{i}: {val}')
    print('-'*30)

In [25]:
xgb_search_res

In [42]:
rand_search_cv.best_score_

In [43]:
rand_search_cv.best_estimator_

In [44]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
# xgb_best_estimator = XGBClassifier(n_estimators = 100,
#                         random_state = 42,
#                         scale_pos_weight = weights_scale,
#                         colsample_bytree = 0.7,
#                         learning_rate = 0.1,
#                         max_depth = 4,
#                         min_child_weight = 5,
#                         reg_alpha = 1,
#                         reg_lambda = 1,
#                         subsample = 0.9,
#                         min_split_loss = 0.001)

xgb_best_estimator = rand_search_cv.best_estimator_

xgb_cv_score = cross_val_score(estimator = xgb_best_estimator,
                               X = dataset_train,
                               y = train_label,
                               scoring = 'roc_auc',
                               cv = kfold_xgb,
                               n_jobs = -1,
                               verbose = 2)

In [45]:
xgb_cv_score.mean()

In [46]:
xgb_best = xgb_best_estimator.fit(dataset_train, train_label)
xgb_best_test_score = xgb_best.predict_proba(dataset_train)[:,1]
xgb_best_roc_auc = roc_auc_score(train_label, xgb_best_test_score)
xgb_best_roc_auc

In [47]:
result_xgb_best = pd.DataFrame({'Id': dataset_test.index, 'Probability': xgb_best.predict_proba(dataset_test)[:,1]})
result_xgb_best.to_csv(FOLDER_PATH +'data/result_xgb_best_local_pipeline_v1.csv', index = False)

## New EDA

In [8]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

xgb_new_eda_best_model = XGBClassifier(n_estimators = 100,
                                        random_state = 42,
                                        scale_pos_weight = weights_scale,
                                        colsample_bytree = 0.7,
                                        learning_rate = 0.1,
                                        max_depth = 4,
                                        min_child_weight = 5,
                                        reg_alpha = 1,
                                        reg_lambda = 1,
                                        subsample = 0.9).fit(X_train, Y_train)

xgb_test_score = xgb_new_eda_best_model.predict_proba(X_test)[:,1]
xgb_roc_auc = roc_auc_score(Y_test, xgb_test_score)
print(xgb_roc_auc)
res_xgb_new_eda_best_model = pd.DataFrame({'Id': dataset_test.index + 1, 'Probability': xgb_new_eda_best_model.predict(dataset_test)})
res_xgb_new_eda_best_model.to_csv(FOLDER_PATH +'data/res_new_eda_xgb.csv', index = False)

## LightGBM

In [29]:
#LGBM Baseline
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
lgbm_baseline_estimator = LGBMClassifier(random_state = 42, scale_pos_weight = weights_scale)
kfold_lgbm = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 42)
lgbm_cv_score = cross_val_score(estimator = lgbm_baseline_estimator,
                               X = X_train,
                               y = Y_train,
                               scoring = 'roc_auc',
                               cv = kfold_lgbm,
                               n_jobs = -1,
                               verbose = 2)

In [30]:
lgbm_cv_score.mean()

LGBMClassifier(colsample_bytree=0.7162219560799004,
               learning_rate=0.003721300336530525, max_depth=5,
               n_estimators=2520, random_state=0, reg_lambda=7.017000573168227,
               subsample=0.6716479399150603)

In [31]:
lgbm_best = LGBMClassifier(random_state=42, scale_pos_weight = weights_scale).fit(X_train, Y_train)
lgbm_best_test_score = lgbm_best.predict_proba(X_test)[:,1]
lgbm_best_roc_auc = roc_auc_score(Y_test, lgbm_best_test_score)
lgbm_best_roc_auc

## CatBoost

In [13]:
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
catbst_baseline_estimator = CatBoostClassifier(
    auto_class_weights='Balanced',
    random_seed=42,
    verbose=True)
kfold_catbst = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 42)
catbst_cv_score = cross_val_score(estimator = catbst_baseline_estimator,
                               X = X_train,
                               y = Y_train,
                               scoring = 'roc_auc',
                               cv = kfold_catbst,
                               n_jobs = -1,
                               verbose = 2)

In [15]:
catbst_cv_score.mean()

## Kaggle XGBoost model

In [48]:
import json
import pandas as pd

# Читаем лучшие параметры
with open(FOLDER_PATH +'kaggle_xgb/best_xgb_params.json', 'r') as f:
    best_params = json.load(f)

print("ЛУЧШИЕ ПАРАМЕТРЫ:")
print("=" * 40)
for key, value in best_params.items():
    print(f"{key}: {value}")

In [52]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import joblib

X_train = dataset_train.values.astype(np.float32)
y_train = train_label.values
final_params = best_params.copy()

# Критически важные параметры для финальной модели
final_params.update({
    'random_state': 42,
    'n_estimators': 5000,  
    'early_stopping_rounds': 100,
    'verbosity': 1,
    'n_jobs': -1,  
})

print("\nПАРАМЕТРЫ ФИНАЛЬНОЙ МОДЕЛИ:")
print("=" * 40)
for key, value in final_params.items():
    print(f"{key:25}: {value}")

# 4. Создаем и обучаем модель
print("\nОБУЧЕНИЕ ФИНАЛЬНОЙ МОДЕЛИ...")
print("-" * 40)

final_model = XGBClassifier(**final_params)

# Обучаем на всех данных
final_model.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train)],
    verbose=100  # Показывает прогресс каждые 100 деревьев
)

print("Финальная модель обучена!")

# 5. Сохраняем финальную модель
# joblib.dump(final_model, 'final_xgb_model_gpu.pkl')
# print("✓ Финальная модель сохранена в final_xgb_model_gpu.pkl")

In [53]:
res_xgb_kaggle = pd.DataFrame({'Id': dataset_test.index, 'Probability': final_model.predict_proba(dataset_test)[:,1]})
res_xgb_kaggle.to_csv(FOLDER_PATH +'data/xgb_kaggle_pipeline_v1.csv', index = False)

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBClassifier, cv
import json
import joblib
from sklearn.model_selection import train_test_split

# ============================================================================
# 1. ДИАГНОСТИКА: Смотрим кривую обучения
# ============================================================================

print("=" * 60)
print("ДИАГНОСТИКА ОПТИМАЛЬНОГО n_estimators")
print("=" * 60)

# Загружаем лучшие параметры с Kaggle
with open(FOLDER_PATH +'kaggle_xgb/best_xgb_params.json', 'r') as f:
    best_params = json.load(f)

# Подготовка данных
if isinstance(dataset_train, pd.DataFrame):
    X = dataset_train.values
else:
    X = dataset_train

if isinstance(train_label, pd.Series):
    y = train_label.values
else:
    y = train_label

# Разделяем на train/validation (80/20)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train size: {X_train.shape}, Validation size: {X_val.shape}")

# Адаптируем параметры для CPU
params = best_params.copy()
if 'device' in params: del params['device']
if params.get('tree_method') == 'gpu_hist': 
    params['tree_method'] = 'hist'
    
# Базовые параметры
params.update({
    'random_state': 42,
    'n_jobs': -1,
    'eval_metric': 'auc',
    'verbosity': 0,
})

# ============================================================================
# 2. Тестируем разные n_estimators с early stopping
# ============================================================================

print("\n" + "=" * 60)
print("ТЕСТИРУЕМ РАЗНОЕ КОЛИЧЕСТВО ДЕРЕВЬЕВ")
print("=" * 60)

n_estimators_options = [100, 250, 500, 750, 1000, 1500, 2000, 3000]
results = []

for n_est in n_estimators_options:
    model = XGBClassifier(**params, n_estimators=n_est, early_stopping_rounds=50)
    
    print(f"\nТестируем n_estimators = {n_est}...")
    
    # Обучаем с валидацией
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    # Получаем историю ошибок
    if hasattr(model, 'evals_result_'):
        eval_results = model.evals_result_
        val_auc = eval_results['validation_0']['auc'][-1]
        best_iteration = model.best_iteration if hasattr(model, 'best_iteration') else n_est
    else:
        # Если нет истории, вычисляем вручную
        from sklearn.metrics import roc_auc_score
        y_pred = model.predict_proba(X_val)[:, 1]
        val_auc = roc_auc_score(y_val, y_pred)
        best_iteration = n_est
    
    results.append({
        'n_estimators': n_est,
        'best_iteration': best_iteration,
        'val_auc': val_auc,
        'actual_used': min(best_iteration, n_est)
    })
    
    print(f"  Best iteration: {best_iteration}")
    print(f"  Validation AUC: {val_auc:.6f}")

# Создаем DataFrame с результатами
results_df = pd.DataFrame(results)
print("\n" + "=" * 60)
print("РЕЗУЛЬТАТЫ:")
print("=" * 60)
print(results_df.to_string())

# ============================================================================
# 3. Визуализируем результаты
# ============================================================================

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(results_df['n_estimators'], results_df['val_auc'], 'bo-', linewidth=2, markersize=8)
plt.xlabel('n_estimators')
plt.ylabel('Validation AUC')
plt.title('AUC vs n_estimators')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(results_df['n_estimators'], results_df['best_iteration'], 'ro-', linewidth=2, markersize=8)
plt.xlabel('n_estimators')
plt.ylabel('Best iteration (early stopping)')
plt.title('Early stopping point')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('n_estimators_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

# ============================================================================
# 4. Находим оптимальное количество
# ============================================================================

print("\n" + "=" * 60)
print("АНАЛИЗ ОПТИМАЛЬНОГО КОЛИЧЕСТВА")
print("=" * 60)

# Находим лучший результат
best_row = results_df.loc[results_df['val_auc'].idxmax()]
print(f"Лучший результат:")
print(f"  n_estimators: {best_row['n_estimators']}")
print(f"  Best iteration: {best_row['best_iteration']}")
print(f"  Validation AUC: {best_row['val_auc']:.6f}")

# Проверяем, где AUC перестает расти
auc_improvement = []
for i in range(1, len(results_df)):
    prev_auc = results_df.iloc[i-1]['val_auc']
    curr_auc = results_df.iloc[i]['val_auc']
    improvement = curr_auc - prev_auc
    auc_improvement.append(improvement)
    
    if improvement < 0.0005:  # Если улучшение меньше 0.05%
        optimal_n = results_df.iloc[i-1]['n_estimators']
        print(f"\nОптимальная точка (улучшение < 0.05%):")
        print(f"  n_estimators: {optimal_n}")
        print(f"  AUC: {results_df.iloc[i-1]['val_auc']:.6f}")
        print(f"  Следующее увеличение дает: {improvement:.6f}")
        break

# ============================================================================
# 5. Кросс-валидация для точной оценки
# ============================================================================

print("\n" + "=" * 60)
print("КРОСС-ВАЛИДАЦИЯ С ЛУЧШИМИ ПАРАМЕТРАМИ")
print("=" * 60)

# Используем оптимальное количество деревьев
optimal_n = int(best_row['best_iteration'] * 1.2)  # Берем с запасом 20%

# Параметры для cv
cv_params = params.copy()
cv_params['n_estimators'] = optimal_n

print(f"Проводим кросс-валидацию с n_estimators = {optimal_n}...")

# Создаем DMatrix для xgboost cv
import xgboost as xgb
dmatrix = xgb.DMatrix(X, label=y)

# Запускаем кросс-валидацию
cv_results = xgb.cv(
    cv_params,
    dmatrix,
    num_boost_round=optimal_n,
    nfold=5,
    stratified=True,
    early_stopping_rounds=50,
    seed=42,
    verbose_eval=False
)

print(f"Лучшее количество итераций по CV: {len(cv_results)}")
print(f"Лучший AUC (train): {cv_results['train-auc-mean'].iloc[-1]:.6f}")
print(f"Лучший AUC (test):  {cv_results['test-auc-mean'].iloc[-1]:.6f}")

# ============================================================================
# 6. ФИНАЛЬНАЯ МОДЕЛЬ С ОПТИМАЛЬНЫМИ ПАРАМЕТРАМИ
# ============================================================================

print("\n" + "=" * 60)
print("ОБУЧЕНИЕ ФИНАЛЬНОЙ МОДЕЛИ")
print("=" * 60)

# Определяем финальные параметры
final_n_estimators = min(optimal_n, 2000)  # Ограничиваем 2000 для CPU

print(f"Используем n_estimators = {final_n_estimators}")

final_params = params.copy()
final_params.update({
    'n_estimators': final_n_estimators,
    'early_stopping_rounds': 100,
    'verbosity': 1,
})

# Обучаем на всех данных
print("Обучаем финальную модель на всех данных...")
final_model = XGBClassifier(**final_params)

final_model.fit(
    X, y,
    eval_set=[(X, y)],
    verbose=100
)





In [6]:
xgb_model_best = final_model

In [7]:
import joblib
joblib.dump(xgb_model_best,FOLDER_PATH + 'models/xgb_model_best.pkl')

In [8]:
res_xgb_kaggle_best = pd.DataFrame({'Id': dataset_test.index, 'Probability': xgb_model_best.predict_proba(dataset_test)[:,1]})
res_xgb_kaggle_best.to_csv(FOLDER_PATH +'data/xgb_kaggle_best_pipeline_v1.csv', index = False)

In [65]:
import numpy as np
from sklearn.metrics import roc_auc_score

# Получаем предсказания на трейне
xgb_train_pred = final_model.predict_proba(X_train)[:, 1]
rf_train_pred = rfc_best.predict_proba(X_train)[:, 1]  
lr_train_pred = log_reg.predict_proba(X_train)[:, 1]

print("Train AUC моделей:")
print(f"XGB: {roc_auc_score(y_train, xgb_train_pred):.6f}")
print(f"RF:  {roc_auc_score(y_train, rf_train_pred):.6f}")
print(f"LR:  {roc_auc_score(y_train, lr_train_pred):.6f}")

# Подбор весов
best_score = 0
best_weights = None

# Тестируем нормальные комбинации
for xgb_w in [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]:
    for rf_w in [0.1, 0.2, 0.3, 0.4]:
        lr_w = 1 - xgb_w - rf_w
        if lr_w < 0: continue
        
        ensemble = xgb_w*xgb_train_pred + rf_w*rf_train_pred + lr_w*lr_train_pred
        score = roc_auc_score(y_train, ensemble)
        
        if score > best_score:
            best_score = score
            best_weights = (xgb_w, rf_w, lr_w)

print(f"\nЛучшие веса: XGB={best_weights[0]:.2f}, RF={best_weights[1]:.2f}, LR={best_weights[2]:.2f}")
print(f"Train AUC: {best_score:.6f}")


In [69]:
import numpy as np
import pandas as pd

# Веса которые ты нашел
xgb_weight = 0.7
rf_weight = 0.3

# Получаем предсказания от уже обученных моделей
xgb_test_pred = final_model.predict_proba(dataset_test)[:, 1]
rf_test_pred = rfc_best.predict_proba(dataset_test)[:, 1]

# Ансамбль
ensemble_pred = xgb_weight * xgb_test_pred + rf_weight * rf_test_pred

# Создаем сабмит
res_ensebmle_xgb_rf = pd.DataFrame({'Id': dataset_test.index, 'Probability': ensemble_pred})
res_ensebmle_xgb_rf.to_csv(FOLDER_PATH +'data/res_ensebmle_xgb_rf_pipeline_v1.csv', index = False)

print("Сабмит создан: ensemble_xgb07_rf03.csv")
print(f"Веса: XGB={xgb_weight}, RF={rf_weight}")
print(f"Min={ensemble_pred.min():.4f}, Max={ensemble_pred.max():.4f}, Mean={ensemble_pred.mean():.4f}")

In [85]:
#1. LightGBM (часто лучше XGBoost)

import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

print("Обучаем LightGBM...")
lgb_model.fit(dataset_train, train_label)

# # 2. Создаем сабмит LightGBM
# lgb_preds = lgb_model.predict_proba(dataset_test)[:, 1]
# lgbm = pd.DataFrame({'Id': dataset_test.index, 'Probability': lgb_preds})
# lgbm.to_csv(FOLDER_PATH +'data/res_lgbm_pipeline_v1.csv', index = False)

# 3. Простое усреднение всех моделей
print("\nСоздаем усредненный сабмит всех моделей...")

# Получаем предсказания всех моделей
xgb_preds = final_model.predict_proba(dataset_test)[:, 1]
rf_preds = rfc_best.predict_proba(dataset_test)[:, 1]
lr_preds = log_reg.predict_proba(dataset_test)[:, 1]
lgb_preds = lgb_model.predict_proba(dataset_test)[:, 1]
cat_preds = cat_model.predict_proba(dataset_test)[:, 1]
# Простое усреднение
avg_preds = (xgb_preds + rf_preds + lr_preds + lgb_preds + cat_preds) / 5

# Взвешенное усреднение (подбираем вес для XGB)
for xgb_weight in [#0.4, 0.5, 0.6, 0.7, 
                   0.75,0.8,0.9,0.95]:
    other_weight = (1 - xgb_weight) / 4
    weighted_preds = (
        xgb_weight * xgb_preds +
        other_weight * rf_preds +
        other_weight * lr_preds +
        other_weight * lgb_preds +
        other_weight * cat_preds
    )
    
    pd.DataFrame({'Id': dataset_test.index,'Probability': weighted_preds}).to_csv(FOLDER_PATH +f'data/weighted_ensemble_cat_xgb{xgb_weight}.csv', index=False)
    print(f"Создал weighted_ensemble_cat_xgb{xgb_weight}.csv")


print("\n✓ Все сабмиты созданы. Загружай на Kaggle и сравнивай!")

In [86]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
import numpy as np

# Базовый LightGBM
lgb_model = lgb.LGBMClassifier(
    random_state=42,
    n_jobs=-1,
    verbose=-1,
    n_estimators=1000
)

# Сетка параметров для поиска
lgb_params = {
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'num_leaves': [15, 31, 63, 127],
    'max_depth': [-1, 5, 7, 9, 12],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1, 3],
    'reg_lambda': [0, 0.1, 0.5, 1, 3],
    'min_child_samples': [5, 10, 20, 30],
}

# Быстрый поиск
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
lgb_search = RandomizedSearchCV(
    lgb_model, lgb_params,
    n_iter=50,
    scoring='roc_auc',
    cv=cv,
    n_jobs=1,
    random_state=42,
    verbose=1
)

print("Ищем параметры для LightGBM...")
lgb_search.fit(dataset_train, train_label)

print(f"Лучший LightGBM: {lgb_search.best_score_:.6f}")

# # Создаем сабмит
# lgb_best = lgb_search.best_estimator_
# lgb_preds = lgb_best.predict_proba(dataset_test)[:, 1]
# pd.DataFrame({'Id': range(len(lgb_preds)), 'Probability': lgb_preds})\
#   .to_csv('lightgbm_tuned.csv', index=False)

In [87]:
lgb_search.best_estimator_
'''
Лучший LightGBM: 0.863677

boosting_type 	'gbdt'
num_leaves 	31
max_depth 	5
learning_rate 	0.01
n_estimators 	1000
subsample_for_bin 	200000
objective 	None
class_weight 	None
min_split_gain 	0.0
min_child_weight 	0.001
min_child_samples 	10
subsample 	0.8
subsample_freq 	0
colsample_bytree 	0.6
reg_alpha 	3
reg_lambda 	3
random_state 	42
n_jobs 	-1
importance_type 	'split'
verbose 	-1

'''

In [79]:
from catboost import CatBoostClassifier

# Простой CatBoost
cat_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=100,  # Показывает прогресс
    task_type='CPU'  # Или 'GPU' если есть
)

cat_model.fit(dataset_train, train_label)

cat_preds = cat_model.predict_proba(dataset_test)[:, 1]
pd.DataFrame({'Id': dataset_test.index, 'Probability': cat_preds})\
  .to_csv(FOLDER_PATH + 'data/catboost_basic.csv', index=False)

In [None]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
import numpy as np

# Базовый CatBoost
cat_model = CatBoostClassifier(
    random_seed=42,
    verbose=0,  # Поменял на 0 для RandomizedSearchCV
    task_type='CPU',  # 'GPU' если есть
    eval_metric='AUC',
    early_stopping_rounds=50
)

# Сетка параметров для CatBoost
cat_params = {
    'iterations': [1000, 1500, 2000],
    'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1, 0.15],
    'depth': [4, 5, 6, 7, 8, 9],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'border_count': [32, 64, 128, 256],  # Аналог max_bin
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bylevel': [0.6, 0.7, 0.8, 0.9, 1.0],
    'min_data_in_leaf': [1, 3, 5, 7, 10],
    'random_strength': [0, 0.5, 1, 2],  # Регуляризация
    'bagging_temperature': [0, 0.5, 1],  # Для стохастичности
    'fold_len_multiplier': [1.5, 2, 2.5],  # Для overfitting detection
}

# Кросс-валидация
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# RandomizedSearch для CatBoost
cat_search = RandomizedSearchCV(
    cat_model,
    cat_params,
    n_iter=50,  # Можно увеличить до 100 если время есть
    scoring='roc_auc',
    cv=cv,
    n_jobs=1,  # CatBoost сам использует потоки
    random_state=42,
    verbose=3
)

print("Запускаем поиск параметров для CatBoost...")
print(f"Будет проверено {50} комбинаций")

cat_search.fit(dataset_train, train_label)

print(f"\nЛучший CatBoost: {cat_search.best_score_:.6f}")
print("Лучшие параметры:")
for param, value in cat_search.best_params_.items():
    print(f"  {param}: {value}")

# Обучаем финальную модель с лучшими параметрами
print("\nОбучаем финальный CatBoost на всех данных...")
cat_best = CatBoostClassifier(
    **cat_search.best_params_,
    random_seed=42,
    verbose=100,  # Показываем прогресс
    task_type='CPU'
)

cat_best.fit(dataset_train, train_label)

# # Создаем сабмит CatBoost
# cat_preds = cat_best.predict_proba(dataset_test)[:, 1]
# pd.DataFrame({
#     'Id': dataset_test.index,
#     'Probability': cat_preds
# }).to_csv(FOLDER_PATH + 'catboost_tuned.csv', index=False)

# print("\n✓ CatBoost с настройкой создан: catboost_tuned.csv")