## Исследование таргета

In [2]:
# Импорт данных
import pandas as pd
train_target = pd.read_csv("train_target.csv", index_col="id")
train_target.head()

Unnamed: 0_level_0,flag
id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


In [3]:
# Подсчет кол-ва уникальных значений
train_target["flag"].value_counts()

0    2893558
1     106442
Name: flag, dtype: int64

In [4]:
# Проверка на пустые значения
train_target["flag"].isnull().sum()

0

## Эксперименты с данными

In [3]:
# Импорт данных
df = pd.read_parquet("train_data")

In [112]:
df.head()

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,0,1,18,9,2,3,16,10,11,3,...,3,3,3,4,1,3,4,1,0,0
1,0,2,18,9,14,14,12,12,0,3,...,0,0,0,4,1,3,4,1,0,0
2,0,3,18,9,4,8,1,11,11,0,...,0,0,0,4,1,2,3,1,1,1
3,0,4,4,1,9,12,16,7,12,2,...,3,3,3,4,1,3,1,1,0,0
4,0,5,5,12,15,2,11,12,10,2,...,3,3,3,4,1,3,4,1,0,0


In [113]:
# Проверка на пустые значения
df.iloc[:, :60].isnull().sum()

id                               0
rn                               0
pre_since_opened                 0
pre_since_confirmed              0
pre_pterm                        0
pre_fterm                        0
pre_till_pclose                  0
pre_till_fclose                  0
pre_loans_credit_limit           0
pre_loans_next_pay_summ          0
pre_loans_outstanding            0
pre_loans_total_overdue          0
pre_loans_max_overdue_sum        0
pre_loans_credit_cost_rate       0
pre_loans5                       0
pre_loans530                     0
pre_loans3060                    0
pre_loans6090                    0
pre_loans90                      0
is_zero_loans5                   0
is_zero_loans530                 0
is_zero_loans3060                0
is_zero_loans6090                0
is_zero_loans90                  0
pre_util                         0
pre_over2limit                   0
pre_maxover2limit                0
is_zero_util                     0
is_zero_over2limit  

In [114]:
# Проверка на дубликаты
df.duplicated().sum()

0

### Эксперимент 1
Здесь мы опираемся на здравый смысл и преобразуем признаки, исходя из собственных соображений.

In [115]:
#Эти фичи представляют число просрочек в разных интервалах. 
#Вместо использования отдельных признаков, можно объединить их в один признак, который представляет общее количество просрочек.
df["total_overdue_count"] = df["pre_loans5"] + df["pre_loans530"] + df["pre_loans3060"] + df["pre_loans6090"] + df["pre_loans90"]
df.drop(["pre_loans5", "pre_loans530", "pre_loans3060", "pre_loans6090", "pre_loans90"], axis=1, inplace=True)

In [116]:
#Можно объединить их в один признак, который указывает, есть ли просрочки в любом из интервалов.
df["has_overdue_flag"] = 1 - (df["is_zero_loans5"] & df["is_zero_loans530"] & df["is_zero_loans3060"] & df["is_zero_loans6090"] & df["is_zero_loans90"])
df.drop(["is_zero_loans5", "is_zero_loans530", "is_zero_loans3060", "is_zero_loans6090", "is_zero_loans90"], axis=1, inplace=True)

In [117]:
#Можно объединить в один признак, который указывает на отсутствие любой задолженности.
df["has_no_debt_flag"] = df["is_zero_util"] & df["is_zero_over2limit"] & df["is_zero_maxover2limit"]
df.drop(["is_zero_util", "is_zero_over2limit", "is_zero_maxover2limit"], axis=1, inplace=True)

In [118]:
#Можно создать новые признаки, которые отражают разницу между плановыми и фактическими сроками.
df["term_difference"] = df["pre_fterm"] - df["pre_pterm"]
df["close_difference"] = df["pre_till_fclose"] - df["pre_till_pclose"]
df.drop(["pre_fterm", "pre_pterm", "pre_till_fclose", "pre_till_pclose"], axis=1, inplace=True)

In [119]:
df.columns

Index(['id', 'rn', 'pre_since_opened', 'pre_since_confirmed',
       'pre_loans_credit_limit', 'pre_loans_next_pay_summ',
       'pre_loans_outstanding', 'pre_loans_total_overdue',
       'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate', 'pre_util',
       'pre_over2limit', 'pre_maxover2limit', 'enc_paym_0', 'enc_paym_1',
       'enc_paym_2', 'enc_paym_3', 'enc_paym_4', 'enc_paym_5', 'enc_paym_6',
       'enc_paym_7', 'enc_paym_8', 'enc_paym_9', 'enc_paym_10', 'enc_paym_11',
       'enc_paym_12', 'enc_paym_13', 'enc_paym_14', 'enc_paym_15',
       'enc_paym_16', 'enc_paym_17', 'enc_paym_18', 'enc_paym_19',
       'enc_paym_20', 'enc_paym_21', 'enc_paym_22', 'enc_paym_23',
       'enc_paym_24', 'enc_loans_account_holder_type',
       'enc_loans_credit_status', 'enc_loans_credit_type',
       'enc_loans_account_cur', 'pclose_flag', 'fclose_flag',
       'total_overdue_count', 'has_overdue_flag', 'has_no_debt_flag',
       'term_difference', 'close_difference'],
      dtype='obje

In [121]:
# 'enc_paym_11', 'enc_paym_20', 'enc_paym_24' имеют диапазон от 1 до 4, остальные от 0 до 3, 
# поэтому для удобства подсчёта перевожу их в один диапазон
value_mapping = {
    1: 0,
    2: 1,
    3: 2,
    4: 3
}

columns_to_transform = ['enc_paym_11', 'enc_paym_20', 'enc_paym_24']

for column in columns_to_transform:
    df[column] = df[column].replace(value_mapping)

In [122]:
enc_paym_columns = [f'enc_paym_{i}' for i in range(25)] 

In [123]:
# Подсчитывают кол-во статусов во всех enc_paym_N
import numpy as np
df[f'status_0'] = np.sum(df[enc_paym_columns].values == 0, axis=1)

In [130]:
df[f'status_1'] = np.sum(df[enc_paym_columns].values == 1, axis=1)

In [131]:
df[f'status_2'] = np.sum(df[enc_paym_columns].values == 2, axis=1)

In [132]:
df[f'status_3'] = np.sum(df[enc_paym_columns].values == 3, axis=1)

In [133]:
df.drop(enc_paym_columns, axis=1, inplace=True)

In [134]:
df.head()

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_loans_next_pay_summ,pre_loans_credit_cost_rate,pre_util,pre_over2limit,pre_maxover2limit,enc_loans_account_holder_type,...,fclose_flag,total_overdue_count,has_overdue_flag,has_no_debt_flag,term_difference,close_difference,status_0,status_1,status_2,status_3
0,0,1,18,9,3,11,16,2,17,1,...,0,39,0,1,1,-6,2,0,0,23
1,0,2,18,9,3,11,16,2,17,1,...,0,39,0,1,0,0,24,0,0,1
2,0,3,18,9,0,8,15,2,17,1,...,1,39,0,0,4,10,24,0,0,1
3,0,4,4,1,2,4,16,2,17,1,...,0,39,1,1,3,-9,11,1,0,13
4,0,5,5,12,2,4,16,2,17,1,...,0,39,0,1,-13,1,7,0,0,18


In [135]:
df.columns

Index(['id', 'rn', 'pre_since_opened', 'pre_since_confirmed',
       'pre_loans_next_pay_summ', 'pre_loans_credit_cost_rate', 'pre_util',
       'pre_over2limit', 'pre_maxover2limit', 'enc_loans_account_holder_type',
       'enc_loans_credit_status', 'enc_loans_credit_type',
       'enc_loans_account_cur', 'pclose_flag', 'fclose_flag',
       'total_overdue_count', 'has_overdue_flag', 'has_no_debt_flag',
       'term_difference', 'close_difference', 'status_0', 'status_1',
       'status_2', 'status_3'],
      dtype='object')

In [164]:
# Определяю как аггрегировать
aggregations = {
    'rn': 'max', 
    'pre_since_opened': 'mean',
    'pre_since_confirmed': 'median', 
    'pre_loans_next_pay_summ': 'median',
    'pre_loans_credit_cost_rate': 'median',
    'pre_util': 'median',
    'pre_over2limit': 'median',
    'pre_maxover2limit': 'median',
    'enc_loans_account_holder_type': 'median',
    'enc_loans_credit_status': 'median',
    'enc_loans_credit_type': 'median',
    'enc_loans_account_cur': 'median',
    'pclose_flag': 'median',
    'fclose_flag': 'median',
    'total_overdue_count': 'median',
    'has_overdue_flag': 'mean',
    'has_no_debt_flag': 'mean',
    'term_difference': 'mean',
    'close_difference': 'mean',
    'status_0': 'median',
    'status_1': 'median',
    'status_2': 'median',
    'status_3': 'median'
}

In [165]:
# Группирую с аггрегацией
grouped_df = df.groupby('id').agg(aggregations).reset_index()

In [167]:
# Чекпоинт данных
grouped_df.to_csv("grouped_features.csv")

In [168]:
grouped_df

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_loans_next_pay_summ,pre_loans_credit_cost_rate,pre_util,pre_over2limit,pre_maxover2limit,enc_loans_account_holder_type,...,fclose_flag,total_overdue_count,has_overdue_flag,has_no_debt_flag,term_difference,close_difference,status_0,status_1,status_2,status_3
0,0,10,8.100000,9.0,3.0,9.5,16.0,2.0,17.0,1.0,...,0.0,39.0,0.100000,0.600000,0.400000,-0.700000,9.5,0.0,0.0,15.0
1,1,14,11.428571,7.0,2.0,4.0,16.0,2.0,17.0,1.0,...,0.0,39.0,0.571429,0.714286,1.285714,-3.071429,8.5,0.0,0.0,15.5
2,2,3,8.333333,9.0,1.0,4.0,6.0,2.0,17.0,1.0,...,1.0,39.0,0.333333,0.333333,-1.000000,4.000000,10.0,0.0,0.0,15.0
3,3,15,7.000000,9.0,2.0,4.0,16.0,2.0,17.0,1.0,...,0.0,39.0,0.000000,0.533333,0.200000,2.933333,14.0,0.0,0.0,11.0
4,4,1,12.000000,9.0,1.0,0.0,16.0,2.0,17.0,1.0,...,1.0,39.0,0.000000,1.000000,4.000000,10.000000,0.0,0.0,0.0,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999995,2999995,11,8.818182,9.0,2.0,4.0,16.0,2.0,17.0,1.0,...,0.0,39.0,0.727273,0.545455,0.000000,0.000000,13.0,2.0,0.0,12.0
2999996,2999996,13,8.000000,11.0,2.0,2.0,16.0,2.0,17.0,1.0,...,0.0,39.0,0.076923,0.692308,-0.538462,0.538462,12.0,0.0,0.0,13.0
2999997,2999997,10,8.100000,7.0,2.0,5.0,16.0,2.0,17.0,1.0,...,0.0,39.0,0.400000,0.500000,-0.700000,0.900000,10.0,0.0,0.0,14.0
2999998,2999998,5,11.600000,9.0,2.0,4.0,9.0,2.0,17.0,1.0,...,0.0,39.0,0.200000,0.200000,1.600000,1.800000,4.0,0.0,0.0,21.0


In [17]:
grouped_df = pd.read_csv("grouped_features.csv", index_col="Unnamed: 0")

In [18]:
grouped_df.head()

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_loans_next_pay_summ,pre_loans_credit_cost_rate,pre_util,pre_over2limit,pre_maxover2limit,enc_loans_account_holder_type,...,fclose_flag,total_overdue_count,has_overdue_flag,has_no_debt_flag,term_difference,close_difference,status_0,status_1,status_2,status_3
0,0,10,8.1,9.0,3.0,9.5,16.0,2.0,17.0,1.0,...,0.0,39.0,0.1,0.6,0.4,-0.7,9.5,0.0,0.0,15.0
1,1,14,11.428571,7.0,2.0,4.0,16.0,2.0,17.0,1.0,...,0.0,39.0,0.571429,0.714286,1.285714,-3.071429,8.5,0.0,0.0,15.5
2,2,3,8.333333,9.0,1.0,4.0,6.0,2.0,17.0,1.0,...,1.0,39.0,0.333333,0.333333,-1.0,4.0,10.0,0.0,0.0,15.0
3,3,15,7.0,9.0,2.0,4.0,16.0,2.0,17.0,1.0,...,0.0,39.0,0.0,0.533333,0.2,2.933333,14.0,0.0,0.0,11.0
4,4,1,12.0,9.0,1.0,0.0,16.0,2.0,17.0,1.0,...,1.0,39.0,0.0,1.0,4.0,10.0,0.0,0.0,0.0,25.0


In [19]:
# объединение с таргетом
final_df = grouped_df.merge(train_target, how="left", on="id")

In [20]:
final_df.head()

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_loans_next_pay_summ,pre_loans_credit_cost_rate,pre_util,pre_over2limit,pre_maxover2limit,enc_loans_account_holder_type,...,total_overdue_count,has_overdue_flag,has_no_debt_flag,term_difference,close_difference,status_0,status_1,status_2,status_3,flag
0,0,10,8.1,9.0,3.0,9.5,16.0,2.0,17.0,1.0,...,39.0,0.1,0.6,0.4,-0.7,9.5,0.0,0.0,15.0,0
1,1,14,11.428571,7.0,2.0,4.0,16.0,2.0,17.0,1.0,...,39.0,0.571429,0.714286,1.285714,-3.071429,8.5,0.0,0.0,15.5,0
2,2,3,8.333333,9.0,1.0,4.0,6.0,2.0,17.0,1.0,...,39.0,0.333333,0.333333,-1.0,4.0,10.0,0.0,0.0,15.0,0
3,3,15,7.0,9.0,2.0,4.0,16.0,2.0,17.0,1.0,...,39.0,0.0,0.533333,0.2,2.933333,14.0,0.0,0.0,11.0,0
4,4,1,12.0,9.0,1.0,0.0,16.0,2.0,17.0,1.0,...,39.0,0.0,1.0,4.0,10.0,0.0,0.0,0.0,25.0,0


In [21]:
# Смотрим на все корелляции выше 0.6. Не очень сильная корелляция, поэтому оставим эти признаки.
correlation_matrix = final_df.corr()
correlation_pairs = correlation_matrix.unstack().sort_values(ascending=False)
correlation_pairs = correlation_pairs[correlation_pairs < 1]
high_correlation = correlation_pairs[correlation_pairs > 0.6]
high_correlation

has_no_debt_flag  pre_util            0.627119
pre_util          has_no_debt_flag    0.627119
dtype: float64

In [22]:
# Разделяем значения на X, y, train/test и стандартизируем их.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = final_df.drop(columns=['id', 'flag'])
y = final_df['flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
# Тестируем на CatBoostClassifier, подбирая параметры с помощью optuna
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import optuna
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'random_seed': 42,
        'verbose': 0,
        'class_weights': class_weights_dict
    }
    
    model = CatBoostClassifier(**params)
    model.fit(X_train_scaled, y_train, eval_set=(X_test_scaled, y_test), early_stopping_rounds=50, verbose=False)
    
    preds = model.predict_proba(X_test_scaled)[:, 1]
    auc = roc_auc_score(y_test, preds)
    
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print("Лучшие гиперпараметры:", study.best_params)
print("Лучший ROC-AUC:", study.best_value)

best_params = study.best_params
final_model = CatBoostClassifier(**best_params)
final_model.fit(X_train_scaled, y_train)

final_preds = final_model.predict_proba(X_test_scaled)[:, 1]
final_auc = roc_auc_score(y_test, final_preds)
print("Финальный ROC-AUC:", final_auc)

[I 2024-09-18 09:06:15,096] A new study created in memory with name: no-name-1b97861d-d1d5-4d70-b4b6-5537589aac67
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
[I 2024-09-18 09:08:25,908] Trial 0 finished with value: 0.6891391701556918 and parameters: {'iterations': 658, 'learning_rate': 0.001487526733448147, 'depth': 10, 'l2_leaf_reg': 1}. Best is trial 0 with value: 0.6891391701556918.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
[I 2024-09-18 09:08:50,106] Trial 1 finished with value: 0.6828546756345869 and parameters: {'iterations': 921, 'learning_rate': 6.155447542948876e-05, 'depth': 9, 'l2_leaf_reg': 9}. Best is trial 0 with value: 0.6891391701556918.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
[I 2024-09-18 09:09:08,545] Trial 2 finished with value: 0.6822818359179362 and parameters: {'iterations': 631, 'learning_rate': 5.455209764961053e-05, 'depth': 9, 'l2_leaf_reg': 5}. Best is trial 0 wit

Лучшие гиперпараметры: {'iterations': 698, 'learning_rate': 0.040668444700415675, 'depth': 6, 'l2_leaf_reg': 1}
Лучший ROC-AUC: 0.7024164211632169
0:	learn: 0.6325000	total: 83.5ms	remaining: 58.2s
1:	learn: 0.5768237	total: 167ms	remaining: 58.2s
2:	learn: 0.5286970	total: 247ms	remaining: 57.2s
3:	learn: 0.4869436	total: 316ms	remaining: 54.8s
4:	learn: 0.4498116	total: 385ms	remaining: 53.4s
5:	learn: 0.4159709	total: 461ms	remaining: 53.1s
6:	learn: 0.3867466	total: 532ms	remaining: 52.5s
7:	learn: 0.3602850	total: 610ms	remaining: 52.6s
8:	learn: 0.3369039	total: 684ms	remaining: 52.4s
9:	learn: 0.3162234	total: 762ms	remaining: 52.4s
10:	learn: 0.2982364	total: 835ms	remaining: 52.1s
11:	learn: 0.2824142	total: 916ms	remaining: 52.4s
12:	learn: 0.2686754	total: 980ms	remaining: 51.7s
13:	learn: 0.2558643	total: 1.06s	remaining: 51.8s
14:	learn: 0.2446729	total: 1.13s	remaining: 51.4s
15:	learn: 0.2343439	total: 1.21s	remaining: 51.6s
16:	learn: 0.2253417	total: 1.29s	remaining: 5

159:	learn: 0.1442905	total: 12.6s	remaining: 42.4s
160:	learn: 0.1442830	total: 12.7s	remaining: 42.3s
161:	learn: 0.1442751	total: 12.8s	remaining: 42.3s
162:	learn: 0.1442685	total: 12.9s	remaining: 42.2s
163:	learn: 0.1442629	total: 12.9s	remaining: 42.1s
164:	learn: 0.1442567	total: 13s	remaining: 42s
165:	learn: 0.1442509	total: 13.1s	remaining: 42s
166:	learn: 0.1442451	total: 13.2s	remaining: 41.9s
167:	learn: 0.1442390	total: 13.2s	remaining: 41.8s
168:	learn: 0.1442306	total: 13.3s	remaining: 41.7s
169:	learn: 0.1442253	total: 13.4s	remaining: 41.6s
170:	learn: 0.1442191	total: 13.5s	remaining: 41.5s
171:	learn: 0.1442113	total: 13.6s	remaining: 41.4s
172:	learn: 0.1442019	total: 13.6s	remaining: 41.4s
173:	learn: 0.1441950	total: 13.7s	remaining: 41.3s
174:	learn: 0.1441912	total: 13.8s	remaining: 41.2s
175:	learn: 0.1441855	total: 13.9s	remaining: 41.1s
176:	learn: 0.1441817	total: 13.9s	remaining: 41s
177:	learn: 0.1441755	total: 14s	remaining: 40.9s
178:	learn: 0.1441689	

318:	learn: 0.1436318	total: 24.9s	remaining: 29.5s
319:	learn: 0.1436292	total: 24.9s	remaining: 29.4s
320:	learn: 0.1436270	total: 25s	remaining: 29.4s
321:	learn: 0.1436215	total: 25.1s	remaining: 29.3s
322:	learn: 0.1436198	total: 25.2s	remaining: 29.2s
323:	learn: 0.1436179	total: 25.2s	remaining: 29.1s
324:	learn: 0.1436136	total: 25.3s	remaining: 29.1s
325:	learn: 0.1436114	total: 25.4s	remaining: 29s
326:	learn: 0.1436081	total: 25.5s	remaining: 28.9s
327:	learn: 0.1436052	total: 25.6s	remaining: 28.8s
328:	learn: 0.1436028	total: 25.6s	remaining: 28.7s
329:	learn: 0.1435993	total: 25.7s	remaining: 28.7s
330:	learn: 0.1435969	total: 25.8s	remaining: 28.6s
331:	learn: 0.1435934	total: 25.9s	remaining: 28.5s
332:	learn: 0.1435905	total: 25.9s	remaining: 28.4s
333:	learn: 0.1435884	total: 26s	remaining: 28.4s
334:	learn: 0.1435853	total: 26.1s	remaining: 28.3s
335:	learn: 0.1435817	total: 26.2s	remaining: 28.2s
336:	learn: 0.1435764	total: 26.3s	remaining: 28.1s
337:	learn: 0.1435

477:	learn: 0.1432344	total: 37.1s	remaining: 17.1s
478:	learn: 0.1432329	total: 37.2s	remaining: 17s
479:	learn: 0.1432294	total: 37.2s	remaining: 16.9s
480:	learn: 0.1432279	total: 37.3s	remaining: 16.8s
481:	learn: 0.1432264	total: 37.4s	remaining: 16.8s
482:	learn: 0.1432247	total: 37.5s	remaining: 16.7s
483:	learn: 0.1432228	total: 37.5s	remaining: 16.6s
484:	learn: 0.1432214	total: 37.6s	remaining: 16.5s
485:	learn: 0.1432195	total: 37.7s	remaining: 16.4s
486:	learn: 0.1432167	total: 37.8s	remaining: 16.4s
487:	learn: 0.1432136	total: 37.8s	remaining: 16.3s
488:	learn: 0.1432124	total: 37.9s	remaining: 16.2s
489:	learn: 0.1432099	total: 38s	remaining: 16.1s
490:	learn: 0.1432082	total: 38.1s	remaining: 16.1s
491:	learn: 0.1432048	total: 38.2s	remaining: 16s
492:	learn: 0.1432034	total: 38.3s	remaining: 15.9s
493:	learn: 0.1432021	total: 38.3s	remaining: 15.8s
494:	learn: 0.1432006	total: 38.4s	remaining: 15.8s
495:	learn: 0.1431988	total: 38.5s	remaining: 15.7s
496:	learn: 0.1431

637:	learn: 0.1429375	total: 49.2s	remaining: 4.63s
638:	learn: 0.1429368	total: 49.3s	remaining: 4.55s
639:	learn: 0.1429353	total: 49.4s	remaining: 4.47s
640:	learn: 0.1429345	total: 49.4s	remaining: 4.39s
641:	learn: 0.1429335	total: 49.5s	remaining: 4.32s
642:	learn: 0.1429321	total: 49.6s	remaining: 4.24s
643:	learn: 0.1429300	total: 49.7s	remaining: 4.16s
644:	learn: 0.1429279	total: 49.7s	remaining: 4.09s
645:	learn: 0.1429261	total: 49.8s	remaining: 4.01s
646:	learn: 0.1429238	total: 49.9s	remaining: 3.93s
647:	learn: 0.1429215	total: 50s	remaining: 3.86s
648:	learn: 0.1429199	total: 50s	remaining: 3.78s
649:	learn: 0.1429188	total: 50.1s	remaining: 3.7s
650:	learn: 0.1429164	total: 50.2s	remaining: 3.62s
651:	learn: 0.1429146	total: 50.3s	remaining: 3.55s
652:	learn: 0.1429127	total: 50.4s	remaining: 3.47s
653:	learn: 0.1429114	total: 50.4s	remaining: 3.39s
654:	learn: 0.1429098	total: 50.5s	remaining: 3.31s
655:	learn: 0.1429078	total: 50.6s	remaining: 3.24s
656:	learn: 0.142

Итог 0.7006790899890878

### Эксперимент 2
Здесь мы попробуем не преобразовывать признаки и аггрегировать их как есть.

In [5]:
# Импортируем данные
df = pd.read_parquet("train_data")

In [8]:
df.columns

Index(['id', 'rn', 'pre_since_opened', 'pre_since_confirmed', 'pre_pterm',
       'pre_fterm', 'pre_till_pclose', 'pre_till_fclose',
       'pre_loans_credit_limit', 'pre_loans_next_pay_summ',
       'pre_loans_outstanding', 'pre_loans_total_overdue',
       'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate', 'pre_loans5',
       'pre_loans530', 'pre_loans3060', 'pre_loans6090', 'pre_loans90',
       'is_zero_loans5', 'is_zero_loans530', 'is_zero_loans3060',
       'is_zero_loans6090', 'is_zero_loans90', 'pre_util', 'pre_over2limit',
       'pre_maxover2limit', 'is_zero_util', 'is_zero_over2limit',
       'is_zero_maxover2limit', 'enc_paym_0', 'enc_paym_1', 'enc_paym_2',
       'enc_paym_3', 'enc_paym_4', 'enc_paym_5', 'enc_paym_6', 'enc_paym_7',
       'enc_paym_8', 'enc_paym_9', 'enc_paym_10', 'enc_paym_11', 'enc_paym_12',
       'enc_paym_13', 'enc_paym_14', 'enc_paym_15', 'enc_paym_16',
       'enc_paym_17', 'enc_paym_18', 'enc_paym_19', 'enc_paym_20',
       'enc_paym_21', 

In [9]:
# Определяем аггрегации и группируем
aggregations = {
    'rn': 'max', 
    'pre_since_opened': 'median',
    'pre_since_confirmed': 'median', 
    'pre_pterm': 'median',
    'pre_fterm': 'median',
    'pre_till_pclose': 'median',
    'pre_till_fclose': 'median',
    'pre_loans_credit_limit': 'median',
    'pre_loans_next_pay_summ': 'median',
    'pre_loans_outstanding': 'median',
    'pre_loans_total_overdue': 'median',
    'pre_loans_max_overdue_sum': 'median',
    'pre_loans_credit_cost_rate': 'median',
    'pre_loans5': 'median',
    'pre_loans530': 'median',
    'pre_loans3060': 'median',
    'pre_loans6090': 'median',
    'pre_loans90': 'median',
    'is_zero_loans5': 'mean',
    'is_zero_loans530': 'mean',
    'is_zero_loans3060': 'mean',
    'is_zero_loans6090': 'mean',
    'is_zero_loans90': 'mean',
    'pre_util': 'median',
    'pre_over2limit': 'median',
    'pre_maxover2limit': 'median',
    'is_zero_util': 'mean',
    'is_zero_over2limit': 'mean',
    'is_zero_maxover2limit': 'mean',
    'enc_loans_account_holder_type': 'median',
    'enc_loans_credit_status': 'median',
    'enc_loans_account_cur': 'median',
    'enc_loans_credit_type': 'median',
    'pclose_flag': 'median',
    'fclose_flag': 'median',
    'enc_paym_0': 'median',
    'enc_paym_1': 'median',
    'enc_paym_2': 'median',
    'enc_paym_3': 'median',
    'enc_paym_4': 'median',
    'enc_paym_5': 'median',
    'enc_paym_6': 'median',
    'enc_paym_7': 'median',
    'enc_paym_8': 'median',
    'enc_paym_9': 'median',
    'enc_paym_10': 'median',
    'enc_paym_11': 'median',
    'enc_paym_12': 'median',
    'enc_paym_13': 'median',
    'enc_paym_14': 'median',
    'enc_paym_15': 'median',
    'enc_paym_16': 'median',
    'enc_paym_17': 'median',
    'enc_paym_18': 'median',
    'enc_paym_19': 'median',
    'enc_paym_20': 'median',
    'enc_paym_21': 'median',
    'enc_paym_22': 'median',
    'enc_paym_23': 'median',
    'enc_paym_24': 'median'
}
grouped_df = df.groupby('id').agg(aggregations).reset_index()

In [10]:
# Соединяем с таргетом
final_df = grouped_df.merge(train_target, how="left", on="id")

In [11]:
final_df.corr()

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_16,enc_paym_17,enc_paym_18,enc_paym_19,enc_paym_20,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,flag
id,1.000000,0.074279,0.041197,0.012538,0.012521,0.008800,0.014166,-0.003865,-0.012971,-0.022741,...,-0.010049,-0.005368,-0.001678,0.003597,0.007220,0.010184,0.012385,0.013781,-0.010858,-0.000037
rn,0.074279,1.000000,0.022856,-0.011241,0.027872,-0.016063,0.073501,-0.055806,0.016918,-0.200790,...,0.158870,0.164487,0.170377,0.174816,0.176665,0.178037,0.179105,0.180341,0.158081,-0.014138
pre_since_opened,0.041197,0.022856,1.000000,-0.015967,0.000199,0.018226,-0.032863,-0.054148,-0.006013,0.005154,...,-0.035592,-0.044443,-0.046777,-0.043951,-0.039876,-0.036881,-0.037089,-0.040171,-0.038730,0.011584
pre_since_confirmed,0.012538,-0.011241,-0.015967,1.000000,-0.025293,0.016467,0.021289,-0.071005,-0.010780,0.006327,...,-0.026685,-0.024848,-0.022593,-0.021504,-0.019905,-0.018877,-0.017182,-0.015935,-0.006572,-0.006152
pre_pterm,0.012521,0.027872,0.000199,-0.025293,1.000000,0.083474,0.275438,-0.071958,0.016123,-0.058705,...,0.081110,0.084519,0.088255,0.092226,0.093451,0.094822,0.095662,0.096543,0.046033,-0.007316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
enc_paym_21,0.010184,0.178037,-0.036881,-0.018877,0.094822,-0.100247,0.002081,-0.017802,0.039268,-0.030261,...,0.744141,0.790044,0.841635,0.893177,0.943114,1.000000,0.942422,0.890901,0.668388,0.023949
enc_paym_22,0.012385,0.179105,-0.037089,-0.017182,0.095662,-0.093724,0.002365,-0.018339,0.037563,-0.030415,...,0.705873,0.749616,0.798934,0.847878,0.894455,0.942422,1.000000,0.939453,0.705367,0.022085
enc_paym_23,0.013781,0.180341,-0.040171,-0.015935,0.096543,-0.086521,0.002936,-0.018804,0.036420,-0.031091,...,0.666947,0.708543,0.755604,0.802060,0.846045,0.890901,0.939453,1.000000,0.747841,0.020732
enc_paym_24,-0.010858,0.158081,-0.038730,-0.006572,0.046033,-0.041839,-0.061665,-0.006396,0.023948,0.015017,...,0.496182,0.527952,0.564167,0.599843,0.633579,0.668388,0.705367,0.747841,1.000000,0.011932


In [14]:
# Смотрим кореляции
correlation_matrix = final_df.corr()
correlation_pairs = correlation_matrix.unstack().sort_values(ascending=False)
correlation_pairs = correlation_pairs[abs(correlation_pairs) < 1]
high_correlation = correlation_pairs[abs(correlation_pairs) > 0.6]
high_correlation

enc_paym_21        enc_paym_20          0.943114
enc_paym_20        enc_paym_21          0.943114
enc_paym_22        enc_paym_21          0.942422
enc_paym_21        enc_paym_22          0.942422
enc_paym_20        enc_paym_19          0.940890
                                          ...   
enc_paym_12        enc_paym_18          0.609364
enc_paym_13        enc_paym_21          0.601151
enc_paym_21        enc_paym_13          0.601151
pre_over2limit     pre_maxover2limit   -0.631950
pre_maxover2limit  pre_over2limit      -0.631950
Length: 194, dtype: float64

In [16]:
correlation_matrix = final_df.corr()
correlation_pairs = correlation_matrix.unstack().sort_values(ascending=False)
correlation_pairs = correlation_pairs[abs(correlation_pairs) < 1]
high_correlation = correlation_pairs[abs(correlation_pairs) < 0.001]
high_correlation

pre_fterm                pre_till_fclose            0.000979
pre_till_fclose          pre_fterm                  0.000979
pre_till_pclose          enc_paym_17                0.000970
enc_paym_17              pre_till_pclose            0.000970
pre_fterm                pre_over2limit             0.000908
                                                      ...   
enc_paym_8               pre_loans6090             -0.000920
pre_loans3060            pre_loans_next_pay_summ   -0.000953
pre_loans_next_pay_summ  pre_loans3060             -0.000953
is_zero_over2limit       enc_paym_11               -0.000987
enc_paym_11              is_zero_over2limit        -0.000987
Length: 224, dtype: float64

In [19]:
# Разделяем значения на X, y, train/test и стандартизируем их.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = final_df.drop(columns=['id', 'flag'])
y = final_df['flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
# Тестируем на CatBoostClassifier, подбирая параметры с помощью optuna
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import optuna
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'random_seed': 42,
        'verbose': 0,
        'class_weights': class_weights_dict,
        'task_type': 'GPU'
    }
    
    model = CatBoostClassifier(**params)
    model.fit(X_train_scaled, y_train, eval_set=(X_test_scaled, y_test), early_stopping_rounds=50, verbose=False)
    
    preds = model.predict_proba(X_test_scaled)[:, 1]
    auc = roc_auc_score(y_test, preds)
    
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print("Лучшие гиперпараметры:", study.best_params)
print("Лучший ROC-AUC:", study.best_value)

best_params = study.best_params
final_model = CatBoostClassifier(**best_params)
final_model.fit(X_train_scaled, y_train)

final_preds = final_model.predict_proba(X_test_scaled)[:, 1]
final_auc = roc_auc_score(y_test, final_preds)
print("Финальный ROC-AUC:", final_auc)

[I 2024-09-18 09:36:23,008] A new study created in memory with name: no-name-fb66f60f-4550-4aa8-8562-646804c8bbe6
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-09-18 09:36:37,182] Trial 0 finished with value: 0.7123013943772616 and parameters: {'iterations': 885, 'learning_rate': 0.014473890036877704, 'depth': 8, 'l2_leaf_reg': 3}. Best is trial 0 with value: 0.7123013943772616.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-09-18 09:36:38,471] Trial 1 finished with value: 0.6586982355258011 and parameters: {'iterations': 824, 'learning_rate': 0.00024202791798047022, 'depth': 4, 'l2_leaf_reg': 10}. Best is trial 0 with value: 0.7123013943772616.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
Default metric period is 5 because AUC is/are not implemented

Лучшие гиперпараметры: {'iterations': 724, 'learning_rate': 0.07818926354409358, 'depth': 7, 'l2_leaf_reg': 3}
Лучший ROC-AUC: 0.7144303873103413
0:	learn: 0.5794777	total: 68.7ms	remaining: 49.7s
1:	learn: 0.4897011	total: 146ms	remaining: 52.6s
2:	learn: 0.4186237	total: 206ms	remaining: 49.5s
3:	learn: 0.3631414	total: 282ms	remaining: 50.7s
4:	learn: 0.3197851	total: 339ms	remaining: 48.7s
5:	learn: 0.2859382	total: 405ms	remaining: 48.5s
6:	learn: 0.2593207	total: 463ms	remaining: 47.5s
7:	learn: 0.2373601	total: 531ms	remaining: 47.5s
8:	learn: 0.2202377	total: 604ms	remaining: 48s
9:	learn: 0.2065585	total: 670ms	remaining: 47.8s
10:	learn: 0.1956329	total: 741ms	remaining: 48s
11:	learn: 0.1868164	total: 821ms	remaining: 48.7s
12:	learn: 0.1801330	total: 881ms	remaining: 48.2s
13:	learn: 0.1743282	total: 956ms	remaining: 48.5s
14:	learn: 0.1693701	total: 1.03s	remaining: 48.6s
15:	learn: 0.1654155	total: 1.09s	remaining: 48.3s
16:	learn: 0.1621179	total: 1.16s	remaining: 48.2s


159:	learn: 0.1424903	total: 11.4s	remaining: 40s
160:	learn: 0.1424834	total: 11.4s	remaining: 40s
161:	learn: 0.1424760	total: 11.5s	remaining: 39.9s
162:	learn: 0.1424696	total: 11.6s	remaining: 39.8s
163:	learn: 0.1424589	total: 11.6s	remaining: 39.7s
164:	learn: 0.1424520	total: 11.7s	remaining: 39.7s
165:	learn: 0.1424463	total: 11.8s	remaining: 39.6s
166:	learn: 0.1424368	total: 11.8s	remaining: 39.5s
167:	learn: 0.1424302	total: 11.9s	remaining: 39.4s
168:	learn: 0.1424197	total: 12s	remaining: 39.3s
169:	learn: 0.1424131	total: 12s	remaining: 39.3s
170:	learn: 0.1424080	total: 12.1s	remaining: 39.2s
171:	learn: 0.1424014	total: 12.2s	remaining: 39.1s
172:	learn: 0.1423957	total: 12.3s	remaining: 39s
173:	learn: 0.1423907	total: 12.3s	remaining: 39s
174:	learn: 0.1423830	total: 12.4s	remaining: 38.9s
175:	learn: 0.1423703	total: 12.5s	remaining: 38.8s
176:	learn: 0.1423633	total: 12.5s	remaining: 38.8s
177:	learn: 0.1423590	total: 12.6s	remaining: 38.7s
178:	learn: 0.1423467	to

320:	learn: 0.1413218	total: 22.7s	remaining: 28.5s
321:	learn: 0.1413159	total: 22.8s	remaining: 28.4s
322:	learn: 0.1413122	total: 22.8s	remaining: 28.4s
323:	learn: 0.1413051	total: 22.9s	remaining: 28.3s
324:	learn: 0.1413020	total: 23s	remaining: 28.2s
325:	learn: 0.1412951	total: 23s	remaining: 28.1s
326:	learn: 0.1412910	total: 23.1s	remaining: 28s
327:	learn: 0.1412843	total: 23.2s	remaining: 28s
328:	learn: 0.1412782	total: 23.2s	remaining: 27.9s
329:	learn: 0.1412725	total: 23.3s	remaining: 27.8s
330:	learn: 0.1412668	total: 23.4s	remaining: 27.8s
331:	learn: 0.1412602	total: 23.5s	remaining: 27.7s
332:	learn: 0.1412535	total: 23.5s	remaining: 27.6s
333:	learn: 0.1412461	total: 23.6s	remaining: 27.6s
334:	learn: 0.1412426	total: 23.7s	remaining: 27.5s
335:	learn: 0.1412371	total: 23.7s	remaining: 27.4s
336:	learn: 0.1412298	total: 23.8s	remaining: 27.4s
337:	learn: 0.1412250	total: 23.9s	remaining: 27.3s
338:	learn: 0.1412196	total: 24s	remaining: 27.2s
339:	learn: 0.1412138	

480:	learn: 0.1404795	total: 33.9s	remaining: 17.1s
481:	learn: 0.1404752	total: 33.9s	remaining: 17s
482:	learn: 0.1404714	total: 34s	remaining: 17s
483:	learn: 0.1404657	total: 34.1s	remaining: 16.9s
484:	learn: 0.1404600	total: 34.1s	remaining: 16.8s
485:	learn: 0.1404554	total: 34.2s	remaining: 16.8s
486:	learn: 0.1404508	total: 34.3s	remaining: 16.7s
487:	learn: 0.1404464	total: 34.4s	remaining: 16.6s
488:	learn: 0.1404398	total: 34.4s	remaining: 16.5s
489:	learn: 0.1404339	total: 34.5s	remaining: 16.5s
490:	learn: 0.1404298	total: 34.6s	remaining: 16.4s
491:	learn: 0.1404273	total: 34.6s	remaining: 16.3s
492:	learn: 0.1404210	total: 34.7s	remaining: 16.3s
493:	learn: 0.1404150	total: 34.8s	remaining: 16.2s
494:	learn: 0.1404084	total: 34.8s	remaining: 16.1s
495:	learn: 0.1404031	total: 34.9s	remaining: 16s
496:	learn: 0.1403965	total: 35s	remaining: 16s
497:	learn: 0.1403902	total: 35s	remaining: 15.9s
498:	learn: 0.1403867	total: 35.1s	remaining: 15.8s
499:	learn: 0.1403823	tota

640:	learn: 0.1397723	total: 45s	remaining: 5.83s
641:	learn: 0.1397693	total: 45.1s	remaining: 5.76s
642:	learn: 0.1397659	total: 45.2s	remaining: 5.69s
643:	learn: 0.1397625	total: 45.2s	remaining: 5.62s
644:	learn: 0.1397563	total: 45.3s	remaining: 5.55s
645:	learn: 0.1397505	total: 45.4s	remaining: 5.48s
646:	learn: 0.1397443	total: 45.5s	remaining: 5.41s
647:	learn: 0.1397407	total: 45.5s	remaining: 5.34s
648:	learn: 0.1397351	total: 45.6s	remaining: 5.27s
649:	learn: 0.1397309	total: 45.7s	remaining: 5.2s
650:	learn: 0.1397274	total: 45.8s	remaining: 5.13s
651:	learn: 0.1397257	total: 45.8s	remaining: 5.06s
652:	learn: 0.1397223	total: 45.9s	remaining: 4.99s
653:	learn: 0.1397187	total: 46s	remaining: 4.92s
654:	learn: 0.1397155	total: 46s	remaining: 4.85s
655:	learn: 0.1397125	total: 46.1s	remaining: 4.78s
656:	learn: 0.1397083	total: 46.2s	remaining: 4.71s
657:	learn: 0.1397033	total: 46.2s	remaining: 4.64s
658:	learn: 0.1397001	total: 46.3s	remaining: 4.57s
659:	learn: 0.13969

Итог: 0.7138754863850251

### Эксперимент 3
Здесь мы пробуем исопльзовать PCA, как замену преобразования (понижения размерности) enc_paym_N

In [10]:
# имопрт данных
import pandas as pd
df = pd.read_parquet("train_data")
train_target = pd.read_csv("train_target.csv", index_col="id")

In [11]:
# определние правил аггрегации и группировка данных
aggregations = {
    'rn': 'max', 
    'pre_since_opened': 'median',
    'pre_since_confirmed': 'median', 
    'pre_pterm': 'median',
    'pre_fterm': 'median',
    'pre_till_pclose': 'median',
    'pre_till_fclose': 'median',
    'pre_loans_credit_limit': 'median',
    'pre_loans_next_pay_summ': 'median',
    'pre_loans_outstanding': 'median',
    'pre_loans_total_overdue': 'median',
    'pre_loans_max_overdue_sum': 'median',
    'pre_loans_credit_cost_rate': 'median',
    'pre_loans5': 'median',
    'pre_loans530': 'median',
    'pre_loans3060': 'median',
    'pre_loans6090': 'median',
    'pre_loans90': 'median',
    'is_zero_loans5': 'mean',
    'is_zero_loans530': 'mean',
    'is_zero_loans3060': 'mean',
    'is_zero_loans6090': 'mean',
    'is_zero_loans90': 'mean',
    'pre_util': 'median',
    'pre_over2limit': 'median',
    'pre_maxover2limit': 'median',
    'is_zero_util': 'mean',
    'is_zero_over2limit': 'mean',
    'is_zero_maxover2limit': 'mean',
    'enc_loans_account_holder_type': 'median',
    'enc_loans_credit_status': 'median',
    'enc_loans_account_cur': 'median',
    'enc_loans_credit_type': 'median',
    'pclose_flag': 'median',
    'fclose_flag': 'median',
    'enc_paym_0': 'median',
    'enc_paym_1': 'median',
    'enc_paym_2': 'median',
    'enc_paym_3': 'median',
    'enc_paym_4': 'median',
    'enc_paym_5': 'median',
    'enc_paym_6': 'median',
    'enc_paym_7': 'median',
    'enc_paym_8': 'median',
    'enc_paym_9': 'median',
    'enc_paym_10': 'median',
    'enc_paym_11': 'median',
    'enc_paym_12': 'median',
    'enc_paym_13': 'median',
    'enc_paym_14': 'median',
    'enc_paym_15': 'median',
    'enc_paym_16': 'median',
    'enc_paym_17': 'median',
    'enc_paym_18': 'median',
    'enc_paym_19': 'median',
    'enc_paym_20': 'median',
    'enc_paym_21': 'median',
    'enc_paym_22': 'median',
    'enc_paym_23': 'median',
    'enc_paym_24': 'median'
}
grouped_df = df.groupby('id').agg(aggregations).reset_index()

In [12]:
# Использвоание PCA для преобразования enc_paym_N
from sklearn.decomposition import PCA
enc_paym_columns = [f'enc_paym_{i}' for i in range(25)] 
X = grouped_df[enc_paym_columns]
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
df_pca = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2'])
df_pca['id'] = grouped_df['id']
grouped_df.drop(enc_paym_columns, axis=1, inplace=True)
pca_df = grouped_df.merge(df_pca, how="left", on="id")

In [13]:
# Объединение с таргетом
final_df = pca_df.merge(train_target, how="left", on="id")

In [14]:
final_df.head()

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,is_zero_maxover2limit,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_account_cur,enc_loans_credit_type,pclose_flag,fclose_flag,PCA1,PCA2,flag
0,0,10,5.0,9.0,6.5,8.0,12.0,11.0,11.0,3.0,...,0.9,1.0,3.0,1.0,4.0,0.0,0.0,-0.634182,-1.863417,0
1,1,14,12.5,7.0,7.0,8.0,9.0,6.5,6.0,2.0,...,0.785714,1.0,3.0,1.0,4.0,0.0,0.0,-1.044334,-0.780536,0
2,2,3,12.0,9.0,4.0,8.0,1.0,11.0,1.0,1.0,...,0.666667,1.0,2.0,1.0,3.0,1.0,1.0,-0.692204,-2.391195,0
3,3,15,6.0,9.0,4.0,8.0,3.0,10.0,11.0,2.0,...,0.933333,1.0,3.0,1.0,4.0,0.0,0.0,2.78074,-2.181642,0
4,4,1,12.0,9.0,4.0,8.0,1.0,11.0,12.0,1.0,...,1.0,1.0,2.0,1.0,3.0,1.0,1.0,-5.161593,3.915705,0


In [15]:
final_df.corr()

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,is_zero_maxover2limit,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_account_cur,enc_loans_credit_type,pclose_flag,fclose_flag,PCA1,PCA2,flag
id,1.0,0.074279,0.041197,0.012538,0.012521,0.0088,0.014166,-0.003865,-0.012971,-0.022741,...,0.034301,0.006625,0.002197,-0.005245,0.090867,-0.038292,-0.030114,0.04263053,-0.0426113,-3.7e-05
rn,0.074279,1.0,0.022856,-0.011241,0.027872,-0.016063,0.073501,-0.055806,0.016918,-0.20079,...,-0.000908,-0.068007,0.479661,-0.016342,0.311111,-0.291479,-0.370548,-0.06294235,-0.2610057,-0.014138
pre_since_opened,0.041197,0.022856,1.0,-0.015967,0.000199,0.018226,-0.032863,-0.054148,-0.006013,0.005154,...,0.009194,-0.005022,0.031123,0.015189,0.000534,-0.009273,-0.016724,-0.06050729,0.17063,0.011584
pre_since_confirmed,0.012538,-0.011241,-0.015967,1.0,-0.025293,0.016467,0.021289,-0.071005,-0.01078,0.006327,...,-0.013476,-0.003496,0.007781,-0.002123,-0.040799,0.032918,0.002133,0.03943669,-0.01439267,-0.006152
pre_pterm,0.012521,0.027872,0.000199,-0.025293,1.0,0.083474,0.275438,-0.071958,0.016123,-0.058705,...,0.094911,-0.044209,0.105707,0.001329,0.145605,-0.3079,-0.192805,0.01351234,-0.1984303,-0.007316
pre_fterm,0.0088,-0.016063,0.018226,0.016467,0.083474,1.0,0.076098,0.000979,-0.002708,0.010614,...,-0.049685,-0.019735,0.036652,0.007494,0.022163,-0.011496,-0.067113,0.2262062,-0.03234167,-0.013389
pre_till_pclose,0.014166,0.073501,-0.032863,0.021289,0.275438,0.076098,1.0,-0.067357,0.034717,-0.082038,...,0.136103,-0.066251,0.086069,-0.000285,0.157355,-0.431777,-0.184868,0.01966996,-0.03078653,0.003658
pre_till_fclose,-0.003865,-0.055806,-0.054148,-0.071005,-0.071958,0.000979,-0.067357,1.0,0.019196,0.021807,...,-0.00147,0.011508,-0.211587,-0.013421,-0.017223,0.148067,0.311906,0.001150663,0.03346705,0.002761
pre_loans_credit_limit,-0.012971,0.016918,-0.006013,-0.01078,0.016123,-0.002708,0.034717,0.019196,1.0,-0.028141,...,-0.009959,-0.070457,-0.006416,-0.020803,0.127687,0.021679,0.003282,-0.0591998,0.007783502,0.019402
pre_loans_next_pay_summ,-0.022741,-0.20079,0.005154,0.006327,-0.058705,0.010614,-0.082038,0.021807,-0.028141,1.0,...,-0.000142,0.038693,-0.197996,0.004989,-0.093174,0.185576,0.125628,-0.01496049,0.08337432,0.007076


In [16]:
# Разделяем значения на X, y, train/test и стандартизируем их.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = final_df.drop(columns=['id', 'flag'])
y = final_df['flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
# Тестируем на CatBoostClassifier, подбирая параметры с помощью optuna
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import optuna
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'random_seed': 42,
        'verbose': 0,
        'class_weights': class_weights_dict
    }
    
    model = CatBoostClassifier(**params)
    model.fit(X_train_scaled, y_train, eval_set=(X_test_scaled, y_test), early_stopping_rounds=50, verbose=False)
    
    preds = model.predict_proba(X_test_scaled)[:, 1]
    auc = roc_auc_score(y_test, preds)
    
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print("Лучшие гиперпараметры:", study.best_params)
print("Лучший ROC-AUC:", study.best_value)

best_params = study.best_params
final_model = CatBoostClassifier(**best_params)
final_model.fit(X_train_scaled, y_train)

final_preds = final_model.predict_proba(X_test_scaled)[:, 1]
final_auc = roc_auc_score(y_test, final_preds)
print("Финальный ROC-AUC:", final_auc)

[I 2024-09-18 09:53:19,023] A new study created in memory with name: no-name-35c22191-d15f-401c-a619-bdd32fcc7b96
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
[I 2024-09-18 09:53:37,743] Trial 0 finished with value: 0.7071741731094856 and parameters: {'iterations': 260, 'learning_rate': 0.06452240316537888, 'depth': 5, 'l2_leaf_reg': 3}. Best is trial 0 with value: 0.7071741731094856.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
[I 2024-09-18 09:53:58,657] Trial 1 finished with value: 0.7114086148959831 and parameters: {'iterations': 228, 'learning_rate': 0.0903721612429925, 'depth': 8, 'l2_leaf_reg': 8}. Best is trial 1 with value: 0.7114086148959831.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
[I 2024-09-18 09:54:42,560] Trial 2 finished with value: 0.6840059974413742 and parameters: {'iterations': 517, 'learning_rate': 0.0014975460210306984, 'depth': 7, 'l2_leaf_reg': 8}. Best is trial 1 with val

Лучшие гиперпараметры: {'iterations': 228, 'learning_rate': 0.0903721612429925, 'depth': 8, 'l2_leaf_reg': 8}
Лучший ROC-AUC: 0.7114086148959831
0:	learn: 0.5630849	total: 67.9ms	remaining: 15.4s
1:	learn: 0.4653685	total: 128ms	remaining: 14.4s
2:	learn: 0.3913049	total: 174ms	remaining: 13.1s
3:	learn: 0.3350902	total: 237ms	remaining: 13.2s
4:	learn: 0.2923074	total: 304ms	remaining: 13.5s
5:	learn: 0.2603685	total: 366ms	remaining: 13.5s
6:	learn: 0.2356590	total: 426ms	remaining: 13.4s
7:	learn: 0.2168282	total: 486ms	remaining: 13.4s
8:	learn: 0.2017804	total: 545ms	remaining: 13.3s
9:	learn: 0.1901500	total: 605ms	remaining: 13.2s
10:	learn: 0.1814874	total: 673ms	remaining: 13.3s
11:	learn: 0.1749174	total: 728ms	remaining: 13.1s
12:	learn: 0.1693317	total: 796ms	remaining: 13.2s
13:	learn: 0.1646386	total: 866ms	remaining: 13.2s
14:	learn: 0.1610986	total: 929ms	remaining: 13.2s
15:	learn: 0.1580892	total: 996ms	remaining: 13.2s
16:	learn: 0.1557089	total: 1.06s	remaining: 13.

159:	learn: 0.1419153	total: 10.5s	remaining: 4.46s
160:	learn: 0.1419067	total: 10.6s	remaining: 4.39s
161:	learn: 0.1418963	total: 10.6s	remaining: 4.33s
162:	learn: 0.1418873	total: 10.7s	remaining: 4.26s
163:	learn: 0.1418793	total: 10.7s	remaining: 4.19s
164:	learn: 0.1418696	total: 10.8s	remaining: 4.13s
165:	learn: 0.1418606	total: 10.9s	remaining: 4.06s
166:	learn: 0.1418481	total: 10.9s	remaining: 4s
167:	learn: 0.1418389	total: 11s	remaining: 3.93s
168:	learn: 0.1418300	total: 11.1s	remaining: 3.87s
169:	learn: 0.1418159	total: 11.1s	remaining: 3.8s
170:	learn: 0.1418091	total: 11.2s	remaining: 3.73s
171:	learn: 0.1418022	total: 11.3s	remaining: 3.67s
172:	learn: 0.1417945	total: 11.3s	remaining: 3.6s
173:	learn: 0.1417789	total: 11.4s	remaining: 3.54s
174:	learn: 0.1417754	total: 11.5s	remaining: 3.47s
175:	learn: 0.1417660	total: 11.5s	remaining: 3.4s
176:	learn: 0.1417593	total: 11.6s	remaining: 3.34s
177:	learn: 0.1417493	total: 11.6s	remaining: 3.27s
178:	learn: 0.141733

Итог: 0.709926128840979

### 4 эксперимент
Объединяем 1 и 3 эксперимент, добавляя выделение важных признаков в PCA.

In [36]:
# Импорт данных
import pandas as pd
df = pd.read_parquet("train_data")
train_target = pd.read_csv("train_target.csv", index_col="id")

In [37]:
# Объединение с таргетом
merged_df = df.merge(train_target, on="id", how="left")

In [38]:
merged_df.shape

(26162717, 62)

In [39]:
merged_df["total_overdue_count"] = merged_df["pre_loans5"] + merged_df["pre_loans530"] + merged_df["pre_loans3060"] + merged_df["pre_loans6090"] + merged_df["pre_loans90"]
merged_df.drop(["pre_loans5", "pre_loans530", "pre_loans3060", "pre_loans6090", "pre_loans90"], axis=1, inplace=True)
merged_df["has_overdue_flag"] = 1 - (merged_df["is_zero_loans5"] & merged_df["is_zero_loans530"] & merged_df["is_zero_loans3060"] & merged_df["is_zero_loans6090"] & merged_df["is_zero_loans90"])
merged_df.drop(["is_zero_loans5", "is_zero_loans530", "is_zero_loans3060", "is_zero_loans6090", "is_zero_loans90"], axis=1, inplace=True)
merged_df["has_no_debt_flag"] = merged_df["is_zero_util"] & merged_df["is_zero_over2limit"] & merged_df["is_zero_maxover2limit"]
merged_df.drop(["is_zero_util", "is_zero_over2limit", "is_zero_maxover2limit"], axis=1, inplace=True)
merged_df["term_difference"] = merged_df["pre_pterm"] - merged_df["pre_fterm"]
merged_df["close_difference"] = merged_df["pre_till_pclose"] - merged_df["pre_till_fclose"]
merged_df.drop(["pre_fterm", "pre_pterm", "pre_till_fclose", "pre_till_pclose"], axis=1, inplace=True)
merged_df.drop(["pre_over2limit", "pre_loans_total_overdue"], axis=1, inplace=True)
merged_df["pre_since_conf_opened"] = merged_df['pre_since_confirmed'] - merged_df['pre_since_opened']
merged_df.drop(['pre_since_opened','pre_since_confirmed'],axis=1,inplace=True)

In [40]:
merged_df.shape

(26162717, 48)

In [6]:
# Получение важных признаков из enc_paym_N
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

enc_paym_columns = [f'enc_paym_{i}' for i in range(25)]
X = merged_df[enc_paym_columns]
y = merged_df['flag']

model = RandomForestClassifier()
model.fit(X, y)

selector = SelectFromModel(model, prefit=True)
X_important = selector.transform(X)

selected_features = selector.get_support(indices=True)
important_features = X.columns[selected_features]

print("Важные признаки:", important_features)



Важные признаки: Index(['enc_paym_4', 'enc_paym_5', 'enc_paym_6', 'enc_paym_7', 'enc_paym_8',
       'enc_paym_9', 'enc_paym_10', 'enc_paym_11', 'enc_paym_12',
       'enc_paym_13', 'enc_paym_14', 'enc_paym_15'],
      dtype='object')


In [41]:
# Использование PCA на важных признаках
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
enc_paym_columns = [f'enc_paym_{i}' for i in range(25)] 
X = merged_df[important_features]
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df_pca = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2'])
merged_df.drop(enc_paym_columns, axis=1, inplace=True)
merged_df[["enc_paym1", 'enc_paym2']] = df_pca

In [45]:
merged_df.shape

(26162717, 24)

In [46]:
merged_df.columns

Index(['id', 'rn', 'pre_loans_credit_limit', 'pre_loans_next_pay_summ',
       'pre_loans_outstanding', 'pre_loans_max_overdue_sum',
       'pre_loans_credit_cost_rate', 'pre_util', 'pre_maxover2limit',
       'enc_loans_account_holder_type', 'enc_loans_credit_status',
       'enc_loans_credit_type', 'enc_loans_account_cur', 'pclose_flag',
       'fclose_flag', 'flag', 'total_overdue_count', 'has_overdue_flag',
       'has_no_debt_flag', 'term_difference', 'close_difference', 'enc_paym1',
       'enc_paym2', 'pre_since_conf_opened'],
      dtype='object')

In [54]:
df.memory_usage(index=True).sum() + train_target.memory_usage(index=True).sum() - merged_df.memory_usage(index=True).sum()

7582862628

In [3]:
df.head()

Unnamed: 0,id,rn,pre_loans_credit_limit,pre_loans_next_pay_summ,pre_loans_outstanding,pre_loans_max_overdue_sum,pre_loans_credit_cost_rate,pre_util,pre_maxover2limit,enc_loans_account_holder_type,...,fclose_flag,flag,total_overdue_count,has_overdue_flag,has_no_debt_flag,term_difference,close_difference,enc_paym1,enc_paym2,pre_since_conf_opened
0,0,1,11,3,3,2,11,16,17,1,...,0,0,39,0,1,-1,6,-3.628027,1.222879,-9
1,0,2,0,3,3,2,11,16,17,1,...,0,0,39,0,1,0,0,3.679743,0.889779,-9
2,0,3,11,0,5,2,8,15,17,1,...,1,0,39,0,0,-4,-10,3.679743,0.889779,-9
3,0,4,12,2,3,2,4,16,17,1,...,0,0,39,1,1,-3,9,1.223487,-1.915711,-3
4,0,5,10,2,3,2,4,16,17,1,...,0,0,39,0,1,13,-1,-1.913414,-1.354639,7


In [4]:
df.columns

Index(['id', 'rn', 'pre_loans_credit_limit', 'pre_loans_next_pay_summ',
       'pre_loans_outstanding', 'pre_loans_max_overdue_sum',
       'pre_loans_credit_cost_rate', 'pre_util', 'pre_maxover2limit',
       'enc_loans_account_holder_type', 'enc_loans_credit_status',
       'enc_loans_credit_type', 'enc_loans_account_cur', 'pclose_flag',
       'fclose_flag', 'flag', 'total_overdue_count', 'has_overdue_flag',
       'has_no_debt_flag', 'term_difference', 'close_difference', 'enc_paym1',
       'enc_paym2', 'pre_since_conf_opened'],
      dtype='object')

In [8]:
# Определение правил аггрегации
aggregations = {
    'rn': 'count', 
    'pre_loans_credit_limit': 'min',
    'pre_loans_next_pay_summ': 'sum',
    'pre_loans_outstanding': 'mean',
    'pre_loans_max_overdue_sum': 'max',
    'pre_loans_credit_cost_rate': 'max',
    'pre_util': 'mean',
    'pre_maxover2limit': 'mean',
    'enc_loans_account_holder_type': 'median',
    'enc_loans_credit_status': 'median',
    'enc_loans_credit_type': 'median',
    'enc_loans_account_cur': 'median',
    'pclose_flag': 'max',
    'fclose_flag': 'max',
    'flag': 'max',
    'total_overdue_count': 'sum',
    'has_overdue_flag': 'max',
    'has_no_debt_flag': 'max',
    'term_difference': 'median',
    'close_difference': 'median',
    'enc_paym1': 'mean',
    'enc_paym2': 'mean',
    'pre_since_conf_opened': 'median',
}

In [9]:
# Группировка данных
final_df = df.groupby('id').agg(aggregations).reset_index()

In [10]:
# Разделяем значения на X, y, train/test и стандартизируем их.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = final_df.drop(columns=['id', 'flag'])
y = final_df['flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Тестируем на CatBoostClassifier, подбирая параметры с помощью optuna
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import optuna
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'random_seed': 42,
        'verbose': 0,
        'class_weights': class_weights_dict
    }
    
    model = CatBoostClassifier(**params)
    model.fit(X_train_scaled, y_train, eval_set=(X_test_scaled, y_test), early_stopping_rounds=50, verbose=False)
    
    preds = model.predict_proba(X_test_scaled)[:, 1]
    auc = roc_auc_score(y_test, preds)
    
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print("Лучшие гиперпараметры:", study.best_params)
print("Лучший ROC-AUC:", study.best_value)

best_params = study.best_params
final_model = CatBoostClassifier(**best_params)
final_model.fit(X_train_scaled, y_train)

final_preds = final_model.predict_proba(X_test_scaled)[:, 1]
final_auc = roc_auc_score(y_test, final_preds)
print("Финальный ROC-AUC:", final_auc)

[I 2024-09-20 16:48:14,742] A new study created in memory with name: no-name-b42a8e15-3031-4f0e-b626-dca3489758c1
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
[I 2024-09-20 16:48:27,981] Trial 0 finished with value: 0.6692362052862839 and parameters: {'iterations': 354, 'learning_rate': 3.2022699519404837e-05, 'depth': 10, 'l2_leaf_reg': 2}. Best is trial 0 with value: 0.6692362052862839.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
[I 2024-09-20 16:48:36,220] Trial 1 finished with value: 0.6506724620829086 and parameters: {'iterations': 447, 'learning_rate': 2.8082553948990834e-05, 'depth': 4, 'l2_leaf_reg': 8}. Best is trial 0 with value: 0.6692362052862839.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
[I 2024-09-20 16:48:49,527] Trial 2 finished with value: 0.654830888402111 and parameters: {'iterations': 333, 'learning_rate': 1.2239344744909578e-05, 'depth': 5, 'l2_leaf_reg': 8}. Best is trial 0 

Лучшие гиперпараметры: {'iterations': 278, 'learning_rate': 0.07010603469493519, 'depth': 5, 'l2_leaf_reg': 8}
Лучший ROC-AUC: 0.6892048719444132
0:	learn: 0.5918680	total: 74.8ms	remaining: 20.7s
1:	learn: 0.5085208	total: 149ms	remaining: 20.5s
2:	learn: 0.4424157	total: 219ms	remaining: 20.1s
3:	learn: 0.3878410	total: 297ms	remaining: 20.3s
4:	learn: 0.3445888	total: 365ms	remaining: 19.9s
5:	learn: 0.3099300	total: 433ms	remaining: 19.6s
6:	learn: 0.2817651	total: 521ms	remaining: 20.2s
7:	learn: 0.2575964	total: 593ms	remaining: 20s
8:	learn: 0.2392098	total: 659ms	remaining: 19.7s
9:	learn: 0.2236924	total: 735ms	remaining: 19.7s
10:	learn: 0.2113918	total: 812ms	remaining: 19.7s
11:	learn: 0.2009904	total: 884ms	remaining: 19.6s
12:	learn: 0.1925061	total: 949ms	remaining: 19.4s
13:	learn: 0.1856028	total: 1.02s	remaining: 19.2s
14:	learn: 0.1795714	total: 1.09s	remaining: 19.1s
15:	learn: 0.1747811	total: 1.16s	remaining: 19s
16:	learn: 0.1706174	total: 1.23s	remaining: 18.9s


160:	learn: 0.1451226	total: 11.4s	remaining: 8.3s
161:	learn: 0.1451177	total: 11.5s	remaining: 8.23s
162:	learn: 0.1451138	total: 11.6s	remaining: 8.16s
163:	learn: 0.1451097	total: 11.6s	remaining: 8.09s
164:	learn: 0.1451062	total: 11.7s	remaining: 8.01s
165:	learn: 0.1450988	total: 11.8s	remaining: 7.94s
166:	learn: 0.1450887	total: 11.8s	remaining: 7.87s
167:	learn: 0.1450820	total: 11.9s	remaining: 7.8s
168:	learn: 0.1450761	total: 12s	remaining: 7.73s
169:	learn: 0.1450713	total: 12.1s	remaining: 7.66s
170:	learn: 0.1450672	total: 12.1s	remaining: 7.59s
171:	learn: 0.1450582	total: 12.2s	remaining: 7.52s
172:	learn: 0.1450497	total: 12.3s	remaining: 7.45s
173:	learn: 0.1450436	total: 12.3s	remaining: 7.38s
174:	learn: 0.1450391	total: 12.4s	remaining: 7.31s
175:	learn: 0.1450369	total: 12.5s	remaining: 7.24s
176:	learn: 0.1450322	total: 12.6s	remaining: 7.16s
177:	learn: 0.1450220	total: 12.6s	remaining: 7.09s
178:	learn: 0.1450163	total: 12.7s	remaining: 7.02s
179:	learn: 0.14

Итог: 0.6865942379239678

### 5 эксперимент
Объдиняем создание признаков из 1 эксперимента

In [3]:
# Импортируем данные
import pandas as pd
df = pd.read_parquet("train_data")
train_target = pd.read_csv("train_target.csv", index_col="id")

In [4]:
# Приводим enc_paym_N к одному диапазону
import numpy as np
value_mapping = {
    1: 0,
    2: 1,
    3: 2,
    4: 3
}

columns_to_transform = ['enc_paym_11', 'enc_paym_20', 'enc_paym_24']

for column in columns_to_transform:
    df[column] = df[column].replace(value_mapping)
    
# Подсчитываем кол-во статусов
enc_paym_columns = [f'enc_paym_{i}' for i in range(25)] 
df[f'enc_paym_status_0'] = np.sum(df[enc_paym_columns].values == 0, axis=1)
df[f'enc_paym_status_1'] = np.sum(df[enc_paym_columns].values == 1, axis=1)
df[f'enc_paym_status_2'] = np.sum(df[enc_paym_columns].values == 2, axis=1)
df[f'enc_paym_status_3'] = np.sum(df[enc_paym_columns].values == 3, axis=1)
df.drop(enc_paym_columns, axis=1, inplace=True)

In [5]:
# Создаем новые признаки на основе других, по описанию как в 1 эксперименте.
df["total_overdue_count"] = df["pre_loans5"] + df["pre_loans530"] + df["pre_loans3060"] + df["pre_loans6090"] + df["pre_loans90"]
df.drop(["pre_loans5", "pre_loans530", "pre_loans3060", "pre_loans6090", "pre_loans90"], axis=1, inplace=True)
df["has_no_debt_flag"] = df["is_zero_util"] & df["is_zero_over2limit"] & df["is_zero_maxover2limit"]
df.drop(["is_zero_util", "is_zero_over2limit", "is_zero_maxover2limit"], axis=1, inplace=True)
df["has_overdue_flag"] = 1 - (df["is_zero_loans5"] & df["is_zero_loans530"] & df["is_zero_loans3060"] & df["is_zero_loans6090"] & df["is_zero_loans90"])
df.drop(["is_zero_loans5", "is_zero_loans530", "is_zero_loans3060", "is_zero_loans6090", "is_zero_loans90"], axis=1, inplace=True)
df["term_difference"] = df["pre_pterm"] - df["pre_fterm"]
df["close_difference"] = df["pre_till_pclose"] - df["pre_till_fclose"]
df.drop(["pre_fterm", "pre_pterm", "pre_till_fclose", "pre_till_pclose"], axis=1, inplace=True)

In [6]:
# Функция для подсчёта уникальных значений каждого признака
def create_count_columns_and_remove(df, columns_to_count):
    for column in columns_to_count:
        if column in df.columns and pd.api.types.is_numeric_dtype(df[column]):
            unique_values = df[column].unique()
            for value in unique_values:
                df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
            df.drop(columns=[column], inplace=True)
    
    return df

In [7]:
# Опредеяем признаки, которые следует подсчитать и преобразуем
columns_to_count = ['pre_since_opened', 'pre_since_confirmed', 'pre_loans_credit_limit', 'pre_loans_next_pay_summ', 'pre_loans_outstanding', 'pre_loans_total_overdue', 'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate', 'pre_util', 'pre_over2limit', 'pre_maxover2limit', 'enc_loans_account_holder_type', 'enc_loans_credit_status', 'enc_loans_credit_type', 'enc_loans_account_cur', 'pclose_flag', 'fclose_flag']
df_with_counts = create_count_columns_and_remove(df, columns_to_count)
del df

  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] ==

  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] ==

  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] ==

  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] ==

  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)


In [8]:
df_with_counts.columns[:20]

Index(['id', 'rn', 'enc_paym_status_0', 'enc_paym_status_1',
       'enc_paym_status_2', 'enc_paym_status_3', 'total_overdue_count',
       'has_no_debt_flag', 'has_overdue_flag', 'term_difference',
       'close_difference', 'pre_since_opened_18_count',
       'pre_since_opened_4_count', 'pre_since_opened_5_count',
       'pre_since_opened_3_count', 'pre_since_opened_2_count',
       'pre_since_opened_1_count', 'pre_since_opened_7_count',
       'pre_since_opened_8_count', 'pre_since_opened_15_count'],
      dtype='object')

In [9]:
# Определяем правила аггрегации
aggregations = {
    'rn': 'max', 
    'has_no_debt_flag': 'median',
    'has_overdue_flag': 'median',
    'term_difference': 'mean',
    'close_difference': 'mean',
    **{col: 'sum' for col in df_with_counts.columns if col not in ['rn', 'has_no_debt_flag', 'has_overdue_flag', 'term_difference', 'close_difference', 'id']}
}

In [10]:
# Группируем данные
grouped_df = df_with_counts.groupby('id').agg(aggregations).reset_index()

In [11]:
del df_with_counts

In [12]:
# Объединяем с таргетом
grouped_df = grouped_df.merge(train_target, how="left", on="id")

In [13]:
grouped_df.head()

Unnamed: 0,id,rn,has_no_debt_flag,has_overdue_flag,term_difference,close_difference,enc_paym_status_0,enc_paym_status_1,enc_paym_status_2,enc_paym_status_3,...,enc_loans_credit_type_6_count,enc_loans_account_cur_1_count,enc_loans_account_cur_2_count,enc_loans_account_cur_0_count,enc_loans_account_cur_3_count,pclose_flag_0_count,pclose_flag_1_count,fclose_flag_0_count,fclose_flag_1_count,flag
0,0,10,1.0,0.0,-0.4,0.7,117,1,0,132,...,0,10,0,0,0,9,1,8,2,0
1,1,14,1.0,1.0,-1.285714,3.071429,152,7,2,189,...,0,14,0,0,0,13,1,12,2,0
2,2,3,0.0,0.0,1.0,-4.0,21,10,2,42,...,0,3,0,0,0,1,2,1,2,0
3,3,15,1.0,0.0,-0.2,-2.933333,246,0,0,129,...,0,15,0,0,0,10,5,9,6,0
4,4,1,1.0,0.0,-4.0,-10.0,0,0,0,25,...,0,1,0,0,0,0,1,0,1,0


In [14]:
# Разделяем значения на X, y, train/test и стандартизируем их.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
X = grouped_df.drop(columns=['id', 'flag'])
y = grouped_df['flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
scaler = StandardScaler()

# Здесь мы убираем из списка для стандартизации бинарные признаки.
features_to_scale = X_train.columns.difference(['has_no_debt_flag', 'has_overdue_flag'])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, features_to_scale),
        ('passthrough', 'passthrough', ['has_no_debt_flag', 'has_overdue_flag'])
    ]
)

X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=list(features_to_scale) + ['has_no_debt_flag', 'has_overdue_flag'])
X_test_scaled = pd.DataFrame(X_test_scaled, columns=list(features_to_scale) + ['has_no_debt_flag', 'has_overdue_flag'])

In [21]:
# Тестируем на CatBoostClassifier, подбирая параметры с помощью optuna
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import optuna
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'random_seed': 42,
        'verbose': 0,
        'class_weights': class_weights_dict
    }
    
    model = CatBoostClassifier(**params)
    model.fit(X_train_scaled, y_train, eval_set=(X_test_scaled, y_test), early_stopping_rounds=50, verbose=False)
    
    preds = model.predict_proba(X_test_scaled)[:, 1]
    auc = roc_auc_score(y_test, preds)
    
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print("Лучшие гиперпараметры:", study.best_params)
print("Лучший ROC-AUC:", study.best_value)

best_params = study.best_params
final_model = CatBoostClassifier(**best_params)
final_model.fit(X_train_scaled, y_train)

final_preds = final_model.predict_proba(X_test_scaled)[:, 1]
final_auc = roc_auc_score(y_test, final_preds)
print("Финальный ROC-AUC:", final_auc)

[I 2024-09-21 11:00:52,575] A new study created in memory with name: no-name-ee2e9f32-0e32-48eb-875a-6176a3d20bbd
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
[I 2024-09-21 11:01:45,704] Trial 0 finished with value: 0.7138660903116149 and parameters: {'iterations': 179, 'learning_rate': 0.0021202434561103114, 'depth': 9, 'l2_leaf_reg': 3}. Best is trial 0 with value: 0.7138660903116149.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
[I 2024-09-21 11:02:14,864] Trial 1 finished with value: 0.6984954075393497 and parameters: {'iterations': 128, 'learning_rate': 0.0023195243414088507, 'depth': 6, 'l2_leaf_reg': 5}. Best is trial 0 with value: 0.7138660903116149.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
[I 2024-09-21 11:04:27,402] Trial 2 finished with value: 0.7211251451166132 and parameters: {'iterations': 735, 'learning_rate': 0.0035082706783929065, 'depth': 4, 'l2_leaf_reg': 10}. Best is trial 2 wi

Лучшие гиперпараметры: {'iterations': 361, 'learning_rate': 0.07525302007219146, 'depth': 8, 'l2_leaf_reg': 1}
Лучший ROC-AUC: 0.7528255479379675
0:	learn: 0.5836798	total: 181ms	remaining: 1m 5s
1:	learn: 0.4962392	total: 333ms	remaining: 59.8s
2:	learn: 0.4281020	total: 521ms	remaining: 1m 2s
3:	learn: 0.3739988	total: 670ms	remaining: 59.8s
4:	learn: 0.3304111	total: 802ms	remaining: 57.1s
5:	learn: 0.2962838	total: 944ms	remaining: 55.9s
6:	learn: 0.2682574	total: 1.13s	remaining: 56.9s
7:	learn: 0.2463348	total: 1.29s	remaining: 57.1s
8:	learn: 0.2275873	total: 1.5s	remaining: 58.8s
9:	learn: 0.2131081	total: 1.66s	remaining: 58.2s
10:	learn: 0.2011463	total: 1.85s	remaining: 59s
11:	learn: 0.1910770	total: 2.06s	remaining: 59.8s
12:	learn: 0.1830643	total: 2.25s	remaining: 1m
13:	learn: 0.1770041	total: 2.4s	remaining: 59.6s
14:	learn: 0.1718037	total: 2.59s	remaining: 59.7s
15:	learn: 0.1670133	total: 2.8s	remaining: 1m
16:	learn: 0.1632394	total: 3.01s	remaining: 1m
17:	learn: 

159:	learn: 0.1379984	total: 33.3s	remaining: 41.8s
160:	learn: 0.1379777	total: 33.5s	remaining: 41.6s
161:	learn: 0.1379614	total: 33.7s	remaining: 41.4s
162:	learn: 0.1379474	total: 33.9s	remaining: 41.2s
163:	learn: 0.1379400	total: 34.1s	remaining: 40.9s
164:	learn: 0.1379269	total: 34.3s	remaining: 40.7s
165:	learn: 0.1379154	total: 34.5s	remaining: 40.6s
166:	learn: 0.1379095	total: 34.7s	remaining: 40.3s
167:	learn: 0.1378966	total: 34.9s	remaining: 40.1s
168:	learn: 0.1378782	total: 35.1s	remaining: 39.9s
169:	learn: 0.1378642	total: 35.3s	remaining: 39.7s
170:	learn: 0.1378549	total: 35.5s	remaining: 39.4s
171:	learn: 0.1378283	total: 35.7s	remaining: 39.2s
172:	learn: 0.1378191	total: 35.9s	remaining: 39s
173:	learn: 0.1378103	total: 36.1s	remaining: 38.8s
174:	learn: 0.1377956	total: 36.3s	remaining: 38.6s
175:	learn: 0.1377880	total: 36.5s	remaining: 38.3s
176:	learn: 0.1377791	total: 36.6s	remaining: 38.1s
177:	learn: 0.1377604	total: 36.9s	remaining: 37.9s
178:	learn: 0.

318:	learn: 0.1363961	total: 1m 3s	remaining: 8.43s
319:	learn: 0.1363896	total: 1m 4s	remaining: 8.22s
320:	learn: 0.1363785	total: 1m 4s	remaining: 8.02s
321:	learn: 0.1363736	total: 1m 4s	remaining: 7.82s
322:	learn: 0.1363646	total: 1m 4s	remaining: 7.61s
323:	learn: 0.1363576	total: 1m 4s	remaining: 7.41s
324:	learn: 0.1363500	total: 1m 5s	remaining: 7.21s
325:	learn: 0.1363434	total: 1m 5s	remaining: 7.01s
326:	learn: 0.1363254	total: 1m 5s	remaining: 6.81s
327:	learn: 0.1363181	total: 1m 5s	remaining: 6.61s
328:	learn: 0.1363102	total: 1m 5s	remaining: 6.41s
329:	learn: 0.1363034	total: 1m 6s	remaining: 6.21s
330:	learn: 0.1362993	total: 1m 6s	remaining: 6s
331:	learn: 0.1362887	total: 1m 6s	remaining: 5.8s
332:	learn: 0.1362806	total: 1m 6s	remaining: 5.6s
333:	learn: 0.1362742	total: 1m 6s	remaining: 5.4s
334:	learn: 0.1362623	total: 1m 7s	remaining: 5.2s
335:	learn: 0.1362540	total: 1m 7s	remaining: 5s
336:	learn: 0.1362482	total: 1m 7s	remaining: 4.8s
337:	learn: 0.1362439	t

Итог: Более 0.75, берём данные преобразования в рассмотрение.

### 6 эксперимент
Используя успехи 5 эксперимента, добавляем средние значения каждого признака.

In [44]:
# Имопртируем данные
import pandas as pd
df = pd.read_parquet("train_data")
train_target = pd.read_csv("train_target.csv", index_col="id")

In [45]:
# Приводим enc_paym_N к одному диапазону
import numpy as np
value_mapping = {
    1: 0,
    2: 1,
    3: 2,
    4: 3
}

columns_to_transform = ['enc_paym_11', 'enc_paym_20', 'enc_paym_24']

for column in columns_to_transform:
    df[column] = df[column].replace(value_mapping)
# Подсчитываем кол-во статусов
enc_paym_columns = [f'enc_paym_{i}' for i in range(25)] 
df[f'enc_paym_status_0'] = np.sum(df[enc_paym_columns].values == 0, axis=1)
df[f'enc_paym_status_1'] = np.sum(df[enc_paym_columns].values == 1, axis=1)
df[f'enc_paym_status_2'] = np.sum(df[enc_paym_columns].values == 2, axis=1)
df[f'enc_paym_status_3'] = np.sum(df[enc_paym_columns].values == 3, axis=1)
df.drop(enc_paym_columns, axis=1, inplace=True)

In [46]:
# Создаем новые признаки на основе других, по описанию как в 1 эксперименте.
df["total_overdue_count"] = df["pre_loans5"] + df["pre_loans530"] + df["pre_loans3060"] + df["pre_loans6090"] + df["pre_loans90"]
df.drop(["pre_loans5", "pre_loans530", "pre_loans3060", "pre_loans6090", "pre_loans90"], axis=1, inplace=True)
df["has_no_debt_flag"] = df["is_zero_util"] & df["is_zero_over2limit"] & df["is_zero_maxover2limit"]
df.drop(["is_zero_util", "is_zero_over2limit", "is_zero_maxover2limit"], axis=1, inplace=True)
df["has_overdue_flag"] = 1 - (df["is_zero_loans5"] & df["is_zero_loans530"] & df["is_zero_loans3060"] & df["is_zero_loans6090"] & df["is_zero_loans90"])
df.drop(["is_zero_loans5", "is_zero_loans530", "is_zero_loans3060", "is_zero_loans6090", "is_zero_loans90"], axis=1, inplace=True)
df["term_difference"] = df["pre_pterm"] - df["pre_fterm"]
df["close_difference"] = df["pre_till_pclose"] - df["pre_till_fclose"]
df.drop(["pre_fterm", "pre_pterm", "pre_till_fclose", "pre_till_pclose"], axis=1, inplace=True)

In [47]:
# Функция для подсчёта уникальных значений каждого признака
def create_count_columns_and_remove(df, columns_to_count):
    for column in columns_to_count:
        if column in df.columns and pd.api.types.is_numeric_dtype(df[column]):
            unique_values = df[column].unique()
            for value in unique_values:
                df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
    
    return df

In [48]:
# Опредеяем признаки, которые следует подсчитать и преобразуем. Прежде убрал бинарные признаки fclose/pclose_flag
columns_to_agg = ['pre_since_opened', 'pre_since_confirmed', 'pre_loans_credit_limit', 'pre_loans_next_pay_summ', 'pre_loans_outstanding', 'pre_loans_total_overdue', 'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate', 'pre_util', 'pre_over2limit', 'pre_maxover2limit', 'enc_loans_account_holder_type', 'enc_loans_credit_status', 'enc_loans_credit_type', 'enc_loans_account_cur']
df_with_counts_max_means = create_count_columns_and_remove(df, columns_to_agg)
del df

  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] ==

  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] ==

  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] ==

  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] ==

  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)


In [49]:
# Определил правила аггрегации и сгруппировал данные
aggregations = {
    'has_no_debt_flag': 'median',
    'has_overdue_flag': 'median',
    'pclose_flag': 'median',
    'fclose_flag': 'median',
    **{col: 'sum' for col in df_with_counts_max_means.columns if col.endswith('_count')},
    **{col: 'mean' for col in df_with_counts_max_means.columns if col not in ['has_no_debt_flag', 'has_overdue_flag', 'id', 'fclose_flag', 'pclose_flag']}
}
grouped_df = df_with_counts_max_means.groupby('id').agg(aggregations).reset_index()

In [50]:
del df_with_counts_max_means

In [51]:
# Объединение с таргетом
grouped_df = grouped_df.merge(train_target, how="left", on="id")

In [52]:
# Разделяем значения на X, y, train/test и стандартизируем их.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
X = grouped_df.drop(columns=['id', 'flag'])
y = grouped_df['flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
scaler = StandardScaler()
# Здесь мы убираем из списка для стандартизации бинарные признаки
features_to_scale = X_train.columns.difference(['has_no_debt_flag', 'has_overdue_flag', 'fclose_flag', 'pclose_flag'])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, features_to_scale),
        ('passthrough', 'passthrough', ['has_no_debt_flag', 'has_overdue_flag', 'fclose_flag', 'pclose_flag'])
    ]
)

X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=list(features_to_scale) + ['has_no_debt_flag', 'has_overdue_flag', 'fclose_flag', 'pclose_flag'])
X_test_scaled = pd.DataFrame(X_test_scaled, columns=list(features_to_scale) + ['has_no_debt_flag', 'has_overdue_flag', 'fclose_flag', 'pclose_flag'])

In [53]:
# чекпоинт скалера
import pickle
with open('scaler.pkl','wb') as f:
    pickle.dump(preprocessor, f)

In [None]:
# Тестируем на CatBoostClassifier
# Взял параметры из предыдущего эксперимента
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

params = {
    'iterations': 361,
    'learning_rate': 0.07525302007219146,
    'depth': 8,
    'l2_leaf_reg': 1,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': 42,
    'verbose': 0,
    'class_weights': class_weights_dict
}
    
model = CatBoostClassifier(**params)
model.fit(X_train_scaled, y_train, eval_set=(X_test_scaled, y_test), early_stopping_rounds=50, verbose=False)
    
preds = model.predict_proba(X_test_scaled)[:, 1]
auc = roc_auc_score(y_test, preds)
    
print(auc)

Итог: 0.7532005635650828. Эти данные лучше всего подходят, так как имеют самую высокую метрику на одной модели.

### Эксперименты с моделями

In [15]:
# Подбор гиперпараметров CatBoost на финальных данных.
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import optuna
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'random_seed': 42,
        'verbose': 0,
        'class_weights': class_weights_dict
    }
    
    model = CatBoostClassifier(**params)
    model.fit(X_train_scaled, y_train)
    
    preds = model.predict_proba(X_test_scaled)[:, 1]
    auc = roc_auc_score(y_test, preds)
    
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print("Лучшие гиперпараметры:", study.best_params)
print("Лучший ROC-AUC:", study.best_value)

[I 2024-09-23 15:29:45,361] A new study created in memory with name: no-name-d0d4938d-8705-421a-88ca-486afe2a734a
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
[I 2024-09-23 15:30:40,925] Trial 0 finished with value: 0.7542534786574895 and parameters: {'iterations': 387, 'learning_rate': 0.08538574222674168, 'depth': 7, 'l2_leaf_reg': 9}. Best is trial 0 with value: 0.7542534786574895.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
[I 2024-09-23 15:33:39,945] Trial 1 finished with value: 0.7503892726431716 and parameters: {'iterations': 960, 'learning_rate': 0.07298846903025896, 'depth': 9, 'l2_leaf_reg': 2}. Best is trial 0 with value: 0.7542534786574895.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
[I 2024-09-23 15:34:02,393] Trial 2 finished with value: 0.6860244358714618 and parameters: {'iterations': 211, 'learning_rate': 3.869704859051864e-05, 'depth': 4, 'l2_leaf_reg': 6}. Best is trial 0 with va

Лучшие гиперпараметры: {'iterations': 935, 'learning_rate': 0.05312609024208735, 'depth': 6, 'l2_leaf_reg': 1}
Лучший ROC-AUC: 0.7551199922862032


Лучшие гиперпараметры: {'iterations': 935, 'learning_rate': 0.05312609024208735, 'depth': 6, 'l2_leaf_reg': 1}
Лучший ROC-AUC: 0.7551199922862032

In [11]:
# Подбор гиперпараметров LGBM на финальных данных.
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
import optuna
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'class_weight': 'balanced'
    }

    model = LGBMClassifier(**params)
    model.fit(X_train_scaled, y_train)

    preds = model.predict_proba(X_test_scaled)[:, 1]
    auc = roc_auc_score(y_test, preds)
    
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print("Лучшие гиперпараметры:", study.best_params)
print("Лучший ROC-AUC:", study.best_value)

[I 2024-09-23 14:53:44,066] A new study created in memory with name: no-name-bcd484c7-1343-447b-9411-8774ac1cfa56
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),


[LightGBM] [Info] Number of positive: 74509, number of negative: 2025491
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.505227 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21992
[LightGBM] [Info] Number of data points in the train set: 2100000, number of used features: 199
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000












[I 2024-09-23 14:54:20,554] Trial 0 finished with value: 0.736452417419333 and parameters: {'n_estimators': 599, 'learning_rate': 0.008857730135169845, 'max_depth': 4, 'num_leaves': 55}. Best is trial 0 with value: 0.736452417419333.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),


[LightGBM] [Info] Number of positive: 74509, number of negative: 2025491
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.480728 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21992
[LightGBM] [Info] Number of data points in the train set: 2100000, number of used features: 199
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


















[I 2024-09-23 14:55:44,064] Trial 1 finished with value: 0.6967378149714947 and parameters: {'n_estimators': 926, 'learning_rate': 1.729600045148328e-05, 'max_depth': 5, 'num_leaves': 97}. Best is trial 0 with value: 0.736452417419333.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),


[LightGBM] [Info] Number of positive: 74509, number of negative: 2025491
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.525218 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21992
[LightGBM] [Info] Number of data points in the train set: 2100000, number of used features: 199
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2024-09-23 14:56:11,023] Trial 2 finished with value: 0.70818645571019 and parameters: {'n_estimators': 132, 'learning_rate': 4.331610449633374e-05, 'max_depth': 9, 'num_leaves': 65}. Best is trial 0 with value: 0.736452417419333.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),


[LightGBM] [Info] Number of positive: 74509, number of negative: 2025491
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.455533 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21992
[LightGBM] [Info] Number of data points in the train set: 2100000, number of used features: 199
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2024-09-23 14:57:43,598] Trial 3 finished with value: 0.7055935228727128 and parameters: {'n_estimators': 804, 'learning_rate': 7.954908136093614e-05, 'max_depth': 6, 'num_leaves': 38}. Best is trial 0 with value: 0.736452417419333.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),


[LightGBM] [Info] Number of positive: 74509, number of negative: 2025491
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.139429 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 21992
[LightGBM] [Info] Number of data points in the train set: 2100000, number of used features: 199
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2024-09-23 14:58:35,019] Trial 4 finished with value: 0.7539273090731384 and parameters: {'n_estimators': 896, 'learning_rate': 0.034363631126211436, 'max_depth': 9, 'num_leaves': 27}. Best is trial 4 with value: 0.7539273090731384.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),


[LightGBM] [Info] Number of positive: 74509, number of negative: 2025491
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.181322 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 21992
[LightGBM] [Info] Number of data points in the train set: 2100000, number of used features: 199
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2024-09-23 14:59:20,105] Trial 5 finished with value: 0.7094946243635231 and parameters: {'n_estimators': 459, 'learning_rate': 0.00026523034448260746, 'max_depth': 7, 'num_leaves': 45}. Best is trial 4 with value: 0.7539273090731384.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),


[LightGBM] [Info] Number of positive: 74509, number of negative: 2025491
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.459845 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21992
[LightGBM] [Info] Number of data points in the train set: 2100000, number of used features: 199
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000








[I 2024-09-23 14:59:46,515] Trial 6 finished with value: 0.6860034284275083 and parameters: {'n_estimators': 349, 'learning_rate': 3.585158444231599e-05, 'max_depth': 4, 'num_leaves': 98}. Best is trial 4 with value: 0.7539273090731384.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),


[LightGBM] [Info] Number of positive: 74509, number of negative: 2025491
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.494391 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21992
[LightGBM] [Info] Number of data points in the train set: 2100000, number of used features: 199
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2024-09-23 15:00:44,088] Trial 7 finished with value: 0.7137596746999739 and parameters: {'n_estimators': 407, 'learning_rate': 0.0009962133272080336, 'max_depth': 7, 'num_leaves': 38}. Best is trial 4 with value: 0.7539273090731384.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),


[LightGBM] [Info] Number of positive: 74509, number of negative: 2025491
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.539611 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21992
[LightGBM] [Info] Number of data points in the train set: 2100000, number of used features: 199
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2024-09-23 15:01:32,884] Trial 8 finished with value: 0.7542063324714363 and parameters: {'n_estimators': 437, 'learning_rate': 0.04778428283852397, 'max_depth': 10, 'num_leaves': 81}. Best is trial 8 with value: 0.7542063324714363.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),


[LightGBM] [Info] Number of positive: 74509, number of negative: 2025491
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.498510 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21992
[LightGBM] [Info] Number of data points in the train set: 2100000, number of used features: 199
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2024-09-23 15:02:39,175] Trial 9 finished with value: 0.7498644216635804 and parameters: {'n_estimators': 586, 'learning_rate': 0.014747241514021354, 'max_depth': 7, 'num_leaves': 55}. Best is trial 8 with value: 0.7542063324714363.


Лучшие гиперпараметры: {'n_estimators': 437, 'learning_rate': 0.04778428283852397, 'max_depth': 10, 'num_leaves': 81}
Лучший ROC-AUC: 0.7542063324714363


Лучшие гиперпараметры: {'n_estimators': 437, 'learning_rate': 0.04778428283852397, 'max_depth': 10, 'num_leaves': 81}
Лучший ROC-AUC: 0.7542063324714363

In [14]:
# Подбор гиперпараметров XGBoost на финальных данных.
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
import optuna
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'scale_pos_weight': class_weights_dict[1] / class_weights_dict[0]
    }

    model = XGBClassifier(**params)
    model.fit(X_train_scaled, y_train)

    preds = model.predict_proba(X_test_scaled)[:, 1]
    auc = roc_auc_score(y_test, preds)
    
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print("Лучшие гиперпараметры:", study.best_params)
print("Лучший ROC-AUC:", study.best_value)

[I 2024-09-23 15:14:56,909] A new study created in memory with name: no-name-6df32c6f-1a7c-4581-8e3d-72044e42ebb2
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
[I 2024-09-23 15:15:24,864] Trial 0 finished with value: 0.7412145704504709 and parameters: {'n_estimators': 264, 'learning_rate': 0.014657180347155012, 'max_depth': 6, 'min_child_weight': 4, 'subsample': 0.9705712357453846, 'colsample_bytree': 0.8530119505529817}. Best is trial 0 with value: 0.7412145704504709.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
[I 2024-09-23 15:17:27,748] Trial 1 finished with value: 0.7454640599912297 and parameters: {'n_estimators': 892, 'learning_rate': 0.003042887596450597, 'max_depth': 9, 'min_child_weight': 1, 'subsample': 0.7593033255073088, 'colsample_bytree': 0.799968608027691}. Best is trial 1 with value: 0.7454640599912297.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
[I 2024-09-23 15:18:48,323] Trial 2 f

Лучшие гиперпараметры: {'n_estimators': 873, 'learning_rate': 0.09607318384069156, 'max_depth': 4, 'min_child_weight': 3, 'subsample': 0.6109521676209237, 'colsample_bytree': 0.7385982319190236}
Лучший ROC-AUC: 0.7534491687454309


Лучшие гиперпараметры: {'n_estimators': 873, 'learning_rate': 0.09607318384069156, 'max_depth': 4, 'min_child_weight': 3, 'subsample': 0.6109521676209237, 'colsample_bytree': 0.7385982319190236}
Лучший ROC-AUC: 0.7534491687454309

In [55]:
# Заполняю словарь с лучшими параметрами для каждой из модели
best_params = {'cat_iterations': 935, 'cat_learning_rate': 0.05312609024208735, 'cat_depth': 6, 'cat_l2_leaf_reg': 1, 'lgb_n_estimators': 437, 'lgb_learning_rate': 0.04778428283852397, 'lgb_max_depth': 10, 'lgb_num_leaves': 81, 'xgb_n_estimators': 873, 'xgb_learning_rate': 0.09607318384069156, 'xgb_max_depth': 4, 'xgb_min_child_weight': 3, 'xgb_subsample': 0.6109521676209237, 'xgb_colsample_bytree': 0.7385982319190236}

#### Объединение 3-х моделей бустинга по наилучшим параметрам

In [56]:
# Создаю ансамлбя из трёх моделей на основе Voting Classifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

class_weights = compute_class_weight('balanced', classes=[0, 1], y=y_train)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

final_cat_model = CatBoostClassifier(class_weights = class_weights_dict, **{k[4:]: best_params[k] for k in best_params if 'cat_' in k})
final_lgb_model = LGBMClassifier(class_weight = 'balanced', **{k[4:]: best_params[k] for k in best_params if 'lgb_' in k})
final_xgb_model = XGBClassifier(scale_pos_weight = class_weights_dict[1] / class_weights_dict[0], **{k[4:]: best_params[k] for k in best_params if 'xgb_' in k})

final_ensemble_model = VotingClassifier(estimators=[
    ('catboost', final_cat_model),
    ('lgbm', final_lgb_model),
    ('xgb', final_xgb_model)
], voting='soft')

final_ensemble_model.fit(X_train_scaled, y_train)

final_preds = final_ensemble_model.predict_proba(X_test_scaled)[:, 1]
final_auc = roc_auc_score(y_test, final_preds)
print("ROC-AUC:", final_auc)

0:	learn: 0.6867491	total: 203ms	remaining: 3m 9s
1:	learn: 0.6812500	total: 411ms	remaining: 3m 11s
2:	learn: 0.6759802	total: 608ms	remaining: 3m 9s
3:	learn: 0.6713522	total: 808ms	remaining: 3m 8s
4:	learn: 0.6673067	total: 1.03s	remaining: 3m 11s
5:	learn: 0.6633780	total: 1.24s	remaining: 3m 12s
6:	learn: 0.6598701	total: 1.44s	remaining: 3m 11s
7:	learn: 0.6564973	total: 1.65s	remaining: 3m 11s
8:	learn: 0.6534603	total: 1.89s	remaining: 3m 14s
9:	learn: 0.6506242	total: 2.1s	remaining: 3m 13s
10:	learn: 0.6480755	total: 2.3s	remaining: 3m 13s
11:	learn: 0.6457556	total: 2.52s	remaining: 3m 13s
12:	learn: 0.6436336	total: 2.75s	remaining: 3m 14s
13:	learn: 0.6416114	total: 2.96s	remaining: 3m 15s
14:	learn: 0.6396915	total: 3.18s	remaining: 3m 15s
15:	learn: 0.6380700	total: 3.41s	remaining: 3m 15s
16:	learn: 0.6362786	total: 3.61s	remaining: 3m 14s
17:	learn: 0.6348180	total: 3.82s	remaining: 3m 14s
18:	learn: 0.6333265	total: 4.02s	remaining: 3m 13s
19:	learn: 0.6319169	total:

158:	learn: 0.5945679	total: 31.8s	remaining: 2m 35s
159:	learn: 0.5944838	total: 32s	remaining: 2m 35s
160:	learn: 0.5944114	total: 32.2s	remaining: 2m 35s
161:	learn: 0.5943204	total: 32.5s	remaining: 2m 34s
162:	learn: 0.5942501	total: 32.7s	remaining: 2m 34s
163:	learn: 0.5941999	total: 32.8s	remaining: 2m 34s
164:	learn: 0.5941543	total: 33s	remaining: 2m 34s
165:	learn: 0.5940959	total: 33.2s	remaining: 2m 33s
166:	learn: 0.5940155	total: 33.4s	remaining: 2m 33s
167:	learn: 0.5939276	total: 33.6s	remaining: 2m 33s
168:	learn: 0.5938663	total: 33.8s	remaining: 2m 33s
169:	learn: 0.5938200	total: 34s	remaining: 2m 33s
170:	learn: 0.5937609	total: 34.2s	remaining: 2m 32s
171:	learn: 0.5936962	total: 34.4s	remaining: 2m 32s
172:	learn: 0.5935966	total: 34.6s	remaining: 2m 32s
173:	learn: 0.5935457	total: 34.8s	remaining: 2m 32s
174:	learn: 0.5934972	total: 35s	remaining: 2m 32s
175:	learn: 0.5934271	total: 35.2s	remaining: 2m 31s
176:	learn: 0.5933425	total: 35.4s	remaining: 2m 31s
1

314:	learn: 0.5862997	total: 1m 3s	remaining: 2m 4s
315:	learn: 0.5862465	total: 1m 3s	remaining: 2m 4s
316:	learn: 0.5861935	total: 1m 3s	remaining: 2m 4s
317:	learn: 0.5861521	total: 1m 3s	remaining: 2m 4s
318:	learn: 0.5861059	total: 1m 4s	remaining: 2m 3s
319:	learn: 0.5860378	total: 1m 4s	remaining: 2m 3s
320:	learn: 0.5859810	total: 1m 4s	remaining: 2m 3s
321:	learn: 0.5859397	total: 1m 4s	remaining: 2m 3s
322:	learn: 0.5859007	total: 1m 5s	remaining: 2m 3s
323:	learn: 0.5858520	total: 1m 5s	remaining: 2m 3s
324:	learn: 0.5858081	total: 1m 5s	remaining: 2m 2s
325:	learn: 0.5857561	total: 1m 5s	remaining: 2m 2s
326:	learn: 0.5857238	total: 1m 5s	remaining: 2m 2s
327:	learn: 0.5856682	total: 1m 6s	remaining: 2m 2s
328:	learn: 0.5856166	total: 1m 6s	remaining: 2m 2s
329:	learn: 0.5855617	total: 1m 6s	remaining: 2m 1s
330:	learn: 0.5855120	total: 1m 6s	remaining: 2m 1s
331:	learn: 0.5854684	total: 1m 6s	remaining: 2m 1s
332:	learn: 0.5854143	total: 1m 7s	remaining: 2m 1s
333:	learn: 

468:	learn: 0.5797879	total: 1m 34s	remaining: 1m 33s
469:	learn: 0.5797592	total: 1m 34s	remaining: 1m 33s
470:	learn: 0.5797257	total: 1m 34s	remaining: 1m 33s
471:	learn: 0.5796846	total: 1m 34s	remaining: 1m 33s
472:	learn: 0.5796475	total: 1m 35s	remaining: 1m 32s
473:	learn: 0.5796111	total: 1m 35s	remaining: 1m 32s
474:	learn: 0.5795766	total: 1m 35s	remaining: 1m 32s
475:	learn: 0.5795434	total: 1m 35s	remaining: 1m 32s
476:	learn: 0.5795140	total: 1m 35s	remaining: 1m 31s
477:	learn: 0.5794838	total: 1m 35s	remaining: 1m 31s
478:	learn: 0.5794491	total: 1m 36s	remaining: 1m 31s
479:	learn: 0.5794212	total: 1m 36s	remaining: 1m 31s
480:	learn: 0.5793801	total: 1m 36s	remaining: 1m 31s
481:	learn: 0.5793365	total: 1m 36s	remaining: 1m 30s
482:	learn: 0.5792976	total: 1m 36s	remaining: 1m 30s
483:	learn: 0.5792720	total: 1m 37s	remaining: 1m 30s
484:	learn: 0.5792390	total: 1m 37s	remaining: 1m 30s
485:	learn: 0.5792042	total: 1m 37s	remaining: 1m 30s
486:	learn: 0.5791728	total:

622:	learn: 0.5745896	total: 2m 4s	remaining: 1m 2s
623:	learn: 0.5745531	total: 2m 5s	remaining: 1m 2s
624:	learn: 0.5745241	total: 2m 5s	remaining: 1m 2s
625:	learn: 0.5744930	total: 2m 5s	remaining: 1m 1s
626:	learn: 0.5744682	total: 2m 5s	remaining: 1m 1s
627:	learn: 0.5744339	total: 2m 5s	remaining: 1m 1s
628:	learn: 0.5744028	total: 2m 6s	remaining: 1m 1s
629:	learn: 0.5743703	total: 2m 6s	remaining: 1m 1s
630:	learn: 0.5743408	total: 2m 6s	remaining: 1m
631:	learn: 0.5743000	total: 2m 6s	remaining: 1m
632:	learn: 0.5742706	total: 2m 6s	remaining: 1m
633:	learn: 0.5742448	total: 2m 6s	remaining: 1m
634:	learn: 0.5742116	total: 2m 7s	remaining: 1m
635:	learn: 0.5741851	total: 2m 7s	remaining: 59.9s
636:	learn: 0.5741548	total: 2m 7s	remaining: 59.7s
637:	learn: 0.5741224	total: 2m 7s	remaining: 59.5s
638:	learn: 0.5740830	total: 2m 7s	remaining: 59.3s
639:	learn: 0.5740501	total: 2m 8s	remaining: 59.1s
640:	learn: 0.5740180	total: 2m 8s	remaining: 58.9s
641:	learn: 0.5739816	total

779:	learn: 0.5698446	total: 2m 35s	remaining: 31s
780:	learn: 0.5698135	total: 2m 35s	remaining: 30.8s
781:	learn: 0.5697873	total: 2m 36s	remaining: 30.6s
782:	learn: 0.5697562	total: 2m 36s	remaining: 30.4s
783:	learn: 0.5697288	total: 2m 36s	remaining: 30.2s
784:	learn: 0.5696923	total: 2m 36s	remaining: 30s
785:	learn: 0.5696687	total: 2m 36s	remaining: 29.8s
786:	learn: 0.5696408	total: 2m 37s	remaining: 29.6s
787:	learn: 0.5696175	total: 2m 37s	remaining: 29.3s
788:	learn: 0.5695873	total: 2m 37s	remaining: 29.2s
789:	learn: 0.5695591	total: 2m 37s	remaining: 29s
790:	learn: 0.5695312	total: 2m 37s	remaining: 28.8s
791:	learn: 0.5695039	total: 2m 38s	remaining: 28.6s
792:	learn: 0.5694754	total: 2m 38s	remaining: 28.4s
793:	learn: 0.5694424	total: 2m 38s	remaining: 28.2s
794:	learn: 0.5694170	total: 2m 38s	remaining: 28s
795:	learn: 0.5693850	total: 2m 38s	remaining: 27.8s
796:	learn: 0.5693533	total: 2m 39s	remaining: 27.6s
797:	learn: 0.5693169	total: 2m 39s	remaining: 27.4s
7

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.746785 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21992
[LightGBM] [Info] Number of data points in the train set: 2100000, number of used features: 199
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
ROC-AUC: 0.7564616310712784


['final_ensemble_model.pkl']

In [112]:
import joblib
joblib.dump(final_ensemble_model, "final_ensemble_model.pkl")

['final_ensemble_model.pkl']

In [59]:
# Формируем файл с предиктами.
predictions = final_ensemble_model.predict(X_test_scaled)
predictions = pd.DataFrame(predictions, columns=['flag'])
predictions.index.rename('id', inplace=True)
predictions.to_csv("predictions.csv")

Итог ансамбля: 0.7564616310712784

#### Нейросетевой метод с помощью PyTorch GRU RNN

In [30]:
# Реализую модель RNN с GRU слоем
import torch
import torch.nn as nn
import torch.optim as optim
import optuna

class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.GRU(input_size, hidden_size, batch_first=True, num_layers=2)
        self.fc = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()

    def forward(self, x):
        out, _ = self.rnn(x)
        out = out[:, -1, :]
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc(out)
        return out

# Подбираем гиперпараметры с помощью optuna
def objective(trial):

    input_size = X_train_scaled.shape[1]
    hidden_size = trial.suggest_int('hidden_size', 64, 128)
    
    nn_model = SimpleRNN(input_size, hidden_size)
    
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(nn_model.parameters(), lr=trial.suggest_loguniform('nn_learning_rate', 0.01, 0.05))
    
    X_train_tensor = torch.FloatTensor(X_train_scaled.values).unsqueeze(1)
    y_train_tensor = torch.FloatTensor(y_train.values).view(-1, 1)
    X_test_tensor = torch.FloatTensor(X_test_scaled.values).unsqueeze(1)
    
    nn_model.train()
    for epoch in range(20):
        optimizer.zero_grad()
        outputs = nn_model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()
    
    nn_model.eval()
    with torch.no_grad():
        nn_preds = nn_model(X_test_tensor).numpy()
    
    auc = roc_auc_score(y_test, nn_preds.flatten())
    
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)
print("Лучшие гиперпараметры:", study.best_params)
print("Лучший ROC-AUC:", study.best_value)

[I 2024-09-23 16:44:39,916] A new study created in memory with name: no-name-506cbf81-61dc-4607-acbc-803ab1170483
  optimizer = optim.Adam(nn_model.parameters(), lr=trial.suggest_loguniform('nn_learning_rate', 0.008, 0.042))
[I 2024-09-23 16:48:10,205] Trial 0 finished with value: 0.7038999450795886 and parameters: {'hidden_size': 78, 'nn_learning_rate': 0.030119025598515585}. Best is trial 0 with value: 0.7038999450795886.
  optimizer = optim.Adam(nn_model.parameters(), lr=trial.suggest_loguniform('nn_learning_rate', 0.008, 0.042))
[I 2024-09-23 16:53:51,030] Trial 1 finished with value: 0.7054451351221079 and parameters: {'hidden_size': 128, 'nn_learning_rate': 0.041033332276468173}. Best is trial 1 with value: 0.7054451351221079.
  optimizer = optim.Adam(nn_model.parameters(), lr=trial.suggest_loguniform('nn_learning_rate', 0.008, 0.042))
[I 2024-09-23 16:57:11,624] Trial 2 finished with value: 0.6904154543203619 and parameters: {'hidden_size': 75, 'nn_learning_rate': 0.008992655736

Лучшие гиперпараметры: {'hidden_size': 128, 'nn_learning_rate': 0.041033332276468173}
Лучший ROC-AUC: 0.7054451351221079


Лучшие гиперпараметры: {'hidden_size': 94, 'nn_learning_rate': 0.020327253698045836}
Лучший ROC-AUC: 0.7112043343992882

In [63]:
# Использую лучшие гиперпараметры и обучаю на 100 эпох.
import torch
import torch.nn as nn
import torch.optim as optim

class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.GRU(input_size, hidden_size, batch_first=True, num_layers=2)
        self.fc = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()

    def forward(self, x):
        out, _ = self.rnn(x)
        out = out[:, -1, :]
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc(out)
        return out
    

input_size = X_train_scaled.shape[1]
hidden_size = 128

nn_model = SimpleRNN(input_size, hidden_size)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(nn_model.parameters(), lr=0.020327253698045836)

X_train_tensor = torch.FloatTensor(X_train_scaled.values).unsqueeze(1)
y_train_tensor = torch.FloatTensor(y_train.values).view(-1, 1)
X_test_tensor = torch.FloatTensor(X_test_scaled.values).unsqueeze(1)

nn_model.train()
for epoch in range(100):
    optimizer.zero_grad()
    outputs = nn_model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

nn_model.eval()
with torch.no_grad():
    nn_preds = nn_model(X_test_tensor).numpy()

auc = roc_auc_score(y_test, nn_preds.flatten())
nn_preds_binary = (nn_preds.flatten() > -2.35).astype(int)
f1 = f1_score(y_test, nn_preds_binary)
print(auc, f1)

0.7408235685764727 0.0


In [78]:
np.mean(nn_preds.flatten())

-3.8828955

In [127]:
nn_preds_binary = (nn_preds.flatten() > -2.35).astype(int)

In [128]:
f1_score(y_test, nn_preds_binary)

0.16432441880790072

In [129]:
confusion_matrix(y_test, nn_preds_binary)

array([[794981,  73086],
       [ 22532,   9401]], dtype=int64)

In [130]:
# Сохраняю лучшую модель (ячейка заполнена после проведения всех тестов с моделями)
torch.save(nn_model.state_dict(), 'rnn_gru_model_weights.pth')

In [None]:
# Здесь реализован алгоритм по загрузке модели из файла.
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.GRU(input_size, hidden_size, batch_first=True, num_layers=2)
        self.fc = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()

    def forward(self, x):
        out, _ = self.rnn(x)
        out = out[:, -1, :]
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc(out)
        return out
nn_model = SimpleRNN(input_size=X_train_scaled.shape[1], hidden_size=128)
nn_model.load_state_dict(torch.load('rnn_gru_model_weights.pth'))
nn_model.eval()

#### Нейросетевой метод с помощью PyTorch RNN

In [35]:
# Реализую простую модель RNN
import torch
import torch.nn as nn
import torch.optim as optim
import optuna

class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()

    def forward(self, x):
        out, _ = self.rnn(x)
        out = out[:, -1, :]
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc(out)
        return out
    
def objective(trial):

    input_size = X_train_scaled.shape[1]
    hidden_size = trial.suggest_int('hidden_size', 100, 128)
    num_layers = 3
    
    nn_model = SimpleRNN(input_size, hidden_size, num_layers)
    
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(nn_model.parameters(), lr=trial.suggest_loguniform('nn_learning_rate', 0.01116, 0.01118))
    
    X_train_tensor = torch.FloatTensor(X_train_scaled.values).unsqueeze(1)
    y_train_tensor = torch.FloatTensor(y_train.values).view(-1, 1)
    X_test_tensor = torch.FloatTensor(X_test_scaled.values).unsqueeze(1)
    
    nn_model.train()
    for epoch in range(20):
        optimizer.zero_grad()
        outputs = nn_model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()
    
    nn_model.eval()
    with torch.no_grad():
        nn_preds = nn_model(X_test_tensor).numpy()
    
    auc = roc_auc_score(y_test, nn_preds.flatten())
    
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=3)
print("Лучшие гиперпараметры:", study.best_params)
print("Лучший ROC-AUC:", study.best_value)

[I 2024-09-23 10:06:17,080] A new study created in memory with name: no-name-b415bc78-dc84-4019-b67f-aa0040bd443f
  optimizer = optim.Adam(nn_model.parameters(), lr=trial.suggest_loguniform('nn_learning_rate', 0.01116, 0.01118))
[I 2024-09-23 10:07:49,694] Trial 0 finished with value: 0.6961709299120658 and parameters: {'hidden_size': 105, 'nn_learning_rate': 0.011178042097046028}. Best is trial 0 with value: 0.6961709299120658.
  optimizer = optim.Adam(nn_model.parameters(), lr=trial.suggest_loguniform('nn_learning_rate', 0.01116, 0.01118))
[I 2024-09-23 10:09:40,861] Trial 1 finished with value: 0.7028123600531386 and parameters: {'hidden_size': 119, 'nn_learning_rate': 0.011173626980773737}. Best is trial 1 with value: 0.7028123600531386.
  optimizer = optim.Adam(nn_model.parameters(), lr=trial.suggest_loguniform('nn_learning_rate', 0.01116, 0.01118))
[I 2024-09-23 10:11:30,000] Trial 2 finished with value: 0.6865238922652402 and parameters: {'hidden_size': 128, 'nn_learning_rate': 

Лучшие гиперпараметры: {'hidden_size': 119, 'nn_learning_rate': 0.011173626980773737}
Лучший ROC-AUC: 0.7028123600531386


In [20]:
# Использую лучшие гиперпараметры и обучаю на 100 эпох.
import torch
import torch.nn as nn
import torch.optim as optim
import optuna

class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()

    def forward(self, x):
        out, _ = self.rnn(x)
        out = out[:, -1, :]
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc(out)
        return out
    

input_size = X_train_scaled.shape[1]
hidden_size = 128
num_layers = 3

nn_model = SimpleRNN(input_size, hidden_size, num_layers)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(nn_model.parameters(), lr=0.01117498392633228)

X_train_tensor = torch.FloatTensor(X_train_scaled.values).unsqueeze(1)
y_train_tensor = torch.FloatTensor(y_train.values).view(-1, 1)
X_test_tensor = torch.FloatTensor(X_test_scaled.values).unsqueeze(1)

nn_model.train()
for epoch in range(100):
    optimizer.zero_grad()
    outputs = nn_model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

nn_model.eval()
with torch.no_grad():
    nn_preds = nn_model(X_test_tensor).numpy()

auc = roc_auc_score(y_test, nn_preds.flatten())

print(auc)

0.7358548122478354


#### Нейросетевой метод с помощью PyTorch SimpleNN

In [23]:
# Опрелеяю простую нейросеть с 3-мя линейными слоями и Relu активатором c dropout слоем в конце
# Использую гиперпараметры на основе своего опыта.
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import roc_auc_score

class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

input_size = X_train_scaled.shape[1]
nn_model = SimpleNN(input_size)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(nn_model.parameters(), lr=0.0407158632296223)

X_train_tensor = torch.FloatTensor(X_train_scaled.values)
y_train_tensor = torch.FloatTensor(y_train.values).view(-1, 1)
X_test_tensor = torch.FloatTensor(X_test_scaled.values)

best_train_loss = float('inf')
patience = 10
patience_counter = 0

nn_model.train()
for epoch in range(100): 
    optimizer.zero_grad()
    outputs = nn_model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    
    if loss < best_train_loss:
        best_train_loss = loss
        patience_counter = 0
        best_model_state = nn_model.state_dict()
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print(f'Early stopping at epoch {epoch + 1}')
        break

nn_model.load_state_dict(best_model_state)

nn_model.eval()
with torch.no_grad():
    nn_preds = nn_model(X_test_tensor).numpy()

auc = roc_auc_score(y_test, nn_preds.flatten())
print(auc)

0.7341119871851572


Лучший метод это RNN GRU с AUC: 0.7422890378103878 

In [59]:
# Проводим объединение результатов ансамбля и нейросети, подбирая коэффициент объединения.
for i in range(1, 50):
    print(roc_auc_score(y_test, (preds_proba * i + nn_preds.flatten()/i)/2), i)

0.7477950390473449 1
0.7545207456454753 2
0.7570618447760735 3
0.7576040956397523 4
0.7575704787366386 5
0.7574127691190169 6
0.7572528356905486 7
0.7571178125943583 8
0.757009418482262 9
0.7569227506457156 10
0.7568539072786464 11
0.7567986282414352 12
0.7567539488137035 13
0.7567170033371814 14
0.7566867053754359 15
0.7566613465580427 16
0.7566398377068646 17
0.756621594189519 18
0.7566060206954067 19
0.7565925582775863 20
0.7565808568997745 21
0.7565705250032968 22
0.7565615848825387 23
0.7565536159744796 24
0.756546655003528 25
0.7565403230733545 26
0.7565348045275033 27
0.7565297266384798 28
0.7565251866646394 29
0.7565210977733182 30
0.7565174268115387 31
0.7565140796234004 32
0.7565110175725169 33
0.756508153357249 34
0.7565056274538706 35
0.7565032690830593 36
0.7565010674222981 37
0.7564990971469558 38
0.7564972105296719 39
0.7564954804781376 40
0.7564938662636133 41
0.7564923528788747 42
0.7564909626183075 43
0.7564896655396138 44
0.7564884022993241 45
0.7564872769739794 46
0

Итог: Объединение энсембла бустинга с RNN, GRU. + коэффициент 4. AUC_Score = 0.7576040956397523

## Создание финального пайплайна (собственный класс)

In [41]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.utils.class_weight import compute_class_weight
import joblib

class CustomPipeline:
    def __init__(self):
        self.preprocessor = None
        self.final_ensemble_model = None
        self.features_to_scale = []
        self.value_mapping = {
            1: 0,
            2: 1,
            3: 2,
            4: 3
        }
        
    def fit(self, train_data_path, train_target_path):
        # Загрузка данных
        df = pd.read_parquet(train_data_path)
        train_target = pd.read_csv(train_target_path, index_col="id")

        # Преобразование данных
        columns_to_transform = ['enc_paym_11', 'enc_paym_20', 'enc_paym_24']
        for column in columns_to_transform:
            df[column] = df[column].replace(self.value_mapping)

        enc_paym_columns = [f'enc_paym_{i}' for i in range(25)]
        for status in range(4):
            df[f'enc_paym_status_{status}'] = np.sum(df[enc_paym_columns].values == status, axis=1)
        df.drop(enc_paym_columns, axis=1, inplace=True)

        # Дополнительные вычисления
        df["total_overdue_count"] = df[["pre_loans5", "pre_loans530", "pre_loans3060", "pre_loans6090", "pre_loans90"]].sum(axis=1)
        df.drop(["pre_loans5", "pre_loans530", "pre_loans3060", "pre_loans6090", "pre_loans90"], axis=1, inplace=True)
        
        # Создание признаков
        df["has_no_debt_flag"] = df["is_zero_util"] & df["is_zero_over2limit"] & df["is_zero_maxover2limit"]
        df.drop(["is_zero_util", "is_zero_over2limit", "is_zero_maxover2limit"], axis=1, inplace=True)
        df["has_overdue_flag"] = 1 - (df[["is_zero_loans5", "is_zero_loans530", "is_zero_loans3060", "is_zero_loans6090", "is_zero_loans90"]].all(axis=1))
        df.drop(["is_zero_loans5", "is_zero_loans530", "is_zero_loans3060", "is_zero_loans6090", "is_zero_loans90"], axis=1, inplace=True)
        df["term_difference"] = df["pre_pterm"] - df["pre_fterm"]
        df["close_difference"] = df["pre_till_pclose"] - df["pre_till_fclose"]
        df.drop(["pre_fterm", "pre_pterm", "pre_till_fclose", "pre_till_pclose"], axis=1, inplace=True)

        # Обработка числовых признаков
        columns_to_agg = ['pre_since_opened', 'pre_since_confirmed', 'pre_loans_credit_limit', 'pre_loans_next_pay_summ', 'pre_loans_outstanding', 'pre_loans_total_overdue', 'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate', 'pre_util', 'pre_over2limit', 'pre_maxover2limit', 'enc_loans_account_holder_type', 'enc_loans_credit_status', 'enc_loans_credit_type', 'enc_loans_account_cur']
        df_with_counts_max_means = self.create_count_columns(df, columns_to_agg)
        del df

        # Группировка и агрегация
        grouped_df = df_with_counts_max_means.groupby('id').agg({
            'has_no_debt_flag': 'median',
            'has_overdue_flag': 'median',
            'pclose_flag': 'median',
            'fclose_flag': 'median',
            **{col: 'sum' for col in df_with_counts_max_means.columns if col.endswith('_count')},
            **{col: 'mean' for col in df_with_counts_max_means.columns if col not in ['has_no_debt_flag', 'has_overdue_flag', 'id', 'fclose_flag', 'pclose_flag']}
        }).reset_index()
        del df_with_counts_max_means
        
        # Объединение с целевой переменной
        grouped_df = grouped_df.merge(train_target, how="left", on="id")
        X = grouped_df.drop(columns=['id', 'flag'])
        y = grouped_df['flag']

        # Препроцессинг
        self.features_to_scale = X.columns.difference(['has_no_debt_flag', 'has_overdue_flag', 'fclose_flag', 'pclose_flag'])
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), self.features_to_scale),
                ('passthrough', 'passthrough', ['has_no_debt_flag', 'has_overdue_flag', 'fclose_flag', 'pclose_flag'])
            ]
        )
        X_scaled = self.preprocessor.fit_transform(X)
        del X
        
        # Обучение моделей
        class_weights = compute_class_weight('balanced', classes=[0, 1], y=y)
        class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

        final_cat_model = CatBoostClassifier(class_weights=class_weights_dict, iterations=935, learning_rate=0.05312609024208735, depth=6, l2_leaf_reg=1)
        final_lgb_model = LGBMClassifier(class_weight='balanced', n_estimators=437, learning_rate=0.04778428283852397, max_depth=10, num_leaves=81)
        final_xgb_model = XGBClassifier(scale_pos_weight=class_weights_dict[1] / class_weights_dict[0], n_estimators=873, learning_rate=0.09607318384069156, max_depth=4, min_child_weight=3, subsample=0.6109521676209237, colsample_bytree=0.73859823191902369)

        self.final_ensemble_model = VotingClassifier(estimators=[
            ('catboost', final_cat_model),
            ('lgbm', final_lgb_model),
            ('xgb', final_xgb_model)
        ], voting='soft')

        self.final_ensemble_model.fit(X_scaled, y)
        joblib.dump(self.final_ensemble_model, "final_ensemble_model.pkl")

    def predict(self, test_data_path):
        test_data = pd.read_parquet(test_data_path)
        
        # Преобразование данных
        columns_to_transform = ['enc_paym_11', 'enc_paym_20', 'enc_paym_24']
        for column in columns_to_transform:
            test_data[column] = test_data[column].replace(self.value_mapping)

        enc_paym_columns = [f'enc_paym_{i}' for i in range(25)]
        for status in range(4):
            test_data[f'enc_paym_status_{status}'] = np.sum(test_data[enc_paym_columns].values == status, axis=1)
        test_data.drop(enc_paym_columns, axis=1, inplace=True)

        # Дополнительные вычисления
        test_data["total_overdue_count"] = test_data[["pre_loans5", "pre_loans530", "pre_loans3060", "pre_loans6090", "pre_loans90"]].sum(axis=1)
        test_data.drop(["pre_loans5", "pre_loans530", "pre_loans3060", "pre_loans6090", "pre_loans90"], axis=1, inplace=True)
        
        # Создание признаков
        test_data["has_no_debt_flag"] = test_data["is_zero_util"] & test_data["is_zero_over2limit"] & test_data["is_zero_maxover2limit"]
        test_data.drop(["is_zero_util", "is_zero_over2limit", "is_zero_maxover2limit"], axis=1, inplace=True)
        test_data["has_overdue_flag"] = 1 - (test_data[["is_zero_loans5", "is_zero_loans530", "is_zero_loans3060", "is_zero_loans6090", "is_zero_loans90"]].all(axis=1))
        test_data.drop(["is_zero_loans5", "is_zero_loans530", "is_zero_loans3060", "is_zero_loans6090", "is_zero_loans90"], axis=1, inplace=True)
        test_data["term_difference"] = test_data["pre_pterm"] - test_data["pre_fterm"]
        test_data["close_difference"] = test_data["pre_till_pclose"] - test_data["pre_till_fclose"]
        test_data.drop(["pre_fterm", "pre_pterm", "pre_till_fclose", "pre_till_pclose"], axis=1, inplace=True)

        # Обработка числовых признаков
        columns_to_agg = ['pre_since_opened', 'pre_since_confirmed', 'pre_loans_credit_limit', 'pre_loans_next_pay_summ', 'pre_loans_outstanding', 'pre_loans_total_overdue', 'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate', 'pre_util', 'pre_over2limit', 'pre_maxover2limit', 'enc_loans_account_holder_type', 'enc_loans_credit_status', 'enc_loans_credit_type', 'enc_loans_account_cur']
        df_with_counts_max_means = self.create_count_columns(test_data, columns_to_agg)
        del test_data

        # Группировка и агрегация
        grouped_df = df_with_counts_max_means.groupby('id').agg({
            'has_no_debt_flag': 'median',
            'has_overdue_flag': 'median',
            'pclose_flag': 'median',
            'fclose_flag': 'median',
            **{col: 'sum' for col in df_with_counts_max_means.columns if col.endswith('_count')},
            **{col: 'mean' for col in df_with_counts_max_means.columns if col not in ['has_no_debt_flag', 'has_overdue_flag', 'id', 'fclose_flag', 'pclose_flag']}
        }).reset_index()
        del df_with_counts_max_means
        grouped_df.drop(["id"], axis=1, inplace=True)
        test_scaled = self.preprocessor.transform(grouped_df)
        return self.final_ensemble_model.predict(test_scaled)
    
    def predict_proba(self, test_data_path):
        test_data = pd.read_parquet(test_data_path)
        # Преобразование данных
        columns_to_transform = ['enc_paym_11', 'enc_paym_20', 'enc_paym_24']
        for column in columns_to_transform:
            test_data[column] = test_data[column].replace(self.value_mapping)

        enc_paym_columns = [f'enc_paym_{i}' for i in range(25)]
        for status in range(4):
            test_data[f'enc_paym_status_{status}'] = np.sum(test_data[enc_paym_columns].values == status, axis=1)
        test_data.drop(enc_paym_columns, axis=1, inplace=True)

        # Дополнительные вычисления
        test_data["total_overdue_count"] = test_data[["pre_loans5", "pre_loans530", "pre_loans3060", "pre_loans6090", "pre_loans90"]].sum(axis=1)
        test_data.drop(["pre_loans5", "pre_loans530", "pre_loans3060", "pre_loans6090", "pre_loans90"], axis=1, inplace=True)
        
        # Создание признаков
        test_data["has_no_debt_flag"] = test_data["is_zero_util"] & test_data["is_zero_over2limit"] & test_data["is_zero_maxover2limit"]
        test_data.drop(["is_zero_util", "is_zero_over2limit", "is_zero_maxover2limit"], axis=1, inplace=True)
        test_data["has_overdue_flag"] = 1 - (test_data[["is_zero_loans5", "is_zero_loans530", "is_zero_loans3060", "is_zero_loans6090", "is_zero_loans90"]].all(axis=1))
        test_data.drop(["is_zero_loans5", "is_zero_loans530", "is_zero_loans3060", "is_zero_loans6090", "is_zero_loans90"], axis=1, inplace=True)
        test_data["term_difference"] = test_data["pre_pterm"] - test_data["pre_fterm"]
        test_data["close_difference"] = test_data["pre_till_pclose"] - test_data["pre_till_fclose"]
        test_data.drop(["pre_fterm", "pre_pterm", "pre_till_fclose", "pre_till_pclose"], axis=1, inplace=True)

        # Обработка числовых признаков
        columns_to_agg = ['pre_since_opened', 'pre_since_confirmed', 'pre_loans_credit_limit', 'pre_loans_next_pay_summ', 'pre_loans_outstanding', 'pre_loans_total_overdue', 'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate', 'pre_util', 'pre_over2limit', 'pre_maxover2limit', 'enc_loans_account_holder_type', 'enc_loans_credit_status', 'enc_loans_credit_type', 'enc_loans_account_cur']
        df_with_counts_max_means = self.create_count_columns(test_data, columns_to_agg)
        del test_data

        # Группировка и агрегация
        grouped_df = df_with_counts_max_means.groupby('id').agg({
            'has_no_debt_flag': 'median',
            'has_overdue_flag': 'median',
            'pclose_flag': 'median',
            'fclose_flag': 'median',
            **{col: 'sum' for col in df_with_counts_max_means.columns if col.endswith('_count')},
            **{col: 'mean' for col in df_with_counts_max_means.columns if col not in ['has_no_debt_flag', 'has_overdue_flag', 'id', 'fclose_flag', 'pclose_flag']}
        }).reset_index()
        del df_with_counts_max_means
        grouped_df.drop(["id"], axis=1, inplace=True)
        test_scaled = self.preprocessor.transform(grouped_df)
        return self.final_ensemble_model.predict_proba(test_scaled)

    def save_model(self, filepath):
        with open(filepath, 'wb') as file:
            pickle.dump(self, file)

    @classmethod
    def load_model(cls, filepath):
        with open(filepath, 'rb') as file:
            return pickle.load(file)
        
    def create_count_columns(self, df, columns_to_count):
        for column in columns_to_count:
            if column in df.columns and pd.api.types.is_numeric_dtype(df[column]):
                unique_values = df[column].unique()
                for value in unique_values:
                    df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
        return df

In [43]:
pipeline = CustomPipeline()
pipeline.fit('train_data', 'train_target.csv')
pipeline.save_model('custom_pipeline_model.pkl')

KeyboardInterrupt: 

## Пайплайн Sklearn

In [104]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

def preprocess_data(X):
    columns_to_transform = ['enc_paym_11', 'enc_paym_20', 'enc_paym_24']
    for column in columns_to_transform:
        X[column] = X[column].replace(value_mapping)

    enc_paym_columns = [f'enc_paym_{i}' for i in range(25)]
    for status in range(4):
        X[f'enc_paym_status_{status}'] = np.sum(X[enc_paym_columns].values == status, axis=1)
    X.drop(enc_paym_columns, axis=1, inplace=True)

    # Дополнительные вычисления
    X["total_overdue_count"] = X[["pre_loans5", "pre_loans530", "pre_loans3060", "pre_loans6090", "pre_loans90"]].sum(axis=1)
    X.drop(["pre_loans5", "pre_loans530", "pre_loans3060", "pre_loans6090", "pre_loans90"], axis=1, inplace=True)

    # Создание признаков
    X["has_no_debt_flag"] = X["is_zero_util"] & X["is_zero_over2limit"] & X["is_zero_maxover2limit"]
    X.drop(["is_zero_util", "is_zero_over2limit", "is_zero_maxover2limit"], axis=1, inplace=True)
    X["has_overdue_flag"] = 1 - (X[["is_zero_loans5", "is_zero_loans530", "is_zero_loans3060", "is_zero_loans6090", "is_zero_loans90"]].all(axis=1))
    X.drop(["is_zero_loans5", "is_zero_loans530", "is_zero_loans3060", "is_zero_loans6090", "is_zero_loans90"], axis=1, inplace=True)
    X["term_difference"] = X["pre_pterm"] - X["pre_fterm"]
    X["close_difference"] = X["pre_till_pclose"] - X["pre_till_fclose"]
    X.drop(["pre_fterm", "pre_pterm", "pre_till_fclose", "pre_till_pclose"], axis=1, inplace=True)
    def create_count_columns(df, columns_to_count):
        for column in columns_to_count:
            if column in df.columns and pd.api.types.is_numeric_dtype(df[column]):
                unique_values = df[column].unique()
                for value in unique_values:
                    df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
        return df
    # Обработка числовых признаков
    columns_to_agg = ['pre_since_opened', 'pre_since_confirmed', 'pre_loans_credit_limit', 'pre_loans_next_pay_summ', 'pre_loans_outstanding', 'pre_loans_total_overdue', 'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate', 'pre_util', 'pre_over2limit', 'pre_maxover2limit', 'enc_loans_account_holder_type', 'enc_loans_credit_status', 'enc_loans_credit_type', 'enc_loans_account_cur']
    X = create_count_columns(X, columns_to_agg)

    # Группировка и агрегация
    X = X.groupby('id').agg({
        'has_no_debt_flag': 'median',
        'has_overdue_flag': 'median',
        'pclose_flag': 'median',
        'fclose_flag': 'median',
        **{col: 'sum' for col in X.columns if col.endswith('_count')},
        **{col: 'mean' for col in X.columns if col not in ['has_no_debt_flag', 'has_overdue_flag', 'id', 'fclose_flag', 'pclose_flag']}
    }).reset_index()
    X.drop(["id"], axis=1, inplace=True)
    return X


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), features_to_scale),
        ('passthrough', 'passthrough', ['has_no_debt_flag', 'has_overdue_flag', 'fclose_flag', 'pclose_flag'])
    ]
)



pipeline = Pipeline(steps=[
    ('preprocess', FunctionTransformer(preprocess_data, validate=False)),
    ('scaler', preprocessor),
    ('classifier', VotingClassifier(estimators=[
        ('catboost', CatBoostClassifier(class_weights=class_weights_dict, iterations=935, learning_rate=0.05312609024208735, depth=6, l2_leaf_reg=1)),
        ('xgb', XGBClassifier(scale_pos_weight=class_weights_dict[1] / class_weights_dict[0], n_estimators=873, learning_rate=0.09607318384069156, max_depth=4, min_child_weight=3, subsample=0.6109521676209237, colsample_bytree=0.73859823191902369)),
        ('lgbm', LGBMClassifier(class_weight='balanced', n_estimators=437, learning_rate=0.04778428283852397, max_depth=10, num_leaves=81))
    ], voting='soft'))
])

X = pd.read_parquet("train_data")
y = pd.read_csv("train_target.csv", index_col="id")

pipeline.fit(X, y["flag"])

  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] ==

  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] ==

  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] ==

  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] == value).astype(int)
  df[f'{column}_{value}_count'] = (df[column] ==

0:	learn: 0.6869567	total: 300ms	remaining: 4m 40s
1:	learn: 0.6812104	total: 649ms	remaining: 5m 2s
2:	learn: 0.6761697	total: 975ms	remaining: 5m 2s
3:	learn: 0.6716120	total: 1.28s	remaining: 4m 59s
4:	learn: 0.6673141	total: 1.6s	remaining: 4m 57s
5:	learn: 0.6635073	total: 1.93s	remaining: 4m 58s
6:	learn: 0.6599179	total: 2.22s	remaining: 4m 54s
7:	learn: 0.6567497	total: 2.53s	remaining: 4m 53s
8:	learn: 0.6538219	total: 2.87s	remaining: 4m 55s
9:	learn: 0.6512200	total: 3.21s	remaining: 4m 56s
10:	learn: 0.6487638	total: 3.53s	remaining: 4m 56s
11:	learn: 0.6464888	total: 3.88s	remaining: 4m 58s
12:	learn: 0.6442096	total: 4.21s	remaining: 4m 58s
13:	learn: 0.6421736	total: 4.52s	remaining: 4m 57s
14:	learn: 0.6404621	total: 4.86s	remaining: 4m 58s
15:	learn: 0.6387424	total: 5.17s	remaining: 4m 56s
16:	learn: 0.6370087	total: 5.48s	remaining: 4m 55s
17:	learn: 0.6355151	total: 5.83s	remaining: 4m 57s
18:	learn: 0.6340831	total: 6.15s	remaining: 4m 56s
19:	learn: 0.6328350	tota

158:	learn: 0.5953091	total: 47.7s	remaining: 3m 52s
159:	learn: 0.5952530	total: 48s	remaining: 3m 52s
160:	learn: 0.5951606	total: 48.4s	remaining: 3m 52s
161:	learn: 0.5951038	total: 48.6s	remaining: 3m 52s
162:	learn: 0.5950285	total: 48.9s	remaining: 3m 51s
163:	learn: 0.5949856	total: 49.2s	remaining: 3m 51s
164:	learn: 0.5949047	total: 49.5s	remaining: 3m 51s
165:	learn: 0.5947706	total: 49.8s	remaining: 3m 50s
166:	learn: 0.5947025	total: 50.1s	remaining: 3m 50s
167:	learn: 0.5946201	total: 50.4s	remaining: 3m 49s
168:	learn: 0.5945583	total: 50.7s	remaining: 3m 49s
169:	learn: 0.5944985	total: 50.9s	remaining: 3m 49s
170:	learn: 0.5944197	total: 51.3s	remaining: 3m 49s
171:	learn: 0.5943501	total: 51.6s	remaining: 3m 48s
172:	learn: 0.5942795	total: 51.9s	remaining: 3m 48s
173:	learn: 0.5942402	total: 52.1s	remaining: 3m 48s
174:	learn: 0.5941756	total: 52.5s	remaining: 3m 47s
175:	learn: 0.5941169	total: 52.8s	remaining: 3m 47s
176:	learn: 0.5940713	total: 53.1s	remaining: 3m

312:	learn: 0.5878893	total: 1m 33s	remaining: 3m 5s
313:	learn: 0.5878420	total: 1m 33s	remaining: 3m 5s
314:	learn: 0.5877953	total: 1m 34s	remaining: 3m 5s
315:	learn: 0.5877440	total: 1m 34s	remaining: 3m 5s
316:	learn: 0.5877057	total: 1m 34s	remaining: 3m 4s
317:	learn: 0.5876644	total: 1m 35s	remaining: 3m 4s
318:	learn: 0.5876280	total: 1m 35s	remaining: 3m 4s
319:	learn: 0.5875819	total: 1m 35s	remaining: 3m 3s
320:	learn: 0.5875407	total: 1m 35s	remaining: 3m 3s
321:	learn: 0.5874986	total: 1m 36s	remaining: 3m 3s
322:	learn: 0.5874652	total: 1m 36s	remaining: 3m 2s
323:	learn: 0.5874204	total: 1m 36s	remaining: 3m 2s
324:	learn: 0.5873625	total: 1m 37s	remaining: 3m 2s
325:	learn: 0.5873238	total: 1m 37s	remaining: 3m 2s
326:	learn: 0.5872858	total: 1m 37s	remaining: 3m 1s
327:	learn: 0.5872523	total: 1m 38s	remaining: 3m 1s
328:	learn: 0.5872129	total: 1m 38s	remaining: 3m 1s
329:	learn: 0.5871549	total: 1m 38s	remaining: 3m
330:	learn: 0.5871188	total: 1m 38s	remaining: 3m

466:	learn: 0.5824566	total: 2m 19s	remaining: 2m 19s
467:	learn: 0.5824165	total: 2m 19s	remaining: 2m 19s
468:	learn: 0.5823906	total: 2m 19s	remaining: 2m 18s
469:	learn: 0.5823666	total: 2m 20s	remaining: 2m 18s
470:	learn: 0.5823333	total: 2m 20s	remaining: 2m 18s
471:	learn: 0.5823074	total: 2m 20s	remaining: 2m 18s
472:	learn: 0.5822797	total: 2m 21s	remaining: 2m 17s
473:	learn: 0.5822409	total: 2m 21s	remaining: 2m 17s
474:	learn: 0.5822196	total: 2m 21s	remaining: 2m 17s
475:	learn: 0.5821921	total: 2m 22s	remaining: 2m 16s
476:	learn: 0.5821552	total: 2m 22s	remaining: 2m 16s
477:	learn: 0.5821285	total: 2m 22s	remaining: 2m 16s
478:	learn: 0.5820935	total: 2m 22s	remaining: 2m 16s
479:	learn: 0.5820640	total: 2m 23s	remaining: 2m 15s
480:	learn: 0.5820350	total: 2m 23s	remaining: 2m 15s
481:	learn: 0.5820117	total: 2m 23s	remaining: 2m 15s
482:	learn: 0.5819847	total: 2m 24s	remaining: 2m 14s
483:	learn: 0.5819542	total: 2m 24s	remaining: 2m 14s
484:	learn: 0.5819275	total:

620:	learn: 0.5782964	total: 3m 5s	remaining: 1m 33s
621:	learn: 0.5782735	total: 3m 5s	remaining: 1m 33s
622:	learn: 0.5782467	total: 3m 6s	remaining: 1m 33s
623:	learn: 0.5782222	total: 3m 6s	remaining: 1m 32s
624:	learn: 0.5781968	total: 3m 6s	remaining: 1m 32s
625:	learn: 0.5781721	total: 3m 6s	remaining: 1m 32s
626:	learn: 0.5781518	total: 3m 7s	remaining: 1m 31s
627:	learn: 0.5781288	total: 3m 7s	remaining: 1m 31s
628:	learn: 0.5781013	total: 3m 7s	remaining: 1m 31s
629:	learn: 0.5780765	total: 3m 8s	remaining: 1m 31s
630:	learn: 0.5780549	total: 3m 8s	remaining: 1m 30s
631:	learn: 0.5780307	total: 3m 8s	remaining: 1m 30s
632:	learn: 0.5780065	total: 3m 8s	remaining: 1m 30s
633:	learn: 0.5779829	total: 3m 9s	remaining: 1m 29s
634:	learn: 0.5779568	total: 3m 9s	remaining: 1m 29s
635:	learn: 0.5779280	total: 3m 9s	remaining: 1m 29s
636:	learn: 0.5779061	total: 3m 10s	remaining: 1m 29s
637:	learn: 0.5778859	total: 3m 10s	remaining: 1m 28s
638:	learn: 0.5778587	total: 3m 10s	remainin

774:	learn: 0.5746800	total: 3m 50s	remaining: 47.6s
775:	learn: 0.5746533	total: 3m 50s	remaining: 47.3s
776:	learn: 0.5746282	total: 3m 51s	remaining: 47s
777:	learn: 0.5746079	total: 3m 51s	remaining: 46.7s
778:	learn: 0.5745858	total: 3m 51s	remaining: 46.4s
779:	learn: 0.5745630	total: 3m 52s	remaining: 46.1s
780:	learn: 0.5745442	total: 3m 52s	remaining: 45.8s
781:	learn: 0.5745240	total: 3m 52s	remaining: 45.5s
782:	learn: 0.5745070	total: 3m 52s	remaining: 45.2s
783:	learn: 0.5744849	total: 3m 53s	remaining: 44.9s
784:	learn: 0.5744664	total: 3m 53s	remaining: 44.6s
785:	learn: 0.5744476	total: 3m 53s	remaining: 44.3s
786:	learn: 0.5744302	total: 3m 54s	remaining: 44s
787:	learn: 0.5744087	total: 3m 54s	remaining: 43.7s
788:	learn: 0.5743862	total: 3m 54s	remaining: 43.4s
789:	learn: 0.5743622	total: 3m 54s	remaining: 43.1s
790:	learn: 0.5743489	total: 3m 55s	remaining: 42.8s
791:	learn: 0.5743313	total: 3m 55s	remaining: 42.5s
792:	learn: 0.5743078	total: 3m 55s	remaining: 42.

930:	learn: 0.5713728	total: 4m 36s	remaining: 1.19s
931:	learn: 0.5713509	total: 4m 36s	remaining: 890ms
932:	learn: 0.5713278	total: 4m 36s	remaining: 594ms
933:	learn: 0.5713124	total: 4m 37s	remaining: 297ms
934:	learn: 0.5712925	total: 4m 37s	remaining: 0us
[LightGBM] [Info] Number of positive: 106442, number of negative: 2893558
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.931119 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21993
[LightGBM] [Info] Number of data points in the train set: 3000000, number of used features: 199
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


In [105]:
import joblib

joblib.dump(pipeline, 'model_pipeline.pkl')

['model_pipeline.pkl']