# School of Quants hackathon 2025 – Finals

## Импорты и настройки

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
from catboost import CatBoostClassifier, Pool
# ЯЧЕЙКА 8 — GRU по risk-последовательности
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)

DEVICE: cpu


## Данные

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [96]:
X_train = pd.read_csv('res_2/X_train.csv')
y_train = pd.read_csv('res_2/y_train.csv')
X_test = pd.read_csv('res_2/X_test.csv')

In [11]:
df = X_train.copy()
df = df.sort_values(by='id').set_index('id')

In [12]:
df['target'] = y_train.set_index('id')['flag']

In [8]:
df.columns

Index(['credit_number_for_user', 'days_since_confirmed', 'maturity_plan',
       'maturity_fact', 'credit_limit', 'next_payment_sum', 'sum_left_to_pay',
       'current_overdue_debt', 'max_overdue_debt', 'full_credit_cost',
       'overdues_5d', 'overdues_5d_30d', 'overdues_30d_60d',
       'overdues_60d_90d', 'overdues_90d', 'no_overdues_5d',
       'no_overdues_5d_30d', 'no_overdues_30d_60d', 'no_overdues_60d_90d',
       'no_overdues_90d', 'enc_paym_0', 'enc_paym_1', 'enc_paym_2',
       'enc_paym_3', 'enc_paym_4', 'enc_paym_5', 'enc_paym_6', 'enc_paym_7',
       'enc_paym_8', 'enc_paym_9', 'enc_paym_10', 'enc_paym_11', 'enc_paym_12',
       'enc_paym_13', 'enc_paym_14', 'enc_paym_15', 'enc_paym_16',
       'enc_paym_17', 'enc_paym_18', 'enc_paym_19', 'enc_paym_20',
       'enc_paym_21', 'enc_paym_22', 'enc_paym_23', 'enc_paym_24',
       'credit_type', 'credit_currency', 'target'],
      dtype='object')

## Исследование данных и анализ признаков

### enc_paym_

#### Анализ

Надо однозначно понять, что за статусы платежей представлены, вот динамика платежей по месяцам (`enc_paym_{0..N}` - Статусы ежемесячных платежей за последние N месяцев***):

In [9]:
all_data = []
for month in range(0, 25):
    payment_counts = X_train.groupby(f'enc_paym_{month}')['id'].count()
    month_data = {
        'month': month,
        'status_0': payment_counts.get(0, 0),
        'status_1': payment_counts.get(1, 0),
        'status_2': payment_counts.get(2, 0),
        'status_3': payment_counts.get(3, 0)
    }
    all_data.append(month_data)
result_df = pd.DataFrame(all_data)
result_df

Unnamed: 0,month,status_0,status_1,status_2,status_3
0,0,1695569,53323,3950,74562
1,1,1528964,103377,8043,187020
2,2,1447139,92128,7369,280768
3,3,1374843,87819,7042,357700
4,4,1302740,82808,6396,435460
5,5,1227907,76923,5967,516607
6,6,1142545,70554,5145,609160
7,7,1050430,65498,4804,706672
8,8,988710,60230,4347,774117
9,9,927778,55380,3935,840311


Можно заметить, что 11, 20, 24ый месяцы отсутствуте значение status_0, но это ОБМАН, по динамике как раз видно, что данные сдвинуты и не хвататет как раз `status_3`. Посмотрим на тестовые данные:

In [10]:
all_data = []
for month in range(0, 25):
    payment_counts = X_train.groupby(f'enc_paym_{month}')['id'].count()

    month_data = {
        'month': month,
        'status_0': payment_counts.get(0, 0),
        'status_1': payment_counts.get(1, 0),
        'status_2': payment_counts.get(2, 0),
        'status_3': payment_counts.get(3, 0)
    }
    all_data.append(month_data)
result_df = pd.DataFrame(all_data)
result_df

Unnamed: 0,month,status_0,status_1,status_2,status_3
0,0,1695569,53323,3950,74562
1,1,1528964,103377,8043,187020
2,2,1447139,92128,7369,280768
3,3,1374843,87819,7042,357700
4,4,1302740,82808,6396,435460
5,5,1227907,76923,5967,516607
6,6,1142545,70554,5145,609160
7,7,1050430,65498,4804,706672
8,8,988710,60230,4347,774117
9,9,927778,55380,3935,840311


In [11]:
all_data = []
for month in range(0, 25):
    payment_counts = X_train.groupby(f'enc_paym_{month}')['id'].count()

    month_data = {
        'month': month,
        'status_0': payment_counts.get(0, 0),
        'status_1': payment_counts.get(1, 0),
        'status_2': payment_counts.get(2, 0),
        'status_3': payment_counts.get(3, 0)
    }
    all_data.append(month_data)
result_df = pd.DataFrame(all_data)
result_df

Unnamed: 0,month,status_0,status_1,status_2,status_3
0,0,1695569,53323,3950,74562
1,1,1528964,103377,8043,187020
2,2,1447139,92128,7369,280768
3,3,1374843,87819,7042,357700
4,4,1302740,82808,6396,435460
5,5,1227907,76923,5967,516607
6,6,1142545,70554,5145,609160
7,7,1050430,65498,4804,706672
8,8,988710,60230,4347,774117
9,9,927778,55380,3935,840311


In [12]:
# Сдвигаем определённые месяца
map_enc_paym_ = {1:0, 2:1, 3:2, 4:3}

df['enc_paym_11'] = df['enc_paym_11'].map(map_enc_paym_)
df['enc_paym_20'] = df['enc_paym_20'].map(map_enc_paym_)
df['enc_paym_24'] = df['enc_paym_24'].map(map_enc_paym_)

X_test['enc_paym_11'] = X_test['enc_paym_11'].map(map_enc_paym_)
X_test['enc_paym_20'] = X_test['enc_paym_20'].map(map_enc_paym_)
X_test['enc_paym_24'] = X_test['enc_paym_24'].map(map_enc_paym_)

Так всё же, что это за статусы? Если исходить из простой банковской логики и посчитать распределение дефолтов согласно статусам:
|N|default_rate|
|----------|----------|
|2 |0.125000 |
|1| 0.051819 |
|3 |0.034540 |
|0| 0.028840 |


- **Статус 0**: Своевременный платёж (timely paid). Доминирует в недавних месяцах (382k в месяц 1, ~80% активных кредитов), корреляция с target=False (не-дефолт) выше. Это "заплочено вовремя".
- **Статус 3**: Лёгкая/средняя просрочка (past due 1–60 дней). Редкий (25k–6k, ~5–1%), чаще у дефолтеров (target=True, до 25% в примерах), коррелирует с overdues_5d_30d. Это "просрочено" на ранних стадиях.
- **Статус 1**: Тяжёлая просрочка (past due 60+ дней или charge-off). Очень редкий (<0.5%, 2k–0.5k), почти только у дефолтеров, коррелирует с overdues_60d_90d/overdues_90d. Это "сильно просрочено".
- **Статус 2**: Это очень серьёзный маркер дефолта.

На основе датасета, идея - создать метрики, учитывающие:
- Количество и тяжесть просрочек (1, 2) в последних месяцах.
- Последовательности (streaks) или переходы (например, 0→1→2 хуже, чем 1→0).
- Веса для recency: недавние месяцы (enc_paym_0–2) важнее, чем enc_paym_3–5.

#### Сложные признаки

Заменяем случайные веса на осмысленные, которые увеличиваются с увеличением риска:

In [13]:
enc_paym_corrected = {
    0: 0,  #  → 0 (лучшая ситуация)
    3: 1,  #  → 1 (средний риск)
    1: 2,  #  → 2 (высокий риск)
    2: 3   #  → 3 (максимальный риск)
}
for month in range(0, 25):
    df[f'enc_paym_{month}'] = df[f'enc_paym_{month}'].map(enc_paym_corrected)
    X_test[f'enc_paym_{month}'] = X_test[f'enc_paym_{month}'].map(enc_paym_corrected)

#### Взвешенная сумма статусов (дисконтированная по времени)

$$S=s_0​⋅1+s_1​⋅0.9+s_2​⋅0.8+…$$

Смысл:
- чем ближе к текущему моменту плохие статусы, тем сильнее они влияют;
- штрафы за «старые» косяки постепенно уменьшаются.
Это очень информативный признак, потому что модель получит сглаженный показатель уровня риска клиента с учётом динамики.

In [14]:
def weighted_status_score(df, w):
    df = df.copy()
    df['weighted_status_score'] = 0
    for month in range(0, 25):
        df['weighted_status_score'] += df[f'enc_paym_{month}'] * w[month]
    return df

In [15]:
df = weighted_status_score(df, np.flip((np.arange(25) * 1/25)))

In [16]:
X_test = weighted_status_score(X_test, np.flip((np.arange(25) * 1/25)))

#### Максимальная длина подряд идущих «плохих» статусов

Смысл:
- Если клиент допустил разовую просрочку, это не так страшно.
- Если у него 3–4 месяца подряд delinquent или defaulted, это почти всегда дефолтный сценарий.
Такой признак даёт сильный сигнал модели, потому что «устойчивость» плохого поведения предсказывает вероятность дефолта лучше, чем единичные выбросы.

In [17]:
def longest_run_bad_status(df, bad_statuses={2, 3}):
    df = df.copy()
    df['longest_run_bad_status'] = 0
    df['maxi'] = 0
    for month in tqdm(range(0, 25)):
        mask = df['maxi'] > df['longest_run_bad_status']
        df.loc[mask, 'longest_run_bad_status'] = df.loc[mask, 'maxi']

        # Обновляем maxi для всех строк
        df['maxi'] = df.apply(
            lambda row: row['maxi'] + 1 if row[f'enc_paym_{month}'] in bad_statuses else 0,
            axis=1
        )
    mask = df['maxi'] > df['longest_run_bad_status']
    df.loc[mask, 'longest_run_bad_status'] = df.loc[mask, 'maxi']
    return df['longest_run_bad_status']

In [19]:
df['longest_run_bad_status'] = longest_run_bad_status(df)

100%|██████████| 25/25 [05:48<00:00, 13.93s/it]


In [20]:
X_test['longest_run_bad_status'] = longest_run_bad_status(X_test)

100%|██████████| 25/25 [01:17<00:00,  3.11s/it]


#### Количество переходов «ухудшения положения»

Cчитается сколько раз статус ухудшился в истории:
- 0 → 1
- 1 → 2
- 2 → 3

Это отражает «тренд»: клиент движется к дефолту или колеблется.
Если много ухудшений подряд → клиент явно уходит в риск-зону.

In [21]:
def count_deterioration_transitions(df):
    df = df.copy()
    df['deterioration_count'] = 0
    for month in range(1, 25):
        current_col = f'enc_paym_{month}'
        prev_col = f'enc_paym_{month-1}'

        # Определяем условия ухудшения
        condition_0_to_1 = (df[prev_col] == 0) & (df[current_col] == 1)
        condition_1_to_2 = (df[prev_col] == 1) & (df[current_col] == 2)
        condition_2_to_3 = (df[prev_col] == 2) & (df[current_col] == 3)
        # Суммируем все случаи ухудшения
        deterioration_mask = condition_0_to_1 | condition_1_to_2 | condition_2_to_3
        df.loc[deterioration_mask, 'deterioration_count'] += 1
    return df['deterioration_count']

In [22]:
df['deterioration_count'] = count_deterioration_transitions(df)

In [23]:
X_test['deterioration_count'] = count_deterioration_transitions(X_test)

### credit_number_for_user

`credit_number_for_user` - Порядковый номер кредитного продукта в кредитной истории. Большему номеру соответствует продукт с более поздней датой открытия.

Этот признак и так хорош, он отражает насколько можно доверять клиенту, ведь, если он брал кредиты ранее и возвращал, то у него хорошая кредитная история и выше шанс отдать деньги.

### days_since_confirmed

`days_since_confirmed` - Дней с даты подтверждения информации по кредиту до даты сбора данных*

Этот признак я вообше не понимаю, как он связан с дефолтом?

### maturity_fact

`maturity_fact` - Фактическое количество дней с даты открытия кредита до даты закрытия*
Само по себе знание сколько дней длился кредит важно - чем больше срок кредита, тем выше вероятность дефолта (невыплаты). И наоборот досрочное погашение это круто

### maturity_plan

`maturity_plan` - Плановое количество дней с даты открытия кредита до даты закрытия*

Само по себе знание сколько дней должен был длился кредит не важно, но понимание того, насколько был превышен ожидаемый срок:

Если maturity_ratio больше 1, то срок превышен, если меньше 1, то было досрочное погашение!

In [24]:
def maturity_ratio(maturity_plan, maturity_fact):
    return maturity_plan / (maturity_fact + 1)

In [25]:
df['maturity_ratio'] = maturity_ratio(df['maturity_plan'], df['maturity_fact'])

In [26]:
X_test['maturity_ratio'] = maturity_ratio(X_test['maturity_plan'], X_test['maturity_fact'])

### credit_limit

`credit_limit` - Кредитный лимит. Уже очень крутой признак. Показывает уровень доверия к клиенту.

### next_payment_sum

`next_payment_sum` - Сумма следующего платежа по кредиту*. Уже хороший признак. Но его надо превратить в безразмерный:

In [27]:
def next_payment_sum_ratio(next_payment_sum, full_credit_cost):
    return next_payment_sum / (full_credit_cost + 1)

In [28]:
df['next_payment_sum_ratio'] = next_payment_sum_ratio(df['next_payment_sum'], df['full_credit_cost'])

In [29]:
X_test['next_payment_sum_ratio'] = next_payment_sum_ratio(X_test['next_payment_sum'], X_test['full_credit_cost'])

### sum_left_to_pay

`sum_left_to_pay` - Оставшаяся невыплаченная сумма кредита*. Уже хороший признак. Но его надо превратить в безразмерный:

In [30]:
def sum_left_to_pay_progress(sum_left_to_pay, full_credit_cost):
    return 1 - (sum_left_to_pay / (full_credit_cost + 1))

In [31]:
df['sum_left_to_pay_progress'] = sum_left_to_pay_progress(df['sum_left_to_pay'], df['full_credit_cost'])

In [32]:
X_test['sum_left_to_pay_progress'] = sum_left_to_pay_progress(X_test['sum_left_to_pay'], X_test['full_credit_cost'])

### current_overdue_debt

`current_overdue_debt` - Текущая просроченная задолженность*. Абсолютно пустой признак - не нужен

### max_overdue_debt

`max_overdue_debt` - Максимальная просроченная задолженность*. Можно сделать относительной величиной

In [33]:
def max_overdue_debt_ratio(max_overdue_debt, full_credit_cost):
    return max_overdue_debt / (full_credit_cost + 1)

In [34]:
df['max_overdue_debt_ratio'] = max_overdue_debt_ratio(df['max_overdue_debt'], df['full_credit_cost'])

In [35]:
X_test['max_overdue_debt_ratio'] = max_overdue_debt_ratio(X_test['max_overdue_debt'], X_test['full_credit_cost'])

### full_credit_cost

`full_credit_cost` - Полная стоимость кредита*. Можно сделать относительной от общего кредитного лимита

In [36]:
def full_credit_cost_ef_rate(full_credit_cost, credit_limit):
    return (full_credit_cost / (credit_limit + 1) - 1)

In [37]:
df['full_credit_cost_ef_rate'] = full_credit_cost_ef_rate(df['full_credit_cost'], df['credit_limit'])

In [38]:
X_test['full_credit_cost_ef_rate'] = full_credit_cost_ef_rate(X_test['full_credit_cost'], X_test['credit_limit'])

### overdues_Xd_Yd

`overdues_Xd_Yd` - Число просрочек сроком между X дней и Y дней (либо менее 5 дней/более 90 дней)*

In [39]:
def total_overdues(df):
    df = df.copy()
    df["total_overdues"] = (
        df["overdues_5d"] + df["overdues_5d_30d"] + df["overdues_30d_60d"] +
        df["overdues_60d_90d"] + df["overdues_90d"]
    )
    return df

In [40]:
def has_overdue(df):
    df = df.copy()
    df["has_long_overdue"] = (df["overdues_60d_90d"] > 0) | (df["overdues_90d"] > 0)
    df["has_mid_overdue"] = (df["overdues_30d_60d"] > 0).astype(int)
    return df

In [41]:
def overdue_severity_score(df):
    df = df.copy()
    df["overdue_severity_score"] = (
        1*df["overdues_5d"] +
        2*df["overdues_5d_30d"] +
        4*df["overdues_30d_60d"] +
        6*df["overdues_60d_90d"] +
        10*df["overdues_90d"]
    )
    return df

In [42]:
def overdue_ratio(df):
    df = df.copy()
    df["overdue_ratio"] = df["total_overdues"] / (df["credit_number_for_user"] + 1)
    return df

In [43]:
df = total_overdues(df)
df = has_overdue(df)
df = overdue_severity_score(df)
df = overdue_ratio(df)

In [44]:
X_test = total_overdues(X_test)
X_test = has_overdue(X_test)
X_test = overdue_severity_score(X_test)
X_test = overdue_ratio(X_test)

### no_overdues_Xd_Yd

`no_overdues_Xd_Yd` - нет просрочек сроком между X дней и Y дней (либо менее 5 дней/более 90 дней)

In [49]:
def has_clean_history(df):
    df = df.copy()
    df["has_clean_history"] = (
    (df["no_overdues_5d"]==1) &
    (df["no_overdues_5d_30d"]==1) &
    (df["no_overdues_30d_60d"]==1) &
    (df["no_overdues_60d_90d"]==1) &
    (df["no_overdues_90d"]==1)
    ).astype(int)
    return df

In [50]:
def max_overdue_level(df):
    df = df.copy()
    df["max_overdue_level"] = (
        df[["overdues_5d","overdues_5d_30d","overdues_30d_60d",
            "overdues_60d_90d","overdues_90d"]] > 0
    ).idxmax(axis=1)
    return df

In [51]:
df = has_clean_history(df)
df = max_overdue_level(df)

In [52]:
X_test = has_clean_history(X_test)
X_test = max_overdue_level(X_test)

### credit_type

`credit_type` - Тип кредита***

### credit_currency

`credit_currency` - Валюта кредита**

### другие фичи

In [None]:
def _fit_rate_map(s_values: np.ndarray, y_values: np.ndarray, prior: float, smoothing: float = 20.0):
    """
    Строит сглаженную карту риска: value -> P(default|value).
    - s_values: массив значений признака на train-части fold'а
    - y_values: соответствующий массив 0/1
    - prior: глобальная доля дефолтов (сглаживающий приор)
    - smoothing: сила сглаживания (чем больше, тем ближе к prior при редких значениях)
    Возвращает словарь {значение: риск}
    """
    tmp = pd.DataFrame({'val': s_values, 'y': y_values})
    grp = tmp.groupby('val')['y']
    mean = grp.mean()
    cnt  = grp.size()
    # сглаженный риск: (mean*count + prior*smoothing) / (count + smoothing)
    risk = (mean*cnt + prior*smoothing) / (cnt + smoothing)
    return risk.to_dict()

In [54]:
target = df['target'].astype(int).values

enc_cols = [c for c in df.columns if c.startswith('enc_paym_')]
enc_cols = sorted(enc_cols, key=lambda s: int(s.split('_')[-1]))  # гарантированно 0..24 по возрастанию

overdue_cols = [
    'overdues_5d','overdues_5d_30d','overdues_30d_60d','overdues_60d_90d','overdues_90d',
    'no_overdues_5d','no_overdues_5d_30d','no_overdues_30d_60d','no_overdues_60d_90d','no_overdues_90d'
]

starred_cols = [  # поля со звёздочкой из условия (разбиты по интервалам → «категории» в числах)
    'days_since_confirmed','maturity_plan','maturity_fact','credit_limit',
    'next_payment_sum','sum_left_to_pay','current_overdue_debt','max_overdue_debt','full_credit_cost'
]
cat_cols = ['credit_type','credit_currency']  # настоящие категории (перекодированные числами)
base_num = ['credit_number_for_user']         # числовая, но мы тоже ей присвоим риск-ранг

print(len(df), len(X_test), len(enc_cols))

1827404 456852 25


In [14]:
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
folds = list(skf.split(df, target))

In [56]:
def oof_target_rate_map(train_series: pd.Series, y: np.ndarray, folds, smoothing: float = 20.0):
    """
    Возвращает:
      - oof_risk: массив OOF-рисков для train_series (на валидации каждого фолда считаем по карте, обученной на его train-части)
      - full_map: карта риска по всей обучающей части для применения на тесте
    """
    prior = y.mean().item()
    oof_risk = np.zeros(len(train_series), dtype=np.float32)

    # Проход по фолдам: для каждого валидного куска используем карту, обученную на тренировочном
    for tr_idx, vl_idx in folds:
        s_tr = train_series.iloc[tr_idx].values
        y_tr = y[tr_idx]
        s_vl = train_series.iloc[vl_idx].values

        risk_map = _fit_rate_map(s_tr, y_tr, prior, smoothing)
        # OOF для валидации
        oof_risk[vl_idx] = np.array([risk_map.get(v, prior) for v in s_vl], dtype=np.float32)

    # Полная карта риска для теста
    risk_map_full = _fit_rate_map(train_series.values, y, prior, smoothing)
    return oof_risk, risk_map_full

def apply_rate_map(series: pd.Series, rate_map: dict, default_rate: float):
    """Применяем карту риска к Series (неизвестные значения -> default_rate)."""
    return series.map(rate_map).fillna(default_rate).astype(np.float32)


In [57]:
# ЯЧЕЙКА 3a — строим risk-последовательности (N x 25) для train и test
def build_risk_sequence(df_tr: pd.DataFrame, df_te: pd.DataFrame, enc_cols, y: np.ndarray, folds):
    """
    На каждый enc_paym_k строим OOF риск (train) и full-map (test).
    Склеиваем по столбцам -> матрица [N, 25] риска, согласованная с порядком enc_cols.
    """
    prior = y.mean().item()
    mats_tr = []
    mats_te = []

    for c in tqdm(enc_cols):
        oof, full_map = oof_target_rate_map(df_tr[c], y, folds, smoothing=20.0)
        mats_tr.append(oof.reshape(-1, 1))
        mats_te.append(apply_rate_map(df_te[c], full_map, prior).values.reshape(-1, 1))

    risk_tr = np.concatenate(mats_tr, axis=1).astype(np.float32)  # [N_train, 25]
    risk_te = np.concatenate(mats_te, axis=1).astype(np.float32)  # [N_test,  25]
    return risk_tr, risk_te

In [58]:
risk_seq_tr, risk_seq_te = build_risk_sequence(df, X_test, enc_cols, target, folds)
risk_seq_tr.shape, risk_seq_te.shape

100%|██████████| 25/25 [00:22<00:00,  1.12it/s]


((1827404, 25), (456852, 25))

In [59]:
# ЯЧЕЙКА 3b — агрегаты по risk-последовательности (без тяжёлых циклов)
def ema_last_axis(mat: np.ndarray, alpha: float = 0.85):
    """
    Применяет EMA слева направо по оси признаков:
    y_t = alpha*y_{t-1} + (1-alpha)*x_t
    Возвращает 1 столбец: последний EMA (сильнее учитывает свежие месяцы).
    """
    out = np.zeros(mat.shape[0], dtype=np.float32)
    for j in range(mat.shape[1]):
        if j == 0:
            out = mat[:, j]
        else:
            out = alpha * out + (1.0 - alpha) * mat[:, j]
    return out.astype(np.float32)

def build_payment_risk_aggs(risk_tr: np.ndarray, risk_te: np.ndarray):
    """
    Быстрые агрегаты из risk-последовательности (векторно):
    - среднее за все 25
    - среднее за последние 12, 6
    - EMA (последнее значение)
    - bad_count_12, bad_count_6 (где bad >= глобальной медианы по train-последовательности)
    """
    # глобальный порог "плохости" по train
    thr = np.median(risk_tr)

    def agg_side(mat: np.ndarray):
        feats = {}
        feats['risk_mean_25']      = mat.mean(axis=1).astype(np.float32)
        feats['risk_mean_last_12'] = mat[:, -12:].mean(axis=1).astype(np.float32)
        feats['risk_mean_last_6']  = mat[:, -6:].mean(axis=1).astype(np.float32)
        feats['risk_ema']          = ema_last_axis(mat, alpha=0.85)  # последний EMA

        bad = (mat >= thr)  # bool
        feats['bad_count_12'] = bad[:, -12:].sum(axis=1).astype(np.int16)
        feats['bad_count_6']  = bad[:, -6:].sum(axis=1).astype(np.int16)
        return pd.DataFrame(feats)

    tr_aggs = agg_side(risk_tr)
    te_aggs = agg_side(risk_te)
    return tr_aggs, te_aggs

In [60]:
tr_pay_aggs, te_pay_aggs = build_payment_risk_aggs(risk_seq_tr, risk_seq_te)
tr_pay_aggs.shape, te_pay_aggs.shape

((1827404, 6), (456852, 6))

In [61]:
# ЯЧЕЙКА 5 — риск-фичи для starred и base_num
def build_risk_features_block(df_tr: pd.DataFrame, df_te: pd.DataFrame, cols, y: np.ndarray, folds):
    prior = y.mean().item()
    out_tr = {}
    out_te = {}

    for c in cols:
        oof, mp = oof_target_rate_map(df_tr[c], y, folds, smoothing=30.0)
        out_tr[f'{c}_risk'] = oof.astype(np.float32)
        out_te[f'{c}_risk'] = apply_rate_map(df_te[c], mp, prior).values

    return pd.DataFrame(out_tr), pd.DataFrame(out_te)

In [62]:
tr_star, te_star = build_risk_features_block(df, X_test, starred_cols + base_num, target, folds)
tr_star.shape, te_star.shape

((1827404, 10), (456852, 10))

In [63]:
# ЯЧЕЙКА 5b — сглаженное target encoding для категорий (credit_type, credit_currency)
def build_smoothed_te(df_tr: pd.DataFrame, df_te: pd.DataFrame, cols, y: np.ndarray, folds, min_count=50, prior=None):
    if prior is None: prior = y.mean().item()
    out_tr = {}
    out_te = {}

    for c in cols:
        oof_vals = np.zeros(len(df_tr), dtype=np.float32)
        for tr_idx, vl_idx in folds:
            s_tr = df_tr[c].iloc[tr_idx].values
            y_tr = y[tr_idx]
            # считаем частоты и средние на train части
            tmp = pd.DataFrame({'v': s_tr, 'y': y_tr})
            grp = tmp.groupby('v')['y']
            mean = grp.mean()
            cnt  = grp.size()
            # сглаживание к prior при малых cnt
            smoothed = (mean*cnt + prior*min_count) / (cnt + min_count)
            mp = smoothed.to_dict()
            oof_vals[vl_idx] = np.array([mp.get(v, prior) for v in df_tr[c].iloc[vl_idx].values], dtype=np.float32)

        # full map для теста
        tmp_all = pd.DataFrame({'v': df_tr[c].values, 'y': y})
        grp_all = tmp_all.groupby('v')['y']
        mean_a = grp_all.mean()
        cnt_a  = grp_all.size()
        smoothed_a = (mean_a*cnt_a + prior*min_count) / (cnt_a + min_count)
        mp_all = smoothed_a.to_dict()

        out_tr[f'{c}_te'] = oof_vals
        out_te[f'{c}_te'] = df_te[c].map(mp_all).fillna(prior).astype(np.float32).values

    return pd.DataFrame(out_tr), pd.DataFrame(out_te)

In [64]:
def safe_div(a, b):
    return np.where(b!=0, a/b, 0.0).astype(np.float32)

In [65]:
tr_cat, te_cat = build_smoothed_te(df, X_test, cat_cols, target, folds, min_count=100)
tr_cat.shape, te_cat.shape

((1827404, 2), (456852, 2))

In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis

def ema_last_axis(mat, alpha=0.85):
    """Векторизованная версия EMA с прогресс-баром"""
    out = np.zeros(mat.shape[0], dtype=np.float32)
    for j in tqdm(range(mat.shape[1]), desc="EMA calculation", leave=False):
        out = alpha * out + (1.0 - alpha) * mat[:, j] if j > 0 else mat[:, 0]
    return out.astype(np.float32)

def poly_trend_slope_vectorized(mat):
    """Векторизованная версия расчета наклона тренда"""
    with tqdm(total=6, desc="Trend slope calculation", leave=False) as pbar:
        n = mat.shape[1]
        t = np.arange(n, dtype=np.float32)
        t_mean = t.mean()
        pbar.update(1)

        # Векторизованные вычисления
        x_mean = mat.mean(axis=1, keepdims=True)
        pbar.update(1)

        t_diff = t - t_mean
        x_diff = mat - x_mean
        pbar.update(1)

        cov = (t_diff * x_diff).mean(axis=1)
        pbar.update(1)

        t_std = t_diff.std()
        x_std = x_diff.std(axis=1)
        pbar.update(1)

        denom = t_std * x_std + 1e-9
        result = (cov / denom).astype(np.float32)
        pbar.update(1)

    return result

def hist_feats_vectorized(mat, bins=10, desc="Histogram features"):
    """Векторизованная версия гистограммных фич с прогресс-баром"""
    with tqdm(total=4, desc=desc, leave=False) as pbar:
        h = np.apply_along_axis(lambda x: np.histogram(x, bins=bins, range=(0.0, 1.0))[0], 1, mat)
        pbar.update(1)

        h = h.astype(np.float32)
        pbar.update(1)

        h_sum = h.sum(axis=1, keepdims=True)
        pbar.update(1)

        dens = np.divide(h, h_sum, out=np.zeros_like(h), where=h_sum != 0)
        result = np.concatenate([h, dens], axis=1).astype(np.float32)
        pbar.update(1)

    return result

def safe_div(a, b):
    return np.divide(a, b, out=np.zeros_like(a), where=b != 0)

def build_payment_aggs(risk_tr, risk_te):
    """Оптимизированная версия с векторизацией и полным отслеживанием"""

    def side(mat, dataset_name):
        F = {}

        with tqdm(total=20, desc=f"Processing {dataset_name} features") as pbar:
            # Предварительные вычисления для оптимизации
            mat_mean = mat.mean(axis=1)
            mat_std = mat.std(axis=1)
            mat_min = mat.min(axis=1)
            mat_max = mat.max(axis=1)
            pbar.update(4)

            # Глобальные окна - векторизованные вычисления
            F['mean_25'] = mat_mean.astype(np.float32)
            F['std_25'] = mat_std.astype(np.float32)
            F['min_25'] = mat_min.astype(np.float32)
            F['max_25'] = mat_max.astype(np.float32)
            pbar.update(4)

            # Квантили вычисляем за один проход
            with tqdm(total=1, desc="Quantiles calculation", leave=False) as quant_pbar:
                quantiles = np.quantile(mat, [0.10, 0.50, 0.90], axis=1)
                F['q10_25'] = quantiles[0].astype(np.float32)
                F['q50_25'] = quantiles[1].astype(np.float32)
                F['q90_25'] = quantiles[2].astype(np.float32)
                quant_pbar.update(1)
            pbar.update(1)

            # Асимметрия и эксцесс - векторизованные версии
            with tqdm(total=2, desc="Skewness/Kurtosis", leave=False) as skew_pbar:
                F['skew_25'] = skew(mat, axis=1).astype(np.float32)
                skew_pbar.update(1)
                F['kurt_25'] = kurtosis(mat, axis=1).astype(np.float32)
                skew_pbar.update(1)
            pbar.update(1)

            # EMA и slope
            F['ema_25'] = ema_last_axis(mat, alpha=0.85)
            pbar.update(1)

            F['slope_25'] = poly_trend_slope_vectorized(mat)
            pbar.update(1)

            # Последние окна
            window_sizes = [12, 6, 3]
            for W in tqdm(window_sizes, desc="Window processing", leave=False):
                chunk = mat[:, -W:]

                chunk_mean = chunk.mean(axis=1)
                chunk_std = chunk.std(axis=1)
                chunk_min = chunk.min(axis=1)
                chunk_max = chunk.max(axis=1)

                chunk_quantiles = np.quantile(chunk, [0.10, 0.50, 0.90], axis=1)

                F[f'mean_{W}'] = chunk_mean.astype(np.float32)
                F[f'std_{W}'] = chunk_std.astype(np.float32)
                F[f'min_{W}'] = chunk_min.astype(np.float32)
                F[f'max_{W}'] = chunk_max.astype(np.float32)
                F[f'q10_{W}'] = chunk_quantiles[0].astype(np.float32)
                F[f'q50_{W}'] = chunk_quantiles[1].astype(np.float32)
                F[f'q90_{W}'] = chunk_quantiles[2].astype(np.float32)
                F[f'ema_{W}'] = ema_last_axis(chunk, alpha=0.85)
                F[f'slope_{W}'] = poly_trend_slope_vectorized(chunk)
            pbar.update(3)

            # Гистограммы
            # H25 = hist_feats_vectorized(mat, bins=10, desc="H25 histograms")
            # pbar.update(1)

            # H12 = hist_feats_vectorized(mat[:, -12:], bins=10, desc="H12 histograms")
            # pbar.update(1)

            # Создаем DataFrame
            with tqdm(total=3, desc="Creating DataFrames", leave=False) as df_pbar:
                F_df = pd.DataFrame(F).astype(np.float32)
                df_pbar.update(1)

                # H25_df = pd.DataFrame(H25, columns=[f'h25_{i}' for i in range(H25.shape[1])])
                # df_pbar.update(1)

                # H12_df = pd.DataFrame(H12, columns=[f'h12_{i}' for i in range(H12.shape[1])])
                # df_pbar.update(1)

            result = pd.concat([F_df], axis=1)
            pbar.update(1)

        return result

    # Обработка train и test с прогресс-баром
    with tqdm(total=2, desc="Overall processing") as main_pbar:
        tr = side(risk_tr, "train")
        main_pbar.update(1)
        main_pbar.set_description("Train processing completed")

        te = side(risk_te, "test")
        main_pbar.update(1)
        main_pbar.set_description("Test processing completed")

    return tr, te

In [68]:
tr_pay_aggs, te_pay_aggs = build_payment_aggs(risk_seq_tr, risk_seq_te)
tr_pay_aggs.shape, te_pay_aggs.shape

Overall processing:   0%|          | 0/2 [00:00<?, ?it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A
[A

[A[A

[A[A

[A[A

[A[A

[A[A
[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Processing train features:  90%|█████████ | 18/20 [06:34<00:43, 21.94s/it]
Train processing completed:  50%|█████     | 1/2 [06:34<06:34, 394.97s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A

[A[A

[A[A

[A[A

[A[A
[A

[A[A

[A[A

[A[A

[A[A
[A

[A[A

[A[A

[A[A

[A[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Processing test features:  90%|█████████ | 18/20 [01:29<00:09,  4.96s/it]
Test processing completed: 100%|██████████| 2/2 [08:04<00:00, 242.15s/it] 


((1827404, 78), (456852, 78))

Кросс

In [69]:
engineered_from_df = [
    'maturity_ratio','weighted_status_score','longest_run_bad_status','deterioration_count',
    'next_payment_sum_ratio','sum_left_to_pay_progress','max_overdue_debt_ratio','full_credit_cost_ef_rate',
    'total_overdues','overdue_severity_score','overdue_ratio','has_clean_history','max_overdues_level',
    'credit_type','credit_currency'
]

In [70]:
# Подхватим только реально существующие столбцы
present_in_train = [c for c in engineered_from_df if c in df.columns]
present_in_test  = [c for c in engineered_from_df if c in X_test.columns]
assert set(present_in_train)==set(present_in_test)

df_num = df[present_in_train].copy()
te_num = X_test[present_in_test].copy()

# Превратим bool/binary в float32, NaN/inf -> 0
df_num = df_num.replace([np.inf, -np.inf], np.nan).fillna(0).astype(np.float32)
te_num = te_num.replace([np.inf, -np.inf], np.nan).fillna(0).astype(np.float32)

# Выберем базовый набор для кроссов (только числовые, исключим сырьё категорий)
cross_base = [c for c in df_num.columns if c not in ['credit_type','credit_currency']]

# Ограничим число колонок для кроссов (чтобы память не улетела)
# Возьмём топ-15 по корреляции с OOF-риском одной из сильных фич (например, mean_25)
tmp_corr = pd.DataFrame({'col': cross_base})
tmp_corr['corr'] = [np.corrcoef(df_num[c].values, tr_pay_aggs['mean_25'].values)[0,1] if df_num[c].std()>0 else 0 for c in cross_base]
tmp_corr['corr'] = tmp_corr['corr'].fillna(0).abs()
top_cross = tmp_corr.sort_values('corr', ascending=False)['col'].head(15).tolist()

def make_crosses(A: pd.DataFrame, cols: list) -> pd.DataFrame:
    """
    Сгенерируем пары признаков (i<j) и вычислим:
    - prod, diff, ratio, min, max
    Это даёт ~ 5 * C(len(cols),2) фич.
    """
    feats = {}
    for i in tqdm(range(len(cols))):
        for j in range(i+1, len(cols)):
            a, b = cols[i], cols[j]
            a_v, b_v = A[a].values, A[b].values
            feats[f'{a}__x__{b}'] = (a_v * b_v).astype(np.float32)
            feats[f'{a}__m__{b}'] = (a_v - b_v).astype(np.float32)
            feats[f'{a}__d__{b}'] = safe_div(a_v, b_v)
            feats[f'{a}__mn__{b}'] = np.minimum(a_v, b_v).astype(np.float32)
            feats[f'{a}__mx__{b}'] = np.maximum(a_v, b_v).astype(np.float32)
    return pd.DataFrame(feats)

tr_cross = make_crosses(df_num, top_cross)
te_cross = make_crosses(te_num, top_cross)

# Контроль памяти
print("cross feats:", tr_cross.shape[1])

# Финальные блоки фич — пока без объединения
for blk in [tr_pay_aggs, tr_star, tr_cat, df_num, tr_cross]:
    blk.index = df.index
for blk in [te_pay_aggs, te_star, te_cat, te_num, te_cross]:
    blk.index = X_test.index

100%|██████████| 12/12 [00:01<00:00,  6.00it/s]
100%|██████████| 12/12 [00:00<00:00, 22.83it/s]


cross feats: 330


In [78]:
# tr_pay_aggs.to_csv(f'features/tr_pay_aggs.csv')
# tr_star.to_csv(f'features/tr_star.csv') 
# tr_cat.to_csv(f'features/tr_cat.csv') 
# df.reset_index()[use_cols_num].to_csv(f'features/df_reset_index_use_cols_num.csv') 
# df.reset_index()[use_cols_cat_raw].to_csv(f'features/f_reset_index_use_cols_cat_raw.csv') 
# df_num.to_csv(f'features/df_num.csv') 
# tr_cross.to_csv(f'features/tr_cross.csv')

In [None]:
# te_pay_aggs.to_csv(f'features1/te_pay_aggs.csv')
# te_star.to_csv(f'features1/te_star.csv') 
# te_cat.to_csv(f'features1/te_cat.csv') 
# X_test[use_cols_num].to_csv(f'features1/X_test_use_cols_num.csv') 
# X_test[use_cols_cat_raw].to_csv(f'features1/X_test_use_cols_cat_raw.csv') 
# te_num.to_csv(f'features1/te_num.csv') 
# te_cross.to_csv(f'features1/te_cross.csv') 

In [2]:
tr_pay_aggs = pd.read_csv(f'features/tr_pay_aggs.csv')
tr_star = pd.read_csv(f'features/tr_star.csv') 
tr_cat= pd.read_csv(f'features/tr_cat.csv') 
df_reser_use_cols_nums = pd.read_csv(f'features/df_reset_index_use_cols_num.csv') 
f_reset_index_use_cols_cat_raw = pd.read_csv(f'features/f_reset_index_use_cols_cat_raw.csv') 
df_num = pd.read_csv(f'features/df_num.csv') 
tr_cross = pd.read_csv(f'features/tr_cross.csv')

In [5]:
te_pay_aggs= pd.read_csv(f'features1/te_pay_aggs.csv')
te_star= pd.read_csv(f'features1/te_star.csv') 
te_cat= pd.read_csv(f'features1/te_cat.csv') 
X_test_use_cols_num= pd.read_csv(f'features1/X_test_use_cols_num.csv') 
X_test_use_cols_cat_raw= pd.read_csv(f'features1/X_test_use_cols_cat_raw.csv') 
te_num= pd.read_csv(f'features1/te_num.csv') 
te_cross= pd.read_csv(f'features1/te_cross.csv') 

In [76]:
# Базовая сборка
use_cols_num = [
    'maturity_ratio',
    'weighted_status_score',
    'longest_run_bad_status',
    'deterioration_count',
    'next_payment_sum_ratio',
    'sum_left_to_pay_progress',
    'max_overdue_debt_ratio',
    'full_credit_cost_ef_rate',
    'total_overdues',
    'overdue_severity_score',
    'overdue_ratio',
    'has_clean_history',        # как числовая псевдо-ординальная
    # опционально:
    # 'credit_number_for_user',
]

use_cols_cat_raw = ['credit_type', 'credit_currency']  # пометим как категории в CatBoost


In [None]:
X_tr = pd.concat(
    [tr_pay_aggs, tr_star, tr_cat, df_reser_use_cols_nums, f_reset_index_use_cols_cat_raw, df_num, tr_cross],
    axis=1
)

In [10]:
X_te.to_csv('features1/X_te.csv')

In [9]:
X_te = pd.concat(
    [te_pay_aggs, te_star, te_cat, X_test_use_cols_num, X_test_use_cols_cat_raw, te_num, te_cross],
    axis=1
)

In [6]:
target = df['target'].astype(int).values

In [8]:
df['target'].to_csv('features/target.csv')

In [6]:
X_tr.to_csv('features/X_tr_light.csv')

In [3]:
top_features_clean = ['days_since_confirmed_risk',
 'credit_limit_risk',
 'full_credit_cost_risk',
 'sum_left_to_pay_risk',
 'maturity_fact_risk',
 'credit_type_te',
 'maturity_plan_risk',
 'credit_type',
 'max_25',
 'std_25',
 'max_overdue_debt_ratio__d__sum_left_to_pay_progress',
 'next_payment_sum_risk',
 'kurt_25',
 'weighted_status_score__m__maturity_ratio',
 'max_overdue_debt_ratio__mn__sum_left_to_pay_progress',
 'total_overdues__mn__full_credit_cost_ef_rate',
 'sum_left_to_pay_progress__mx__next_payment_sum_ratio',
 'weighted_status_score__m__deterioration_count',
 'maturity_ratio__x__next_payment_sum_ratio',
 'overdue_severity_score__x__overdue_ratio',
 'weighted_status_score__mx__longest_run_bad_status',
 'overdue_ratio__d__full_credit_cost_ef_rate',
 'weighted_status_score__m__sum_left_to_pay_progress',
 'full_credit_cost_ef_rate__m__max_overdue_debt_ratio',
 'min_12',
 'maturity_ratio__d__full_credit_cost_ef_rate',
 'deterioration_count__d__maturity_ratio',
 'max_overdue_debt_ratio__x__sum_left_to_pay_progress',
 'total_overdues__x__sum_left_to_pay_progress',
 'overdue_ratio__x__has_clean_history',
 'sum_left_to_pay_progress__m__next_payment_sum_ratio',
 'credit_number_for_user_risk',
 'total_overdues__mn__overdue_ratio',
 'overdue_ratio__mn__full_credit_cost_ef_rate',
 'deterioration_count__mn__max_overdue_debt_ratio',
 'credit_currency_te',
 'overdue_ratio__d__has_clean_history',
 'overdue_severity_score__d__sum_left_to_pay_progress',
 'max_12',
 'deterioration_count__mx__overdue_ratio',
 'overdue_ratio__mx__sum_left_to_pay_progress',
 'longest_run_bad_status__mx__overdue_ratio',
 'overdue_ratio__mn__max_overdue_debt_ratio',
 'weighted_status_score__mn__total_overdues',
 'deterioration_count__mn__maturity_ratio',
 'has_clean_history__d__sum_left_to_pay_progress',
 'full_credit_cost_ef_rate__m__sum_left_to_pay_progress',
 'maturity_ratio__d__next_payment_sum_ratio',
 'total_overdues__x__max_overdue_debt_ratio',
 'has_clean_history__x__next_payment_sum_ratio',
 'maturity_ratio__mx__max_overdue_debt_ratio',
 'maturity_ratio__x__max_overdue_debt_ratio',
 'deterioration_count__mn__sum_left_to_pay_progress',
 'sum_left_to_pay_progress',
 'total_overdues__d__next_payment_sum_ratio',
 'q50_6',
 'overdue_severity_score__mn__next_payment_sum_ratio',
 'overdue_ratio__d__next_payment_sum_ratio',
 'weighted_status_score__d__maturity_ratio',
 'maturity_ratio__mx__sum_left_to_pay_progress',
 'overdue_severity_score__x__sum_left_to_pay_progress',
 'sum_left_to_pay_progress__x__next_payment_sum_ratio',
 'overdue_ratio__mx__max_overdue_debt_ratio',
 'max_6',
 'maturity_ratio__mn__max_overdue_debt_ratio',
 'overdue_severity_score__m__maturity_ratio',
 'std_3',
 'maturity_ratio__mn__next_payment_sum_ratio',
 'total_overdues__m__maturity_ratio',
 'maturity_ratio__m__max_overdue_debt_ratio',
 'has_clean_history__mx__full_credit_cost_ef_rate',
 'overdue_severity_score__mn__overdue_ratio',
 'total_overdues__d__sum_left_to_pay_progress',
 'weighted_status_score__mx__deterioration_count',
 'maturity_ratio',
 'total_overdues__d__full_credit_cost_ef_rate',
 'weighted_status_score__mn__max_overdue_debt_ratio',
 'overdue_severity_score__d__max_overdue_debt_ratio',
 'ema_3',
 'total_overdues__mn__has_clean_history',
 'weighted_status_score__x__overdue_severity_score',
 'longest_run_bad_status__mx__sum_left_to_pay_progress',
 'overdue_severity_score__d__overdue_ratio',
 'maturity_ratio__d__sum_left_to_pay_progress',
 'maturity_ratio__d__max_overdue_debt_ratio',
 'q90_25',
 'q50_3',
 'slope_6',
 'maturity_ratio__mx__next_payment_sum_ratio',
 'q50_12',
 'has_clean_history__mx__next_payment_sum_ratio',
 'overdue_ratio__mn__sum_left_to_pay_progress',
 'max_overdue_debt_ratio__mx__next_payment_sum_ratio',
 'total_overdues__mn__next_payment_sum_ratio',
 'maturity_ratio__x__sum_left_to_pay_progress',
 'maturity_ratio__m__full_credit_cost_ef_rate',
 'has_clean_history__x__full_credit_cost_ef_rate',
 'has_clean_history__m__sum_left_to_pay_progress',
 'weighted_status_score__x__maturity_ratio',
 'longest_run_bad_status__mx__maturity_ratio',
 'overdue_ratio__x__maturity_ratio',
 'weighted_status_score__mx__full_credit_cost_ef_rate',
 'overdue_severity_score__d__next_payment_sum_ratio',
 'max_overdue_debt_risk',
 'deterioration_count__mx__full_credit_cost_ef_rate',
 'total_overdues__x__maturity_ratio',
 'overdue_ratio__m__sum_left_to_pay_progress',
 'weighted_status_score__mx__next_payment_sum_ratio',
 'weighted_status_score__mx__overdue_ratio',
 'has_clean_history__m__next_payment_sum_ratio',
 'weighted_status_score__mn__overdue_ratio',
 'overdue_ratio__d__maturity_ratio',
 'weighted_status_score__mx__max_overdue_debt_ratio',
 'full_credit_cost_ef_rate__d__sum_left_to_pay_progress',
 'total_overdues__x__overdue_ratio',
 'overdue_severity_score__x__max_overdue_debt_ratio',
 'weighted_status_score__m__max_overdue_debt_ratio',
 'full_credit_cost_ef_rate__d__next_payment_sum_ratio',
 'total_overdues__d__maturity_ratio',
 'total_overdues__m__overdue_ratio',
 'weighted_status_score__x__overdue_ratio',
 'longest_run_bad_status__mx__full_credit_cost_ef_rate',
 'full_credit_cost_ef_rate__x__next_payment_sum_ratio',
 'weighted_status_score__d__deterioration_count',
 'overdue_ratio__mn__next_payment_sum_ratio',
 'slope_25',
 'overdue_ratio__mx__has_clean_history',
 'overdue_severity_score__m__next_payment_sum_ratio',
 'longest_run_bad_status__m__overdue_ratio',
 'sum_left_to_pay_progress__mn__next_payment_sum_ratio',
 'maturity_ratio__m__sum_left_to_pay_progress',
 'overdue_ratio__mx__full_credit_cost_ef_rate',
 'deterioration_count__x__full_credit_cost_ef_rate',
 'total_overdues__mn__max_overdue_debt_ratio',
 'longest_run_bad_status__mx__next_payment_sum_ratio',
 'min_3',
 'weighted_status_score__x__deterioration_count',
 'total_overdues__x__full_credit_cost_ef_rate',
 'total_overdues__m__full_credit_cost_ef_rate',
 'deterioration_count__x__sum_left_to_pay_progress',
 'min_25',
 'maturity_ratio__mn__full_credit_cost_ef_rate',
 'weighted_status_score__d__has_clean_history',
 'weighted_status_score__x__longest_run_bad_status',
 'ema_12',
 'slope_12',
 'q90_6',
 'deterioration_count__x__overdue_severity_score',
 'deterioration_count__mx__maturity_ratio',
 'has_clean_history__m__full_credit_cost_ef_rate',
 'max_overdue_debt_ratio__mx__sum_left_to_pay_progress',
 'max_overdue_debt_ratio__d__next_payment_sum_ratio',
 'overdue_severity_score__mn__max_overdue_debt_ratio',
 'weighted_status_score__m__total_overdues',
 'longest_run_bad_status__m__next_payment_sum_ratio',
 'deterioration_count__x__max_overdue_debt_ratio',
 'overdue_ratio__m__max_overdue_debt_ratio',
 'weighted_status_score__mn__has_clean_history',
 'full_credit_cost_ef_rate__mx__sum_left_to_pay_progress',
 'max_3',
 'overdue_ratio__m__has_clean_history',
 'longest_run_bad_status__mx__max_overdue_debt_ratio',
 'std_12',
 'full_credit_cost_ef_rate__m__next_payment_sum_ratio',
 'has_clean_history__x__sum_left_to_pay_progress',
 'deterioration_count__m__has_clean_history',
 'weighted_status_score__m__longest_run_bad_status',
 'weighted_status_score__d__overdue_severity_score',
 'full_credit_cost_ef_rate__x__sum_left_to_pay_progress',
 'total_overdues__mn__sum_left_to_pay_progress',
 'deterioration_count__x__overdue_ratio',
 'q90_3',
 'q10_12',
 'overdue_ratio__x__next_payment_sum_ratio',
 'has_clean_history__m__max_overdue_debt_ratio',
 'overdue_ratio__d__max_overdue_debt_ratio',
 'full_credit_cost_ef_rate__d__max_overdue_debt_ratio',
 'total_overdues__x__next_payment_sum_ratio',
 'longest_run_bad_status__mn__overdue_ratio',
 'weighted_status_score__d__max_overdue_debt_ratio',
 'overdue_severity_score__mn__maturity_ratio',
 'longest_run_bad_status__m__maturity_ratio',
 'overdue_severity_score__m__sum_left_to_pay_progress',
 'weighted_status_score__mn__overdue_severity_score',
 'has_clean_history__mn__full_credit_cost_ef_rate',
 'max_overdue_debt_ratio__m__next_payment_sum_ratio',
 'deterioration_count__x__next_payment_sum_ratio',
 'overdue_ratio',
 'full_credit_cost_ef_rate__mn__sum_left_to_pay_progress',
 'weighted_status_score__mn__sum_left_to_pay_progress',
 'maturity_ratio__m__next_payment_sum_ratio',
 'overdue_severity_score__mn__full_credit_cost_ef_rate',
 'full_credit_cost_ef_rate__mx__next_payment_sum_ratio',
 'weighted_status_score__mn__maturity_ratio',
 'weighted_status_score__x__total_overdues',
 'overdue_ratio__mx__next_payment_sum_ratio',
 'overdue_severity_score__m__overdue_ratio',
 'has_clean_history__mn__sum_left_to_pay_progress',
 'weighted_status_score__m__overdue_ratio',
 'maturity_ratio__mx__has_clean_history',
 'full_credit_cost_ef_rate__mx__max_overdue_debt_ratio',
 'weighted_status_score__d__next_payment_sum_ratio',
 'deterioration_count__m__max_overdue_debt_ratio',
 'std_6',
 'weighted_status_score__d__overdue_ratio',
 'weighted_status_score',
 'deterioration_count__d__longest_run_bad_status',
 'has_clean_history__mn__next_payment_sum_ratio',
 'weighted_status_score__mn__full_credit_cost_ef_rate',
 'overdue_severity_score__x__maturity_ratio',
 'total_overdues__m__sum_left_to_pay_progress',
 'deterioration_count__m__maturity_ratio',
 'overdue_severity_score__x__next_payment_sum_ratio',
 'longest_run_bad_status__mx__has_clean_history',
 'has_clean_history__d__max_overdue_debt_ratio',
 'longest_run_bad_status__m__sum_left_to_pay_progress',
 'deterioration_count__m__longest_run_bad_status',
 'maturity_ratio__mn__has_clean_history',
 'q10_25',
 'longest_run_bad_status__mn__overdue_severity_score',
 'overdue_ratio__mn__maturity_ratio',
 'weighted_status_score__x__has_clean_history',
 'overdue_severity_score__mx__has_clean_history',
 'total_overdues__d__overdue_ratio',
 'q50_25',
 'longest_run_bad_status__mn__full_credit_cost_ef_rate',
 'overdue_ratio__x__max_overdue_debt_ratio',
 'has_clean_history__d__next_payment_sum_ratio',
 'has_clean_history__x__max_overdue_debt_ratio',
 'mean_3',
 'weighted_status_score__m__full_credit_cost_ef_rate',
 'weighted_status_score__m__next_payment_sum_ratio',
 'has_clean_history',
 'deterioration_count__mx__has_clean_history',
 'has_clean_history__mx__max_overdue_debt_ratio',
 'deterioration_count__d__next_payment_sum_ratio',
 'overdue_severity_score__d__full_credit_cost_ef_rate',
 'maturity_ratio__m__has_clean_history',
 'longest_run_bad_status__x__max_overdue_debt_ratio',
 'overdue_ratio__mn__has_clean_history',
 'total_overdues__m__has_clean_history',
 'weighted_status_score__x__next_payment_sum_ratio',
 'deterioration_count__d__full_credit_cost_ef_rate',
 'slope_3',
 'longest_run_bad_status__m__full_credit_cost_ef_rate',
 'total_overdues__d__has_clean_history',
 'deterioration_count__mx__next_payment_sum_ratio',
 'deterioration_count__x__maturity_ratio',
 'deterioration_count__d__sum_left_to_pay_progress',
 'ema_25',
 'max_overdue_debt_ratio__x__next_payment_sum_ratio',
 'overdue_ratio__d__sum_left_to_pay_progress',
 'overdue_ratio__m__maturity_ratio',
 'min_6',
 'overdue_severity_score__m__full_credit_cost_ef_rate',
 'overdue_ratio__mx__maturity_ratio',
 'full_credit_cost_ef_rate__x__max_overdue_debt_ratio',
 'weighted_status_score__mx__has_clean_history',
 'deterioration_count__m__next_payment_sum_ratio',
 'has_clean_history__d__full_credit_cost_ef_rate',
 'maturity_ratio__d__has_clean_history',
 'deterioration_count__d__overdue_ratio',
 'overdue_severity_score__mn__sum_left_to_pay_progress',
 'overdue_ratio__x__full_credit_cost_ef_rate',
 'skew_25',
 'max_overdue_debt_ratio',
 'weighted_status_score__d__sum_left_to_pay_progress',
 'weighted_status_score__m__overdue_severity_score',
 'total_overdues__m__max_overdue_debt_ratio',
 'total_overdues__mx__full_credit_cost_ef_rate',
 'weighted_status_score__m__has_clean_history',
 'has_clean_history__mx__sum_left_to_pay_progress',
 'overdue_severity_score__mx__sum_left_to_pay_progress',
 'mean_25',
 'longest_run_bad_status__d__maturity_ratio',
 'weighted_status_score__mn__longest_run_bad_status',
 'overdue_severity_score__d__maturity_ratio',
 'full_credit_cost_ef_rate',
 'weighted_status_score__d__longest_run_bad_status',
 'overdue_ratio__x__sum_left_to_pay_progress',
 'max_overdue_debt_ratio__mn__next_payment_sum_ratio',
 'weighted_status_score__d__total_overdues',
 'mean_6',
 'overdue_severity_score__x__full_credit_cost_ef_rate',
 'longest_run_bad_status',
 'sum_left_to_pay_progress__d__next_payment_sum_ratio',
 'deterioration_count__d__overdue_severity_score',
 'maturity_ratio__x__full_credit_cost_ef_rate',
 'deterioration_count__d__total_overdues',
 'ema_6',
 'full_credit_cost_ef_rate__mn__next_payment_sum_ratio',
 'weighted_status_score__x__full_credit_cost_ef_rate',
 'overdue_severity_score__x__has_clean_history',
 'deterioration_count__m__full_credit_cost_ef_rate',
 'deterioration_count__mx__overdue_severity_score',
 'longest_run_bad_status__d__overdue_ratio',
 'has_clean_history__mn__max_overdue_debt_ratio',
 'deterioration_count__m__overdue_ratio',
 'maturity_ratio__mx__full_credit_cost_ef_rate',
 'full_credit_cost_ef_rate__mn__max_overdue_debt_ratio',
 'deterioration_count__x__longest_run_bad_status',
 'total_overdues__mx__sum_left_to_pay_progress',
 'weighted_status_score__mx__maturity_ratio',
 'deterioration_count__mn__has_clean_history',
 'total_overdues__mn__maturity_ratio',
 'longest_run_bad_status__x__overdue_ratio',
 'weighted_status_score__mx__sum_left_to_pay_progress',
 'longest_run_bad_status__d__full_credit_cost_ef_rate',
 'longest_run_bad_status__mn__next_payment_sum_ratio',
 'maturity_ratio__x__has_clean_history',
 'deterioration_count__mn__next_payment_sum_ratio',
 'longest_run_bad_status__m__max_overdue_debt_ratio',
 'longest_run_bad_status__mx__total_overdues',
 'total_overdues',
 'q10_3',
 'weighted_status_score__mn__deterioration_count',
 'longest_run_bad_status__m__overdue_severity_score',
 'q90_12',
 'credit_currency',
 'max_overdue_debt_ratio__m__sum_left_to_pay_progress',
 'longest_run_bad_status__m__has_clean_history',
 'weighted_status_score__x__max_overdue_debt_ratio',
 'total_overdues__mx__has_clean_history',
 'mean_12',
 'weighted_status_score__mn__next_payment_sum_ratio',
 'longest_run_bad_status__d__overdue_severity_score',
 'deterioration_count__m__overdue_severity_score',
 'longest_run_bad_status__x__full_credit_cost_ef_rate',
 'deterioration_count__d__max_overdue_debt_ratio',
 'weighted_status_score__x__sum_left_to_pay_progress',
 'longest_run_bad_status__x__maturity_ratio',
 'longest_run_bad_status__mn__total_overdues',
 'longest_run_bad_status__x__total_overdues',
 'overdue_severity_score__mx__overdue_ratio',
 'q10_6',
 'deterioration_count__m__sum_left_to_pay_progress',
 'total_overdues__x__overdue_severity_score',
 'longest_run_bad_status__mn__maturity_ratio',
 'overdue_ratio__m__next_payment_sum_ratio',
 'total_overdues__d__max_overdue_debt_ratio',
 'longest_run_bad_status__d__next_payment_sum_ratio',
 'longest_run_bad_status__m__total_overdues',
 'longest_run_bad_status__x__next_payment_sum_ratio',
 'deterioration_count__mn__overdue_severity_score',
 'longest_run_bad_status__d__max_overdue_debt_ratio',
 'deterioration_count__x__has_clean_history',
 'overdue_severity_score__m__max_overdue_debt_ratio',
 'total_overdues__m__overdue_severity_score',
 'total_overdues__mx__maturity_ratio',
 'total_overdues__mn__overdue_severity_score',
 'overdue_severity_score__mx__full_credit_cost_ef_rate',
 'total_overdues__mx__next_payment_sum_ratio',
 'longest_run_bad_status__x__overdue_severity_score',
 'deterioration_count__mn__overdue_ratio',
 'weighted_status_score__d__full_credit_cost_ef_rate',
 'total_overdues__d__overdue_severity_score',
 'overdue_ratio__m__full_credit_cost_ef_rate',
 'longest_run_bad_status__mx__overdue_severity_score',
 'longest_run_bad_status__x__sum_left_to_pay_progress',
 'total_overdues__m__next_payment_sum_ratio',
 'weighted_status_score__mx__total_overdues',
 'overdue_severity_score',
 'maturity_ratio__mn__sum_left_to_pay_progress',
 'deterioration_count__mx__total_overdues',
 'total_overdues__mx__overdue_severity_score',
 'longest_run_bad_status__mn__sum_left_to_pay_progress',
 'total_overdues__mx__max_overdue_debt_ratio',
 'longest_run_bad_status__d__sum_left_to_pay_progress',
 'deterioration_count',
 'overdue_severity_score__mx__maturity_ratio',
 'deterioration_count__d__has_clean_history',
 'h12_16',
 'h12_15',
 'h12_14',
 'h12_13',
 'h25_6',
 'h25_7',
 'h25_8']

In [4]:
X_tr = pd.read_csv('features/X_tr.csv')[top_features_clean].astype(np.float32)

In [None]:
X_tr.info()

In [5]:
X_te = pd.read_csv('features1/X_te.csv')[top_features_clean].astype(np.float32)

In [6]:
target = pd.read_csv('features/target.csv')['target'].astype(int).values

In [5]:
pd.read_csv('features/target.csv').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1827404 entries, 0 to 1827403
Data columns (total 2 columns):
 #   Column  Dtype
---  ------  -----
 0   id      int64
 1   target  bool 
dtypes: bool(1), int64(1)
memory usage: 15.7 MB


In [19]:
set(y_tr)

{0, 1}

In [31]:
set(y_tr)

{np.int64(0), np.int64(1)}

In [9]:
y_tr = pd.read_csv('features/target.csv')['target'].astype(int).values.astype(int)
X_tr.shape, X_te.shape

((1827404, 378), (456852, 378))

In [None]:
# Если где-то есть NaN/inf — сразу чистим:
X_tr = X_tr.replace([np.inf, -np.inf], np.nan).fillna(0)
X_te = X_te.replace([np.inf, -np.inf], np.nan).fillna(0)

In [7]:
del X_tr
import gc
gc.collect()

20

In [7]:
X_tr

Unnamed: 0,days_since_confirmed_risk,credit_limit_risk,full_credit_cost_risk,sum_left_to_pay_risk,maturity_fact_risk,credit_type_te,maturity_plan_risk,credit_type,max_25,std_25,...,deterioration_count,overdue_severity_score__mx__maturity_ratio,deterioration_count__d__has_clean_history,h12_16,h12_15,h12_14,h12_13,h25_6,h25_7,h25_8
0,0.036442,0.035327,0.034828,0.031592,0.031765,0.069495,0.030992,5.0,0.047831,0.005730,...,1.0,162.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.033437,0.031628,0.034834,0.031667,0.030732,0.031906,0.030417,4.0,0.037585,0.002770,...,1.0,162.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.033437,0.032114,0.035352,0.031667,0.038862,0.031906,0.035473,4.0,0.043827,0.003179,...,1.0,162.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.037797,0.032976,0.034167,0.031592,0.034909,0.031392,0.036833,3.0,0.046522,0.003494,...,1.0,162.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.028799,0.040180,0.034167,0.031592,0.029911,0.031392,0.025565,3.0,0.048816,0.004078,...,1.0,162.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1827399,0.028598,0.032039,0.029464,0.031667,0.029100,0.031354,0.036625,3.0,0.041704,0.002938,...,1.0,162.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1827400,0.028799,0.035327,0.037589,0.031592,0.059022,0.031867,0.036833,4.0,0.046522,0.003494,...,1.0,162.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1827401,0.028598,0.032114,0.024460,0.031667,0.031163,0.031906,0.030634,4.0,0.034486,0.002623,...,1.0,162.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1827402,0.029256,0.030438,0.030119,0.044242,0.034640,0.031518,0.036958,3.0,0.046808,0.006300,...,1.0,162.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Обучение первичное

In [None]:
features = [
    'credit_number_for_user',
    'maturity_fact',
    'maturity_ratio',
    'credit_limit',
    'weighted_status_score',
    'longest_run_bad_status',
    'deterioration_count',
    'next_payment_sum_ratio',
    'sum_left_to_pay_progress',
    'max_overdue_debt_ratio',
    'full_credit_cost_ef_rate',

    'total_overdues', 'overdue_severity_score', 'overdue_ratio', 'has_clean_history',
    'max_overdue_level',

    'credit_type',
    'credit_currency']
cat_features = ['credit_type', 'credit_currency', 'max_overdue_level']

In [None]:
X = df[features]
y = df['target']

x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)
x_test = X_test[features]

In [None]:
model = CatBoostClassifier(
    iterations=2000,
    depth=11,
    learning_rate=0.05,
    l2_leaf_reg=7,
    bagging_temperature=0.5,
    auto_class_weights='Balanced',
    eval_metric='AUC',  # Оптимизация по F1
    random_seed=42,
    verbose=100,
    early_stopping_rounds=100,
    cat_features=cat_features
)

# Обучение модели
model.fit(
    x_train, y_train, eval_set=(x_val, y_val),
    early_stopping_rounds=100
)

0:	test: 0.6046038	best: 0.6046038 (0)	total: 1.44s	remaining: 48m 1s
100:	test: 0.6170486	best: 0.6170486 (100)	total: 1m 56s	remaining: 36m 32s
200:	test: 0.6183562	best: 0.6184386 (194)	total: 3m 32s	remaining: 31m 39s
300:	test: 0.6181507	best: 0.6186464 (261)	total: 5m 17s	remaining: 29m 51s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6186464119
bestIteration = 261

Shrink model to first 262 iterations.


<catboost.core.CatBoostClassifier at 0x28175c47550>

In [13]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score, roc_auc_score, confusion_matrix,
                            classification_report, precision_recall_curve,
                            average_precision_score)
import numpy as np

def evaluate_model(y_true, y_pred, y_pred_proba=None):
    """
    Выводит все основные метрики качества для бинарной классификации

    Parameters:
    y_true: истинные значения
    y_pred: предсказанные классы
    y_pred_proba: предсказанные вероятности (для ROC-AUC)
    """

    print("=" * 50)
    print("МЕТРИКИ КАЧЕСТВА МОДЕЛИ")
    print("=" * 50)

    # Основные метрики
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-score:  {f1:.4f}")

    # ROC-AUC если есть вероятности
    if y_pred_proba is not None:
        try:
            roc_auc = roc_auc_score(y_true, y_pred_proba)
            print(f"ROC-AUC:   {roc_auc:.4f}")

            # Average Precision Score
            avg_precision = average_precision_score(y_true, y_pred_proba)
            print(f"Avg Precision: {avg_precision:.4f}")
        except:
            print("ROC-AUC: Не удалось вычислить (проверьте y_pred_proba)")

    print("\n" + "-" * 30)
    print("MATRIXA SOWMESHENIY (CONFUSION MATRIX)")
    print("-" * 30)

    # Матрица ошибок
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print(f"True Negative (TN):  {tn}")
    print(f"False Positive (FP): {fp}")
    print(f"False Negative (FN): {fn}")
    print(f"True Positive (TP):  {tp}")
    print(f"\nМатрица в виде таблицы:")
    print(f"[[TN {tn}   FP {fp}]")
    print(f" [FN {fn}   TP {tp}]]")

    # Дополнительные метрики из матрицы ошибок
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0  # False Negative Rate
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Positive Rate

    print(f"\nSpecificity (TNR): {specificity:.4f}")
    print(f"False Positive Rate (FPR): {fpr:.4f}")
    print(f"False Negative Rate (FNR): {fnr:.4f}")

    print("\n" + "-" * 30)
    print("DETALNY OTCHET (CLASSIFICATION REPORT)")
    print("-" * 30)

    # Детальный отчет
    print(classification_report(y_true, y_pred, target_names=['Class 0', 'Class 1'], zero_division=0))

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'confusion_matrix': cm,
        'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp
    }

In [126]:
set(X_test['current_overdue_debt'])

{0}

In [125]:
pr = 0.06
re = 0.587

(2 *pr*re)/(pr+re)

0.10887171561051003

In [None]:
base_vetrics = evaluate_model(y_val, model.predict(x_val), model.predict_proba(x_val)[:,1])

МЕТРИКИ КАЧЕСТВА МОДЕЛИ
Accuracy:  0.6806
Precision: 0.0502
Recall:    0.4787
F1-score:  0.0909
ROC-AUC:   0.6186
Avg Precision: 0.0574

------------------------------
MATRIXA SOWMESHENIY (CONFUSION MATRIX)
------------------------------
True Negative (TN):  303613
False Positive (FP): 137994
False Negative (FN): 7947
True Positive (TP):  7297

Матрица в виде таблицы:
[[TN 303613   FP 137994]
 [FN 7947   TP 7297]]

Specificity (TNR): 0.6875
False Positive Rate (FPR): 0.3125
False Negative Rate (FNR): 0.5213

------------------------------
DETALNY OTCHET (CLASSIFICATION REPORT)
------------------------------
              precision    recall  f1-score   support

     Class 0       0.97      0.69      0.81    441607
     Class 1       0.05      0.48      0.09     15244

    accuracy                           0.68    456851
   macro avg       0.51      0.58      0.45    456851
weighted avg       0.94      0.68      0.78    456851



In [None]:
feature_imp = pd.DataFrame({
        'feature': x_train.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', key=abs, ascending=False)
feature_imp

Unnamed: 0,feature,importance
4,weighted_status_score,11.235639
8,sum_left_to_pay_progress,9.909159
16,credit_type,8.989772
2,maturity_ratio,8.400929
7,next_payment_sum_ratio,7.90396
1,maturity_fact,7.837513
9,max_overdue_debt_ratio,7.707635
10,full_credit_cost_ef_rate,5.922321
3,credit_limit,5.717225
14,has_clean_history,5.597286


In [None]:
y_val_pred_proba = model.predict_proba(x_train)[:, 1]

# Сетка порогов
thresholds = np.linspace(0.01, 0.99, 199)

best_threshold = 0.5
best_f1 = 0

for t in thresholds:
    y_val_pred = (y_val_pred_proba >= t).astype(int)
    f1 = f1_score(y_train, y_val_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print(f"Best F1: {best_f1:.4f} at threshold {best_threshold:.2f}")

Best F1: 0.1400 at threshold 0.63


In [None]:
y_val_pred_proba = model.predict_proba(x_val)[:, 1]

# Сетка порогов
thresholds = np.linspace(0.01, 0.99, 199)

best_threshold = 0.5
best_f1 = 0

for t in thresholds:
    y_val_pred = (y_val_pred_proba >= t).astype(int)
    f1 = f1_score(y_val, y_val_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print(f"Best F1: {best_f1:.4f} at threshold {best_threshold:.2f}")

Best F1: 0.1052 at threshold 0.59


#### Итоговое обучение и сабмишен

In [None]:
x_tra, x_va, y_tra, y_va = train_test_split(X, y, test_size=0.001, random_state=42)

In [None]:
model_itog = CatBoostClassifier(
    iterations=500,
    depth=11,
    learning_rate=0.05,
    l2_leaf_reg=20,
    bagging_temperature=0.5,
    auto_class_weights='Balanced',
    eval_metric='AUC',  # Оптимизация по F1
    random_seed=42,
    verbose=100,
    cat_features=cat_features
)

# Обучение модели
model.fit(
    x_tra, y_tra, eval_set=(x_va, y_va)
)

0:	test: 0.6178258	best: 0.6178258 (0)	total: 1.75s	remaining: 58m 26s
100:	test: 0.6243863	best: 0.6276787 (70)	total: 2m 30s	remaining: 47m 5s
200:	test: 0.6362800	best: 0.6362800 (200)	total: 4m 34s	remaining: 40m 53s
300:	test: 0.6396113	best: 0.6402932 (295)	total: 6m 37s	remaining: 37m 23s
400:	test: 0.6462254	best: 0.6485340 (380)	total: 9m 21s	remaining: 37m 19s
500:	test: 0.6476671	best: 0.6506673 (465)	total: 12m 2s	remaining: 36m 2s
600:	test: 0.6579827	best: 0.6584015 (599)	total: 14m 45s	remaining: 34m 21s
700:	test: 0.6613920	best: 0.6614699 (698)	total: 17m 31s	remaining: 32m 28s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6614699006
bestIteration = 698

Shrink model to first 699 iterations.


<catboost.core.CatBoostClassifier at 0x28175c47550>

In [None]:
model_itog_max = CatBoostClassifier(
    iterations=00,
    depth=11,
    learning_rate=0.05,
    l2_leaf_reg=20,
    bagging_temperature=0.5,
    auto_class_weights='Balanced',
    eval_metric='AUC',  # Оптимизация по F1
    random_seed=42,
    verbose=100,
    cat_features=cat_features
)

# Обучение модели
model.fit(
    df[features], df['target']
)

In [None]:
base_vetrics = evaluate_model(y_va, model.predict(x_va), model.predict_proba(x_va)[:,1])

МЕТРИКИ КАЧЕСТВА МОДЕЛИ
Accuracy:  0.7194
Precision: 0.0617
Recall:    0.5517
F1-score:  0.1109
ROC-AUC:   0.6615
Avg Precision: 0.1048

------------------------------
MATRIXA SOWMESHENIY (CONFUSION MATRIX)
------------------------------
True Negative (TN):  1283
False Positive (FP): 487
False Negative (FN): 26
True Positive (TP):  32

Матрица в виде таблицы:
[[TN 1283   FP 487]
 [FN 26   TP 32]]

Specificity (TNR): 0.7249
False Positive Rate (FPR): 0.2751
False Negative Rate (FNR): 0.4483

------------------------------
DETALNY OTCHET (CLASSIFICATION REPORT)
------------------------------
              precision    recall  f1-score   support

     Class 0       0.98      0.72      0.83      1770
     Class 1       0.06      0.55      0.11        58

    accuracy                           0.72      1828
   macro avg       0.52      0.64      0.47      1828
weighted avg       0.95      0.72      0.81      1828



In [None]:
test_pred = model.predict(x_test)

In [None]:
y_val_pred_proba = model.predict_proba(x_va)[:, 1]

# Сетка порогов
thresholds = np.linspace(0.01, 0.99, 99)

best_threshold = 0.5
best_f1 = 0

for t in thresholds:
    y_val_pred = (y_val_pred_proba >= t).astype(int)
    f1 = f1_score(y_val_pred, y_va)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print(f"Best F1: {best_f1:.4f} at threshold {best_threshold:.2f}")

Best F1: 0.1159 at threshold 0.74


In [None]:
y_val_pred_proba = model.predict_proba(x_tra)[:, 1]

# Сетка порогов
thresholds = np.linspace(0.01, 0.99, 99)

best_threshold = 0.5
best_f1 = 0

for t in thresholds:
    y_val_pred = (y_val_pred_proba >= t).astype(int)
    f1 = f1_score(y_val_pred, y_tra)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print(f"Best F1: {best_f1:.4f} at threshold {best_threshold:.2f}")

Best F1: 0.1963 at threshold 0.65


In [None]:
proba = model.predict_proba(x_test)[:, 1]
test_pred = (proba >= 0.74).astype(bool)

In [None]:
test_pred

array([False, False, False, ..., False, False, False], shape=(456852,))

In [None]:
X_test['flag'] = test_pred

In [None]:
X_test[['id', 'flag']].set_index('id').to_csv('sub_mission.csv')

## Мощное обучение

### credit_type == ?

In [None]:
diction = {}
detph = {}

In [None]:
df.groupby('credit_type')['credit_number_for_user'].count() * 0.25

credit_type
0     15963.25
1      2328.25
2     10630.25
3    137111.75
4    258849.75
5     20398.75
6      1256.25
7     10312.75
Name: credit_number_for_user, dtype: float64

In [None]:
typee = 7
df_0 = df[df['credit_type'] == typee]

In [None]:
features = [
    'credit_number_for_user',
    'maturity_fact',
    'maturity_ratio',
    'credit_limit',
    'weighted_status_score',
    'longest_run_bad_status',
    'deterioration_count',
    'next_payment_sum_ratio',
    'sum_left_to_pay_progress',
    'max_overdue_debt_ratio',
    'full_credit_cost_ef_rate',
    'credit_currency']

X = df_0[features]
y = df_0['target']

x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)
x_test = X_test[features]

depth = 9
model = CatBoostClassifier(
    iterations=2000,
    depth=depth,
    learning_rate=0.07,
    l2_leaf_reg=5,
    bagging_temperature=0.5,
    auto_class_weights='Balanced',
    eval_metric='AUC',  # Оптимизация по F1
    random_seed=42,
    verbose=100,
    early_stopping_rounds=100,
    cat_features=['credit_currency']
)

# Обучение модели
model.fit(
    x_train, y_train, eval_set=(x_val, y_val),
    early_stopping_rounds=100
)

diction[typee] = evaluate_model(y_val, model.predict(x_val), model.predict_proba(x_val)[:,1])
detph[typee] = depth

feature_imp = pd.DataFrame({
        'feature': x_train.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', key=abs, ascending=False)
feature_imp

0:	test: 0.5586678	best: 0.5586678 (0)	total: 32.3ms	remaining: 1m 4s
100:	test: 0.5327881	best: 0.5685026 (6)	total: 2.92s	remaining: 54.9s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.5685026315
bestIteration = 6

Shrink model to first 7 iterations.
МЕТРИКИ КАЧЕСТВА МОДЕЛИ
Accuracy:  0.7027
Precision: 0.0446
Recall:    0.4098
F1-score:  0.0804
ROC-AUC:   0.5685
Avg Precision: 0.0423

------------------------------
MATRIXA SOWMESHENIY (CONFUSION MATRIX)
------------------------------
True Negative (TN):  7113
False Positive (FP): 2873
False Negative (FN): 193
True Positive (TP):  134

Матрица в виде таблицы:
[[TN 7113   FP 2873]
 [FN 193   TP 134]]

Specificity (TNR): 0.7123
False Positive Rate (FPR): 0.2877
False Negative Rate (FNR): 0.5902

------------------------------
DETALNY OTCHET (CLASSIFICATION REPORT)
------------------------------
              precision    recall  f1-score   support

     Class 0       0.97      0.71      0.82      9986
     Class 1

Unnamed: 0,feature,importance
5,longest_run_bad_status,17.528091
0,credit_number_for_user,16.898484
1,maturity_fact,14.885166
4,weighted_status_score,12.307858
3,credit_limit,8.092802
9,max_overdue_debt_ratio,8.014381
10,full_credit_cost_ef_rate,7.521686
8,sum_left_to_pay_progress,6.358995
2,maturity_ratio,3.284717
6,deterioration_count,2.743928


In [None]:
for k, v in diction.items():
    print(k, v['f1'])

0 0.06322861094645327
1 0.1016949152542373
2 0.06269020085209982
3 0.08201916690388082
4 0.07979600017816578
5 0.16505293638584742
6 0.13020833333333334
7 0.08038392321535692


### credit_type == 4

In [None]:
typee = 4
df_4 = df[df['credit_type'] == typee]

In [None]:
import os
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import f1_score, roc_auc_score, precision_recall_curve

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# -------------------------
# 0) Reproducibility
# -------------------------
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)

DEVICE: cpu


In [None]:
(df[features] == np.inf).sum()

credit_number_for_user           0
maturity_fact                    0
maturity_ratio              100255
credit_limit                     0
weighted_status_score            0
longest_run_bad_status           0
deterioration_count              0
next_payment_sum_ratio      104816
sum_left_to_pay_progress         0
max_overdue_debt_ratio      108960
full_credit_cost_ef_rate     94592
dtype: int64

In [None]:

features = [
    'credit_number_for_user',
    'maturity_fact',
    'maturity_ratio',
    'credit_limit',
    'weighted_status_score',
    'longest_run_bad_status',
    'deterioration_count',
    'next_payment_sum_ratio',
    'sum_left_to_pay_progress',
    'max_overdue_debt_ratio',
    'full_credit_cost_ef_rate',
]

# проверим, что все фичи есть в данных
missing = [f for f in features if f not in df.columns]
if missing:
    raise ValueError(f"Отсутствуют признаки в X_train: {missing}\n"
                     f"Убедись, что они посчитаны и присутствуют до запуска MLP.")

# на всякий случай — заменить NaN/inf
df_4[features] = df[features].fillna(0.0)
X_test_4[features] = X_test_4[features].fillna(0.0)

X = df_4[features].astype(float)
y = df_4['target'].astype(int).values

# train/val split (стратифицированный)
x_tr, x_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.25, random_state=SEED, stratify=y
)

# масштабирование (только по train)
scaler = RobustScaler()
x_tr_sc = scaler.fit_transform(x_tr)
x_val_sc = scaler.transform(x_val)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


ValueError: Input X contains infinity or a value too large for dtype('float64').

In [None]:
# -------------------------
# 2) PyTorch Dataset/Dataloader
# -------------------------
class TabDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.from_numpy(X).float()
        self.y = None if y is None else torch.from_numpy(y).float()
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

train_ds = TabDataset(x_tr_sc, y_tr)
val_ds   = TabDataset(x_val_sc, y_val)
# sub_ds   = TabDataset(x_sub_sc, None)

train_loader = DataLoader(train_ds, batch_size=4096, shuffle=True, num_workers=0)
val_loader   = DataLoader(val_ds,   batch_size=8192, shuffle=False, num_workers=0)
# sub_loader   = DataLoader(sub_ds,   batch_size=8192, shuffle=False, num_workers=0)

In [None]:

# -------------------------
# 3) Model
# -------------------------
class MLP(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 256),
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.Dropout(0.20),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.GELU(),
            nn.Dropout(0.20),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.GELU(),
            nn.Dropout(0.10),

            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.GELU(),
            nn.Dropout(0.10),

            nn.Linear(32, 1)  # логит
        )
    def forward(self, x):
        return self.net(x).squeeze(1)  # [B]

in_dim = len(features)
model = MLP(in_dim).to(DEVICE)

# -------------------------
# 4) Loss (pos_weight) & Optimizer & Scheduler
# -------------------------
# дисбаланс: pos_weight = N_neg/N_pos
pos = y_tr.sum()
neg = len(y_tr) - pos
pos_weight_value = (neg / max(1, pos))
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight_value, device=DEVICE))

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

In [None]:

# -------------------------
# 5) Train loop with early stopping
# -------------------------
def evaluate(model, loader):
    model.eval()
    all_logits, all_y = [], []
    with torch.no_grad():
        for batch in loader:
            xb, yb = batch
            xb = xb.to(DEVICE); yb = yb.to(DEVICE)
            logits = model(xb)
            all_logits.append(logits.detach().cpu().numpy())
            all_y.append(yb.detach().cpu().numpy())
    logits = np.concatenate(all_logits)
    y_true = np.concatenate(all_y)
    probs = 1.0 / (1.0 + np.exp(-logits))
    # AUC для контроля, F1 посчитаем с порогом отдельно
    auc = roc_auc_score(y_true, probs)
    return auc, probs, y_true

best_auc = -np.inf
best_state = None
patience, patience_left = 8, 8
EPOCHS = 10
scaler_amp = torch.cuda.amp.GradScaler(enabled=(DEVICE=="cuda"))

for epoch in tqdm(range(1, EPOCHS+1)):
    model.train()
    epoch_loss = 0.0
    for batch in train_loader:
        xb, yb = batch
        xb = xb.to(DEVICE); yb = yb.to(DEVICE)

        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
            logits = model(xb)
            loss = criterion(logits, yb)
        scaler_amp.scale(loss).backward()
        scaler_amp.step(optimizer)
        scaler_amp.update()
        epoch_loss += loss.item()

    scheduler.step()

    val_auc, val_probs, val_true = evaluate(model, val_loader)
    print(f"Epoch {epoch:02d} | train_loss={epoch_loss/len(train_loader):.5f} | val_auc={val_auc:.5f}")

    if val_auc > best_auc + 1e-4:
        best_auc = val_auc
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        patience_left = patience
    else:
        patience_left -= 1
        if patience_left == 0:
            print("Early stopping.")
            break

# restore best
if best_state is not None:
    model.load_state_dict({k: v.to(DEVICE) for k, v in best_state.items()})

# -------------------------
# 6) Threshold tuning for F1
# -------------------------
# вычислим ещё раз на валидации лучшие пороги

# # (опционально) можно посмотреть и PR-кривую:
# # precisions, recalls, thr = precision_recall_curve(val_true, val_probs)

# # -------------------------
# # 7) Inference on test + submission
# # -------------------------
# # прогон по test
# model.eval()
# test_probs_all = []
# with torch.no_grad():
#     for xb in sub_loader:
#         xb = xb.to(DEVICE)
#         logits = model(xb)
#         probs = torch.sigmoid(logits)
#         test_probs_all.append(probs.detach().cpu().numpy())
# test_probs = np.concatenate(test_probs_all)

# # бинаризация по найденному порогу
# test_pred = (test_probs >= best_t).astype(int)

# submission = pd.DataFrame({
#     "id": X_test["id"].values,
#     "flag": test_pred
# })
# submission_path = "submission_mlp_tabular.csv"
# submission.to_csv(submission_path, index=False)
# print("Saved:", submission_path)

  scaler_amp = torch.cuda.amp.GradScaler(enabled=(DEVICE=="cuda"))
  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
 10%|█         | 1/10 [00:25<03:50, 25.61s/it]

Epoch 01 | train_loss=1.33416 | val_auc=0.56409


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
 20%|██        | 2/10 [00:51<03:26, 25.77s/it]

Epoch 02 | train_loss=1.32950 | val_auc=0.56670


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
 30%|███       | 3/10 [01:16<02:58, 25.49s/it]

Epoch 03 | train_loss=1.32829 | val_auc=0.57015


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
 40%|████      | 4/10 [01:42<02:32, 25.48s/it]

Epoch 04 | train_loss=1.32712 | val_auc=0.57142


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
 50%|█████     | 5/10 [02:07<02:06, 25.34s/it]

Epoch 05 | train_loss=1.32686 | val_auc=0.57075


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
 60%|██████    | 6/10 [02:32<01:41, 25.36s/it]

Epoch 06 | train_loss=1.32614 | val_auc=0.57508


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
 70%|███████   | 7/10 [02:59<01:17, 25.85s/it]

Epoch 07 | train_loss=1.32530 | val_auc=0.57355


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
 80%|████████  | 8/10 [03:24<00:51, 25.66s/it]

Epoch 08 | train_loss=1.32493 | val_auc=0.57485


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
 90%|█████████ | 9/10 [03:49<00:25, 25.52s/it]

Epoch 09 | train_loss=1.32414 | val_auc=0.57607


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
100%|██████████| 10/10 [04:15<00:00, 25.51s/it]

Epoch 10 | train_loss=1.32345 | val_auc=0.57609





In [None]:
model.eval()
_, val_probs, val_true = evaluate(model, val_loader)

thresholds = np.linspace(0.01, 0.99, 99)
best_t, best_f1 = 0.5, 0.0
for t in thresholds:
    preds = (val_probs >= t).astype(int)
    f1 = f1_score(val_true, preds)
    if f1 > best_f1:
        best_f1, best_t = f1, t

print(f"Best F1 on VAL = {best_f1:.4f} at threshold={best_t:.3f}")

Best F1 on VAL = 0.0820 at threshold=0.550


## Ансамбль моделей с подбором точности

### Подготовка данных

#### Модель A (Catboost)

In [None]:
oof_cb  = np.zeros(len(X_tr), dtype=np.float32)
test_cb = np.zeros(len(X_te), dtype=np.float32)
models = {}

for fold, (tr_idx, vl_idx) in enumerate(folds, 1):
    Xtr, Xvl = X_tr.iloc[tr_idx], X_tr.iloc[vl_idx]
    ytr, yvl = y_tr[tr_idx],     y_tr[vl_idx]

    model_cb = CatBoostClassifier(
        iterations=3000,
        depth=10,
        learning_rate=0.03,
        l2_leaf_reg=12,
        bagging_temperature=0.4,
        auto_class_weights='Balanced',  # важный момент при 3.3% позитивов
        eval_metric='AUC',
        random_seed=42,
        verbose=50,
        early_stopping_rounds=200,

    )
    model_cb.fit(Xtr, ytr, eval_set=(Xvl, yvl), use_best_model=True)

    models[fold] = model_cb
    oof_cb[vl_idx] = model_cb.predict_proba(Xvl)[:, 1]
    test_cb       += model_cb.predict_proba(X_te)[:, 1] / skf.n_splits

0:	test: 0.6045104	best: 0.6045104 (0)	total: 2.06s	remaining: 1h 42m 56s
50:	test: 0.6217217	best: 0.6217217 (50)	total: 52.4s	remaining: 50m 27s
100:	test: 0.6252702	best: 0.6252702 (100)	total: 1m 43s	remaining: 49m 40s
150:	test: 0.6270390	best: 0.6270390 (150)	total: 2m 33s	remaining: 48m 7s
200:	test: 0.6282392	best: 0.6282392 (200)	total: 3m 21s	remaining: 46m 39s
250:	test: 0.6286820	best: 0.6286830 (248)	total: 4m 11s	remaining: 45m 52s
300:	test: 0.6288116	best: 0.6289138 (287)	total: 5m	remaining: 44m 58s
350:	test: 0.6292577	best: 0.6293020 (349)	total: 5m 48s	remaining: 43m 53s
400:	test: 0.6293799	best: 0.6294236 (397)	total: 6m 38s	remaining: 43m
450:	test: 0.6293790	best: 0.6294236 (397)	total: 7m 26s	remaining: 42m 1s
500:	test: 0.6292565	best: 0.6294236 (397)	total: 8m 15s	remaining: 41m 12s
550:	test: 0.6288378	best: 0.6294236 (397)	total: 9m 4s	remaining: 40m 20s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.6294236467
bestIteration = 397

Shr

In [None]:
oof_cb_e  = np.zeros(len(X_tr), dtype=np.float32)
for fold, (tr_idx, vl_idx) in enumerate(folds, 1):
    Xtr, Xvl = X_tr.iloc[tr_idx], X_tr.iloc[vl_idx]
    ytr, yvl = y_tr[tr_idx],     y_tr[vl_idx]

    oof_cb_e[vl_idx] = models[fold].predict(Xvl)

In [None]:
auc_cb = roc_auc_score(y_tr, oof_cb)
f1_cb = f1_score(y_tr, oof_cb_e)
print(f"CatBoost OOF AUC: {auc_cb:.5f}")
print(f"CatBoost OOF AUC: {f1_cb:.5f}")

CatBoost OOF AUC: 0.62969
CatBoost OOF AUC: 0.09242


#### Модель Лёгкая GRU на risk-последовательности

In [None]:
class SeqDS(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.from_numpy(X.astype(np.float32))
        self.y = None if y is None else torch.from_numpy(y.astype(np.float32))
    def __len__(self): return len(self.X)
    def __getitem__(self, i):
        return (self.X[i], self.y[i]) if self.y is not None else self.X[i]

class GRUSimple(nn.Module):
    def __init__(self, hidden=64):
        super().__init__()
        self.gru = nn.GRU(input_size=1, hidden_size=hidden, num_layers=2, batch_first=True, dropout=0.1)
        self.head = nn.Sequential(
            nn.Linear(hidden, 64), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(64, 1)
        )
    def forward(self, x):            # x: [B, 25]
        x = x.unsqueeze(-1)         # -> [B, 25, 1]
        out, _ = self.gru(x)        # -> [B, 25, H]
        feat = out[:, -1, :]        # берём последнее скрытое состояние
        logit = self.head(feat)     # -> [B, 1]
        return logit.squeeze(1)     # -> [B]

In [None]:

def train_gru_oof(X, y, Xtest, folds, epochs=5, bs=8192, pos_weight=30.0, lr=2e-3):
    oof = np.zeros(len(X), dtype=np.float32)
    tpred = np.zeros(len(Xtest), dtype=np.float32)

    for fold, (tr_idx, vl_idx) in enumerate(folds, 1):
        ds_tr = SeqDS(X[tr_idx], y[tr_idx])
        ds_vl = SeqDS(X[vl_idx], y[vl_idx])
        ds_te = SeqDS(Xtest, None)

        dl_tr = DataLoader(ds_tr, batch_size=bs, shuffle=True)
        dl_vl = DataLoader(ds_vl, batch_size=bs, shuffle=False)
        dl_te = DataLoader(ds_te, batch_size=bs, shuffle=False)

        model = GRUSimple(hidden=64).to(DEVICE)
        crit = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight, device=DEVICE))
        opt  = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)

        best_auc, best_state = -1, None
        for ep in tqdm(range(epochs)):
            # train
            model.train()
            for xb, yb in dl_tr:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                opt.zero_grad(set_to_none=True)
                loss = crit(model(xb), yb)
                loss.backward(); opt.step()
            # valid
            model.eval()
            logits, yt = [], []
            with torch.no_grad():
                for xb, yb in dl_vl:
                    xb = xb.to(DEVICE)
                    lg = model(xb).detach().cpu().numpy()
                    logits.append(lg); yt.append(yb.numpy())
            p  = 1 / (1 + np.exp(-np.concatenate(logits)))
            yt = np.concatenate(yt)
            auc = roc_auc_score(yt, p)
            print('auc: ', auc)
            if auc > best_auc:
                best_auc = auc
                best_state = {k:v.cpu().clone() for k,v in model.state_dict().items()}

        # restore best
        model.load_state_dict({k:v.to(DEVICE) for k,v in best_state.items()})
        # OOF
        model.eval()
        logits=[]
        with torch.no_grad():
            for xb, yb in dl_vl:
                xb = xb.to(DEVICE)
                logits.append(model(xb).detach().cpu().numpy())
        oof[vl_idx] = 1/(1+np.exp(-np.concatenate(logits)))
        # test
        logits=[]
        with torch.no_grad():
            for xb in dl_te:
                xb = xb.to(DEVICE)
                logits.append(model(xb).detach().cpu().numpy())
        tpred += (1/(1+np.exp(-np.concatenate(logits)))) / len(folds)

    return oof, tpred

In [None]:
oof_gru, test_gru = train_gru_oof(risk_seq_tr, y_tr, risk_seq_te, folds, epochs=10, bs=8192, pos_weight=30.0, lr=2e-3)

 10%|█         | 1/10 [00:20<03:01, 20.22s/it]

auc:  0.5448132653868698


 20%|██        | 2/10 [00:39<02:38, 19.82s/it]

auc:  0.5311316110345211


 30%|███       | 3/10 [01:00<02:20, 20.07s/it]

auc:  0.5531873686131482


 40%|████      | 4/10 [01:19<01:59, 19.94s/it]

auc:  0.5549074713817976


 50%|█████     | 5/10 [01:39<01:39, 19.92s/it]

auc:  0.5566238254904868


 60%|██████    | 6/10 [01:59<01:19, 19.97s/it]

auc:  0.5587478612305072


 70%|███████   | 7/10 [02:19<00:59, 19.83s/it]

auc:  0.5617406570542861


 80%|████████  | 8/10 [02:39<00:39, 19.84s/it]

auc:  0.5600456704489001


 90%|█████████ | 9/10 [02:58<00:19, 19.63s/it]

auc:  0.5595705706117914


100%|██████████| 10/10 [03:18<00:00, 19.86s/it]

auc:  0.5592571126289064



 10%|█         | 1/10 [00:20<03:01, 20.21s/it]

auc:  0.5355539200738294


 20%|██        | 2/10 [00:39<02:37, 19.72s/it]

auc:  0.5425713336778037


 30%|███       | 3/10 [00:59<02:19, 19.88s/it]

auc:  0.5551699821933274


 40%|████      | 4/10 [01:19<01:58, 19.78s/it]

auc:  0.5577740190749482


 50%|█████     | 5/10 [01:38<01:38, 19.69s/it]

auc:  0.5588613788486939


 60%|██████    | 6/10 [01:58<01:19, 19.83s/it]

auc:  0.5611135957843044


 70%|███████   | 7/10 [02:18<00:59, 19.71s/it]

auc:  0.5634886663695995


 80%|████████  | 8/10 [02:38<00:39, 19.87s/it]

auc:  0.5658665892577034


 90%|█████████ | 9/10 [02:57<00:19, 19.67s/it]

auc:  0.5721471884376614


100%|██████████| 10/10 [03:18<00:00, 19.82s/it]

auc:  0.5723954858553728



 10%|█         | 1/10 [00:20<03:02, 20.23s/it]

auc:  0.5272246875430122


 20%|██        | 2/10 [00:39<02:36, 19.62s/it]

auc:  0.53367665841613


 30%|███       | 3/10 [00:59<02:18, 19.74s/it]

auc:  0.54992276604653


 40%|████      | 4/10 [01:18<01:58, 19.72s/it]

auc:  0.5555955866038408


 50%|█████     | 5/10 [01:38<01:38, 19.64s/it]

auc:  0.5572265981943275


 60%|██████    | 6/10 [01:58<01:19, 19.76s/it]

auc:  0.5075715733212469


 70%|███████   | 7/10 [02:17<00:58, 19.60s/it]

auc:  0.5587356300833521


 80%|████████  | 8/10 [02:37<00:39, 19.73s/it]

auc:  0.5606500192080847


 90%|█████████ | 9/10 [02:56<00:19, 19.57s/it]

auc:  0.4404313642222964


100%|██████████| 10/10 [03:16<00:00, 19.69s/it]

auc:  0.5594337451129523



 10%|█         | 1/10 [00:20<03:01, 20.17s/it]

auc:  0.5249995947286259


 20%|██        | 2/10 [00:39<02:36, 19.60s/it]

auc:  0.5352099607082692


 30%|███       | 3/10 [00:59<02:19, 19.92s/it]

auc:  0.5543081676980742


 40%|████      | 4/10 [01:19<01:59, 19.84s/it]

auc:  0.5582351207618965


 50%|█████     | 5/10 [01:39<01:39, 19.84s/it]

auc:  0.559918974611463


 60%|██████    | 6/10 [01:59<01:19, 19.90s/it]

auc:  0.5602546196747361


 70%|███████   | 7/10 [02:18<00:59, 19.73s/it]

auc:  0.562788147333708


 80%|████████  | 8/10 [02:38<00:39, 19.87s/it]

auc:  0.5638551708866413


 90%|█████████ | 9/10 [02:58<00:19, 19.69s/it]

auc:  0.5661346008715407


100%|██████████| 10/10 [03:18<00:00, 19.82s/it]

auc:  0.5678605082603955



 10%|█         | 1/10 [00:20<03:02, 20.23s/it]

auc:  0.5405854232630216


 20%|██        | 2/10 [00:39<02:36, 19.61s/it]

auc:  0.5538089903709853


 30%|███       | 3/10 [00:59<02:19, 19.88s/it]

auc:  0.5575015978851395


 40%|████      | 4/10 [01:19<01:58, 19.71s/it]

auc:  0.5607100572753552


 50%|█████     | 5/10 [01:38<01:38, 19.78s/it]

auc:  0.5621521457356247


 60%|██████    | 6/10 [01:58<01:19, 19.80s/it]

auc:  0.5659252425723955


 70%|███████   | 7/10 [02:18<00:58, 19.67s/it]

auc:  0.42989065423264416


 80%|████████  | 8/10 [02:38<00:39, 19.82s/it]

auc:  0.4321305830052229


 90%|█████████ | 9/10 [02:57<00:19, 19.65s/it]

auc:  0.5694318632324534


100%|██████████| 10/10 [03:17<00:00, 19.76s/it]

auc:  0.572552969934361





In [None]:
auc_gru = roc_auc_score(y_tr, oof_gru)
print(f"GRU OOF AUC: {auc_gru:.5f}")

GRU OOF AUC: 0.51479


#### Бленд и подбор порога под F1

In [None]:
oof_blend  = oof_cb.copy()
test_blend = test_cb.copy()

try:
    oof_blend  = 0.6*oof_cb + 0.4*oof_gru
    test_blend = 0.6*test_cb + 0.4*test_gru
except NameError:
    pass  # GRU не считали — используем только CatBoost

best_t, best_f1 = 0.5, 0.0
for t in np.linspace(0.01, 0.7, 40):
    f1 = f1_score(y_tr, (oof_blend >= t).astype(int))
    if f1 > best_f1:
        best_f1, best_t = f1, t
print(f"OOF F1(best): {best_f1:.4f} @ threshold={best_t:.3f}")


OOF F1(best): 0.1093 @ threshold=0.558


In [None]:
submission = pd.DataFrame({
    'id': X_test['id'].values,
    'flag': (test_blend >= best_t).astype(int)
})
submission.to_csv("submission_soq_final_blend.csv", index=False)
print("Saved: submission_soq_final_blend.csv")

## Много фичей

In [15]:
from catboost import CatBoostClassifier
import os, gc, math, warnings
warnings.filterwarnings("ignore")

oof_cb  = np.zeros(len(X_tr), dtype=np.float32)
test_cb = np.zeros(len(X_te), dtype=np.float32)
models = {}
for fold, (tr_idx, vl_idx) in enumerate(folds, 1):
    Xtr, Xvl = X_tr.iloc[tr_idx], X_tr.iloc[vl_idx]
    ytr_, yvl_ = y_tr[tr_idx], y_tr[vl_idx]

    model_cb = CatBoostClassifier(
        iterations=2500,
        depth=12,
        learning_rate=0.04,
        l2_leaf_reg=12,
        bagging_temperature=0.5,
        auto_class_weights='Balanced',
        eval_metric='AUC',
        random_seed=42,
        verbose=20,
        early_stopping_rounds=200
    )
    model_cb.fit(Xtr, ytr_, eval_set=(Xvl, yvl_), use_best_model=True)
    models[fold] = model_cb
    oof_cb[vl_idx] = model_cb.predict_proba(Xvl)[:, 1]
    test_cb       += model_cb.predict_proba(X_te)[:, 1] / skf.n_splits

auc_cb = roc_auc_score(y_tr, oof_cb)
print(f"CatBoost OOF AUC: {auc_cb:.5f}")
gc.collect()

0:	test: 0.5989733	best: 0.5989733 (0)	total: 6.4s	remaining: 4h 26m 39s


KeyboardInterrupt: 

In [4]:
f1 = pd.read_csv('models/features_1.csv')
f2 = pd.read_csv('models/features_2.csv')

In [5]:
# 2) Нормализация и блендинг важностей
def normalize_minmax(s):
    s = s.astype(float); m, M = s.min(), s.max()
    return (s - m) / (M - m + 1e-12) if M > m else s*0

def normalize_l1(s):
    s = s.astype(float).clip(lower=0); S = s.sum()
    return s / S if S > 0 else s

f1['imp_minmax'] = normalize_minmax(f1['importance'])
f2['imp_minmax'] = normalize_minmax(f2['importance'])
f1['imp_l1']     = normalize_l1(f1['importance'])
f2['imp_l1']     = normalize_l1(f2['importance'])

blend = f1.merge(f2[['feature','imp_minmax','imp_l1']], on='feature', how='outer', suffixes=('_1','_2')).fillna(0.0)
# blended score = усреднение minmax и l1 между запусками
blend['score'] = 0.5*(0.5*(blend['imp_minmax_1']+blend['imp_minmax_2']) + 0.5*(blend['imp_l1_1']+blend['imp_l1_2']))

TOP_K = 400  # можно 300–600; 400 — хороший старт
blend_sorted = blend.sort_values('score', ascending=False).reset_index(drop=True)
top_features = blend_sorted.head(TOP_K)['feature'].tolist()

# 3) Чистим мусор (Unnamed, id, явные дубликаты с суффиксами .1/.2 ...)
def clean_feature_list(feats):
    cleaned, seen = [], set()
    for s in map(str, feats):
        if s.lower().startswith('unnamed'): continue
        if s == 'id' or s.startswith('id.'): continue
        if s.endswith(('.1','.2','.3','.4')): continue
        if s not in seen:
            cleaned.append(s); seen.add(s)
    return cleaned

top_features_clean = clean_feature_list(top_features)
print("Selected features:", len(top_features_clean))
# При использовании ОБЯЗАТЕЛЬНО пересечь со своими текущими колонками X_tr:
# top_features_clean = [c for c in top_features_clean if c in X_tr.columns]


Selected features: 378


In [None]:
y_tr = y_tr.values

AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [60]:
x_train, x_val, y_train, y_val = train_test_split(X_tr, y_tr, test_size=0.00001, random_state=42)

In [65]:
depth = 13
model_p = CatBoostClassifier(
    iterations=220,
    depth=depth,
    learning_rate=0.05,
    l2_leaf_reg=13,
    bagging_temperature=0.5,
    auto_class_weights='Balanced',
    eval_metric='Precision',  # Оптимизация по F1
    random_seed=42,
    verbose=20
)

# Обучение модели
model_p.fit(x_train, y_train)

0:	learn: 0.6065944	total: 11s	remaining: 40m 13s
20:	learn: 0.6425812	total: 3m 23s	remaining: 32m 10s
40:	learn: 0.6607436	total: 6m 41s	remaining: 29m 10s
60:	learn: 0.6754747	total: 9m 47s	remaining: 25m 32s
80:	learn: 0.6860088	total: 13m	remaining: 22m 19s
100:	learn: 0.6926854	total: 15m 53s	remaining: 18m 43s
120:	learn: 0.7033047	total: 18m 52s	remaining: 15m 26s
140:	learn: 0.7120634	total: 21m 58s	remaining: 12m 18s
160:	learn: 0.7189710	total: 24m 56s	remaining: 9m 8s
180:	learn: 0.7273515	total: 28m 2s	remaining: 6m 2s
200:	learn: 0.7338458	total: 31m 7s	remaining: 2m 56s
219:	learn: 0.7407929	total: 33m 52s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x2def1c27110>

In [80]:
model_p.save_model('models/model_5.cbm')

In [64]:
evaluate_model(y_train, model_p.predict(x_train), model_p.predict_proba(x_train)[:,1])

CatBoostError: There is no trained model to use predict(). Use fit() to train model. Then use this method.

In [81]:
feature_imp = pd.DataFrame({
        'feature': x_train.columns,
        'importance': model_p.feature_importances_
    }).sort_values('importance', key=abs, ascending=False)
feature_imp[:50]

Unnamed: 0,feature,importance
0,days_since_confirmed_risk,4.784701
1,credit_limit_risk,3.071058
2,full_credit_cost_risk,3.050712
3,sum_left_to_pay_risk,2.602537
5,credit_type_te,2.498753
4,maturity_fact_risk,2.133988
7,credit_type,2.112325
6,maturity_plan_risk,2.093736
31,credit_number_for_user_risk,1.52242
8,max_25,1.37932


In [None]:
model = CatBoostClassifier().load_model('catboost_model.cbm')
model.load_model('catboost_model.cbm')

In [66]:
oof_blend = model_p.predict_proba(x_train)[:,1]

In [59]:
oof_blend >= t

array([ True,  True,  True, ...,  True, False,  True], shape=(1827385,))

In [67]:
best_t, best_f1 = 0.5, 0.0
for t in np.linspace(0.01, 0.8, 99):
    f1 = f1_score(y_train, (oof_blend >= t).astype(int))
    if f1 > best_f1:
        best_f1, best_t = f1, t
print(f"OOF F1(best): {best_f1:.4f} @ threshold={best_t:.3f}")

OOF F1(best): 0.2412 @ threshold=0.631


In [73]:
baza = (model_p.predict_proba(X_te)[:,1] >=0.623).astype(int)

In [70]:
g = pd.read_csv('submission_soq_final_blend.csv')

In [112]:
roc_auc_score(g['flag'], baza)

np.float64(0.8547827341294307)

In [48]:
x_val

Unnamed: 0,days_since_confirmed_risk,credit_limit_risk,full_credit_cost_risk,sum_left_to_pay_risk,maturity_fact_risk,credit_type_te,maturity_plan_risk,credit_type,max_25,std_25,...,deterioration_count,overdue_severity_score__mx__maturity_ratio,deterioration_count__d__has_clean_history,h12_16,h12_15,h12_14,h12_13,h25_6,h25_7,h25_8
268646,0.028799,0.022817,0.024520,0.033441,0.031848,0.031867,0.032476,4.0,0.035877,0.002728,...,1.0,162.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1075397,0.034983,0.033305,0.025237,0.031667,0.030653,0.031396,0.030285,7.0,0.046564,0.003921,...,1.0,162.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
833234,0.037793,0.035948,0.034146,0.038332,0.034675,0.031916,0.031658,4.0,0.048503,0.004062,...,1.0,162.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
238404,0.035731,0.035327,0.029981,0.031592,0.031765,0.031867,0.030992,4.0,0.043483,0.003036,...,1.0,162.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1724757,0.037409,0.033130,0.029329,0.031562,0.039055,0.031380,0.025411,3.0,0.048503,0.004062,...,1.0,162.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1719040,0.036269,0.022919,0.025238,0.033156,0.034640,0.019914,0.024937,0.0,0.034098,0.001811,...,1.0,162.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1636739,0.037744,0.034624,0.033788,0.031600,0.029812,0.031518,0.024937,3.0,0.031210,0.001032,...,0.0,162.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1719281,0.034821,0.022919,0.030317,0.031600,0.039202,0.021153,0.035653,2.0,0.034886,0.002766,...,1.0,162.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
556439,0.026359,0.033203,0.025398,0.031592,0.029112,0.031867,0.030261,4.0,0.035486,0.002737,...,1.0,162.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
evaluate_model(y_train, model_p.predict(x_train), model_p.predict_proba(x_train)[:,1])

МЕТРИКИ КАЧЕСТВА МОДЕЛИ
Accuracy:  0.7488
Precision: 0.0826
Recall:    0.6584
F1-score:  0.1468
ROC-AUC:   0.7857
Avg Precision: 0.1418

------------------------------
MATRIXA SOWMESHENIY (CONFUSION MATRIX)
------------------------------
True Negative (TN):  1262375
False Positive (FP): 416679
False Negative (FN): 19464
True Positive (TP):  37515

Матрица в виде таблицы:
[[TN 1262375   FP 416679]
 [FN 19464   TP 37515]]

Specificity (TNR): 0.7518
False Positive Rate (FPR): 0.2482
False Negative Rate (FNR): 0.3416

------------------------------
DETALNY OTCHET (CLASSIFICATION REPORT)
------------------------------
              precision    recall  f1-score   support

     Class 0       0.98      0.75      0.85   1679054
     Class 1       0.08      0.66      0.15     56979

    accuracy                           0.75   1736033
   macro avg       0.53      0.71      0.50   1736033
weighted avg       0.96      0.75      0.83   1736033



{'accuracy': 0.7487703286746278,
 'precision': 0.08259686389516374,
 'recall': 0.6584004633285947,
 'f1': 0.14678005293706828,
 'roc_auc': np.float64(0.7857371198737821),
 'confusion_matrix': array([[1262375,  416679],
        [  19464,   37515]]),
 'tn': np.int64(1262375),
 'fp': np.int64(416679),
 'fn': np.int64(19464),
 'tp': np.int64(37515)}

In [18]:
import lightgbm as lgb

oof_lgb  = np.zeros(len(X_tr), dtype=np.float32)
test_lgb = np.zeros(len(X_te), dtype=np.float32)

lgb_params = dict(
    objective='binary',
    metric='auc',
    boosting_type='gbdt',
    learning_rate=0.03,
    num_leaves=255,
    max_depth=-1,
    min_data_in_leaf=50,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    lambda_l1=0.0,
    lambda_l2=1.0,
    scale_pos_weight=( (y_tr==0).sum() / max(1,(y_tr==1).sum()) ),  # ~30
    verbose=-1,
    seed=42
)

for fold, (tr_idx, vl_idx) in enumerate(folds, 1):
    Xtr, Xvl = X_tr.iloc[tr_idx], X_tr.iloc[vl_idx]
    ytr_, yvl_ = y_tr[tr_idx], y_tr[vl_idx]

    dtr = lgb.Dataset(Xtr, label=ytr_)
    dvl = lgb.Dataset(Xvl, label=yvl_, reference=dtr)

    model_lgb = lgb.train(
        lgb_params, dtr,
        num_boost_round=8000,
        valid_sets=[dtr, dvl],
        valid_names=['train','valid'],
        verbose_eval=20
    )
    oof_lgb[vl_idx] = model_lgb.predict(Xvl, num_iteration=model_lgb.best_iteration)
    test_lgb       += model_lgb.predict(X_te, num_iteration=model_lgb.best_iteration) / skf.n_splits

auc_lgb = roc_auc_score(y_tr, oof_lgb)
print(f"LightGBM OOF AUC: {auc_lgb:.5f}")
gc.collect()

MemoryError: Unable to allocate 4.50 GiB for an array with shape (441, 1370553) and data type float64

In [62]:
del target

In [11]:
import lightgbm as lgb

In [19]:
x_tr, x_val, y_tr_, y_val_ = train_test_split(X_tr, y_tr, test_size=0.05, stratify=y_tr)

# Очищаем названия признаков от специальных символов
x_tr_clean = x_tr
x_val_clean = x_val

if hasattr(x_tr, 'columns'):
    x_tr.columns = [str(col).replace('"', '').replace("'", "").replace("{", "").replace("}", "").replace("[", "").replace("]", "") 
                         for col in x_tr.columns]
    x_val.columns = [str(col).replace('"', '').replace("'", "").replace("{", "").replace("}", "").replace("[", "").replace("]", "") 
                          for col in x_val.columns]

# Создаем датасеты с очищенными признаками
train_data = lgb.Dataset(x_tr, label=y_tr_)
valid_data = lgb.Dataset(x_val, label=y_val_, reference=train_data)

# Параметры
params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 255,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'min_data_in_leaf': 50,
    'lambda_l2': 1.0,
    'scale_pos_weight': (y_tr == 0).sum() / max(1, (y_tr == 1).sum()),
    'seed': 42,
    'verbosity': 2
}

# Callbacks
callbacks = [
    lgb.early_stopping(stopping_rounds=30),
    lgb.log_evaluation(period=200)
]

# Обучение
model_ligh = lgb.train(
    params,
    train_data,
    num_boost_round=4000,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'valid'],
    callbacks=callbacks
)

MemoryError: Unable to allocate 2.85 GiB for an array with shape (441, 1736033) and data type float32

In [None]:
imp_gain = pd.Series(model_ligh.feature_importance(importance_type='gain'),
                        index=x_tr.columns, name=f'lgb_imp_seed{42}')

In [63]:
model_li, ip_g = train_lgb_and_importance(X_tr, y_tr, seed=42)

TypeError: train() got an unexpected keyword argument 'early_stopping_rounds'

In [None]:
# делим полный train на train/val для скорой оценки и тюнинга порога
x_train, x_val, y_train, y_val = train_test_split(
    X_tr_sel, y_tr, test_size=0.15, random_state=RANDOM_STATE, stratify=y_tr
)

# --- CatBoost финальный ---
final_cb = CatBoostClassifier(
    iterations=3000,
    depth=8,
    learning_rate=0.04,
    l2_leaf_reg=12,
    bagging_temperature=0.5,
    auto_class_weights='Balanced',
    eval_metric='AUC',
    random_seed=RANDOM_STATE,
    verbose=200,
    early_stopping_rounds=200
)
final_cb.fit(x_train, y_train, eval_set=(x_val, y_val), use_best_model=True, cat_features=[c for c in present_cats if c in X_tr_sel.columns])

val_cb_proba = final_cb.predict_proba(x_val)[:,1]

# --- LightGBM финальный ---
dtr_f = lgb.Dataset(x_train, label=y_train, categorical_feature=[c for c in present_cats if c in X_tr_sel.columns])
dvl_f = lgb.Dataset(x_val, label=y_val, reference=dtr_f, categorical_feature=[c for c in present_cats if c in X_tr_sel.columns])

params_f = dict(
    objective='binary', metric='auc',
    learning_rate=0.03,
    num_leaves=255, feature_fraction=0.8, bagging_fraction=0.8, bagging_freq=1,
    min_data_in_leaf=50, lambda_l2=1.0,
    scale_pos_weight=((y_tr==0).sum()/max(1,(y_tr==1).sum())),
    seed=RANDOM_STATE, verbose=-1
)
final_lgb = lgb.train(
    params_f, dtr_f,
    num_boost_round=8000,
    valid_sets=[dtr_f, dvl_f],
    early_stopping_rounds=300,
    verbose_eval=200
)
val_lgb_proba = final_lgb.predict(x_val, num_iteration=final_lgb.best_iteration)

# сохраним финальные модели
final_cb.save_model("final_cb_topk.cbm")
final_lgb.save_model("final_lgb_topk.txt")


In [None]:
from sklearn.linear_model import LogisticRegression

# стек из двух вероятностей
val_stack = np.vstack([val_cb_proba, val_lgb_proba]).T
meta = LogisticRegression(
    penalty='l2', C=1.0, solver='lbfgs', max_iter=2000,
    class_weight={0:1.0, 1: ((y_tr==0).sum()/max(1,(y_tr==1).sum()))}
)
meta.fit(val_stack, y_val)
val_meta = meta.predict_proba(val_stack)[:,1]

# тюнинг порога под F1
best_t, best_f1 = 0.5, 0.0
for t in np.linspace(0.01, 0.6, 60):
    f1 = f1_score(y_val, (val_meta>=t).astype(int))
    if f1 > best_f1:
        best_f1, best_t = f1, t

auc_cb  = roc_auc_score(y_val, val_cb_proba)
auc_lgb = roc_auc_score(y_val, val_lgb_proba)
auc_meta= roc_auc_score(y_val, val_meta)

print(f"Val AUC: CB={auc_cb:.4f} | LGB={auc_lgb:.4f} | STACK={auc_meta:.4f}")
print(f"Val F1(best threshold): {best_f1:.4f} @ thr={best_t:.3f}")

# инференс на тесте
test_cb_proba  = final_cb.predict_proba(X_te_sel)[:,1]
test_lgb_proba = final_lgb.predict(X_te_sel, num_iteration=final_lgb.best_iteration)
test_stack = np.vstack([test_cb_proba, test_lgb_proba]).T
test_meta = meta.predict_proba(test_stack)[:,1]

submission = pd.DataFrame({
    'id': X_test['id'],
    'flag': (test_meta >= best_t).astype(int)
})
submission.to_csv("submission_topk_stack.csv", index=False)
submission.head()


In [None]:
# 1) Собираем стек-фичи из OOF предсказаний базовых моделей
stack_tr = np.vstack([oof_cb, oof_lgb]).T  # [N, 2]
stack_te = np.vstack([test_cb, test_lgb]).T

# 2) Логрег как мета-модель (учим на OOF → безопасно; на тест — применяем сразу)
meta = LogisticRegression(
    penalty='l2', C=1.0,
    solver='lbfgs',
    max_iter=1000,
    class_weight={0:1.0, 1: ( (y_tr==0).sum()/max(1,(y_tr==1).sum()) )}  # сбалансировать
)
meta.fit(stack_tr, y_tr)
oof_meta  = meta.predict_proba(stack_tr)[:,1]
test_meta = meta.predict_proba(stack_te)[:,1]

# 3) Тюним порог по F1 на OOF
best_t, best_f1 = 0.5, 0.0
for t in np.linspace(0.01, 0.7, 60):
    f1 = f1_score(y_tr, (oof_meta>=t).astype(int))
    if f1 > best_f1: best_f1, best_t = f1, t

auc_meta = roc_auc_score(y_tr, oof_meta)
print(f"STACK OOF AUC: {auc_meta:.5f}")
print(f"OOF F1(best): {best_f1:.5f} @ thr={best_t:.3f}")


In [79]:
submission = pd.DataFrame({
    'id': X_test['id'].values,
    'flag': baza
})
submission.to_csv("submission_blend_stack_max.csv", index=False)
submission.head()


Unnamed: 0,id,flag
0,532674,0
1,1048835,0
2,184294,0
3,1075748,0
4,2034965,0


In [94]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression

# Very small slice to fit strict time limit
ROWS = 200000

X_part = X_train[:ROWS]
y_df = y_train

df = X_part.merge(y_df, on="id", how="inner")
df["flag"] = df["flag"].astype(int)

# Balanced sample up to ~8k
pos = df[df.flag == 1]
neg = df[df.flag == 0]
n_pos = min(len(pos), 4000)
pos_sample = pos.sample(n=n_pos, random_state=42) if len(pos) > n_pos else pos
neg_sample = neg.sample(n=len(pos_sample), random_state=42)

sample = pd.concat([pos_sample, neg_sample], axis=0).sample(frac=1.0, random_state=42).reset_index(drop=True)

y = sample["flag"].values
X_sample = sample.drop(columns=["flag"])
feature_cols = [c for c in X_sample.columns if c != "id"]
X_feat = X_sample[feature_cols]

X_train, X_val, y_train, y_val = train_test_split(X_feat, y, test_size=0.2, stratify=y, random_state=42)

# Fast LR
lr = LogisticRegression(max_iter=200, class_weight="balanced", solver="liblinear")
lr.fit(X_train, y_train)

# Подбираем порог на тренировочных данных
train_proba = lr.predict_proba(X_train)[:, 1]
thresholds = np.linspace(0.05, 0.95, 181)
f1s_train = [f1_score(y_train, (train_proba >= t).astype(int)) for t in thresholds]
best_idx = int(np.argmax(f1s_train))
best_t = float(thresholds[best_idx])

# Применяем лучший порог на валидационных данных
val_proba = lr.predict_proba(X_val)[:, 1]
val_preds = (val_proba >= best_t).astype(int)
best_f1 = float(f1_score(y_val, val_preds))

res = pd.DataFrame({
    "model": ["LogReg (very small slice)"],
    "best_F1": [best_f1],
    "best_threshold": [best_t],
    "train_rows": [len(X_train)],
    "val_rows": [len(X_val)],
    "X_rows_loaded": [ROWS]
})

print("Best F1 (baseline):", round(best_f1, 4), "at threshold", round(best_t, 3))

Best F1 (baseline): 0.6647 at threshold 0.325


In [97]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

# Объединяем признаки и целевую переменную
df = X_train.merge(y_train, on="id", how="inner")
df["flag"] = df["flag"].astype(int)

# Разделяем на признаки и целевую переменную
X = df.drop(columns=["flag", "id"])
y = df["flag"].values

# Разделяем на тренировочную и валидационную выборки
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True
)

print(f"Размер тренировочной выборки: {len(X_train_split)}")
print(f"Размер валидационной выборки: {len(X_val)}")
print(f"Баланс классов в тренировочной: {np.bincount(y_train_split)}")
print(f"Баланс классов в валидационной: {np.bincount(y_val)}")

# Обучаем модель на тренировочной выборке
lr = LogisticRegression(
    max_iter=1000, 
    class_weight="balanced", 
    solver="liblinear",
    random_state=42
)
lr.fit(X_train_split, y_train_split)

# Подбираем порог на тренировочной выборке
train_proba = lr.predict_proba(X_train_split)[:, 1]
thresholds = np.linspace(0.05, 0.95, 181)
f1s_train = [f1_score(y_train_split, (train_proba >= t).astype(int)) for t in thresholds]
best_idx = int(np.argmax(f1s_train))
best_threshold = float(thresholds[best_idx])
best_f1_train = float(f1s_train[best_idx])

print(f"\nЛучший порог на тренировочной выборке: {best_threshold:.3f}")
print(f"F1-score на тренировочной выборке: {best_f1_train:.4f}")

# Делаем предсказания на валидационной выборке (без подглядывания!)
val_proba = lr.predict_proba(X_val)[:, 1]
val_preds = (val_proba >= best_threshold).astype(int)

# Оцениваем реальную точность на валидационной выборке
val_f1 = f1_score(y_val, val_preds)
val_accuracy = accuracy_score(y_val, val_preds)
val_precision = precision_score(y_val, val_preds)
val_recall = recall_score(y_val, val_preds)

print("\n" + "="*50)
print("РЕАЛЬНЫЕ РЕЗУЛЬТАТЫ НА ВАЛИДАЦИОННОЙ ВЫБОРКЕ:")
print("="*50)
print(f"F1-score: {val_f1:.4f}")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")

# Матрица ошибок
cm = confusion_matrix(y_val, val_preds)
print(f"\nМатрица ошибок:")
print(cm)
print(f"TN: {cm[0,0]}, FP: {cm[0,1]}")
print(f"FN: {cm[1,0]}, TP: {cm[1,1]}")

# Сравнение с дефолтным порогом 0.5
default_preds = (val_proba >= 0.5).astype(int)
default_f1 = f1_score(y_val, default_preds)

print(f"\nСравнение с порогом 0.5:")
print(f"F1-score с порогом 0.5: {default_f1:.4f}")
print(f"Улучшение: {val_f1 - default_f1:.4f}")

# Результаты
results = pd.DataFrame({
    'metric': ['F1-score', 'Accuracy', 'Precision', 'Recall'],
    'train': [best_f1_train, accuracy_score(y_train_split, (train_proba >= best_threshold).astype(int)), 
             precision_score(y_train_split, (train_proba >= best_threshold).astype(int)),
             recall_score(y_train_split, (train_proba >= best_threshold).astype(int))],
    'validation': [val_f1, val_accuracy, val_precision, val_recall]
})

print(f"\nСравнение результатов:")
print(results)

Размер тренировочной выборки: 1461923
Размер валидационной выборки: 365481
Баланс классов в тренировочной: [1413952   47971]
Баланс классов в валидационной: [353488  11993]

Лучший порог на тренировочной выборке: 0.585
F1-score на тренировочной выборке: 0.0911

РЕАЛЬНЫЕ РЕЗУЛЬТАТЫ НА ВАЛИДАЦИОННОЙ ВЫБОРКЕ:
F1-score: 0.0932
Accuracy: 0.8727
Precision: 0.0608
Recall: 0.1995

Матрица ошибок:
[[316548  36940]
 [  9600   2393]]
TN: 316548, FP: 36940
FN: 9600, TP: 2393

Сравнение с порогом 0.5:
F1-score с порогом 0.5: 0.0820
Улучшение: 0.0112

Сравнение результатов:
      metric     train  validation
0   F1-score  0.091055    0.093247
1   Accuracy  0.871643    0.872661
2  Precision  0.059309    0.060839
3     Recall  0.195931    0.199533


In [113]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Объединяем признаки и целевую переменную
df = X_train.merge(y_train, on="id", how="inner")
df["flag"] = df["flag"].astype(int)

# Разделяем на признаки и целевую переменную
X = df.drop(columns=["flag", "id"])
y = df["flag"].values

# Разделяем на тренировочную и валидационную выборки
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True
)

print(f"Размер тренировочной выборки: {len(X_train_split)}")
print(f"Размер валидационной выборки: {len(X_val)}")
print(f"Баланс классов в тренировочной: {np.bincount(y_train_split)}")
print(f"Баланс классов в валидационной: {np.bincount(y_val)}")

# Балансируем только тренировочную выборку
ros = RandomOverSampler(random_state=42)
X_train_balanced, y_train_balanced = ros.fit_resample(X_train_split, y_train_split)

# # Вместо oversampling можно использовать undersampling
# rus = RandomUnderSampler(random_state=42)
# X_train_balanced, y_train_balanced = rus.fit_resample(X_train_split, y_train_split)

# print(f"\nПосле undersampling тренировочной выборки:")
# print(f"Размер сбалансированной тренировочной: {len(X_train_balanced)}")
# print(f"Баланс классов: {np.bincount(y_train_balanced)}")


print(f"\nПосле балансировки тренировочной выборки:")
print(f"Размер сбалансированной тренировочной: {len(X_train_balanced)}")
print(f"Баланс классов: {np.bincount(y_train_balanced)}")

# Обучаем модель на сбалансированной тренировочной выборке
lr = LogisticRegression(
    max_iter=1000, 
    class_weight=None,  # Не используем class_weight т.к. уже сбалансировали
    solver="liblinear",
    random_state=42
)
lr.fit(X_train_balanced, y_train_balanced)

# Подбираем порог на сбалансированной тренировочной выборке
train_proba = lr.predict_proba(X_train_balanced)[:, 1]
thresholds = np.linspace(0.05, 0.95, 181)
f1s_train = [f1_score(y_train_balanced, (train_proba >= t).astype(int)) for t in thresholds]
best_idx = int(np.argmax(f1s_train))
best_threshold = float(thresholds[best_idx])
best_f1_train = float(f1s_train[best_idx])

print(f"\nЛучший порог на тренировочной выборке: {best_threshold:.3f}")
print(f"F1-score на тренировочной выборке: {best_f1_train:.4f}")

# Делаем предсказания на валидационной выборке (без балансировки!)
val_proba = lr.predict_proba(X_val)[:, 1]
val_preds = (val_proba >= best_threshold).astype(int)

# Оцениваем реальную точность на несбалансированной валидационной выборке
val_f1 = f1_score(y_val, val_preds)
val_accuracy = accuracy_score(y_val, val_preds)
val_precision = precision_score(y_val, val_preds)
val_recall = recall_score(y_val, val_preds)

print("\n" + "="*50)
print("РЕАЛЬНЫЕ РЕЗУЛЬТАТЫ НА НЕСБАЛАНСИРОВАННОЙ ВАЛИДАЦИОННОЙ ВЫБОРКЕ:")
print("="*50)
print(f"F1-score: {val_f1:.4f}")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")

# Матрица ошибок
cm = confusion_matrix(y_val, val_preds)
print(f"\nМатрица ошибок:")
print(cm)
print(f"TN: {cm[0,0]}, FP: {cm[0,1]}")
print(f"FN: {cm[1,0]}, TP: {cm[1,1]}")

# Сравнение с дефолтным порогом 0.5
default_preds = (val_proba >= 0.5).astype(int)
default_f1 = f1_score(y_val, default_preds)

print(f"\nСравнение с порогом 0.5:")
print(f"F1-score с порогом 0.5: {default_f1:.4f}")
print(f"Улучшение: {val_f1 - default_f1:.4f}")

# Результаты
results = pd.DataFrame({
    'metric': ['F1-score', 'Accuracy', 'Precision', 'Recall'],
    'train_balanced': [best_f1_train, 
                      accuracy_score(y_train_balanced, (train_proba >= best_threshold).astype(int)), 
                      precision_score(y_train_balanced, (train_proba >= best_threshold).astype(int)),
                      recall_score(y_train_balanced, (train_proba >= best_threshold).astype(int))],
    'validation_original': [val_f1, val_accuracy, val_precision, val_recall]
})

print(f"\nСравнение результатов:")
print(results.round(4))

Размер тренировочной выборки: 1461923
Размер валидационной выборки: 365481
Баланс классов в тренировочной: [1413952   47971]
Баланс классов в валидационной: [353488  11993]

После балансировки тренировочной выборки:
Размер сбалансированной тренировочной: 2827904
Баланс классов: [1413952 1413952]

Лучший порог на тренировочной выборке: 0.360
F1-score на тренировочной выборке: 0.6676

РЕАЛЬНЫЕ РЕЗУЛЬТАТЫ НА НЕСБАЛАНСИРОВАННОЙ ВАЛИДАЦИОННОЙ ВЫБОРКЕ:
F1-score: 0.0646
Accuracy: 0.0637
Precision: 0.0334
Recall: 0.9851

Матрица ошибок:
[[ 11455 342033]
 [   179  11814]]
TN: 11455, FP: 342033
FN: 179, TP: 11814

Сравнение с порогом 0.5:
F1-score с порогом 0.5: 0.0821
Улучшение: -0.0175

Сравнение результатов:
      metric  train_balanced  validation_original
0   F1-score          0.6676               0.0646
1   Accuracy          0.5091               0.0637
2  Precision          0.5047               0.0334
3     Recall          0.9858               0.9851


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Объединяем признаки и целевую переменную
df = X_train.merge(y_train, on="id", how="inner")
df["flag"] = df["flag"].astype(int)

# Разделяем на признаки и целевую переменную
X = df.drop(columns=["flag", "id"])
y = df["flag"].values

# Разделяем на тренировочную и валидационную выборки
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True
)

print(f"Размер тренировочной выборки: {len(X_train_split)}")
print(f"Размер валидационной выборки: {len(X_val)}")
print(f"Баланс классов в тренировочной: {np.bincount(y_train_split)}")
print(f"Баланс классов в валидационной: {np.bincount(y_val)}")

# # Балансируем только тренировочную выборку
# ros = RandomOverSampler(random_state=42)
# X_train_balanced, y_train_balanced = ros.fit_resample(X_train_split, y_train_split)

# Вместо oversampling можно использовать undersampling
rus = RandomUnderSampler(random_state=42)
X_train_balanced, y_train_balanced = rus.fit_resample(X_train_split, y_train_split)

print(f"\nПосле undersampling тренировочной выборки:")
print(f"Размер сбалансированной тренировочной: {len(X_train_balanced)}")
print(f"Баланс классов: {np.bincount(y_train_balanced)}")


print(f"\nПосле балансировки тренировочной выборки:")
print(f"Размер сбалансированной тренировочной: {len(X_train_balanced)}")
print(f"Баланс классов: {np.bincount(y_train_balanced)}")

# Обучаем модель на сбалансированной тренировочной выборке
lr = CatBoostClassifier(
    iterations=500,
    depth=9,
    learning_rate=0.04,
    l2_leaf_reg=12,
    bagging_temperature=0.5,
    auto_class_weights='Balanced',
    eval_metric='AUC',
    verbose=200,
    early_stopping_rounds=200
)
lr.fit(X_train_balanced, y_train_balanced)

# Подбираем порог на сбалансированной тренировочной выборке
train_proba = lr.predict_proba(X_train_balanced)[:, 1]
thresholds = np.linspace(0.05, 0.95, 181)
f1s_train = [f1_score(y_train_balanced, (train_proba >= t).astype(int)) for t in thresholds]
best_idx = int(np.argmax(f1s_train))
best_threshold = float(thresholds[best_idx])
best_f1_train = float(f1s_train[best_idx])

print(f"\nЛучший порог на тренировочной выборке: {best_threshold:.3f}")
print(f"F1-score на тренировочной выборке: {best_f1_train:.4f}")

# Делаем предсказания на валидационной выборке (без балансировки!)
val_proba = lr.predict_proba(X_val)[:, 1]
val_preds = (val_proba >= best_threshold).astype(int)

# Оцениваем реальную точность на несбалансированной валидационной выборке
val_f1 = f1_score(y_val, val_preds)
val_accuracy = accuracy_score(y_val, val_preds)
val_precision = precision_score(y_val, val_preds)
val_recall = recall_score(y_val, val_preds)

print("\n" + "="*50)
print("РЕАЛЬНЫЕ РЕЗУЛЬТАТЫ НА НЕСБАЛАНСИРОВАННОЙ ВАЛИДАЦИОННОЙ ВЫБОРКЕ:")
print("="*50)
print(f"F1-score: {val_f1:.4f}")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")

# Матрица ошибок
cm = confusion_matrix(y_val, val_preds)
print(f"\nМатрица ошибок:")
print(cm)
print(f"TN: {cm[0,0]}, FP: {cm[0,1]}")
print(f"FN: {cm[1,0]}, TP: {cm[1,1]}")

# Сравнение с дефолтным порогом 0.5
default_preds = (val_proba >= 0.62).astype(int)
default_f1 = f1_score(y_val, default_preds)

print(f"\nСравнение с порогом 0.5:")
print(f"F1-score с порогом 0.5: {default_f1:.4f}")
print(f"Улучшение: {val_f1 - default_f1:.4f}")

# Результаты
results = pd.DataFrame({
    'metric': ['F1-score', 'Accuracy', 'Precision', 'Recall'],
    'train_balanced': [best_f1_train, 
                      accuracy_score(y_train_balanced, (train_proba >= best_threshold).astype(int)), 
                      precision_score(y_train_balanced, (train_proba >= best_threshold).astype(int)),
                      recall_score(y_train_balanced, (train_proba >= best_threshold).astype(int))],
    'validation_original': [val_f1, val_accuracy, val_precision, val_recall]
})

print(f"\nСравнение результатов:")
print(results.round(4))

Размер тренировочной выборки: 1461923
Размер валидационной выборки: 365481
Баланс классов в тренировочной: [1413952   47971]
Баланс классов в валидационной: [353488  11993]

После undersampling тренировочной выборки:
Размер сбалансированной тренировочной: 95942
Баланс классов: [47971 47971]

После балансировки тренировочной выборки:
Размер сбалансированной тренировочной: 95942
Баланс классов: [47971 47971]
0:	total: 49.2ms	remaining: 24.6s
200:	total: 6.13s	remaining: 9.12s
400:	total: 12.5s	remaining: 3.09s
499:	total: 15.8s	remaining: 0us

Лучший порог на тренировочной выборке: 0.400
F1-score на тренировочной выборке: 0.6893

РЕАЛЬНЫЕ РЕЗУЛЬТАТЫ НА НЕСБАЛАНСИРОВАННОЙ ВАЛИДАЦИОННОЙ ВЫБОРКЕ:
F1-score: 0.0714
Accuracy: 0.2553
Precision: 0.0372
Recall: 0.8731

Матрица ошибок:
[[ 82833 270655]
 [  1522  10471]]
TN: 82833, FP: 270655
FN: 1522, TP: 10471

Сравнение с порогом 0.5:
F1-score с порогом 0.5: 0.0886
Улучшение: -0.0172

Сравнение результатов:
      metric  train_balanced  validati

In [None]:
default_preds = (val_proba >= 0.62).astype(int)
default_f1 = f1_score(y_val, default_preds)

print(f"\nСравнение с порогом 0.5:")
print(f"F1-score с порогом 0.5: {default_f1:.4f}")
print(f"Улучшение: {val_f1 - default_f1:.4f}")

# Результаты
results = pd.DataFrame({
    'metric': ['F1-score', 'Accuracy', 'Precision', 'Recall'],
    'train_balanced': [best_f1_train, 
                      accuracy_score(y_train_balanced, (train_proba >= best_threshold).astype(int)), 
                      precision_score(y_train_balanced, (train_proba >= best_threshold).astype(int)),
                      recall_score(y_train_balanced, (train_proba >= best_threshold).astype(int))],
    'validation_original': [val_f1, val_accuracy, val_precision, val_recall]
})

print(f"\nСравнение результатов:")
print(results.round(4))

In [117]:
# Делаем предсказания на валидационной выборке (без балансировки!)
val_proba = lr.predict_proba(X_val)[:, 1]
val_preds = (val_proba >= 0.62).astype(int)

# Оцениваем реальную точность на несбалансированной валидационной выборке
val_f1 = f1_score(y_val, val_preds)
val_accuracy = accuracy_score(y_val, val_preds)
val_precision = precision_score(y_val, val_preds)
val_recall = recall_score(y_val, val_preds)

print("\n" + "="*50)
print("РЕАЛЬНЫЕ РЕЗУЛЬТАТЫ НА НЕСБАЛАНСИРОВАННОЙ ВАЛИДАЦИОННОЙ ВЫБОРКЕ:")
print("="*50)
print(f"F1-score: {val_f1:.4f}")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")

# Матрица ошибок
cm = confusion_matrix(y_val, val_preds)
print(f"\nМатрица ошибок:")
print(cm)
print(f"TN: {cm[0,0]}, FP: {cm[0,1]}")
print(f"FN: {cm[1,0]}, TP: {cm[1,1]}")

# Сравнение с дефолтным порогом 0.5
default_preds = (val_proba >= 0.5).astype(int)
default_f1 = f1_score(y_val, default_preds)

print(f"\nСравнение с порогом 0.5:")
print(f"F1-score с порогом 0.5: {default_f1:.4f}")
print(f"Улучшение: {val_f1 - default_f1:.4f}")

# Результаты
results = pd.DataFrame({
    'metric': ['F1-score', 'Accuracy', 'Precision', 'Recall'],
    'train_balanced': [best_f1_train, 
                      accuracy_score(y_train_balanced, (train_proba >= best_threshold).astype(int)), 
                      precision_score(y_train_balanced, (train_proba >= best_threshold).astype(int)),
                      recall_score(y_train_balanced, (train_proba >= best_threshold).astype(int))],
    'validation_original': [val_f1, val_accuracy, val_precision, val_recall]
})

print(f"\nСравнение результатов:")
print(results.round(4))


РЕАЛЬНЫЕ РЕЗУЛЬТАТЫ НА НЕСБАЛАНСИРОВАННОЙ ВАЛИДАЦИОННОЙ ВЫБОРКЕ:
F1-score: 0.1079
Accuracy: 0.8805
Precision: 0.0714
Recall: 0.2202

Матрица ошибок:
[[319155  34333]
 [  9352   2641]]
TN: 319155, FP: 34333
FN: 9352, TP: 2641

Сравнение с порогом 0.5:
F1-score с порогом 0.5: 0.0886
Улучшение: 0.0193

Сравнение результатов:
      metric  train_balanced  validation_original
0   F1-score          0.6893               0.1079
1   Accuracy          0.5927               0.8805
2  Precision          0.5572               0.0714
3     Recall          0.9034               0.2202


In [131]:
submission_boosted = pd.read_csv('submission_soft_strong.csv')

In [122]:
456852*0.033

15076.116

In [135]:
f1_score(submission_boosted['flag'], submission['flag'])

0.8704889971235463

In [134]:
submission_boosted

Unnamed: 0,id,flag
0,532674,0
1,1048835,0
2,184294,0
3,1075748,0
4,2034965,0
...,...,...
456847,1502480,0
456848,1469413,0
456849,263710,0
456850,1119056,0


In [129]:
submission

Unnamed: 0,id,flag
0,532674,0
1,1048835,0
2,184294,0
3,1075748,0
4,2034965,0
...,...,...
456847,1502480,0
456848,1469413,0
456849,263710,0
456850,1119056,0
