# School of Quants hackathon 2025 – Finals

## Импорты и настройки

In [96]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
from catboost import CatBoostClassifier, Pool
# ЯЧЕЙКА 8 — GRU по risk-последовательности
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)


DEVICE: cpu


## Данные

In [2]:
X_train = pd.read_csv('res_2/X_train.csv')
y_train = pd.read_csv('res_2/y_train.csv')
X_test = pd.read_csv('res_2/X_test.csv')

In [3]:
df = X_train.copy()
df = df.sort_values(by='id').set_index('id')

In [4]:
df['target'] = y_train.set_index('id')['flag']

In [5]:
df.columns

Index(['credit_number_for_user', 'days_since_confirmed', 'maturity_plan',
       'maturity_fact', 'credit_limit', 'next_payment_sum', 'sum_left_to_pay',
       'current_overdue_debt', 'max_overdue_debt', 'full_credit_cost',
       'overdues_5d', 'overdues_5d_30d', 'overdues_30d_60d',
       'overdues_60d_90d', 'overdues_90d', 'no_overdues_5d',
       'no_overdues_5d_30d', 'no_overdues_30d_60d', 'no_overdues_60d_90d',
       'no_overdues_90d', 'enc_paym_0', 'enc_paym_1', 'enc_paym_2',
       'enc_paym_3', 'enc_paym_4', 'enc_paym_5', 'enc_paym_6', 'enc_paym_7',
       'enc_paym_8', 'enc_paym_9', 'enc_paym_10', 'enc_paym_11', 'enc_paym_12',
       'enc_paym_13', 'enc_paym_14', 'enc_paym_15', 'enc_paym_16',
       'enc_paym_17', 'enc_paym_18', 'enc_paym_19', 'enc_paym_20',
       'enc_paym_21', 'enc_paym_22', 'enc_paym_23', 'enc_paym_24',
       'credit_type', 'credit_currency', 'target'],
      dtype='object')

## Исследование данных и анализ признаков

### enc_paym_

#### Анализ

Надо однозначно понять, что за статусы платежей представлены, вот динамика платежей по месяцам (`enc_paym_{0..N}` - Статусы ежемесячных платежей за последние N месяцев***):

In [6]:
all_data = []
for month in range(0, 25):
    payment_counts = X_train.groupby(f'enc_paym_{month}')['id'].count()
    month_data = {
        'month': month,
        'status_0': payment_counts.get(0, 0),
        'status_1': payment_counts.get(1, 0),
        'status_2': payment_counts.get(2, 0),
        'status_3': payment_counts.get(3, 0)
    }
    all_data.append(month_data)
result_df = pd.DataFrame(all_data)
result_df

Unnamed: 0,month,status_0,status_1,status_2,status_3
0,0,1695569,53323,3950,74562
1,1,1528964,103377,8043,187020
2,2,1447139,92128,7369,280768
3,3,1374843,87819,7042,357700
4,4,1302740,82808,6396,435460
5,5,1227907,76923,5967,516607
6,6,1142545,70554,5145,609160
7,7,1050430,65498,4804,706672
8,8,988710,60230,4347,774117
9,9,927778,55380,3935,840311


Можно заметить, что 11, 20, 24ый месяцы отсутствуте значение status_0, но это ОБМАН, по динамике как раз видно, что данные сдвинуты и не хвататет как раз `status_3`. Посмотрим на тестовые данные:

In [7]:
all_data = []
for month in range(0, 25):
    payment_counts = X_train.groupby(f'enc_paym_{month}')['id'].count()
    
    month_data = {
        'month': month,
        'status_0': payment_counts.get(0, 0),
        'status_1': payment_counts.get(1, 0),
        'status_2': payment_counts.get(2, 0),
        'status_3': payment_counts.get(3, 0)
    }
    all_data.append(month_data)
result_df = pd.DataFrame(all_data)
result_df

Unnamed: 0,month,status_0,status_1,status_2,status_3
0,0,1695569,53323,3950,74562
1,1,1528964,103377,8043,187020
2,2,1447139,92128,7369,280768
3,3,1374843,87819,7042,357700
4,4,1302740,82808,6396,435460
5,5,1227907,76923,5967,516607
6,6,1142545,70554,5145,609160
7,7,1050430,65498,4804,706672
8,8,988710,60230,4347,774117
9,9,927778,55380,3935,840311


In [8]:
all_data = []
for month in range(0, 25):
    payment_counts = X_train.groupby(f'enc_paym_{month}')['id'].count()
    
    month_data = {
        'month': month,
        'status_0': payment_counts.get(0, 0),
        'status_1': payment_counts.get(1, 0),
        'status_2': payment_counts.get(2, 0),
        'status_3': payment_counts.get(3, 0)
    }
    all_data.append(month_data)
result_df = pd.DataFrame(all_data)
result_df

Unnamed: 0,month,status_0,status_1,status_2,status_3
0,0,1695569,53323,3950,74562
1,1,1528964,103377,8043,187020
2,2,1447139,92128,7369,280768
3,3,1374843,87819,7042,357700
4,4,1302740,82808,6396,435460
5,5,1227907,76923,5967,516607
6,6,1142545,70554,5145,609160
7,7,1050430,65498,4804,706672
8,8,988710,60230,4347,774117
9,9,927778,55380,3935,840311


In [9]:
# Сдвигаем определённые месяца
map_enc_paym_ = {1:0, 2:1, 3:2, 4:3}

df['enc_paym_11'] = df['enc_paym_11'].map(map_enc_paym_)
df['enc_paym_20'] = df['enc_paym_20'].map(map_enc_paym_)
df['enc_paym_24'] = df['enc_paym_24'].map(map_enc_paym_)

X_test['enc_paym_11'] = X_test['enc_paym_11'].map(map_enc_paym_)
X_test['enc_paym_20'] = X_test['enc_paym_20'].map(map_enc_paym_)
X_test['enc_paym_24'] = X_test['enc_paym_24'].map(map_enc_paym_)

Так всё же, что это за статусы? Если исходить из простой банковской логики и посчитать распределение дефолтов согласно статусам:
|N|default_rate|
|----------|----------|
|2 |0.125000 |
|1| 0.051819 |
|3 |0.034540 |
|0| 0.028840 |


- **Статус 0**: Своевременный платёж (timely paid). Доминирует в недавних месяцах (382k в месяц 1, ~80% активных кредитов), корреляция с target=False (не-дефолт) выше. Это "заплочено вовремя".
- **Статус 3**: Лёгкая/средняя просрочка (past due 1–60 дней). Редкий (25k–6k, ~5–1%), чаще у дефолтеров (target=True, до 25% в примерах), коррелирует с overdues_5d_30d. Это "просрочено" на ранних стадиях.
- **Статус 1**: Тяжёлая просрочка (past due 60+ дней или charge-off). Очень редкий (<0.5%, 2k–0.5k), почти только у дефолтеров, коррелирует с overdues_60d_90d/overdues_90d. Это "сильно просрочено".
- **Статус 2**: Это очень серьёзный маркер дефолта.

На основе датасета, идея - создать метрики, учитывающие:
- Количество и тяжесть просрочек (1, 2) в последних месяцах.
- Последовательности (streaks) или переходы (например, 0→1→2 хуже, чем 1→0).
- Веса для recency: недавние месяцы (enc_paym_0–2) важнее, чем enc_paym_3–5.

#### Сложные признаки

Заменяем случайные веса на осмысленные, которые увеличиваются с увеличением риска:

In [10]:
enc_paym_corrected = {
    0: 0,  #  → 0 (лучшая ситуация)
    3: 1,  #  → 1 (средний риск)
    1: 2,  #  → 2 (высокий риск)
    2: 3   #  → 3 (максимальный риск)
}
for month in range(0, 25):
    df[f'enc_paym_{month}'] = df[f'enc_paym_{month}'].map(enc_paym_corrected)
    X_test[f'enc_paym_{month}'] = X_test[f'enc_paym_{month}'].map(enc_paym_corrected)

#### Взвешенная сумма статусов (дисконтированная по времени)

$$S=s_0​⋅1+s_1​⋅0.9+s_2​⋅0.8+…$$

Смысл: 
- чем ближе к текущему моменту плохие статусы, тем сильнее они влияют;
- штрафы за «старые» косяки постепенно уменьшаются.
Это очень информативный признак, потому что модель получит сглаженный показатель уровня риска клиента с учётом динамики.

In [11]:
def weighted_status_score(df, w):
    df = df.copy()
    df['weighted_status_score'] = 0
    for month in range(0, 25):
        df['weighted_status_score'] += df[f'enc_paym_{month}'] * w[month]
    return df

In [12]:
df = weighted_status_score(df, np.flip((np.arange(25) * 1/25)))

In [13]:
X_test = weighted_status_score(X_test, np.flip((np.arange(25) * 1/25)))

#### Максимальная длина подряд идущих «плохих» статусов

Смысл:
- Если клиент допустил разовую просрочку, это не так страшно.
- Если у него 3–4 месяца подряд delinquent или defaulted, это почти всегда дефолтный сценарий.
Такой признак даёт сильный сигнал модели, потому что «устойчивость» плохого поведения предсказывает вероятность дефолта лучше, чем единичные выбросы.

In [14]:
def longest_run_bad_status(df, bad_statuses={2, 3}):
    df = df.copy()
    df['longest_run_bad_status'] = 0
    df['maxi'] = 0
    for month in tqdm(range(0, 25)):
        mask = df['maxi'] > df['longest_run_bad_status']
        df.loc[mask, 'longest_run_bad_status'] = df.loc[mask, 'maxi']
        
        # Обновляем maxi для всех строк
        df['maxi'] = df.apply(
            lambda row: row['maxi'] + 1 if row[f'enc_paym_{month}'] in bad_statuses else 0,
            axis=1
        )
    mask = df['maxi'] > df['longest_run_bad_status']
    df.loc[mask, 'longest_run_bad_status'] = df.loc[mask, 'maxi']
    return df['longest_run_bad_status']

In [15]:
df['longest_run_bad_status'] = longest_run_bad_status(df)

100%|██████████| 25/25 [07:01<00:00, 16.85s/it]


In [16]:
X_test['longest_run_bad_status'] = longest_run_bad_status(X_test)

100%|██████████| 25/25 [01:16<00:00,  3.05s/it]


#### Количество переходов «ухудшения положения»

Cчитается сколько раз статус ухудшился в истории:
- 0 → 1
- 1 → 2
- 2 → 3

Это отражает «тренд»: клиент движется к дефолту или колеблется.
Если много ухудшений подряд → клиент явно уходит в риск-зону.

In [17]:
def count_deterioration_transitions(df):
    df = df.copy()
    df['deterioration_count'] = 0
    for month in range(1, 25):
        current_col = f'enc_paym_{month}'
        prev_col = f'enc_paym_{month-1}'
        
        # Определяем условия ухудшения
        condition_0_to_1 = (df[prev_col] == 0) & (df[current_col] == 1)
        condition_1_to_2 = (df[prev_col] == 1) & (df[current_col] == 2)
        condition_2_to_3 = (df[prev_col] == 2) & (df[current_col] == 3)
        # Суммируем все случаи ухудшения
        deterioration_mask = condition_0_to_1 | condition_1_to_2 | condition_2_to_3
        df.loc[deterioration_mask, 'deterioration_count'] += 1
    return df['deterioration_count']

In [18]:
df['deterioration_count'] = count_deterioration_transitions(df)

In [19]:
X_test['deterioration_count'] = count_deterioration_transitions(X_test)

### credit_number_for_user

`credit_number_for_user` - Порядковый номер кредитного продукта в кредитной истории. Большему номеру соответствует продукт с более поздней датой открытия. 

Этот признак и так хорош, он отражает насколько можно доверять клиенту, ведь, если он брал кредиты ранее и возвращал, то у него хорошая кредитная история и выше шанс отдать деньги.

### days_since_confirmed

`days_since_confirmed` - Дней с даты подтверждения информации по кредиту до даты сбора данных*

Этот признак я вообше не понимаю, как он связан с дефолтом?

### maturity_fact

`maturity_fact` - Фактическое количество дней с даты открытия кредита до даты закрытия*
Само по себе знание сколько дней длился кредит важно - чем больше срок кредита, тем выше вероятность дефолта (невыплаты). И наоборот досрочное погашение это круто

### maturity_plan

`maturity_plan` - Плановое количество дней с даты открытия кредита до даты закрытия*

Само по себе знание сколько дней должен был длился кредит не важно, но понимание того, насколько был превышен ожидаемый срок:

Если maturity_ratio больше 1, то срок превышен, если меньше 1, то было досрочное погашение!

In [20]:
def maturity_ratio(maturity_plan, maturity_fact):
    return maturity_plan / (maturity_fact + 1)

In [21]:
df['maturity_ratio'] = maturity_ratio(df['maturity_plan'], df['maturity_fact'])

In [22]:
X_test['maturity_ratio'] = maturity_ratio(X_test['maturity_plan'], X_test['maturity_fact'])

### credit_limit

`credit_limit` - Кредитный лимит. Уже очень крутой признак. Показывает уровень доверия к клиенту.

### next_payment_sum

`next_payment_sum` - Сумма следующего платежа по кредиту*. Уже хороший признак. Но его надо превратить в безразмерный:

In [23]:
def next_payment_sum_ratio(next_payment_sum, full_credit_cost):
    return next_payment_sum / (full_credit_cost + 1)

In [24]:
df['next_payment_sum_ratio'] = next_payment_sum_ratio(df['next_payment_sum'], df['full_credit_cost'])

In [25]:
X_test['next_payment_sum_ratio'] = next_payment_sum_ratio(X_test['next_payment_sum'], X_test['full_credit_cost'])

### sum_left_to_pay

`sum_left_to_pay` - Оставшаяся невыплаченная сумма кредита*. Уже хороший признак. Но его надо превратить в безразмерный:

In [26]:
def sum_left_to_pay_progress(sum_left_to_pay, full_credit_cost):
    return 1 - (sum_left_to_pay / (full_credit_cost + 1))

In [27]:
df['sum_left_to_pay_progress'] = sum_left_to_pay_progress(df['sum_left_to_pay'], df['full_credit_cost'])

In [28]:
X_test['sum_left_to_pay_progress'] = sum_left_to_pay_progress(X_test['sum_left_to_pay'], X_test['full_credit_cost'])

### current_overdue_debt

`current_overdue_debt` - Текущая просроченная задолженность*. Абсолютно пустой признак - не нужен

### max_overdue_debt

`max_overdue_debt` - Максимальная просроченная задолженность*. Можно сделать относительной величиной

In [29]:
def max_overdue_debt_ratio(max_overdue_debt, full_credit_cost):
    return max_overdue_debt / (full_credit_cost + 1)

In [30]:
df['max_overdue_debt_ratio'] = max_overdue_debt_ratio(df['max_overdue_debt'], df['full_credit_cost'])

In [31]:
X_test['max_overdue_debt_ratio'] = max_overdue_debt_ratio(X_test['max_overdue_debt'], X_test['full_credit_cost'])

### full_credit_cost

`full_credit_cost` - Полная стоимость кредита*. Можно сделать относительной от общего кредитного лимита

In [32]:
def full_credit_cost_ef_rate(full_credit_cost, credit_limit):
    return (full_credit_cost / (credit_limit + 1) - 1)

In [33]:
df['full_credit_cost_ef_rate'] = full_credit_cost_ef_rate(df['full_credit_cost'], df['credit_limit'])

In [34]:
X_test['full_credit_cost_ef_rate'] = full_credit_cost_ef_rate(X_test['full_credit_cost'], X_test['credit_limit'])

### overdues_Xd_Yd

`overdues_Xd_Yd` - Число просрочек сроком между X дней и Y дней (либо менее 5 дней/более 90 дней)*

In [35]:
def total_overdues(df):
    df = df.copy()
    df["total_overdues"] = (
        df["overdues_5d"] + df["overdues_5d_30d"] + df["overdues_30d_60d"] +
        df["overdues_60d_90d"] + df["overdues_90d"]
    )
    return df

In [36]:
def has_overdue(df):
    df = df.copy()
    df["has_long_overdue"] = (df["overdues_60d_90d"] > 0) | (df["overdues_90d"] > 0)
    df["has_mid_overdue"] = (df["overdues_30d_60d"] > 0).astype(int)
    return df

In [37]:
def overdue_severity_score(df):
    df = df.copy()
    df["overdue_severity_score"] = (
        1*df["overdues_5d"] +
        2*df["overdues_5d_30d"] +
        4*df["overdues_30d_60d"] +
        6*df["overdues_60d_90d"] +
        10*df["overdues_90d"]
    )
    return df

In [38]:
def overdue_ratio(df):
    df = df.copy()
    df["overdue_ratio"] = df["total_overdues"] / (df["credit_number_for_user"] + 1)
    return df

In [39]:
df = total_overdues(df)
df = has_overdue(df)
df = overdue_severity_score(df)
df = overdue_ratio(df)

In [40]:
X_test = total_overdues(X_test)
X_test = has_overdue(X_test)
X_test = overdue_severity_score(X_test)
X_test = overdue_ratio(X_test)

### no_overdues_Xd_Yd

`no_overdues_Xd_Yd` - нет просрочек сроком между X дней и Y дней (либо менее 5 дней/более 90 дней)

In [41]:
def has_clean_history(df):
    df = df.copy()
    df["has_clean_history"] = (
    (df["no_overdues_5d"]==1) &
    (df["no_overdues_5d_30d"]==1) &
    (df["no_overdues_30d_60d"]==1) &
    (df["no_overdues_60d_90d"]==1) &
    (df["no_overdues_90d"]==1)
    ).astype(int)
    return df

In [42]:
def max_overdue_level(df):
    df = df.copy()
    df["max_overdue_level"] = (
        df[["overdues_5d","overdues_5d_30d","overdues_30d_60d",
            "overdues_60d_90d","overdues_90d"]] > 0
    ).idxmax(axis=1)
    return df

In [43]:
df = has_clean_history(df)
df = max_overdue_level(df)

In [44]:
X_test = has_clean_history(X_test)
X_test = max_overdue_level(X_test)

### credit_type

`credit_type` - Тип кредита***

### credit_currency

`credit_currency` - Валюта кредита**

### другие фичи

In [49]:
def _fit_rate_map(s_values: np.ndarray, y_values: np.ndarray, prior: float, smoothing: float = 20.0):
    """
    Строит сглаженную карту риска: value -> P(default|value).
    - s_values: массив значений признака на train-части fold'а
    - y_values: соответствующий массив 0/1
    - prior: глобальная доля дефолтов (сглаживающий приор)
    - smoothing: сила сглаживания (чем больше, тем ближе к prior при редких значениях)
    Возвращает словарь {значение: риск}
    """
    tmp = pd.DataFrame({'val': s_values, 'y': y_values})
    grp = tmp.groupby('val')['y']
    mean = grp.mean()
    cnt  = grp.size()
    # сглаженный риск: (mean*count + prior*smoothing) / (count + smoothing)
    risk = (mean*cnt + prior*smoothing) / (cnt + smoothing)
    return risk.to_dict()

In [51]:
target = df['target'].astype(int).values

enc_cols = [c for c in df.columns if c.startswith('enc_paym_')]
enc_cols = sorted(enc_cols, key=lambda s: int(s.split('_')[-1]))  # гарантированно 0..24 по возрастанию

overdue_cols = [
    'overdues_5d','overdues_5d_30d','overdues_30d_60d','overdues_60d_90d','overdues_90d',
    'no_overdues_5d','no_overdues_5d_30d','no_overdues_30d_60d','no_overdues_60d_90d','no_overdues_90d'
]

starred_cols = [  # поля со звёздочкой из условия (разбиты по интервалам → «категории» в числах)
    'days_since_confirmed','maturity_plan','maturity_fact','credit_limit',
    'next_payment_sum','sum_left_to_pay','current_overdue_debt','max_overdue_debt','full_credit_cost'
]
cat_cols = ['credit_type','credit_currency']  # настоящие категории (перекодированные числами)
base_num = ['credit_number_for_user']         # числовая, но мы тоже ей присвоим риск-ранг

print(len(df), len(X_test), len(enc_cols))

1827404 456852 25


In [54]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds = list(skf.split(df, target))

In [52]:
def oof_target_rate_map(train_series: pd.Series, y: np.ndarray, folds, smoothing: float = 20.0):
    """
    Возвращает:
      - oof_risk: массив OOF-рисков для train_series (на валидации каждого фолда считаем по карте, обученной на его train-части)
      - full_map: карта риска по всей обучающей части для применения на тесте
    """
    prior = y.mean().item()
    oof_risk = np.zeros(len(train_series), dtype=np.float32)

    # Проход по фолдам: для каждого валидного куска используем карту, обученную на тренировочном
    for tr_idx, vl_idx in folds:
        s_tr = train_series.iloc[tr_idx].values
        y_tr = y[tr_idx]
        s_vl = train_series.iloc[vl_idx].values

        risk_map = _fit_rate_map(s_tr, y_tr, prior, smoothing)
        # OOF для валидации
        oof_risk[vl_idx] = np.array([risk_map.get(v, prior) for v in s_vl], dtype=np.float32)

    # Полная карта риска для теста
    risk_map_full = _fit_rate_map(train_series.values, y, prior, smoothing)
    return oof_risk, risk_map_full

def apply_rate_map(series: pd.Series, rate_map: dict, default_rate: float):
    """Применяем карту риска к Series (неизвестные значения -> default_rate)."""
    return series.map(rate_map).fillna(default_rate).astype(np.float32)


In [57]:
# ЯЧЕЙКА 3a — строим risk-последовательности (N x 25) для train и test
def build_risk_sequence(df_tr: pd.DataFrame, df_te: pd.DataFrame, enc_cols, y: np.ndarray, folds):
    """
    На каждый enc_paym_k строим OOF риск (train) и full-map (test).
    Склеиваем по столбцам -> матрица [N, 25] риска, согласованная с порядком enc_cols.
    """
    prior = y.mean().item()
    mats_tr = []
    mats_te = []

    for c in tqdm(enc_cols):
        oof, full_map = oof_target_rate_map(df_tr[c], y, folds, smoothing=20.0)
        mats_tr.append(oof.reshape(-1, 1))
        mats_te.append(apply_rate_map(df_te[c], full_map, prior).values.reshape(-1, 1))

    risk_tr = np.concatenate(mats_tr, axis=1).astype(np.float32)  # [N_train, 25]
    risk_te = np.concatenate(mats_te, axis=1).astype(np.float32)  # [N_test,  25]
    return risk_tr, risk_te

In [58]:
risk_seq_tr, risk_seq_te = build_risk_sequence(df, X_test, enc_cols, target, folds)
risk_seq_tr.shape, risk_seq_te.shape

100%|██████████| 25/25 [00:24<00:00,  1.03it/s]


((1827404, 25), (456852, 25))

In [59]:
# ЯЧЕЙКА 3b — агрегаты по risk-последовательности (без тяжёлых циклов)
def ema_last_axis(mat: np.ndarray, alpha: float = 0.85):
    """
    Применяет EMA слева направо по оси признаков:
    y_t = alpha*y_{t-1} + (1-alpha)*x_t
    Возвращает 1 столбец: последний EMA (сильнее учитывает свежие месяцы).
    """
    out = np.zeros(mat.shape[0], dtype=np.float32)
    for j in range(mat.shape[1]):
        if j == 0:
            out = mat[:, j]
        else:
            out = alpha * out + (1.0 - alpha) * mat[:, j]
    return out.astype(np.float32)

def build_payment_risk_aggs(risk_tr: np.ndarray, risk_te: np.ndarray):
    """
    Быстрые агрегаты из risk-последовательности (векторно):
    - среднее за все 25
    - среднее за последние 12, 6
    - EMA (последнее значение)
    - bad_count_12, bad_count_6 (где bad >= глобальной медианы по train-последовательности)
    """
    # глобальный порог "плохости" по train
    thr = np.median(risk_tr)

    def agg_side(mat: np.ndarray):
        feats = {}
        feats['risk_mean_25']      = mat.mean(axis=1).astype(np.float32)
        feats['risk_mean_last_12'] = mat[:, -12:].mean(axis=1).astype(np.float32)
        feats['risk_mean_last_6']  = mat[:, -6:].mean(axis=1).astype(np.float32)
        feats['risk_ema']          = ema_last_axis(mat, alpha=0.85)  # последний EMA

        bad = (mat >= thr)  # bool
        feats['bad_count_12'] = bad[:, -12:].sum(axis=1).astype(np.int16)
        feats['bad_count_6']  = bad[:, -6:].sum(axis=1).astype(np.int16)
        return pd.DataFrame(feats)

    tr_aggs = agg_side(risk_tr)
    te_aggs = agg_side(risk_te)
    return tr_aggs, te_aggs

In [61]:
tr_pay_aggs, te_pay_aggs = build_payment_risk_aggs(risk_seq_tr, risk_seq_te)
tr_pay_aggs.shape, te_pay_aggs.shape

((1827404, 6), (456852, 6))

In [64]:
# ЯЧЕЙКА 5 — риск-фичи для starred и base_num
def build_risk_features_block(df_tr: pd.DataFrame, df_te: pd.DataFrame, cols, y: np.ndarray, folds):
    prior = y.mean().item()
    out_tr = {}
    out_te = {}

    for c in cols:
        oof, mp = oof_target_rate_map(df_tr[c], y, folds, smoothing=30.0)
        out_tr[f'{c}_risk'] = oof.astype(np.float32)
        out_te[f'{c}_risk'] = apply_rate_map(df_te[c], mp, prior).values

    return pd.DataFrame(out_tr), pd.DataFrame(out_te)

In [65]:
tr_star, te_star = build_risk_features_block(df, X_test, starred_cols + base_num, target, folds)
tr_star.shape, te_star.shape

((1827404, 10), (456852, 10))

In [66]:
# ЯЧЕЙКА 5b — сглаженное target encoding для категорий (credit_type, credit_currency)
def build_smoothed_te(df_tr: pd.DataFrame, df_te: pd.DataFrame, cols, y: np.ndarray, folds, min_count=50, prior=None):
    if prior is None: prior = y.mean().item()
    out_tr = {}
    out_te = {}

    for c in cols:
        oof_vals = np.zeros(len(df_tr), dtype=np.float32)
        for tr_idx, vl_idx in folds:
            s_tr = df_tr[c].iloc[tr_idx].values
            y_tr = y[tr_idx]
            # считаем частоты и средние на train части
            tmp = pd.DataFrame({'v': s_tr, 'y': y_tr})
            grp = tmp.groupby('v')['y']
            mean = grp.mean()
            cnt  = grp.size()
            # сглаживание к prior при малых cnt
            smoothed = (mean*cnt + prior*min_count) / (cnt + min_count)
            mp = smoothed.to_dict()
            oof_vals[vl_idx] = np.array([mp.get(v, prior) for v in df_tr[c].iloc[vl_idx].values], dtype=np.float32)

        # full map для теста
        tmp_all = pd.DataFrame({'v': df_tr[c].values, 'y': y})
        grp_all = tmp_all.groupby('v')['y']
        mean_a = grp_all.mean()
        cnt_a  = grp_all.size()
        smoothed_a = (mean_a*cnt_a + prior*min_count) / (cnt_a + min_count)
        mp_all = smoothed_a.to_dict()

        out_tr[f'{c}_te'] = oof_vals
        out_te[f'{c}_te'] = df_te[c].map(mp_all).fillna(prior).astype(np.float32).values

    return pd.DataFrame(out_tr), pd.DataFrame(out_te)

In [67]:
tr_cat, te_cat = build_smoothed_te(df, X_test, cat_cols, target, folds, min_count=100)
tr_cat.shape, te_cat.shape

((1827404, 2), (456852, 2))

In [81]:
# Базовая сборка
use_cols_num = [
    'maturity_ratio',
    'weighted_status_score',
    'longest_run_bad_status',
    'deterioration_count',
    'next_payment_sum_ratio',
    'sum_left_to_pay_progress',
    'max_overdue_debt_ratio',
    'full_credit_cost_ef_rate',
    'total_overdues',
    'overdue_severity_score',
    'overdue_ratio',
    'has_clean_history',        # как числовая псевдо-ординальная
    # опционально:
    # 'credit_number_for_user',
]

use_cols_cat_raw = ['credit_type', 'credit_currency']  # пометим как категории в CatBoost

X_tr = pd.concat(
    [tr_pay_aggs, tr_star, tr_cat, df.reset_index()[use_cols_num], df.reset_index()[use_cols_cat_raw]],
    axis=1
)
X_te = pd.concat(
    [te_pay_aggs, te_star, te_cat, X_test[use_cols_num], X_test[use_cols_cat_raw]],
    axis=1
)

# Если где-то есть NaN/inf — сразу чистим:
X_tr = X_tr.replace([np.inf, -np.inf], np.nan).fillna(0)
X_te = X_te.replace([np.inf, -np.inf], np.nan).fillna(0)

y_tr = target.astype(int)
X_tr.shape, X_te.shape

((1827404, 32), (456852, 32))

## Обучение первичное

In [112]:
features = [
    'credit_number_for_user', 
    'maturity_fact',
    'maturity_ratio',
    'credit_limit',
    'weighted_status_score',
    'longest_run_bad_status',
    'deterioration_count',
    'next_payment_sum_ratio',
    'sum_left_to_pay_progress',
    'max_overdue_debt_ratio',
    'full_credit_cost_ef_rate',

    'total_overdues', 'overdue_severity_score', 'overdue_ratio', 'has_clean_history', 
    'max_overdue_level',

    'credit_type',  
    'credit_currency']
cat_features = ['credit_type', 'credit_currency', 'max_overdue_level']

In [113]:
X = df[features]
y = df['target']

x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)
x_test = X_test[features]

In [124]:
model = CatBoostClassifier(
    iterations=2000,
    depth=11,
    learning_rate=0.05,
    l2_leaf_reg=7,
    bagging_temperature=0.5,
    auto_class_weights='Balanced',
    eval_metric='AUC',  # Оптимизация по F1
    random_seed=42,
    verbose=100, 
    early_stopping_rounds=100,
    cat_features=cat_features
)

# Обучение модели
model.fit(
    x_train, y_train, eval_set=(x_val, y_val),
    early_stopping_rounds=100
)

0:	test: 0.6046038	best: 0.6046038 (0)	total: 1.44s	remaining: 48m 1s
100:	test: 0.6170486	best: 0.6170486 (100)	total: 1m 56s	remaining: 36m 32s
200:	test: 0.6183562	best: 0.6184386 (194)	total: 3m 32s	remaining: 31m 39s
300:	test: 0.6181507	best: 0.6186464 (261)	total: 5m 17s	remaining: 29m 51s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6186464119
bestIteration = 261

Shrink model to first 262 iterations.


<catboost.core.CatBoostClassifier at 0x28175c47550>

In [111]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, roc_auc_score, confusion_matrix, 
                            classification_report, precision_recall_curve, 
                            average_precision_score)
import numpy as np

def evaluate_model(y_true, y_pred, y_pred_proba=None):
    """
    Выводит все основные метрики качества для бинарной классификации
    
    Parameters:
    y_true: истинные значения
    y_pred: предсказанные классы
    y_pred_proba: предсказанные вероятности (для ROC-AUC)
    """
    
    print("=" * 50)
    print("МЕТРИКИ КАЧЕСТВА МОДЕЛИ")
    print("=" * 50)
    
    # Основные метрики
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-score:  {f1:.4f}")
    
    # ROC-AUC если есть вероятности
    if y_pred_proba is not None:
        try:
            roc_auc = roc_auc_score(y_true, y_pred_proba)
            print(f"ROC-AUC:   {roc_auc:.4f}")
            
            # Average Precision Score
            avg_precision = average_precision_score(y_true, y_pred_proba)
            print(f"Avg Precision: {avg_precision:.4f}")
        except:
            print("ROC-AUC: Не удалось вычислить (проверьте y_pred_proba)")
    
    print("\n" + "-" * 30)
    print("MATRIXA SOWMESHENIY (CONFUSION MATRIX)")
    print("-" * 30)
    
    # Матрица ошибок
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    print(f"True Negative (TN):  {tn}")
    print(f"False Positive (FP): {fp}")
    print(f"False Negative (FN): {fn}")
    print(f"True Positive (TP):  {tp}")
    print(f"\nМатрица в виде таблицы:")
    print(f"[[TN {tn}   FP {fp}]")
    print(f" [FN {fn}   TP {tp}]]")
    
    # Дополнительные метрики из матрицы ошибок
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0  # False Negative Rate
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Positive Rate
    
    print(f"\nSpecificity (TNR): {specificity:.4f}")
    print(f"False Positive Rate (FPR): {fpr:.4f}")
    print(f"False Negative Rate (FNR): {fnr:.4f}")
    
    print("\n" + "-" * 30)
    print("DETALNY OTCHET (CLASSIFICATION REPORT)")
    print("-" * 30)
    
    # Детальный отчет
    print(classification_report(y_true, y_pred, target_names=['Class 0', 'Class 1'], zero_division=0))
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'confusion_matrix': cm,
        'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp
    }

In [125]:
base_vetrics = evaluate_model(y_val, model.predict(x_val), model.predict_proba(x_val)[:,1])

МЕТРИКИ КАЧЕСТВА МОДЕЛИ
Accuracy:  0.6806
Precision: 0.0502
Recall:    0.4787
F1-score:  0.0909
ROC-AUC:   0.6186
Avg Precision: 0.0574

------------------------------
MATRIXA SOWMESHENIY (CONFUSION MATRIX)
------------------------------
True Negative (TN):  303613
False Positive (FP): 137994
False Negative (FN): 7947
True Positive (TP):  7297

Матрица в виде таблицы:
[[TN 303613   FP 137994]
 [FN 7947   TP 7297]]

Specificity (TNR): 0.6875
False Positive Rate (FPR): 0.3125
False Negative Rate (FNR): 0.5213

------------------------------
DETALNY OTCHET (CLASSIFICATION REPORT)
------------------------------
              precision    recall  f1-score   support

     Class 0       0.97      0.69      0.81    441607
     Class 1       0.05      0.48      0.09     15244

    accuracy                           0.68    456851
   macro avg       0.51      0.58      0.45    456851
weighted avg       0.94      0.68      0.78    456851



In [126]:
feature_imp = pd.DataFrame({
        'feature': x_train.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', key=abs, ascending=False)
feature_imp

Unnamed: 0,feature,importance
4,weighted_status_score,11.235639
8,sum_left_to_pay_progress,9.909159
16,credit_type,8.989772
2,maturity_ratio,8.400929
7,next_payment_sum_ratio,7.90396
1,maturity_fact,7.837513
9,max_overdue_debt_ratio,7.707635
10,full_credit_cost_ef_rate,5.922321
3,credit_limit,5.717225
14,has_clean_history,5.597286


In [127]:
y_val_pred_proba = model.predict_proba(x_train)[:, 1]

# Сетка порогов
thresholds = np.linspace(0.01, 0.99, 199)

best_threshold = 0.5
best_f1 = 0

for t in thresholds:
    y_val_pred = (y_val_pred_proba >= t).astype(int)
    f1 = f1_score(y_train, y_val_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print(f"Best F1: {best_f1:.4f} at threshold {best_threshold:.2f}")

Best F1: 0.1400 at threshold 0.63


In [128]:
y_val_pred_proba = model.predict_proba(x_val)[:, 1]

# Сетка порогов
thresholds = np.linspace(0.01, 0.99, 199)

best_threshold = 0.5
best_f1 = 0

for t in thresholds:
    y_val_pred = (y_val_pred_proba >= t).astype(int)
    f1 = f1_score(y_val, y_val_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print(f"Best F1: {best_f1:.4f} at threshold {best_threshold:.2f}")

Best F1: 0.1052 at threshold 0.59


#### Итоговое обучение и сабмишен

In [130]:
x_tra, x_va, y_tra, y_va = train_test_split(X, y, test_size=0.001, random_state=42)

In [131]:
model_itog = CatBoostClassifier(
    iterations=500,
    depth=11,
    learning_rate=0.05,
    l2_leaf_reg=20,
    bagging_temperature=0.5,
    auto_class_weights='Balanced',
    eval_metric='AUC',  # Оптимизация по F1
    random_seed=42,
    verbose=100, 
    cat_features=cat_features
)

# Обучение модели
model.fit(
    x_tra, y_tra, eval_set=(x_va, y_va)
)

0:	test: 0.6178258	best: 0.6178258 (0)	total: 1.75s	remaining: 58m 26s
100:	test: 0.6243863	best: 0.6276787 (70)	total: 2m 30s	remaining: 47m 5s
200:	test: 0.6362800	best: 0.6362800 (200)	total: 4m 34s	remaining: 40m 53s
300:	test: 0.6396113	best: 0.6402932 (295)	total: 6m 37s	remaining: 37m 23s
400:	test: 0.6462254	best: 0.6485340 (380)	total: 9m 21s	remaining: 37m 19s
500:	test: 0.6476671	best: 0.6506673 (465)	total: 12m 2s	remaining: 36m 2s
600:	test: 0.6579827	best: 0.6584015 (599)	total: 14m 45s	remaining: 34m 21s
700:	test: 0.6613920	best: 0.6614699 (698)	total: 17m 31s	remaining: 32m 28s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6614699006
bestIteration = 698

Shrink model to first 699 iterations.


<catboost.core.CatBoostClassifier at 0x28175c47550>

In [115]:
model_itog_max = CatBoostClassifier(
    iterations=600,
    depth=11,
    learning_rate=0.05,
    l2_leaf_reg=20,
    bagging_temperature=0.5,
    auto_class_weights='Balanced',
    eval_metric='AUC',  # Оптимизация по F1
    random_seed=42,
    verbose=100, 
    cat_features=cat_features
)

# Обучение модели
model_itog_max.fit(
    df[features], df['target']
)

0:	total: 1.99s	remaining: 19m 53s
100:	total: 2m 34s	remaining: 12m 43s
200:	total: 4m 40s	remaining: 9m 15s
300:	total: 6m 50s	remaining: 6m 47s
400:	total: 10m 1s	remaining: 4m 58s
500:	total: 13m 30s	remaining: 2m 40s
599:	total: 17m 1s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x2c9c5adf0d0>

In [117]:
base_vetrics = evaluate_model(df['target'], model_itog_max.predict(df[features]), model_itog_max.predict_proba(df[features])[:,1])

МЕТРИКИ КАЧЕСТВА МОДЕЛИ
Accuracy:  0.7104
Precision: 0.0659
Recall:    0.5941
F1-score:  0.1187
ROC-AUC:   0.7204
Avg Precision: 0.1040

------------------------------
MATRIXA SOWMESHENIY (CONFUSION MATRIX)
------------------------------
True Negative (TN):  1262642
False Positive (FP): 504798
False Negative (FN): 24337
True Positive (TP):  35627

Матрица в виде таблицы:
[[TN 1262642   FP 504798]
 [FN 24337   TP 35627]]

Specificity (TNR): 0.7144
False Positive Rate (FPR): 0.2856
False Negative Rate (FNR): 0.4059

------------------------------
DETALNY OTCHET (CLASSIFICATION REPORT)
------------------------------
              precision    recall  f1-score   support

     Class 0       0.98      0.71      0.83   1767440
     Class 1       0.07      0.59      0.12     59964

    accuracy                           0.71   1827404
   macro avg       0.52      0.65      0.47   1827404
weighted avg       0.95      0.71      0.80   1827404



In [119]:
test_pred = model_itog_max.predict(x_test)

In [121]:
y_val_pred_proba = model_itog_max.predict_proba(df[features])[:, 1]

# Сетка порогов
thresholds = np.linspace(0.01, 0.99, 99)

best_threshold = 0.5
best_f1 = 0

for t in thresholds:
    y_val_pred = (y_val_pred_proba >= t).astype(int)
    f1 = f1_score(y_val_pred, df['target'])
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print(f"Best F1: {best_f1:.4f} at threshold {best_threshold:.2f}")

Best F1: 0.1710 at threshold 0.63


In [135]:
y_val_pred_proba = model.predict_proba(x_tra)[:, 1]

# Сетка порогов
thresholds = np.linspace(0.01, 0.99, 99)

best_threshold = 0.5
best_f1 = 0

for t in thresholds:
    y_val_pred = (y_val_pred_proba >= t).astype(int)
    f1 = f1_score(y_val_pred, y_tra)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print(f"Best F1: {best_f1:.4f} at threshold {best_threshold:.2f}")

Best F1: 0.1963 at threshold 0.65


In [None]:
proba = model.predict_proba(x_test)[:, 1]
test_pred = (proba >= 0.63).astype(bool)

In [125]:
fpo_test = model_itog_max.predict_proba(X_test[features])[:, 1]

In [140]:
test_pred

array([False, False, False, ..., False, False, False], shape=(456852,))

In [141]:
X_test['flag'] = test_pred

In [142]:
X_test[['id', 'flag']].set_index('id').to_csv('sub_mission.csv')

## Мощное обучение

### credit_type == ?

In [197]:
diction = {}
detph = {}

In [251]:
df.groupby('credit_type')['credit_number_for_user'].count() * 0.25

credit_type
0     15963.25
1      2328.25
2     10630.25
3    137111.75
4    258849.75
5     20398.75
6      1256.25
7     10312.75
Name: credit_number_for_user, dtype: float64

In [259]:
typee = 7
df_0 = df[df['credit_type'] == typee]

In [261]:
features = [
    'credit_number_for_user', 
    'maturity_fact',
    'maturity_ratio',
    'credit_limit',
    'weighted_status_score',
    'longest_run_bad_status',
    'deterioration_count',
    'next_payment_sum_ratio',
    'sum_left_to_pay_progress',
    'max_overdue_debt_ratio',
    'full_credit_cost_ef_rate',  
    'credit_currency']

X = df_0[features]
y = df_0['target']

x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)
x_test = X_test[features]

depth = 9
model = CatBoostClassifier(
    iterations=2000,
    depth=depth,
    learning_rate=0.07,
    l2_leaf_reg=5,
    bagging_temperature=0.5,
    auto_class_weights='Balanced',
    eval_metric='AUC',  # Оптимизация по F1
    random_seed=42,
    verbose=100, 
    early_stopping_rounds=100,
    cat_features=['credit_currency']
)

# Обучение модели
model.fit(
    x_train, y_train, eval_set=(x_val, y_val),
    early_stopping_rounds=100
)

diction[typee] = evaluate_model(y_val, model.predict(x_val), model.predict_proba(x_val)[:,1])
detph[typee] = depth

feature_imp = pd.DataFrame({
        'feature': x_train.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', key=abs, ascending=False)
feature_imp

0:	test: 0.5586678	best: 0.5586678 (0)	total: 32.3ms	remaining: 1m 4s
100:	test: 0.5327881	best: 0.5685026 (6)	total: 2.92s	remaining: 54.9s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.5685026315
bestIteration = 6

Shrink model to first 7 iterations.
МЕТРИКИ КАЧЕСТВА МОДЕЛИ
Accuracy:  0.7027
Precision: 0.0446
Recall:    0.4098
F1-score:  0.0804
ROC-AUC:   0.5685
Avg Precision: 0.0423

------------------------------
MATRIXA SOWMESHENIY (CONFUSION MATRIX)
------------------------------
True Negative (TN):  7113
False Positive (FP): 2873
False Negative (FN): 193
True Positive (TP):  134

Матрица в виде таблицы:
[[TN 7113   FP 2873]
 [FN 193   TP 134]]

Specificity (TNR): 0.7123
False Positive Rate (FPR): 0.2877
False Negative Rate (FNR): 0.5902

------------------------------
DETALNY OTCHET (CLASSIFICATION REPORT)
------------------------------
              precision    recall  f1-score   support

     Class 0       0.97      0.71      0.82      9986
     Class 1

Unnamed: 0,feature,importance
5,longest_run_bad_status,17.528091
0,credit_number_for_user,16.898484
1,maturity_fact,14.885166
4,weighted_status_score,12.307858
3,credit_limit,8.092802
9,max_overdue_debt_ratio,8.014381
10,full_credit_cost_ef_rate,7.521686
8,sum_left_to_pay_progress,6.358995
2,maturity_ratio,3.284717
6,deterioration_count,2.743928


In [265]:
for k, v in diction.items():
    print(k, v['f1'])

0 0.06322861094645327
1 0.1016949152542373
2 0.06269020085209982
3 0.08201916690388082
4 0.07979600017816578
5 0.16505293638584742
6 0.13020833333333334
7 0.08038392321535692


### credit_type == 4

In [269]:
typee = 4
df_4 = df[df['credit_type'] == typee]

In [274]:
import os
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import f1_score, roc_auc_score, precision_recall_curve

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# -------------------------
# 0) Reproducibility
# -------------------------
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)

DEVICE: cpu


In [305]:
(df[features] == np.inf).sum()

credit_number_for_user           0
maturity_fact                    0
maturity_ratio              100255
credit_limit                     0
weighted_status_score            0
longest_run_bad_status           0
deterioration_count              0
next_payment_sum_ratio      104816
sum_left_to_pay_progress         0
max_overdue_debt_ratio      108960
full_credit_cost_ef_rate     94592
dtype: int64

In [302]:

features = [
    'credit_number_for_user', 
    'maturity_fact',
    'maturity_ratio',
    'credit_limit',
    'weighted_status_score',
    'longest_run_bad_status',
    'deterioration_count',
    'next_payment_sum_ratio',
    'sum_left_to_pay_progress',
    'max_overdue_debt_ratio',
    'full_credit_cost_ef_rate',
]

# проверим, что все фичи есть в данных
missing = [f for f in features if f not in df.columns]
if missing:
    raise ValueError(f"Отсутствуют признаки в X_train: {missing}\n"
                     f"Убедись, что они посчитаны и присутствуют до запуска MLP.")

# на всякий случай — заменить NaN/inf
df_4[features] = df[features].fillna(0.0)
X_test_4[features] = X_test_4[features].fillna(0.0)

X = df_4[features].astype(float)
y = df_4['target'].astype(int).values

# train/val split (стратифицированный)
x_tr, x_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.25, random_state=SEED, stratify=y
)

# масштабирование (только по train)
scaler = RobustScaler()
x_tr_sc = scaler.fit_transform(x_tr)
x_val_sc = scaler.transform(x_val)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


ValueError: Input X contains infinity or a value too large for dtype('float64').

In [None]:
# -------------------------
# 2) PyTorch Dataset/Dataloader
# -------------------------
class TabDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.from_numpy(X).float()
        self.y = None if y is None else torch.from_numpy(y).float()
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

train_ds = TabDataset(x_tr_sc, y_tr)
val_ds   = TabDataset(x_val_sc, y_val)
# sub_ds   = TabDataset(x_sub_sc, None)

train_loader = DataLoader(train_ds, batch_size=4096, shuffle=True, num_workers=0)
val_loader   = DataLoader(val_ds,   batch_size=8192, shuffle=False, num_workers=0)
# sub_loader   = DataLoader(sub_ds,   batch_size=8192, shuffle=False, num_workers=0)

In [298]:

# -------------------------
# 3) Model
# -------------------------
class MLP(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 256),
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.Dropout(0.20),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.GELU(),
            nn.Dropout(0.20),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.GELU(),
            nn.Dropout(0.10),

            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.GELU(),
            nn.Dropout(0.10),

            nn.Linear(32, 1)  # логит
        )
    def forward(self, x):
        return self.net(x).squeeze(1)  # [B]

in_dim = len(features)
model = MLP(in_dim).to(DEVICE)

# -------------------------
# 4) Loss (pos_weight) & Optimizer & Scheduler
# -------------------------
# дисбаланс: pos_weight = N_neg/N_pos
pos = y_tr.sum()
neg = len(y_tr) - pos
pos_weight_value = (neg / max(1, pos))
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight_value, device=DEVICE))

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

In [299]:

# -------------------------
# 5) Train loop with early stopping
# -------------------------
def evaluate(model, loader):
    model.eval()
    all_logits, all_y = [], []
    with torch.no_grad():
        for batch in loader:
            xb, yb = batch
            xb = xb.to(DEVICE); yb = yb.to(DEVICE)
            logits = model(xb)
            all_logits.append(logits.detach().cpu().numpy())
            all_y.append(yb.detach().cpu().numpy())
    logits = np.concatenate(all_logits)
    y_true = np.concatenate(all_y)
    probs = 1.0 / (1.0 + np.exp(-logits))
    # AUC для контроля, F1 посчитаем с порогом отдельно
    auc = roc_auc_score(y_true, probs)
    return auc, probs, y_true

best_auc = -np.inf
best_state = None
patience, patience_left = 8, 8
EPOCHS = 10
scaler_amp = torch.cuda.amp.GradScaler(enabled=(DEVICE=="cuda"))

for epoch in tqdm(range(1, EPOCHS+1)):
    model.train()
    epoch_loss = 0.0
    for batch in train_loader:
        xb, yb = batch
        xb = xb.to(DEVICE); yb = yb.to(DEVICE)

        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
            logits = model(xb)
            loss = criterion(logits, yb)
        scaler_amp.scale(loss).backward()
        scaler_amp.step(optimizer)
        scaler_amp.update()
        epoch_loss += loss.item()

    scheduler.step()

    val_auc, val_probs, val_true = evaluate(model, val_loader)
    print(f"Epoch {epoch:02d} | train_loss={epoch_loss/len(train_loader):.5f} | val_auc={val_auc:.5f}")

    if val_auc > best_auc + 1e-4:
        best_auc = val_auc
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        patience_left = patience
    else:
        patience_left -= 1
        if patience_left == 0:
            print("Early stopping.")
            break

# restore best
if best_state is not None:
    model.load_state_dict({k: v.to(DEVICE) for k, v in best_state.items()})

# -------------------------
# 6) Threshold tuning for F1
# -------------------------
# вычислим ещё раз на валидации лучшие пороги

# # (опционально) можно посмотреть и PR-кривую:
# # precisions, recalls, thr = precision_recall_curve(val_true, val_probs)

# # -------------------------
# # 7) Inference on test + submission
# # -------------------------
# # прогон по test
# model.eval()
# test_probs_all = []
# with torch.no_grad():
#     for xb in sub_loader:
#         xb = xb.to(DEVICE)
#         logits = model(xb)
#         probs = torch.sigmoid(logits)
#         test_probs_all.append(probs.detach().cpu().numpy())
# test_probs = np.concatenate(test_probs_all)

# # бинаризация по найденному порогу
# test_pred = (test_probs >= best_t).astype(int)

# submission = pd.DataFrame({
#     "id": X_test["id"].values,
#     "flag": test_pred
# })
# submission_path = "submission_mlp_tabular.csv"
# submission.to_csv(submission_path, index=False)
# print("Saved:", submission_path)

  scaler_amp = torch.cuda.amp.GradScaler(enabled=(DEVICE=="cuda"))
  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
 10%|█         | 1/10 [00:25<03:50, 25.61s/it]

Epoch 01 | train_loss=1.33416 | val_auc=0.56409


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
 20%|██        | 2/10 [00:51<03:26, 25.77s/it]

Epoch 02 | train_loss=1.32950 | val_auc=0.56670


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
 30%|███       | 3/10 [01:16<02:58, 25.49s/it]

Epoch 03 | train_loss=1.32829 | val_auc=0.57015


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
 40%|████      | 4/10 [01:42<02:32, 25.48s/it]

Epoch 04 | train_loss=1.32712 | val_auc=0.57142


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
 50%|█████     | 5/10 [02:07<02:06, 25.34s/it]

Epoch 05 | train_loss=1.32686 | val_auc=0.57075


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
 60%|██████    | 6/10 [02:32<01:41, 25.36s/it]

Epoch 06 | train_loss=1.32614 | val_auc=0.57508


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
 70%|███████   | 7/10 [02:59<01:17, 25.85s/it]

Epoch 07 | train_loss=1.32530 | val_auc=0.57355


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
 80%|████████  | 8/10 [03:24<00:51, 25.66s/it]

Epoch 08 | train_loss=1.32493 | val_auc=0.57485


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
 90%|█████████ | 9/10 [03:49<00:25, 25.52s/it]

Epoch 09 | train_loss=1.32414 | val_auc=0.57607


  with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
100%|██████████| 10/10 [04:15<00:00, 25.51s/it]

Epoch 10 | train_loss=1.32345 | val_auc=0.57609





In [300]:
model.eval()
_, val_probs, val_true = evaluate(model, val_loader)

thresholds = np.linspace(0.01, 0.99, 99)
best_t, best_f1 = 0.5, 0.0
for t in thresholds:
    preds = (val_probs >= t).astype(int)
    f1 = f1_score(val_true, preds)
    if f1 > best_f1:
        best_f1, best_t = f1, t

print(f"Best F1 on VAL = {best_f1:.4f} at threshold={best_t:.3f}")

Best F1 on VAL = 0.0820 at threshold=0.550


## Ансамбль моделей с подбором точности

### Подготовка данных

#### Модель A (Catboost)

In [86]:
oof_cb  = np.zeros(len(X_tr), dtype=np.float32)
test_cb = np.zeros(len(X_te), dtype=np.float32)
models = {}

for fold, (tr_idx, vl_idx) in enumerate(folds, 1):
    Xtr, Xvl = X_tr.iloc[tr_idx], X_tr.iloc[vl_idx]
    ytr, yvl = y_tr[tr_idx],     y_tr[vl_idx]

    model_cb = CatBoostClassifier(
        iterations=3000,
        depth=10,
        learning_rate=0.03,
        l2_leaf_reg=12,
        bagging_temperature=0.4,
        auto_class_weights='Balanced',  # важный момент при 3.3% позитивов
        eval_metric='AUC',
        random_seed=42,
        verbose=50,
        early_stopping_rounds=200,
        
    )
    model_cb.fit(Xtr, ytr, eval_set=(Xvl, yvl), use_best_model=True)

    models[fold] = model_cb
    oof_cb[vl_idx] = model_cb.predict_proba(Xvl)[:, 1]
    test_cb       += model_cb.predict_proba(X_te)[:, 1] / skf.n_splits

0:	test: 0.6045104	best: 0.6045104 (0)	total: 670ms	remaining: 33m 28s
50:	test: 0.6217217	best: 0.6217217 (50)	total: 25s	remaining: 24m 4s
100:	test: 0.6252702	best: 0.6252702 (100)	total: 51.3s	remaining: 24m 31s
150:	test: 0.6267972	best: 0.6268309 (148)	total: 1m 15s	remaining: 23m 53s
200:	test: 0.6277178	best: 0.6277178 (200)	total: 1m 40s	remaining: 23m 24s
250:	test: 0.6282214	best: 0.6282214 (250)	total: 2m 6s	remaining: 23m 9s
300:	test: 0.6285178	best: 0.6285226 (290)	total: 2m 33s	remaining: 22m 53s
350:	test: 0.6286735	best: 0.6288668 (325)	total: 3m	remaining: 22m 44s
400:	test: 0.6287140	best: 0.6288668 (325)	total: 3m 27s	remaining: 22m 24s
450:	test: 0.6288189	best: 0.6288668 (325)	total: 3m 51s	remaining: 21m 48s
500:	test: 0.6287415	best: 0.6289372 (480)	total: 4m 15s	remaining: 21m 15s
550:	test: 0.6281765	best: 0.6289372 (480)	total: 4m 40s	remaining: 20m 46s
600:	test: 0.6278386	best: 0.6289372 (480)	total: 5m 5s	remaining: 20m 18s
650:	test: 0.6273008	best: 0.62

In [93]:
oof_cb_e  = np.zeros(len(X_tr), dtype=np.float32)
for fold, (tr_idx, vl_idx) in enumerate(folds, 1):
    Xtr, Xvl = X_tr.iloc[tr_idx], X_tr.iloc[vl_idx]
    ytr, yvl = y_tr[tr_idx],     y_tr[vl_idx]

    oof_cb_e[vl_idx] = models[fold].predict(Xvl)

In [94]:
auc_cb = roc_auc_score(y_tr, oof_cb)
f1_cb = f1_score(y_tr, oof_cb_e)
print(f"CatBoost OOF AUC: {auc_cb:.5f}")
print(f"CatBoost OOF AUC: {f1_cb:.5f}")

CatBoost OOF AUC: 0.62959
CatBoost OOF AUC: 0.09270


#### Модель Лёгкая GRU на risk-последовательности

In [97]:
class SeqDS(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.from_numpy(X.astype(np.float32))
        self.y = None if y is None else torch.from_numpy(y.astype(np.float32))
    def __len__(self): return len(self.X)
    def __getitem__(self, i):
        return (self.X[i], self.y[i]) if self.y is not None else self.X[i]

class GRUSimple(nn.Module):
    def __init__(self, hidden=64):
        super().__init__()
        self.gru = nn.GRU(input_size=1, hidden_size=hidden, num_layers=2, batch_first=True, dropout=0.1)
        self.head = nn.Sequential(
            nn.Linear(hidden, 64), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(64, 1)
        )
    def forward(self, x):            # x: [B, 25]
        x = x.unsqueeze(-1)         # -> [B, 25, 1]
        out, _ = self.gru(x)        # -> [B, 25, H]
        feat = out[:, -1, :]        # берём последнее скрытое состояние
        logit = self.head(feat)     # -> [B, 1]
        return logit.squeeze(1)     # -> [B]

In [100]:

def train_gru_oof(X, y, Xtest, folds, epochs=5, bs=8192, pos_weight=30.0, lr=2e-3):
    oof = np.zeros(len(X), dtype=np.float32)
    tpred = np.zeros(len(Xtest), dtype=np.float32)

    for fold, (tr_idx, vl_idx) in enumerate(folds, 1):
        ds_tr = SeqDS(X[tr_idx], y[tr_idx])
        ds_vl = SeqDS(X[vl_idx], y[vl_idx])
        ds_te = SeqDS(Xtest, None)

        dl_tr = DataLoader(ds_tr, batch_size=bs, shuffle=True)
        dl_vl = DataLoader(ds_vl, batch_size=bs, shuffle=False)
        dl_te = DataLoader(ds_te, batch_size=bs, shuffle=False)

        model = GRUSimple(hidden=64).to(DEVICE)
        crit = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight, device=DEVICE))
        opt  = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)

        best_auc, best_state = -1, None
        for ep in tqdm(range(epochs)):
            # train
            model.train()
            for xb, yb in dl_tr:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                opt.zero_grad(set_to_none=True)
                loss = crit(model(xb), yb)
                loss.backward(); opt.step()
            # valid
            model.eval()
            logits, yt = [], []
            with torch.no_grad():
                for xb, yb in dl_vl:
                    xb = xb.to(DEVICE)
                    lg = model(xb).detach().cpu().numpy()
                    logits.append(lg); yt.append(yb.numpy())
            p  = 1 / (1 + np.exp(-np.concatenate(logits)))
            yt = np.concatenate(yt)
            auc = roc_auc_score(yt, p)
            print('auc: ', auc)
            if auc > best_auc:
                best_auc = auc
                best_state = {k:v.cpu().clone() for k,v in model.state_dict().items()}

        # restore best
        model.load_state_dict({k:v.to(DEVICE) for k,v in best_state.items()})
        # OOF
        model.eval()
        logits=[]
        with torch.no_grad():
            for xb, yb in dl_vl:
                xb = xb.to(DEVICE)
                logits.append(model(xb).detach().cpu().numpy())
        oof[vl_idx] = 1/(1+np.exp(-np.concatenate(logits)))
        # test
        logits=[]
        with torch.no_grad():
            for xb in dl_te:
                xb = xb.to(DEVICE)
                logits.append(model(xb).detach().cpu().numpy())
        tpred += (1/(1+np.exp(-np.concatenate(logits)))) / len(folds)

    return oof, tpred

In [101]:
oof_gru, test_gru = train_gru_oof(risk_seq_tr, y_tr, risk_seq_te, folds, epochs=6, bs=8192, pos_weight=30.0, lr=2e-3)

 17%|█▋        | 1/6 [04:39<23:17, 279.51s/it]

auc:  0.5331281027001792


 33%|███▎      | 2/6 [10:12<20:43, 310.96s/it]

auc:  0.49041007993867813


 50%|█████     | 3/6 [14:36<14:28, 289.55s/it]

auc:  0.5435256411445505


 67%|██████▋   | 4/6 [18:48<09:09, 274.75s/it]

auc:  0.5506590510301184


 83%|████████▎ | 5/6 [23:22<04:34, 274.33s/it]

auc:  0.5514856447043527


100%|██████████| 6/6 [27:58<00:00, 279.76s/it]

auc:  0.5541296732679302



 17%|█▋        | 1/6 [04:35<22:58, 275.69s/it]

auc:  0.5441724694485535


 33%|███▎      | 2/6 [09:09<18:19, 274.84s/it]

auc:  0.5571701174328637


 50%|█████     | 3/6 [13:45<13:45, 275.19s/it]

auc:  0.5452484111654339


 67%|██████▋   | 4/6 [18:21<09:10, 275.34s/it]

auc:  0.5593424266052103


 83%|████████▎ | 5/6 [22:52<04:33, 273.90s/it]

auc:  0.5608492910790548


100%|██████████| 6/6 [27:09<00:00, 271.65s/it]

auc:  0.564024205210587



 17%|█▋        | 1/6 [04:35<22:56, 275.31s/it]

auc:  0.5387813402125681


 33%|███▎      | 2/6 [09:10<18:20, 275.14s/it]

auc:  0.5536234251377548


 50%|█████     | 3/6 [13:35<13:32, 270.80s/it]

auc:  0.5507426070849298


 67%|██████▋   | 4/6 [17:50<08:48, 264.40s/it]

auc:  0.5500814496862709


 83%|████████▎ | 5/6 [23:13<04:45, 285.68s/it]

auc:  0.5548812275776493


100%|██████████| 6/6 [28:01<00:00, 280.32s/it]

auc:  0.5308226187973175



 17%|█▋        | 1/6 [06:12<31:04, 372.82s/it]

auc:  0.4945372119633191


 33%|███▎      | 2/6 [12:13<24:22, 365.67s/it]

auc:  0.5153153387383305


 50%|█████     | 3/6 [18:09<18:03, 361.04s/it]

auc:  0.5496171161600252


 67%|██████▋   | 4/6 [23:58<11:53, 356.59s/it]

auc:  0.5518872407782766


 83%|████████▎ | 5/6 [29:44<05:52, 352.73s/it]

auc:  0.5468426104291914


100%|██████████| 6/6 [35:44<00:00, 357.34s/it]

auc:  0.5547404364060662



 17%|█▋        | 1/6 [05:15<26:16, 315.27s/it]

auc:  0.5375974116921729


 33%|███▎      | 2/6 [09:44<19:12, 288.10s/it]

auc:  0.533565468375702


 50%|█████     | 3/6 [14:28<14:18, 286.10s/it]

auc:  0.5145364462335472


 67%|██████▋   | 4/6 [18:37<09:03, 271.76s/it]

auc:  0.5529589760709148


 83%|████████▎ | 5/6 [22:47<04:23, 263.81s/it]

auc:  0.5547697019793474


100%|██████████| 6/6 [27:41<00:00, 276.84s/it]

auc:  0.5593875829078722





In [102]:
auc_gru = roc_auc_score(y_tr, oof_gru)
print(f"GRU OOF AUC: {auc_gru:.5f}")

GRU OOF AUC: 0.51149


#### Бленд и подбор порога под F1

In [127]:
oof_blend  = oof_cb.copy()
test_blend = test_cb.copy()

try:
    oof_blend  = 0.7*y_val_pred_proba + 0.2*oof_cb + 0.1*oof_gru
    test_blend = 0.7*fpo_test+0.2*test_cb + 0.1*test_gru
except NameError:
    pass  # GRU не считали — используем только CatBoost

best_t, best_f1 = 0.5, 0.0
for t in np.linspace(0.01, 0.7, 70):
    f1 = f1_score(y_tr, (oof_blend >= t).astype(int))
    if f1 > best_f1:
        best_f1, best_t = f1, t
print(f"OOF F1(best): {best_f1:.4f} @ threshold={best_t:.3f}")

OOF F1(best): 0.1606 @ threshold=0.620


In [135]:
import numpy as np
from sklearn.metrics import f1_score

oof_blend = oof_cb.copy()
test_blend = test_cb.copy()

try:
    best_f1 = 0.0
    best_params = {}
    f = np.linspace(0.1, 0.9, 12)
    
    # Перебираем различные комбинации коэффициентов
    for i in tqdm(range(0, len(f))):
        coef_fpo = f[i]
        for coef_cb in np.linspace(0.1, 0.9, 12):
            coef_gru = 1.0 - coef_fpo - coef_cb
            
            # Пропускаем невалидные комбинации
            if coef_gru < 0:
                continue
            
            # Создаем блендинг
            current_oof = coef_fpo * y_val_pred_proba + coef_cb * oof_cb + coef_gru * oof_gru
            
            # Оптимизируем порог для этой комбинации коэффициентов
            for t in np.linspace(0.01, 0.8, 30):
                f1 = f1_score(y_tr, (current_oof >= t).astype(int))
                
                if f1 > best_f1:
                    best_f1 = f1
                    best_params = {
                        'coef_fpo': coef_fpo,
                        'coef_cb': coef_cb,
                        'coef_gru': coef_gru,
                        'threshold': t
                    }
    
    print(f"Optimal coefficients: FPO={best_params['coef_fpo']:.3f}, "
          f"CB={best_params['coef_cb']:.3f}, GRU={best_params['coef_gru']:.3f}")
    print(f"Optimal threshold: {best_params['threshold']:.3f}")
    print(f"OOF F1(best): {best_f1:.4f}")
    
    # Создаем финальные предсказания
    oof_blend = (best_params['coef_fpo'] * y_val_pred_proba + 
                 best_params['coef_cb'] * oof_cb + 
                 best_params['coef_gru'] * oof_gru)
    
    test_blend = (best_params['coef_fpo'] * fpo_test + 
                  best_params['coef_cb'] * test_cb + 
                  best_params['coef_gru'] * test_gru)

except NameError:
    # Если GRU не доступен, оптимизируем только порог
    print("GRU not available - optimizing threshold only")
    
    best_t, best_f1 = 0.5, 0.0
    for t in np.linspace(0.01, 0.7, 70):
        f1 = f1_score(y_tr, (oof_blend >= t).astype(int))
        if f1 > best_f1:
            best_f1, best_t = f1, t
    
    print(f"OOF F1(best): {best_f1:.4f} @ threshold={best_t:.3f}")

100%|██████████| 12/12 [04:17<00:00, 21.47s/it]

Optimal coefficients: FPO=0.827, CB=0.100, GRU=0.073
Optimal threshold: 0.629
OOF F1(best): 0.1660





In [132]:
np.linspace(0.1, 0.9, 9)

array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [136]:
test_blend = 0.827*fpo_test+0.1*test_cb + 0.073*test_gru

In [137]:
submission = pd.DataFrame({
    'id': X_test['id'].values,
    'flag': (test_blend >= 0.629).astype(int)
})
submission.to_csv("submission_soq_final_blend.csv", index=False)
print("Saved: submission_soq_final_blend.csv")

Saved: submission_soq_final_blend.csv
