In [1]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
import numpy as np
from tqdm.auto import tqdm



In [2]:
# загружаем данные
triggers = pd.read_csv('triggers.csv', parse_dates=['date'])
actions = pd.read_csv('actions.csv', parse_dates=['date'])

In [3]:
# 2. Определяем max_date для окон
max_date = max(triggers['date'].max(), actions['date'].max())

In [4]:
# 3. Функция агрегации признаков по временным окнам с прогресс-баром
def aggregate_features(df, by, date_col, windows, col, max_date):
    chunks = []
    for w in tqdm(windows, desc=f'agg {col} windows'):
        start = max_date - pd.Timedelta(days=w)
        df_w = df[df[date_col] >= start]
        agg = (
            df_w
            .groupby(by)[col]
            .agg(count='count', unique='nunique')
            .rename(columns={
                'count': f'count_{w}d',
                'unique': f'unique_{w}d'
            })
        )
        chunks.append(agg)
    return pd.concat(chunks, axis=1).fillna(0)


# 4. Строим фичи из triggers
windows = [7, 14, 30]
f_tr = aggregate_features(triggers, 'guid', 'date', windows, 'trigger', max_date)
f_ty = aggregate_features(triggers, 'guid', 'date', windows, 'type', max_date)

# 5. Добавляем префиксы, чтобы избежать пересечений имен столбцов
f_tr = f_tr.add_prefix('tr_')
f_ty = f_ty.add_prefix('ty_')

# 6. Агрегация признаков из actions
act_agg = (
    actions
    .groupby('guid')
    .agg(
        shows_count=('date', 'count'),
        positive_shows=('result', 'sum'),
        last_show=('date', 'max')
    )
)
act_agg['days_since_last_show'] = (
        max_date - act_agg['last_show']
).dt.days.fillna(999)

# 7. Объединяем все признаки в один DataFrame
features = (
    f_tr
    .join(f_ty, how='outer')
    .join(act_agg, how='outer')
    .fillna({
        'shows_count': 0,
        'positive_shows': 0,
        'days_since_last_show': 999
    })
)
features['user_ctr'] = (
        features['positive_shows'] / features['shows_count'].replace(0, 1)
)

# 8. Добавляем recency из triggers
last_visit = triggers.groupby('guid')['date'].max()
features['days_since_last_visit'] = (
        max_date - last_visit
).dt.days.fillna(999)

# 9. Сохраняем готовые фичи
features.reset_index().to_csv('features_xgb.csv', index=False)
print("features_xgb.csv успешно сохранён, размер:", features.shape)

agg trigger windows:   0%|          | 0/3 [00:00<?, ?it/s]

agg type windows:   0%|          | 0/3 [00:00<?, ?it/s]

features_xgb.csv успешно сохранён, размер: (1009806, 18)


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report

# загружаем признаки и метки
feat = pd.read_csv('features_xgb.csv')
acts = pd.read_csv('actions.csv', parse_dates=['date'])
df = feat.merge(acts[['guid', 'result']], on='guid').dropna(subset=['result'])

In [26]:
# готовим X и y
drop = ['guid', 'last_show']
X = df.drop(columns=drop + ['result'])
y = df['result']

# split и обучение
X_train, X_vt, y_train, y_vt = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

X_test, X_val, y_test, y_val = train_test_split(
    X_vt, y_vt, test_size=0.5, random_state=42, stratify=y_vt)

In [27]:
# считаем ratio = n_negative / n_positive
ratio = (y_train == 0).sum() / (y_train == 1).sum()

In [53]:
model = XGBClassifier(eval_metric='logloss', scale_pos_weight=ratio, num_boost_round=1000,
                      max_depth=7,  # Глубина дерева
                      learning_rate=0.01,  # Скорость обучения
                      subsample=0.8,  # Доля выборки для каждого дерева
                      colsample_bytree=0.8,  # Доля признаков для каждого дерева
                      min_child_weight=1,  # Минимальный вес в листе
                      gamma=0,  # Минимальное уменьшение потерь
                      reg_alpha=0,  # L1 регуляризация
                      reg_lambda=1,  # L2 регуляризация)
                      )
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=100)

[0]	validation_0-logloss:0.68345


Parameters: { "num_boost_round" } are not used.



[99]	validation_0-logloss:0.22829


In [61]:
# оценка
y_proba = model.predict_proba(X_test)[:, 1]
roc = roc_auc_score(y_test, y_proba)
y_pred = (y_proba >= 0.2).astype(int)
report = classification_report(y_test, y_pred)

print(f"ROC-AUC: {roc:.4f}")
print("Report (threshold=0.2):\n", report)


ROC-AUC: 0.9967
Report (threshold=0.2):
               precision    recall  f1-score   support

           0       1.00      0.99      0.99     36706
           1       0.69      1.00      0.81      1114

    accuracy                           0.99     37820
   macro avg       0.84      0.99      0.90     37820
weighted avg       0.99      0.99      0.99     37820

