In [51]:
import pandas as pd

#type
train_df = pd.read_csv('../data/train_df.csv')
test_df = pd.read_csv('../data/test_df.csv')

In [52]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score

features = ['name_lower', 'type_lower', 'category_l2', 'category_l4', 'type_in_name']
cat_features = [0, 1, 2, 3]

X_train = train_df[features]
y_train = train_df['target']
X_test = test_df[features]
y_test = test_df['target']

train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
test_pool = Pool(X_test, label=y_test, cat_features=cat_features)

model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.2,
    depth=6,
    loss_function='Logloss',
    eval_metric='AUC',
    verbose=50,
    random_state=42
)
model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=30)

0:	test: 0.9977446	best: 0.9977446 (0)	total: 177ms	remaining: 52.9s
50:	test: 0.9995858	best: 0.9995858 (28)	total: 4.7s	remaining: 23s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.9995858085
bestIteration = 28

Shrink model to first 29 iterations.


<catboost.core.CatBoostClassifier at 0x2202e6517c0>

In [53]:
y_pred_proba = model.predict_proba(test_pool)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f'ROC-AUC на тесте: {roc_auc:.4f}')

ROC-AUC на тесте: 0.9996


In [None]:
unique_types = train_df['type_lower'].unique()

def get_top_10_types(sample_row):
    candidate_rows = []
    for t in unique_types:
        row = sample_row.copy()
        row['type_lower'] = t
        row['type_in_name'] = int(t in row['name_lower'])
        candidate_rows.append(row)
    df_candidates = pd.DataFrame(candidate_rows)

    # Обязательно те же признаки и порядок
    preds = model.predict_proba(df_candidates[features])[:, 1]
    top_10_idx = preds.argsort()[::-1][:10]
    top_10_types = df_candidates.iloc[top_10_idx][['type_lower']].copy()
    top_10_types['probability'] = preds[top_10_idx]
    return top_10_types.reset_index(drop=True)

# Пример: ранжирование для первой строки теста
sample = X_test.iloc[7].copy()
top_10 = get_top_10_types(sample)
print(X_test.iloc[7])
print(top_10)

name_lower      картина на холсте "пионы в черно-белом стиле, ...
type_lower                                                картина
category_l2                                             Дом и сад
category_l4                                               Картина
type_in_name                                                 True
Name: 7, dtype: object
                          type_lower  probability
0                            картина     0.999900
1                            рассада     0.503371
2                            саженец     0.503371
3                             семена     0.373324
4         комплект постельного белья     0.373324
5                             кружка     0.361741
6                             постер     0.272891
7  печатная книга: пособие для школы     0.182506
8                   фильтр топливный     0.182506
9                            саквояж     0.182506


In [55]:
feature_importances = model.get_feature_importance(train_pool)
for name, score in zip(features, feature_importances):
    print(f"{name}: {score:.3f}")

name_lower: 1.051
type_lower: 48.704
category_l2: 23.584
category_l4: 4.500
type_in_name: 22.161
