Задание:

Есть датасет. Собран он следующим образом:

Из логов были взяты пары ([запрос] - [объект на который кликнул пользователь]). Это положительные примеры (метка 1 в датасете)
Для каждого запроса был подобран в пару негативный объект (метка 0) следующим образом: определяем к какой рубрике относится положительный пример; выбираем случайный объект из другой рубрики. Идея в том, что этот пример маловероятно будет релевантным.
На этих данных, используя кросс-валидацию, обучались различные модели. Метрики качества были хорошими. При попытки тестирования на реальных данных, качество моделей сильно уступало тестовым метрикам.

Задача: выявить особенности датасета, которые приводили к данным результатам и объяснить почему так происходило.


---



Отчет описан в файле **report.pdf** и **report.md**

In [4]:
from sklearn.model_selection import cross_validate, TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd
import numpy as np

In [None]:
path_dataset = '/content/clicks_dataset_msk_20230101_20230725_spec.csv'

In [21]:
df = pd.read_csv(path_dataset, names=['query_id','object_id', 'target'], nrows=1000000)

In [22]:
df.head()

Unnamed: 0,query_id,object_id,target
0,1590973,168299,1
1,1590973,718560,0
2,1234953,325828,1
3,1234953,135968,0
4,3326557,334526,1


In [23]:
# Преобразование типов данных
df['query_id'] = df['query_id'].astype('int32')
df['object_id'] = df['object_id'].astype('int32')
df['target'] = df['target'].astype('int8')

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   query_id   1000000 non-null  int32
 1   object_id  1000000 non-null  int32
 2   target     1000000 non-null  int8 
dtypes: int32(2), int8(1)
memory usage: 8.6 MB


In [25]:
# Распределение целевой переменной
df['target'].value_counts()

1    500000
0    500000
Name: target, dtype: int64

In [26]:
# Всего рубрик
df['object_id'].nunique()

488829

In [27]:
# Распределение рубрик
df['object_id'].value_counts()

135367    255
161420    187
128803    173
91490     166
356746    149
         ... 
183184      1
562640      1
103683      1
851916      1
612541      1
Name: object_id, Length: 488829, dtype: int64

### Расчет метрик для среза датасета в 1 млн строк

In [28]:
# Разделение на признаки и целевую переменную
X = df.drop('target', axis=1)
y = df['target']

# Создание моделей
rf_model = RandomForestClassifier(n_estimators=100)
xgb_model = xgb.XGBClassifier(n_estimators=100)
catboost_model = CatBoostClassifier(n_estimators=100, verbose=False)
lgb_model = lgb.LGBMClassifier(n_estimators=100)

# Метрики для оценки моделей
scoring_metrics = {
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score)
}

tscv = TimeSeriesSplit(n_splits=5)

# Оценка моделей с кросс-валидацией
rf_cv_results = cross_validate(rf_model, X, y, cv=tscv, scoring=scoring_metrics)
xgb_cv_results = cross_validate(xgb_model, X, y, cv=tscv, scoring=scoring_metrics)
catboost_cv_results = cross_validate(catboost_model, X, y, cv=tscv, scoring=scoring_metrics)
lgb_cv_results = cross_validate(lgb_model, X, y, cv=tscv, scoring=scoring_metrics)

[LightGBM] [Info] Number of positive: 83335, number of negative: 83335
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004047 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 166670, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 166668, number of negative: 166668
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 333336, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 250001, number of negative: 250001
[LightGBM] [Info] Auto-choosing col-wise multi-th

#### Вывод результатов

In [29]:
print("RandomForest:")
for metric, values in rf_cv_results.items():
    if metric != 'fit_time' and metric != 'score_time':
        print(f"{metric}: Mean {np.mean(values):.4f}")

print("XGBoost:")
for metric, values in xgb_cv_results.items():
    if metric != 'fit_time' and metric != 'score_time':
        print(f"{metric}: Mean {np.mean(values):.4f}")

print("CatBoost:")
for metric, values in catboost_cv_results.items():
    if metric != 'fit_time' and metric != 'score_time':
        print(f"{metric}: Mean {np.mean(values):.4f}")

print("LightGBM:")
for metric, values in lgb_cv_results.items():
    if metric != 'fit_time' and metric != 'score_time':
        print(f"{metric}: Mean {np.mean(values):.4f}")

RandomForest:
test_f1: Mean 0.7313
test_roc_auc: Mean 0.7437
XGBoost:
test_f1: Mean 0.6743
test_roc_auc: Mean 0.7222
CatBoost:
test_f1: Mean 0.6691
test_roc_auc: Mean 0.7193
LightGBM:
test_f1: Mean 0.6628
test_roc_auc: Mean 0.7174


### Расчет метрик для всего датасета

In [30]:
full_df = pd.read_csv(path_dataset, names=['query_id','object_id', 'target'])

# Преобразование типов данных
full_df['query_id'] = full_df['query_id'].astype('int32')
full_df['object_id'] = full_df['object_id'].astype('int32')
full_df['target'] = full_df['target'].astype('int8')

# Разделение на признаки и целевую переменную
X_full = full_df.drop('target', axis=1)
y_full = full_df['target']

# Оценка моделей с кросс-валидацией
xgb_cv_results_full = cross_validate(xgb_model, X_full, y_full, cv=tscv, scoring=scoring_metrics)
catboost_cv_results_full = cross_validate(catboost_model, X_full, y_full, cv=tscv, scoring=scoring_metrics)
lgb_cv_results_full = cross_validate(lgb_model, X_full, y_full, cv=tscv, scoring=scoring_metrics)

[LightGBM] [Info] Number of positive: 83335, number of negative: 83335
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006438 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 166670, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 166668, number of negative: 166668
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012646 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 333336, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 250001, number of negative: 250001
[LightGBM] [Info] Auto-choosing col-wise multi-th

#### Выввод результатов

In [31]:
print("XGBoost:")
for metric, values in xgb_cv_results_full.items():
    if metric != 'fit_time' and metric != 'score_time':
        print(f"{metric}: Mean {np.mean(values):.4f}")

print("CatBoost:")
for metric, values in catboost_cv_results_full.items():
    if metric != 'fit_time' and metric != 'score_time':
        print(f"{metric}: Mean {np.mean(values):.4f}")

print("LightGBM:")
for metric, values in lgb_cv_results_full.items():
    if metric != 'fit_time' and metric != 'score_time':
        print(f"{metric}: Mean {np.mean(values):.4f}")

XGBoost:
test_f1: Mean 0.6743
test_roc_auc: Mean 0.7222
CatBoost:
test_f1: Mean 0.6691
test_roc_auc: Mean 0.7193
LightGBM:
test_f1: Mean 0.6628
test_roc_auc: Mean 0.7174


### Пример датасета, который должен и на обучении, и на тестировании на реальных данных показывать хорошие результаты

In [32]:
feature_df = pd.read_csv(path_dataset, names=['query_id','object_id', 'target'], nrows=100)

# Количество дополнительных записей с target 0
additional_records = 8

for i in feature_df['query_id'].unique():
    additional_data = {
        'query_id': i,
        'object_id': np.random.choice(df['object_id'], additional_records),
        'target': np.zeros(additional_records)
    }

    additional_df = pd.DataFrame(additional_data)
    feature_df = pd.concat([feature_df, additional_df], ignore_index=True)

*Возьмем для пример query_id = 1590973*

In [33]:
feature_df[feature_df['query_id'] == 1590973]

Unnamed: 0,query_id,object_id,target
0,1590973,168299,1.0
1,1590973,718560,0.0
100,1590973,265715,0.0
101,1590973,602912,0.0
102,1590973,1062984,0.0
103,1590973,105214,0.0
104,1590973,468598,0.0
105,1590973,1047166,0.0
106,1590973,522739,0.0
107,1590973,102560,0.0
