In [1]:
import pandas as pd
import numpy as np


In [2]:

X_train = pd.read_parquet("X_train.parquet")
y_train = pd.read_parquet("y_train.parquet")

X_valid = pd.read_parquet("X_valid.parquet")
y_valid = pd.read_parquet("y_valid.parquet")

X_test  = pd.read_parquet("X_test.parquet")
y_test  = pd.read_parquet("y_test.parquet")

print(X_train.shape, X_valid.shape, X_test.shape)
print(y_train.mean(), y_valid.mean(), y_test.mean())


(3032808, 26) (1516404, 26) (1516404, 26)
vacation_flag    0.156954
dtype: float64 vacation_flag    0.154247
dtype: float64 vacation_flag    0.138047
dtype: float64


In [3]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1516404 entries, 0 to 1516403
Data columns (total 26 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   year                    1516404 non-null  int32  
 1   month                   1516404 non-null  int32  
 2   is_summer_month         1516404 non-null  int8   
 3   department              1516404 non-null  string 
 4   department_size         1516404 non-null  float64
 5   region                  1516404 non-null  string 
 6   is_manager              1516404 non-null  int8   
 7   manager_avoids_summer   1516404 non-null  int8   
 8   is_north_region         1516404 non-null  int8   
 9   north_bonus_days        1516404 non-null  float64
 10  age_years               1516404 non-null  float64
 11  birth_month             1516404 non-null  float64
 12  is_married              1516404 non-null  int8   
 13  children_count          1516404 non-null  float64
 14  sc

In [4]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [5]:
from sklearn.model_selection import train_test_split

sample_size = 1000000

Xsample, Xdrop, ysample, ydrop = train_test_split(
    X_train,
    y_train,
    train_size=sample_size,
    stratify=y_train,
    random_state=42
)

print("Размер сэмпла:", Xsample.shape)
print("Доля класса 1:", ysample.mean())


Размер сэмпла: (1000000, 26)
Доля класса 1: vacation_flag    0.156954
dtype: float64


In [6]:
catcols = X_train.select_dtypes(include=["object"]).columns.tolist()
print("Категориальные признаки:", catcols)


Категориальные признаки: ['department', 'region']


In [7]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(
    iterations=200,
    depth=6,
    learning_rate=0.1,
    loss_function="Logloss",
    eval_metric="AUC",
    auto_class_weights="Balanced",
    random_seed=42,
    verbose=25
)


In [8]:
model.fit(
    Xsample,
    ysample,
    eval_set=(X_valid, y_valid),
    cat_features=catcols,
    use_best_model=True
)


0:	test: 0.7390289	best: 0.7390289 (0)	total: 1.89s	remaining: 6m 15s
25:	test: 0.7609446	best: 0.7609446 (25)	total: 42.5s	remaining: 4m 44s
50:	test: 0.7648991	best: 0.7650931 (49)	total: 1m 20s	remaining: 3m 53s
75:	test: 0.7668474	best: 0.7668474 (75)	total: 2m	remaining: 3m 16s
100:	test: 0.7675849	best: 0.7675849 (100)	total: 2m 37s	remaining: 2m 34s
125:	test: 0.7675846	best: 0.7676685 (111)	total: 3m 15s	remaining: 1m 54s
150:	test: 0.7676484	best: 0.7679490 (138)	total: 3m 53s	remaining: 1m 15s
175:	test: 0.7677318	best: 0.7679560 (161)	total: 4m 30s	remaining: 36.9s
199:	test: 0.7682742	best: 0.7682742 (199)	total: 5m 6s	remaining: 0us

bestTest = 0.7682742022
bestIteration = 199



<catboost.core.CatBoostClassifier at 0x7806b531d310>

In [11]:
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import precision_score, recall_score, f1_score

roc = roc_auc_score(y_valid, valid_proba)
pr = average_precision_score(y_valid, valid_proba)

valid_pred = (valid_proba >= 0.5).astype(int)

precision = precision_score(y_valid, valid_pred)
recall = recall_score(y_valid, valid_pred)
f1 = f1_score(y_valid, valid_pred)

print("Validation ROC AUC:", roc)
print("Validation PR AUC:", pr)
print("Validation Precision:", precision)
print("Validation Recall:", recall)
print("Validation F1:", f1)


Validation ROC AUC: 0.7682742022340834
Validation PR AUC: 0.3776923448576305
Validation Precision: 0.3200371297214675
Validation Recall: 0.6883724310712652
Validation F1: 0.43693504223869134
