In [1]:
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
import joblib
from lightgbm import LGBMClassifier
import optuna
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier


TRAIN_SIZE = 5_000_001
RANDOM_SEED = 42

categorical_features = ["Gender", "Region_Code", "Vehicle_Age", "Vehicle_Damage"]
numeric_features = ["Age", "Driving_License", "Previously_Insured", "Annual_Premium", "Policy_Sales_Channel", "Vintage"]

In [2]:
df = pd.read_csv('train.csv').drop('id', axis=1)

In [3]:
train_df, test_df = df[:TRAIN_SIZE], df[TRAIN_SIZE:]

In [5]:
train_df.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


In [6]:
test_df.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
5000001,Male,61,1,28.0,0,> 2 Years,Yes,41372.0,30.0,77,1
5000002,Male,44,1,0.0,1,1-2 Year,No,2630.0,122.0,107,0
5000003,Male,25,1,28.0,0,< 1 Year,No,2630.0,152.0,248,0
5000004,Female,26,1,2.0,1,< 1 Year,No,38414.0,160.0,24,0
5000005,Male,28,1,8.0,1,< 1 Year,No,44203.0,152.0,185,0


Категориальные фичи энкодим через onehot, числовые нормализуем

In [4]:
train_df[categorical_features] = train_df[categorical_features].astype('category')

X_train, y_train = train_df.drop(columns=["Response"]), train_df["Response"]
X_test,  y_test  = test_df.drop(columns=["Response"]),  test_df["Response"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", StandardScaler(), numeric_features)  # Масштабирование числовых признаков
    ]
)

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[categorical_features] = train_df[categorical_features].astype('category')


Пытались сделать стекинг алгоритмов, для каждого из которых гиперпараметры находились через `optuna`, но из-за объема датасета не хватало мощностей. Взяли найденные лучшие гиперпараметры с чужого решения на kaggle

In [8]:
def objective(trial, model_type):
    if model_type == "logreg":
        params = {
            "C": trial.suggest_float("C", 1e-5, 10, log=True),
            "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs"])
        }
        model = LogisticRegression(**params, max_iter=1000, random_state=RANDOM_SEED, class_weight="balanced")
    elif model_type == "svc":
        params = {
            "C": trial.suggest_float("C", 1e-5, 10, log=True),
            "kernel": trial.suggest_categorical("kernel", ["linear", "rbf", "poly"])
        }
        model = SVC(**params, random_state=RANDOM_SEED, probability=True, class_weight="balanced")
    elif model_type == "rf":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 10, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 15)
        }
        model = RandomForestClassifier(**params, random_state=RANDOM_SEED, class_weight="balanced")
    elif model_type == "lgbm":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 1000),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 15),
            "num_leaves": trial.suggest_int("num_leaves", 10, 100),
            "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1, 10)
        }
        model = LGBMClassifier(**params, random_state=RANDOM_SEED)
    elif model_type == "xgb":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 1000),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "gamma": trial.suggest_float("gamma", 0, 10),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 100, log=True),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 100, log=True),
            "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1, 10),
        }
        model = XGBClassifier(**params, random_state=RANDOM_SEED, eval_metric="logloss")

    model.fit(X_train_preprocessed, y_train)

    X_test_preprocessed = preprocessor.transform(X_test)
    y_pred = model.predict(X_test_preprocessed)

    f1 = f1_score(y_test, y_pred)
    return f1

models = ["logreg", "svc", "rf", "lgbm", "xgb"]
best_models = {}

for model_type in models:
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: objective(trial, model_type), n_trials=20)
    
    print(f"Лучшие гиперпараметры для {model_type}:", study.best_params)
    print(f"Лучший F1-score для {model_type}:", study.best_value)
    
    best_models[model_type] = study.best_params

base_models = [
    ("logreg", LogisticRegression(**best_models["logreg"], max_iter=1000, random_state=RANDOM_SEED, class_weight="balanced")),
    ("svc", SVC(**best_models["svc"], probability=True, random_state=RANDOM_SEED, class_weight="balanced")),
    ("rf", RandomForestClassifier(**best_models["rf"], random_state=RANDOM_SEED, class_weight="balanced")),
    ("lgbm", LGBMClassifier(**best_models["lgbm"], random_state=RANDOM_SEED)),
    ("xgb", XGBClassifier(**best_models["xgb"], random_state=RANDOM_SEED, eval_metric="logloss"))
]

[I 2025-04-13 20:58:31,577] A new study created in memory with name: no-name-84014964-87d3-4778-8dd0-324e8513340e
[I 2025-04-13 20:59:22,586] Trial 0 finished with value: 0.4159577174653459 and parameters: {'C': 3.7060157204687862, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.4159577174653459.
[I 2025-04-13 21:00:39,684] Trial 1 finished with value: 0.41575953509535435 and parameters: {'C': 0.014770156865537072, 'solver': 'liblinear'}. Best is trial 0 with value: 0.4159577174653459.
[I 2025-04-13 21:01:40,705] Trial 2 finished with value: 0.41413018147664743 and parameters: {'C': 0.0009431182313478506, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.4159577174653459.
[I 2025-04-13 21:02:20,489] Trial 3 finished with value: 0.4042670762660273 and parameters: {'C': 8.001681216183339e-05, 'solver': 'liblinear'}. Best is trial 0 with value: 0.4159577174653459.
[I 2025-04-13 21:03:36,373] Trial 4 finished with value: 0.41588385107536086 and parameters: {'C': 6.177038761297039, 'solve

KeyboardInterrupt: 

In [None]:
cat_params = {
    'iterations': 10000,
    'eval_metric': 'F1',
    'task_type': 'GPU',
    'learning_rate': 0.05,
    'depth': 9,
    'l2_leaf_reg': 55.37964307854247,
    'max_bin': 404,
    'bagging_temperature': 0.017138393608280057,
    'random_strength': 9.256288011643901,
    'auto_class_weights': 'Balanced'
}

model = CatBoostClassifier(**cat_params, random_state=RANDOM_SEED)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
f1_score(y_test, pipeline.predict(X_test))

0.4616219979379776

In [None]:
joblib.dump(pipeline, 'pipeline.joblib')

['pipeline.joblib']