In [47]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, SplineTransformer, StandardScaler

# Загрузка данных
df_train = pd.read_csv("train.csv")
print("Shape:", df_train.shape)
print("Columns:", df_train.columns.tolist())

Shape: (750000, 18)
Columns: ['id', 'age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']


# Преобразование целевой переменной

# Бинарная целевая переменная

# Проверка на наличие двух классов

# Признаки

# Проверка наличия признаков

In [None]:
df_train["y"] = df_train["y"].astype(str).str.strip()
df_train["y"] = df_train["y"].replace({"0": "no", "1": "yes"})
df_train["y"] = df_train["y"].fillna("no")
print("Unique values in 'y' after normalization:", df_train["y"].unique())
print("Value counts in 'y':\n", df_train["y"].value_counts(normalize=True))


y_train = (df_train["y"] == "yes").astype(int)
print("Unique values in y_train:", np.unique(y_train))
print("Value counts in y_train:\n", pd.Series(y_train).value_counts(normalize=True))


if len(np.unique(y_train)) < 2:
    raise ValueError(
        "y_train contains only one class. Check the target variable transformation."
    )


X_train = df_train.drop(["y"], axis=1)
if "id" in X_train.columns:
    X_train = X_train.drop("id", axis=1)

num_features = ["age", "balance", "day", "duration", "campaign", "pdays", "previous"]
cat_features = [
    "job",
    "marital",
    "education",
    "default",
    "housing",
    "loan",
    "contact",
    "month",
    "poutcome",
]


missing_features = [f for f in num_features + cat_features if f not in X_train.columns]
if missing_features:
    raise ValueError(f"Missing features in X_train: {missing_features}")

Unique values in 'y' after normalization: ['no' 'yes']
Value counts in 'y':
 y
no     0.879349
yes    0.120651
Name: proportion, dtype: float64
Unique values in y_train: [0 1]
Value counts in y_train:
 y
0    0.879349
1    0.120651
Name: proportion, dtype: float64


In [None]:
# Трансформер для числовых признаков с сплайнами
num_transformer = Pipeline(
    steps=[
        ("splines", SplineTransformer(n_knots=5, degree=3, include_bias=True)),
        ("scaler", StandardScaler()),
    ]
)


# Препроцессор
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_features),
    ]
)








# Модель
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        (
            "classifier",
            LogisticRegression(random_state=42, max_iter=1000, class_weight="balanced"),
        ),
    ]
)

# Разделение данных
split_success = False
for rs in [42, 123, 456, 777, 0]:
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y_train, test_size=0.2, stratify=y_train, random_state=rs, shuffle=True
    )
    if len(np.unique(y_train_split)) >= 2 and len(np.unique(y_val)) >= 2:
        print(f"Split OK with random_state={rs}")
        print("Classes in train_split:", np.unique(y_train_split))
        print("Classes in val:", np.unique(y_val))
        split_success = True
        break
    else:
        print(
            f"Split failed with rs={rs}: train_classes={np.unique(y_train_split)}, val_classes={np.unique(y_val)}"
        )

if split_success:
    # Обучение и оценка
    model.fit(X_train_split, y_train_split)
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)[:, 1]
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred))
    print(f"AUC-ROC: {roc_auc_score(y_val, y_pred_proba):.4f}")
else:
    # Кросс-валидация, если сплит не удался
    print("Split failed — using cross-validation.")
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(
        model, X_train, y_train, cv=cv, scoring="roc_auc", n_jobs=-1
    )
    print(f"Cross-validation AUC-ROC scores: {cv_scores}")
    print(f"Mean AUC-ROC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

    # Обучение на полном датасете
    model.fit(X_train, y_train)

# Предсказания для теста
df_test = pd.read_csv("test.csv")
X_test = df_test.drop("id", axis=1)
test_predictions = model.predict_proba(X_test)[:, 1]
submission = pd.DataFrame({"id": df_test["id"], "y": test_predictions})
submission.to_csv("lr_with_splines_submission.csv", index=False)
print("Submission file created!")

Split OK with random_state=42
Classes in train_split: [0 1]
Classes in val: [0 1]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.86      0.92    131902
           1       0.46      0.90      0.61     18098

    accuracy                           0.86    150000
   macro avg       0.72      0.88      0.76    150000
weighted avg       0.92      0.86      0.88    150000

AUC-ROC: 0.9471
Submission file created!
