In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# ---------- data ----------
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X = train.drop(columns=["target"])
y = train["target"]
X_test = test.copy()

# ---------- pipeline ----------
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(
        solver="liblinear",
        max_iter=300,
        tol=1e-3,
        class_weight="balanced"
    ))
])

# ---------- cross-validation ----------
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

scoring = {
    "accuracy": "accuracy",
    "f1": "f1",
    "roc_auc": "roc_auc"
}

cv_results = cross_validate(
    pipe,
    X,
    y,
    cv=cv,
    scoring=scoring,
    n_jobs=-1,
    return_train_score=False
)

# ---------- CV results ----------
for metric in scoring.keys():
    scores = cv_results[f"test_{metric}"]
    print(
        f"{metric}: "
        f"mean = {scores.mean():.4f}, "
        f"std = {scores.std():.4f}, "
        f"folds = {np.round(scores, 4)}"
    )

# ---------- final training ----------
pipe.fit(X, y)

test_pred = pipe.predict(X_test)
test_proba = pipe.predict_proba(X_test)[:, 1]

### база

In [None]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(max_iter=1000))
])

scoring = {
    "accuracy": "accuracy",
    "f1": "f1",
    "roc_auc": "roc_auc"
}

cv_results = cross_validate(
    pipe,
    X_train,
    y_train,
    cv=5,
    scoring=scoring
)

for metric in scoring:
    print(metric, np.mean(cv_results[f"test_{metric}"]))

param_grid = {
    "logreg__C": [0.01, 0.1, 1, 10, 100],
    "logreg__penalty": ["l1", "l2"],
    "logreg__solver": ["liblinear"],
    "logreg__class_weight": [None, "balanced"]
}

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)

gs.fit(X_train, y_train)

best_model = gs.best_estimator_

y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

### Randomized search cv

In [None]:
import pandas as pd
from scipy.stats import loguniform

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# ---------- data ----------
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X = train.drop(columns=["target"])
y = train["target"]
X_test = test.copy()

# ---------- model ----------
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(
        solver="liblinear",
        max_iter=300,   # ⏱ быстрее
        tol=1e-3
    ))
])

cv = StratifiedKFold(
    n_splits=3,        # ⏱ меньше фолдов
    shuffle=True,
    random_state=42
)

param_dist = {
    "logreg__C": loguniform(1e-3, 1e2),   # ⏱ суженный диапазон
    "logreg__penalty": ["l1", "l2"],
    "logreg__class_weight": [None, "balanced"]
}

rs = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=20,          # ⏱ жёсткий лимит
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    random_state=42
)

rs.fit(X, y)

best_model = rs.best_estimator_

test_pred = best_model.predict(X_test)
test_proba = best_model.predict_proba(X_test)[:, 1]

### Bayes search cv

In [9]:
pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.7.0-py3-none-any.whl.metadata (12 kB)
Collecting PyYAML (from pyaml>=16.9->scikit-optimize)
  Downloading pyyaml-6.0.3-cp311-cp311-win_amd64.whl.metadata (2.4 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
Downloading pyaml-25.7.0-py3-none-any.whl (26 kB)
Downloading pyyaml-6.0.3-cp311-cp311-win_amd64.whl (158 kB)
Installing collected packages: PyYAML, pyaml, scikit-optimize

   -------------------------- ------------- 2/3 [scikit-optimize]
   ---------------------------------------- 3/3 [scikit-optimize]

Successfully installed PyYAML-6.0.3 pyaml-25.7.0 scikit-optimize-0.10.2
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from skopt import BayesSearchCV
from skopt.space import Real, Categorical

# ---------- data ----------
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X = train.drop(columns=["target"])
y = train["target"]
X_test = test.copy()

# ---------- model ----------
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(
        solver="liblinear",
        max_iter=300,   # ⏱
        tol=1e-3
    ))
])

cv = StratifiedKFold(
    n_splits=3,        # ⏱
    shuffle=True,
    random_state=42
)

search_space = {
    "logreg__C": Real(1e-3, 1e2, prior="log-uniform"),
    "logreg__penalty": Categorical(["l1", "l2"]),
    "logreg__class_weight": Categorical([None, "balanced"])
}

opt = BayesSearchCV(
    pipe,
    search_space,
    n_iter=25,         # ⏱ лимит итераций
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    random_state=42
)

opt.fit(X, y)

best_model = opt.best_estimator_

test_pred = best_model.predict(X_test)
test_proba = best_model.predict_proba(X_test)[:, 1]

### Optuna

In [11]:
pip install optuna

Collecting optuna
  Downloading optuna-4.7.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.18.3-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading sqlalchemy-2.0.46-cp311-cp311-win_amd64.whl.metadata (9.8 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet>=1 (from sqlalchemy>=1.4.2->optuna)
  Downloading greenlet-3.3.1-cp311-cp311-win_amd64.whl.metadata (3.8 kB)
Collecting MarkupSafe>=0.9.2 (from Mako->alembic>=1.5.0->optuna)
  Downloading markupsafe-3.0.3-cp311-cp311-win_amd64.whl.metadata (2.8 kB)
Downloading optuna-4.7.0-py3-none-any.whl (413 kB)
Downloading alembic-1.18.3-py3-none-any.whl (262 kB)
Downloading sqlalchemy-2.0.46-cp311-cp311-win_amd64.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB 

In [None]:
import pandas as pd
import optuna

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# ---------- data ----------
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X = train.drop(columns=["target"])
y = train["target"]
X_test = test.copy()

cv = StratifiedKFold(
    n_splits=3,        # ⏱
    shuffle=True,
    random_state=42
)

def objective(trial):
    model = Pipeline([
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(
            C=trial.suggest_float("C", 1e-3, 1e2, log=True),
            penalty=trial.suggest_categorical("penalty", ["l1", "l2"]),
            class_weight=trial.suggest_categorical("class_weight", [None, "balanced"]),
            solver="liblinear",
            max_iter=300,   # ⏱
            tol=1e-3
        ))
    ])

    scores = cross_val_score(
        model,
        X,
        y,
        cv=cv,
        scoring="roc_auc",
        n_jobs=-1
    )

    return scores.mean()

study = optuna.create_study(
    direction="maximize",
    pruner=optuna.pruners.MedianPruner()  # ⏱ ранняя остановка
)

study.optimize(
    objective,
    timeout=300        # ⏱ 5 минут ЖЁСТКО
)

best_model = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(
        **study.best_params,
        solver="liblinear",
        max_iter=300,
        tol=1e-3
    ))
])

best_model.fit(X, y)

test_pred = best_model.predict(X_test)
test_proba = best_model.predict_proba(X_test)[:, 1]