In [None]:
def get_model_search(trial, name):

    if name == "logreg":
        return Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(
                C=trial.suggest_float("clf__C", 1e-3, 10, log=True),
                l1_ratio=trial.suggest_float("clf__l1_ratio", 0.0, 1.0),
                solver="saga",
                class_weight="balanced",
                max_iter=5000,
                random_state=42
            ))
        ])

    if name == "knn":
        return Pipeline([
            ("scaler", StandardScaler()),
            ("clf", KNeighborsClassifier(
                n_neighbors=trial.suggest_int("clf__n_neighbors", 3, 25),
                weights=trial.suggest_categorical("clf__weights", ["uniform", "distance"])
            ))
        ])

    if name == "svm":
        return Pipeline([
            ("scaler", StandardScaler()),
            ("clf", SVC(
                C=trial.suggest_float("clf__C", 1e-2, 10, log=True),
                kernel="rbf",
                probability=True,
                class_weight="balanced",
                random_state=42
            ))
        ])

    if name == "gnb":
        return Pipeline([
            ("scaler", StandardScaler()),
            ("clf", GaussianNB(
                var_smoothing=trial.suggest_float("clf__var_smoothing", 1e-12, 1e-8, log=True)
            ))
        ])

    if name == "dt":
        return DecisionTreeClassifier(
            max_depth=trial.suggest_int("max_depth", 2, 15),
            min_samples_split=trial.suggest_int("min_samples_split", 2, 20),
            class_weight="balanced",
            random_state=42
        )

    if name == "rf":
        return RandomForestClassifier(
            n_estimators=trial.suggest_int("n_estimators", 200, 600),
            max_depth=trial.suggest_int("max_depth", 3, 15),
            class_weight="balanced",
            n_jobs=-1,
            random_state=42
        )

    if name == "extra":
        return ExtraTreesClassifier(
            n_estimators=trial.suggest_int("n_estimators", 200, 600),
            max_depth=trial.suggest_int("max_depth", 3, 15),
            class_weight="balanced",
            n_jobs=-1,
            random_state=42
        )

    if name == "ada":
        return AdaBoostClassifier(
            n_estimators=trial.suggest_int("n_estimators", 100, 400),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0),
            random_state=42
        )

    if name == "gb":
        return GradientBoostingClassifier(
            n_estimators=trial.suggest_int("n_estimators", 100, 400),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2),
            max_depth=trial.suggest_int("max_depth", 2, 5),
            random_state=42
        )

    if name == "lgbm":
        return LGBMClassifier(
            n_estimators=trial.suggest_int("n_estimators", 200, 600),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2),
            num_leaves=trial.suggest_int("num_leaves", 16, 64),
            class_weight="balanced",
            random_state=42
        )

    if name == "xgb":
        return XGBClassifier(
            n_estimators=trial.suggest_int("n_estimators", 200, 600),
            max_depth=trial.suggest_int("max_depth", 3, 10),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2),
            subsample=trial.suggest_float("subsample", 0.6, 1.0),
            colsample_bytree=trial.suggest_float("colsample_bytree", 0.6, 1.0),
            eval_metric="logloss",
            tree_method="hist",
            random_state=42
        )

    raise ValueError(name)


In [None]:
def make_objective(model_name, X, y):
    def objective(trial):
        model = get_model_search(trial, model_name)
        return cross_val_score(
            model,
            X,
            y,
            scoring="f1",
            cv=cv,
            n_jobs=-1
        ).mean()
    return objective


In [None]:
def objective_catboost(trial, X, y):

    params = {
        "iterations": trial.suggest_int("iterations", 200, 600),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "loss_function": "Logloss",
        "auto_class_weights": "Balanced",
        "verbose": False,
        "random_seed": 42
    }

    f1s = []

    for tr, va in cv.split(X, y):
        model = CatBoostClassifier(**params)
        model.fit(X.iloc[tr], y.iloc[tr])
        preds = model.predict(X.iloc[va])
        f1s.append(f1_score(y.iloc[va], preds))

    return np.mean(f1s)


In [None]:
def f1_from_scores(model, X, y, threshold=0.5):
    scores = model.predict_proba(X)[:,1]
    y_pred = (scores >= threshold).astype(int)
    return f1_score(y, y_pred)


In [None]:
def objective(trial):
    model = get_model_search(trial, model_name)

    f1s = []
    for tr, va in cv.split(X_sel, y_train):
        model.fit(X_sel.iloc[tr], y_train.iloc[tr])
        scores = model.predict_proba(X_sel.iloc[va])[:,1]
        y_pred = (scores >= 0.5).astype(int)
        f1s.append(f1_score(y_train.iloc[va], y_pred))

    return np.mean(f1s)
