In [1]:
# ============================================
#  Library
# ============================================
import os
import math
import time
import random
import numpy as np
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
import numpy as np
import sys
sys.path.append("G:/マイドライブ/signate_MUFJ2023/")
from MUFJ.utils import get_score, seed_everything
import polars as pl
from sklearn.metrics import log_loss
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [2]:
# ============================================
#  CFG
# ============================================
class CFG:
    seed = 42
    save_dir = "G:/マイドライブ/signate_MUFJ2023/exp/"
    data_dir = "G:/マイドライブ/signate_MUFJ2023/data/"
    filename = "exp012"

In [3]:
# ============================================
#  Utils
# ============================================
seed_everything(CFG.seed)

In [4]:
# ============================================
#  Data Loading
# ============================================
df = pl.read_csv(CFG.save_dir+"oof_df_exp009.csv").rename({"pred":"pred1"})
df = df.join(
    pl.read_csv(CFG.save_dir+"oof_df_exp010.csv").rename({"pred":"pred2"}),
    on="index", how="left",
)
df = df.join(
    pl.read_csv(CFG.save_dir+"oof_df_exp011.csv").rename({"pred":"pred3"}),
    on="index", how="left",
)
df = df.join(
    pl.read_csv(CFG.data_dir+"train.csv"),
    on="index", how="left",
)

In [5]:
# ============================================
#  optuna
# ============================================
def objective(trial):
    a = trial.suggest_float("a", 0, 1)
    b = trial.suggest_float("b", 0, 1)  
    c = trial.suggest_float("c", 0, 1)  
    pred = df["pred1"]*a + df["pred2"]*b + df["pred3"]*c
    return log_loss(y_true=df["is_fraud?"], y_pred=pred)

study = optuna.create_study(direction="minimize",
                            sampler=optuna.samplers.TPESampler(seed=CFG.seed),
                            study_name = "Hyperparameter Optimization")
study.optimize(objective, 
                n_trials=1000,
                show_progress_bar=True)

best_params = study.best_params
best_value = study.best_value
print(best_params, best_value)

  0%|          | 0/1000 [00:00<?, ?it/s]

{'a': 0.914367472905719, 'b': 0.032955738198027086, 'c': 0.07060997192225493} 0.11795440757466388


In [6]:
# ============================================
#  Check
# ============================================
df = df.with_columns(
    [
        (pl.col("pred1") * best_params["a"] + pl.col("pred2") * best_params["b"] + pl.col("pred3") * best_params["c"]).alias("pred4"),
    ]
)
best_score, threshold = get_score(df["is_fraud?"], df["pred4"], step=0.005, return_threshold=True, disable=False, )
print('\033[32m'+"====== CV score ======"+'\033[0m')
print('\033[32m'+f'{best_score} (threshold: {threshold})'+'\033[0m')

  0%|          | 0/200 [00:00<?, ?it/s]

[32m0.686212636105139 (threshold: 0.36)[0m


In [7]:
# ============================================
#  test
# ============================================
df = pl.read_csv(CFG.save_dir+"exp009.csv", has_header=False, new_columns=["index", "pred1"])
df = df.join(
    pl.read_csv(CFG.save_dir+"exp010.csv", has_header=False, new_columns=["index", "pred2"]),
    on="index", how="left",
)
df = df.join(
    pl.read_csv(CFG.save_dir+"exp011.csv", has_header=False, new_columns=["index", "pred3"]),
    on="index", how="left",
)
df = df.with_columns(
    [
        (pl.col("pred1") * best_params["a"] + pl.col("pred2") * best_params["b"] + pl.col("pred3") * best_params["c"]).alias("pred"),
    ]
)
df = df.with_columns(
    [
        pl.when(pl.col("pred") > threshold)
        .then(1)
        .otherwise(0)
        .alias("pred")
    ]
)
df[["index", "pred"]].write_csv(CFG.save_dir+f"{CFG.filename}.csv", has_header=False)
df[["index", "pred"]].head(5)

index,pred
i64,i32
471283,0
471284,0
471285,0
471286,1
471287,0
