In [1]:
# ===============================================================
#  Library
# ===============================================================
import numpy as np
import polars as pl

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import warnings
warnings.simplefilter("ignore")

import lightgbm as lgb

import sys
sys.path.append("G:/マイドライブ/signate_MUFJ2023/")
from MUFJ.utils import get_score, seed_everything
from MUFJ.preprocessing import CustomOrdinalEncoder

from math import comb
import xgboost as xgb
from tqdm.auto import tqdm
from itertools import combinations

In [2]:
# ===============================================================
#  CFG
# ===============================================================
class CFG:
    debug = False
    seed = 42
    n_splits = 5
    num_cores = 4
    data_dir = "G:/マイドライブ/signate_MUFJ2023/data/"
    stopping_rounds = 100
    save_dir = "G:/マイドライブ/signate_MUFJ2023/exp/"
    filename = "exp003"
    numerical_features = [
        "amount", 'cards_issued', 'credit_limit','year_pin_last_changed','current_age','retirement_age','birth_year','birth_month', 'latitude', 'longitude',
        'per_capita_income_zipcode', 'yearly_income_person', 'total_debt','fico_score', 'num_credit_cards', 'expires_month','expires_year','acct_open_date_month', 
        'acct_open_date_year', "YearsFromAcctOpenToPinChange", "DiffNonFraudAvgAmount_per_user",
    ]
        
    categorical_features = [
        "errors?", 'merchant_id', 'merchant_city','merchant_state','zip',"mcc",'use_chip','card_brand','card_type', 'has_chip','gender', 'city', 'state', 'zipcode',
        "card_id", "user_id", "same_zipcode_as_zip", "city_is_ONLINE",
        ]
    target_cols = ["is_fraud?"]

In [3]:
# ===============================================================
#  Utils
# ===============================================================
seed_everything(CFG.seed)

In [4]:
# ===============================================================
#  Data Loading
# ===============================================================
train = pl.read_csv(CFG.data_dir+"train.csv")
test = pl.read_csv(CFG.data_dir+"test.csv")
card = pl.read_csv(CFG.data_dir+"card.csv")
user = pl.read_csv(CFG.data_dir+"user.csv")

train = train.with_columns(
    pl.lit("train").alias("flag")
)
test = test.with_columns(
    [
        pl.lit(None, dtype=pl.Int64).alias("is_fraud?"),
        pl.lit("test").alias("flag"),
    ]
)

if CFG.debug:
    train = train.sample(n=10000, seed=CFG.seed)
    test = test.sample(n=1000, seed=CFG.seed)

all_data = pl.concat([train, test], how="align")
all_data = all_data.join(
    card, on=["user_id", "card_id"], how="left"
)
all_data = all_data.join(
    user, on="user_id", how="left"
)

In [5]:
# ===============================================================
#  Preprocessing
# ===============================================================
def preprocessing(all_data: pl.DataFrame) -> pl.DataFrame:
    
    all_data = all_data.with_columns(
        [   
            # str -> float
            pl.col("amount").apply(lambda x: x[1:]).cast(pl.Float64),
            pl.col("total_debt").apply(lambda x: x[1:]).cast(pl.Float64),
            pl.col("credit_limit").apply(lambda x: x[1:]).cast(pl.Float64),
            pl.col("yearly_income_person").apply(lambda x: x[1:]).cast(pl.Float64),
            pl.col("per_capita_income_zipcode").apply(lambda x: x[1:]).cast(pl.Float64),
            
            # str -> Datetime
            pl.col("expires").str.strptime(dtype=pl.Date, format="%m/%Y"),
            pl.col("acct_open_date").str.strptime(dtype=pl.Date, format="%m/%Y"),
            
            # bool
            (pl.col("zip") == pl.col("zipcode")).alias("same_zipcode_as_zip"),
            #(pl.col("state") == pl.col("merchant_state")).alias("same_state"),
            #(pl.col("city") == pl.col("merchant_city")).alias("same_city"),
            (pl.col("merchant_city") == "ONLINE").alias("city_is_ONLINE"),
            #pl.when((pl.col("merchant_city").is_null())&(pl.col("merchant_city") != "ONLINE")) ## TODO: 上手くまとめられないかな
            #.then(pl.lit(True))
            #.otherwise(pl.lit(False))
            #.alias("city_is_not_America"),

            # user_id + card_id
            (pl.col("user_id").cast(pl.Utf8) + "-" + pl.col("card_id").cast(pl.Utf8)).alias("user_card_id"),
        ]
    )
    
    
    all_data = all_data.with_columns(
        [
            # Datetime -> Month, Year
            pl.col("expires").dt.year().suffix("_year"),
            pl.col("expires").dt.month().suffix("_month"),
            pl.col("acct_open_date").dt.year().suffix("_year"),
            pl.col("acct_open_date").dt.month().suffix("_month"),
        
            # feature_engineering
            #(pl.col("amount") - pl.col("credit_limit")).cast(pl.Float64).alias("remaining_credit"),
            #(pl.col("amount") / (pl.col("yearly_income_person") + 1e-9)).alias("income_transaction_ratio"),
            (pl.col("amount") / (pl.col('per_capita_income_zipcode') + 1e-9)).alias("income_transaction_ratio"),
        ]
    )
    
    all_data = all_data.with_columns(
        [
            #(2023 - pl.col('year_pin_last_changed')).alias("YearsSincePinChange"),
            (pl.col("year_pin_last_changed") - pl.col("acct_open_date_year")).alias("YearsFromAcctOpenToPinChange"),
            #(pl.col("retirement_age") - pl.col("current_age")).alias("YearsUntilRetirement"),
            #(pl.col("expires_year") - pl.col("year_pin_last_changed")).alias("YearsFromPinChangeToExpires"),
        ]
    )
    
    
    return all_data
all_data = preprocessing(all_data)

In [6]:
# ===================================================================
#  Cross Validation
# ===================================================================
all_data = all_data.with_columns(pl.lit(None).alias("fold"))
train = all_data.filter(
    pl.col("flag") == "train"
).sort(by="index")
test = all_data.filter(
    pl.col("flag") == "test"
)

# すべてのfoldにuser_card_idとis_fraud?がうまくいきわたるようにCVを作る
skf = MultilabelStratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)
for i, (_, val) in enumerate(skf.split(X=train, y=train[["is_fraud?", "user_card_id"]])):
    train = train.with_columns(
        pl.when(pl.col("index").is_in(val))
        .then(pl.lit(i))
        .otherwise(pl.col("fold"))
        .alias("fold")
    )
    
display(train["fold"].value_counts())
display(train.groupby("fold").agg(
    pl.col("is_fraud?").value_counts()
    ).explode(columns="is_fraud?"))

fold,counts
i32,u32
0,94257
1,94257
2,94257
3,94256
4,94256


fold,is_fraud?
i32,struct[2]
0,"{1,6526}"
0,"{0,87731}"
1,"{1,6527}"
1,"{0,87730}"
2,"{1,6526}"
2,"{0,87731}"
3,"{0,87730}"
3,"{1,6526}"
4,"{0,87730}"
4,"{1,6526}"


In [7]:
# ===============================================================
#  Preprocessing_per_fold
# ===============================================================
def preprocessing_per_fold(CFG, train: pl.DataFrame, test: pl.DataFrame, fold:int):
    # data split
    X_train = train.filter(pl.col("fold") != fold)
    X_valid = train.filter(pl.col("fold") == fold)
    test_df = test.clone()
    
    # user_idごとの不正利用があったとき、無かったときのそれぞれの取引金額の平均
    tmp = X_train.groupby(["user_id", "is_fraud?"]).agg(
        pl.col("amount").mean()
    )
    #tmp_1 = tmp.filter(pl.col("is_fraud?") == 1).rename({"amount":"FraudAvgAmount_per_user"})[["user_id", "FraudAvgAmount_per_user"]]
    tmp_0 = tmp.filter(pl.col("is_fraud?") == 0).rename({"amount":"NonFraudAvgAmount_per_user"})[["user_id", "NonFraudAvgAmount_per_user"]]
    
    X_train = X_train.join(
        tmp_0, on="user_id", how="left"
    )
    #X_train = X_train.join(
    #    tmp_1, on="user_id", how="left"
    #)
    
    X_valid = X_valid.join(
        tmp_0, on="user_id", how="left"
    )
    #X_valid = X_valid.join(
    #    tmp_1, on="user_id", how="left"
    #)
    
    test_df = test_df.join(
        tmp_0, on="user_id", how="left"
    )
    #test_df = test_df.join(
    #    tmp_1, on="user_id", how="left"
    #)
    ## 自分自身との差
    tmp = [
        #(pl.col("amount") - pl.col("FraudAvgAmount_per_user")).alias("DiffFraudAvgAmount_per_user"),
        (pl.col("amount") - pl.col("NonFraudAvgAmount_per_user")).alias("DiffNonFraudAvgAmount_per_user"),
    ]
    X_train = X_train.with_columns(tmp)
    X_valid = X_valid.with_columns(tmp)
    test_df = test_df.with_columns(tmp)
    
    
    """
    # user_card_idごとの不正利用があったとき、無かったときのそれぞれの取引金額の平均
    tmp = X_train.groupby(["user_card_id", "is_fraud?"]).agg(
        pl.col("amount").mean()
    )
    tmp_1 = tmp.filter(pl.col("is_fraud?") == 1).rename({"amount":"FraudAvgAmount_per_user_card_id"})[["user_card_id", "FraudAvgAmount_per_user_card_id"]]
    tmp_0 = tmp.filter(pl.col("is_fraud?") == 0).rename({"amount":"NonFraudAvgAmount_per_user_card_id"})[["user_card_id", "NonFraudAvgAmount_per_user_card_id"]]
    
    X_train = X_train.join(
        tmp_0, on="user_card_id", how="left"
    )
    X_train = X_train.join(
        tmp_1, on="user_card_id", how="left"
    )
    
    X_valid = X_valid.join(
        tmp_0, on="user_card_id", how="left"
    )
    X_valid = X_valid.join(
        tmp_1, on="user_card_id", how="left"
    )
    
    test_df = test_df.join(
        tmp_0, on="user_card_id", how="left"
    )
    test_df = test_df.join(
        tmp_1, on="user_card_id", how="left"
    )
    ## 自分自身との差
    tmp = [
        (pl.col("amount") - pl.col("FraudAvgAmount_per_user_card_id")).alias("DiffFraudAvgAmount_per_user_card_id"),
        (pl.col("amount") - pl.col("NonFraudAvgAmount_per_user_card_id")).alias("DiffNonFraudAvgAmount_per_user_card_id"),
    ]
    X_train = X_train.with_columns(tmp)
    X_valid = X_valid.with_columns(tmp)
    test_df = test_df.with_columns(tmp)
    """
    
    """
    # user_id, merchant_idごとの不正利用があったとき、無かったときのそれぞれの取引金額の平均
    tmp = X_train.groupby(["user_id","merchant_id", "is_fraud?"]).agg(
        pl.col("amount").mean()
    )
    tmp_1 = tmp.filter(pl.col("is_fraud?") == 1).rename({"amount":"FraudAvgAmount_per_user*merchant_id"})[["user_id","merchant_id", "FraudAvgAmount_per_user*merchant_id"]]
    tmp_0 = tmp.filter(pl.col("is_fraud?") == 0).rename({"amount":"NonFraudAvgAmount_per_user*merchant_id"})[["user_id","merchant_id", "NonFraudAvgAmount_per_user*merchant_id"]]
    
    X_train = X_train.join(
        tmp_0, on=["user_id","merchant_id",], how="left"
    )
    X_train = X_train.join(
        tmp_1, on=["user_id","merchant_id",], how="left"
    )
    
    X_valid = X_valid.join(
        tmp_0, on=["user_id","merchant_id",], how="left"
    )
    X_valid = X_valid.join(
        tmp_1, on=["user_id","merchant_id",], how="left"
    )
    
    test_df = test_df.join(
        tmp_0, on=["user_id","merchant_id",], how="left"
    )
    test_df = test_df.join(
        tmp_1, on=["user_id","merchant_id",], how="left"
    )
    ## 自分自身との差
    tmp = [
        (pl.col("amount") - pl.col("FraudAvgAmount_per_user*merchant_id")).alias("DiffFraudAvgAmount_per_user*merchant_id"),
        (pl.col("amount") - pl.col("NonFraudAvgAmount_per_user*merchant_id")).alias("DiffNonFraudAvgAmount_per_user*merchant_id"),
    ]
    X_train = X_train.with_columns(tmp)
    X_valid = X_valid.with_columns(tmp)
    test_df = test_df.with_columns(tmp)
    """
    
    """
    for col in ["merchant_id"]:
        # per user
        count_map = X_train.groupby(["user_id", col]).count().rename(
            {"count":f"{col}_count_per_user"})
        X_train = X_train.join(count_map, on=["user_id", col], how="left")
        X_valid = X_valid.join(count_map, on=["user_id", col], how="left")
        test_df = test_df.join(count_map, on=["user_id", col], how="left")
        
        # per user&card
        count_map = X_train.groupby(["user_id", "card_id", col]).count().rename(
            {"count":f"{col}_count_per_user_card"})
        X_train = X_train.join(count_map, on=["user_id", "card_id", col], how="left")
        X_valid = X_valid.join(count_map, on=["user_id", "card_id", col], how="left")
        test_df = test_df.join(count_map, on=["user_id", "card_id", col], how="left")
    """


    
    """
    # target_encoding
    # 1変数
    for cols in tqdm(combinations(CFG.categorical_features, 1), total=comb(len(CFG.categorical_features), 1), leave=False):
        group_cols = list(cols)  # Convert the combination tuple to a list
        mean_map = X_train.groupby(group_cols).agg(
            pl.col("is_fraud?").mean()
        ).rename({"is_fraud?": f"{group_cols[0]}_is_fraud?"})
        X_train = X_train.join(mean_map, on=group_cols, how="left")
        X_valid = X_valid.join(mean_map, on=group_cols, how="left")
        test_df = test_df.join(mean_map, on=group_cols, how="left")

    for cols in tqdm(combinations(CFG.categorical_features, 2), total=comb(len(CFG.categorical_features), 2)):
        group_cols = list(cols)  # Convert the combination tuple to a list
        mean_map = X_train.groupby(group_cols).agg(
            pl.col("is_fraud?").mean()
        ).rename({"is_fraud?": f"{group_cols[0]}*{group_cols[1]}_is_fraud?"})
        X_train = X_train.join(mean_map, on=group_cols, how="left")
        X_valid = X_valid.join(mean_map, on=group_cols, how="left")
        test_df = test_df.join(mean_map, on=group_cols, how="left")

            
    # 3変数
    for cols in tqdm(combinations(CFG.categorical_features, 3), total=comb(len(CFG.categorical_features), 3)):
        group_cols = list(cols)  # Convert the combination tuple to a list
        mean_map = X_train.groupby(group_cols).agg(
            pl.col("is_fraud?").mean()
        ).rename({"is_fraud?": f"{group_cols[0]}*{group_cols[1]}*{group_cols[2]}_is_fraud?"})
        X_train = X_train.join(mean_map, on=group_cols, how="left")
        X_valid = X_valid.join(mean_map, on=group_cols, how="left")
        test_df = test_df.join(mean_map, on=group_cols, how="left")
        
    # target_encoding
    for col in [col for col in CFG.categorical_features if col not in ["user_id", "card_id"]]:
        # per_user
        mean_map = X_train.groupby(["user_id", col]).agg(
            pl.col("is_fraud?").mean()
            ).rename({"is_fraud?":f"user_id*{col}_is_fraud?"})
        X_train = X_train.join(mean_map, on=["user_id", col], how="left")
        X_valid = X_valid.join(mean_map, on=["user_id", col], how="left")
        test_df = test_df.join(mean_map, on=["user_id", col], how="left")
        
        # per_user_card
        mean_map = X_train.groupby(["user_id","card_id", col]).agg(
            pl.col("is_fraud?").mean()
            ).rename({"is_fraud?":f"user_id*card_id*{col}_is_fraud?"})
        X_train = X_train.join(mean_map, on=["user_id", "card_id", col], how="left")
        X_valid = X_valid.join(mean_map, on=["user_id", "card_id", col], how="left")
        test_df = test_df.join(mean_map, on=["user_id", "card_id", col], how="left")
        
        
    # count_encoding
    for col in [col for col in CFG.categorical_features if col not in ["user_id", "card_id"]]:
        # per user
        count_map = X_train.groupby(["user_id", col]).count().rename(
            {"count":f"{col}_count_per_user"})
        X_train = X_train.join(count_map, on=["user_id", col], how="left")
        X_valid = X_valid.join(count_map, on=["user_id", col], how="left")
        test_df = test_df.join(count_map, on=["user_id", col], how="left")
        
        # per user&card
        count_map = X_train.groupby(["user_id", "card_id", col]).count().rename(
            {"count":f"{col}_count_per_user_card"})
        X_train = X_train.join(count_map, on=["user_id", "card_id", col], how="left")
        X_valid = X_valid.join(count_map, on=["user_id", "card_id", col], how="left")
        test_df = test_df.join(count_map, on=["user_id", "card_id", col], how="left")
        
    # frequency_encoding
    for col in [col for col in CFG.categorical_features if col not in ["user_id", "card_id"]]:
        # per user_id
        tmp = X_train.groupby(["user_id", col]).agg(
            pl.col("is_fraud?").count()
        )
        tmp = tmp.join(
            X_train.groupby("user_id").count(),
            on="user_id", how="left"
        )
        tmp = tmp.with_columns(
            (pl.col("is_fraud?") / pl.col("count")).alias(f"{col}_freq_per_user")
        )
        X_train = X_train.join(
            tmp[["user_id", col, f"{col}_freq_per_user"]],
            on=["user_id", col], how="left"
        )
        X_valid = X_valid.join(
            tmp[["user_id", col, f"{col}_freq_per_user"]],
            on=["user_id", col], how="left"
        )
        test_df = test_df.join(
            tmp[["user_id", col, f"{col}_freq_per_user"]],
            on=["user_id", col], how="left"
        )
        
        # per user_id & card_id
        tmp = X_train.groupby(["user_id", "card_id", col]).agg(
            pl.col("is_fraud?").count()
        )
        tmp = tmp.join(
            X_train.groupby(["user_id", "card_id"]).count(),
            on=["user_id", "card_id"], how="left"
        )
        tmp = tmp.with_columns(
            (pl.col("is_fraud?") / pl.col("count")).alias(f"{col}_freq_per_user_card")
        )
        X_train = X_train.join(
            tmp[["user_id", "card_id", col, f"{col}_freq_per_user_card"]],
            on=["user_id", "card_id", col], how="left"
        )
        X_valid = X_valid.join(
            tmp[["user_id", "card_id", col, f"{col}_freq_per_user_card"]],
            on=["user_id", "card_id", col], how="left"
        )
        test_df = test_df.join(
            tmp[["user_id", "card_id", col, f"{col}_freq_per_user_card"]],
            on=["user_id",  "card_id", col], how="left"
        )
    """
    
    # OrdinalEncoder
    oe = CustomOrdinalEncoder(encoded_missing_value=-1)
    X_train = pl.concat([X_train, 
                        oe.fit_transform(X_train[CFG.categorical_features])
                        ], how="horizontal")
    X_valid = pl.concat([X_valid, 
                        oe.transform(X_valid[CFG.categorical_features])
                        ], how="horizontal")
    test_df = pl.concat([test_df, 
                        oe.transform(test_df[CFG.categorical_features])
                        ], how="horizontal")
    
    
    return X_train, X_valid, test_df

In [8]:
# ===================================================================
#  evaluate
# ===================================================================
def train_lgb(CFG, lgb_param):
    oof_df = pl.DataFrame()
    preds = []
    for fold in range(CFG.n_splits):
        X_train, X_valid, test_df = preprocessing_per_fold(CFG, train, test, fold)
        # train
        categorical_features = [col for col in CFG.use_features if "_category" in col]
        lgb_train = lgb.Dataset(X_train[CFG.use_features].to_pandas(), X_train[CFG.target_cols].to_pandas(), categorical_feature = categorical_features,)
        lgb_valid = lgb.Dataset(X_valid[CFG.use_features].to_pandas(), X_valid[CFG.target_cols].to_pandas(), categorical_feature = categorical_features,)
        model = lgb.train(
                        lgb_param, 
                        lgb_train, 
                        valid_sets=[lgb_valid],
                        categorical_feature = categorical_features,
                        callbacks=[lgb.early_stopping(stopping_rounds=CFG.stopping_rounds, verbose=True),
                                   lgb.log_evaluation(period=200)],
                        )
        
        # valid
        X_valid = X_valid.with_columns(
            pl.Series(model.predict(X_valid[CFG.use_features].to_pandas(), num_iteration=model.best_iteration)).alias("pred")
        )
        #print(f"fold{fold}:", get_score(y_true=X_valid[CFG.target_cols], y_pred=X_valid["pred"]))
        
        # oof
        oof_df = pl.concat(
            [oof_df, X_valid]
        )
        
        # predict
        preds.append(model.predict(test_df[CFG.use_features].to_pandas(), num_iteration=model.best_iteration))
        
    test_df = test_df.with_columns(
        pl.Series(np.mean(preds, axis=0)).alias("pred")
    )
    score, threshold = get_score(oof_df[CFG.target_cols], oof_df["pred"], step=0.005, return_threshold=True)
    return score, threshold, oof_df, test_df


def train_xgb(CFG, xgb_param):
    oof_df = pl.DataFrame()
    preds = []
    for fold in range(CFG.n_splits):
        X_train, X_valid, test_df = preprocessing_per_fold(CFG, train, test, fold)
        d_train = xgb.DMatrix(data=X_train[CFG.use_features].to_numpy(), label=X_train[CFG.target_cols].to_numpy(), enable_categorical=True)
        d_valid = xgb.DMatrix(data=X_valid[CFG.use_features].to_numpy(), label=X_valid[CFG.target_cols].to_numpy(), enable_categorical=True)
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        
        # train
        model = xgb.train(dtrain=d_train,
                          num_boost_round= 1000,
                          evals=watchlist,
                          early_stopping_rounds=CFG.stopping_rounds,
                          verbose_eval=50,
                          params=xgb_param)
        
        # valid
        X_valid = X_valid.with_columns(
            pl.Series(model.predict(xgb.DMatrix(X_valid[CFG.use_features].to_pandas(), enable_categorical=True), ntree_limit=model.best_ntree_limit)).alias("pred")
        )
        print(f"fold{fold}:", get_score(y_true=X_valid[CFG.target_cols], y_pred=X_valid["pred"]))
        
        # oof
        oof_df = pl.concat(
            [oof_df, X_valid]
        )
        
        # predict
        #preds.append(model.predict(xgb.DMatrix(test_df[CFG.use_features].to_pandas(), enable_categorical=True), ntree_limit=model.best_ntree_limit))
        
    #test_df = test_df.with_columns(
    #    pl.Series(np.mean(preds, axis=0)).alias("pred")
    #)
    score, threshold = get_score(oof_df[CFG.target_cols], oof_df["pred"], step=0.005, return_threshold=True)
    return score, threshold, oof_df, test_df

In [9]:
# ===================================================================
#  evaluate
# ===================================================================
CFG.use_features = CFG.numerical_features + [col+"_category" for col in CFG.categorical_features]

lgb_param = {
    "task":"train",
    "objective": "binary",
    "boosting":"gbdt",
    "num_iterations": 10000, # default: 100
    "learning_rate": 0.05, # default: 0.1
    "num_leaves": int((2**6) * 0.7), # max number of leaves in one tree
    "max_depth": 6, # default -1, int: limit the max depth for tree model  ### xgboost, catboostに合わせる
    "min_child_weight":1e-3, # double: minimal sum hessian in one leaf
    "min_data_in_leaf":20, # minimal number of data in one leaf
    "alpha":0.9, # double, constraints, alpha > 0.0: 
    "colsample_bytree":0.4, # 0 < "colsample_bytree" < 1
    #: LightGBM will randomly select a subset of features on each iteration (tree) if feature_fraction is smaller than 1.0
    "lambda": 0, #lambda_l2 >= 0.0: L2 regularization
    "subsample":1, #0.0 < bagging_fraction <= 1.0
    "num_threads": CFG.num_cores,
    "metric": 'binary_logloss',
    "seed" : CFG.seed,
    "verbosity": -1, 
}

best_score, threshold, oof_df, test_df = train_lgb(CFG, lgb_param)
print('\033[32m'+"====== CV score ======"+'\033[0m')
print('\033[32m'+f'{best_score} (threshold: {threshold})'+'\033[0m')

Training until validation scores don't improve for 100 rounds
[200]	valid_0's binary_logloss: 0.1398
[400]	valid_0's binary_logloss: 0.133975
[600]	valid_0's binary_logloss: 0.131843
[800]	valid_0's binary_logloss: 0.130667
[1000]	valid_0's binary_logloss: 0.130154
[1200]	valid_0's binary_logloss: 0.129845
[1400]	valid_0's binary_logloss: 0.129695
Early stopping, best iteration is:
[1468]	valid_0's binary_logloss: 0.129578
Training until validation scores don't improve for 100 rounds
[200]	valid_0's binary_logloss: 0.142141
[400]	valid_0's binary_logloss: 0.135999
[600]	valid_0's binary_logloss: 0.134072
[800]	valid_0's binary_logloss: 0.132976
[1000]	valid_0's binary_logloss: 0.132201
[1200]	valid_0's binary_logloss: 0.131993
[1400]	valid_0's binary_logloss: 0.131819
Early stopping, best iteration is:
[1368]	valid_0's binary_logloss: 0.13174
Training until validation scores don't improve for 100 rounds
[200]	valid_0's binary_logloss: 0.141513
[400]	valid_0's binary_logloss: 0.13547
[6

  0%|          | 0/200 [00:00<?, ?it/s]

[32m0.6532026768642447 (threshold: 0.355)[0m


| id | description | CV |  |
| - | - | - | - |
| 0 | baseline | 0.6503219747786423 (threshold: 0.33) | |
| 1 | add same_zipcode_as_zip | 0.6522468256956091 (threshold: 0.36) | 採用！ |
| 2 | add user_card_id | 0.6477621196981704 (threshold: 0.37) | 不採用 |  
| 3 | add city_is_ONLINE | 0.6527855745513311 (threshold: 0.35000000000000003) | 採用！ |
| 4 | add city_is_not_America | 0.6527855745513311 (threshold: 0.35000000000000003) | 不採用(これ変わらないのか) |
| 5 | add remaining_credit | 0.6522429938135965 (threshold: 0.34500000000000003) | 不採用 |
| 6 | add "YearsSincePinChange" | 0.6497795839647335 (threshold: 0.36) | 不採用 |
| 7 | add "YearsFromAcctOpenToPinChange" | **0.6528226893020749** (threshold: 0.34) | 採用 |
| 8 | add "YearsUntilRetirement" | 0.6520358071036674 (threshold: 0.35000000000000003) | 不採用 |
| 9 | add YearsFromPinChangeToExpires | 0.6521451248705454 (threshold: 0.35000000000000003)| 不採用 |
| 10 | change 'num_credit_cards' numerical to categorical | 0.651323243426059 (threshold: 0.34500000000000003) | 不採用 |
| 11 | add "same_state" | 0.6507434409729858 (threshold: 0.34500000000000003) | 不採用 |
| 12 | add "same_city" | 0.6497568352494373 (threshold: 0.365) | 不採用 |
| 13 | add FraudAvgAmount_per_user | 0.6490824157490824 (threshold: 0.325) | 不採用 |
| 14 | add "NonFraudAvgAmount_per_user" | 0.6524137458457532 (threshold: 0.36) | 不採用 |
| 15 | add DiffFraudAvgAmount_per_user | 0.6521343926743872 (threshold: 0.34) | 不採用 |
| 16 | add DiffNonFraudAvgAmount_per_user | **0.6531318219983208** (threshold: 0.335) | 採用！ |
| 17 | add "FraudAvgAmount_per_user_card_id" | 0.6506323435532013 (threshold: 0.37) | 不採用 |
| 18 | add "NonFraudAvgAmount_per_user_card_id | 0.6494871491576404 (threshold: 0.34) | 不採用 |
| 19 | add "DiffFraudAvgAmount_per_user_card_id" | 0.6497280129742878 (threshold: 0.34) | 不採用 |
| 20 | add "DiffNonFraudAvgAmount_per_user_card_id" | 0.6509064934181436 (threshold: 0.35000000000000003) | 不採用 |
| 21 | add "FraudAvgAmount_per_user*merchant_id" | 0.6103046901745978 (threshold: 0.28500000000000003) | 不採用 |
| 22 | add "NonFraudAvgAmount_per_user*merchant_id" | 0.5241076138561631 (threshold: 0.38) | 不採用 |
| 23 | add "DiffFraudAvgAmount_per_user*merchant_id" | 0.6083466812173269 (threshold: 0.27) | 不採用 |
| 24 | add "DiffNonFraudAvgAmount_per_user*merchant_id" | 0.5209444807765119 (threshold: 0.365) | 不採用 |
| 25 | add "merchant_id_count_per_user" | 0.6525988603004662 (threshold: 0.375) | 不採用(これ変わらないのか) |
| 26 | add "merchant_id_count_per_user_card" | 0.651119717124095 (threshold: 0.34) | 不採用 |
| 27 | n_splits: 5->10 | **0.6551747132568686 (threshold: 0.325)** | 保留(パラメータいじれば0.657くらいはいきそう?) |
| 28 | add "income_transaction_ratio" | 0.6513817187974492 (threshold: 0.34) | 不採用 |
| 28 | add "income_transaction_ratio"(ver2) | 0.6497960571040109 (threshold: 0.35000000000000003) | 不採用 |

target_encoding, count_encoding, frequency_encodingは後でやりましょう

num_leaves = int((2**max_depth) * 0.7)にした

| max_depth | CV |
| - | - |
| 6 | 0.6508343165346369 (threshold: 0.34) |
| 8 | 0.6522853769607017 (threshold: 0.335) |
| 10 | 0.6502891162281452 (threshold: 0.34500000000000003) |

learning_rateを変更

| max_depth | CV |
| - | - |
| 4 | 0.6514412566008543 (threshold: 0.34) |
| 6 | 0.6532026768642447 (threshold: 0.355) |
| 8 | 0.6530890498508736 (threshold: 0.34500000000000003) |
| 10 | 0.6524644104217029 (threshold: 0.32) |

In [14]:
# ===================================================================
#  save_data
# ===================================================================

# oof_df
oof_df[["index", "pred"]].write_csv(CFG.save_dir+f"oof_df_{CFG.filename}.csv", has_header=True)

# test
test_df = test_df.with_columns(
    pl.when(pl.col("pred") > threshold)
    .then(1)
    .otherwise(0)
    .alias("pred")
)
test_df[["index", "pred"]].write_csv(CFG.save_dir+f"{CFG.filename}.csv", has_header=False)

In [15]:
test_df[["index", "pred"]]

index,pred
i64,i32
471283,0
471284,0
471285,0
471286,1
471287,0
471288,0
471289,0
471290,0
471291,0
471292,0


In [11]:
"""
CFG.use_features = CFG.numerical_features + [col+"_category" for col in CFG.categorical_features] + [col for col in X_train.columns if "target_mean" in col]

xgb_param = {
    "booster":"gbtree",
    "verbosity": 1,
    "n_thread": 4,
    "eta": 0.3,
    "gamma": 0,
    "max_depth": 6,
    "task": "train",
    "objective": "binary:logistic", # binary:logistic, binary:logitraw, binary:hinge
    "eval_metric": "logloss", # auc
    "seed": CFG.seed,
}

# The Kaggle Book: Hyperparameter optimization

xgb_param = {
    #"n_estimators": study.suggest_int("n_estimators", 10, 5000),
    "learning_rate": study.suggest_float("learning_rate", 0.01, 1.0, log=True),
    "min_child_weight": study.suggest_int("min_child_weight", 1, 10),
    "max_depth": study.suggest_int("max_depth", 1, 50),
    "max_delta_step": study.suggest_int("max_delta_step", 0, 20),
    "subsample": study.suggest_float("subsample", 0.1, 1.0),
    "colsample_bytree": study.suggest_float("colsample_bytree", 0.1, 10),
    "colsample_bylevel": study.suggest_float("colsample_bylevel", 0.1, 1.0),
    "reg_lambda": study.suggest_float("reg_lambda", 1e-9, 100.0, log=True),
    "reg_alpha": study.suggest_float("reg_alpha", 1e-9, 100.0, log=True),
    "gamma": study.suggest_float("gamma", 1e-9, 100.0, log=True),
    "scale_pos_weight", study.suggest_float("scale_pos_weight", 1e-6, 500.0, log=True),
    "task": "train",
    "objective": "binary:logistic", # binary:logistic, binary:logitraw, binary:hinge
    "eval_metric": "logloss", # auc
    "seed": CFG.seed,
    "task": "train",
}


xgb_best_score, xgb_threshold, xgb_oof_df, xgb_test_df = train_xgb(CFG, xgb_param)
print('\033[32m'+"====== CV score ======"+'\033[0m')
print('\033[32m'+f'{xgb_best_score} (threshold: {xgb_threshold})'+'\033[0m')
"""


