In [1]:
# ===============================================================
#  Library
# ===============================================================
import numpy as np
import polars as pl

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import warnings
warnings.simplefilter("ignore")

import lightgbm as lgb

import sys
sys.path.append("G:/マイドライブ/signate_MUFJ2023/")
from MUFJ.utils import get_score, seed_everything
from MUFJ.preprocessing import CustomOrdinalEncoder

from tqdm.auto import tqdm

In [2]:
# ===============================================================
#  CFG
# ===============================================================
class CFG:
    debug = False
    max_depth = 6
    seed = 42
    n_splits = 5
    num_cores = 4
    data_dir = "G:/マイドライブ/signate_MUFJ2023/data/"
    save_dir = "G:/マイドライブ/signate_MUFJ2023/exp/"
    filename = "exp008"
    numerical_features = [
        "amount", 'cards_issued', 'credit_limit','year_pin_last_changed','current_age','retirement_age','birth_year','birth_month', 'latitude', 'longitude',
        'per_capita_income_zipcode', 'yearly_income_person', 'total_debt','fico_score', 'num_credit_cards', 'expires_month','expires_year','acct_open_date_month', 
        'acct_open_date_year', "NonFraudAvgAmount_per_user_card", "merchant_id_count_per_user",
    ]
        
    categorical_features = [
        "errors?", 'merchant_id', 'merchant_city','merchant_state','zip',"mcc",'use_chip','card_brand','card_type', 'has_chip','gender', 'city', 'state', 'zipcode',
        "card_id", "user_id", "same_zipcode_as_zip", "city_is_not_America", 
        ]
    target_cols = ["is_fraud?"]
    threshold_per_user = False

In [3]:
# ===============================================================
#  Utils
# ===============================================================
seed_everything(CFG.seed)

In [4]:
# ===============================================================
#  Data Loading
# ===============================================================
# load data
train = pl.read_csv(CFG.data_dir+"train.csv")
test = pl.read_csv(CFG.data_dir+"test.csv")
card = pl.read_csv(CFG.data_dir+"card.csv")
user = pl.read_csv(CFG.data_dir+"user.csv")
if CFG.debug:
    train = train.sample(n=10000, seed=CFG.seed)
    test = test.sample(n=1000, seed=CFG.seed)

# add flag
train = train.with_columns(
    pl.lit("train").alias("flag")
)
test = test.with_columns(
    [
        pl.lit(None, dtype=pl.Int64).alias("is_fraud?"),
        pl.lit("test").alias("flag"),
    ]
)

# concat
all_data = pl.concat([train, test], how="align")

# merge
all_data = all_data.join(
    card, on=["user_id", "card_id"], how="left"
)
all_data = all_data.join(
    user, on="user_id", how="left"
)

In [5]:
# ===============================================================
#  Preprocessing
# ===============================================================
def preprocessing(all_data: pl.DataFrame) -> pl.DataFrame:
    all_data = all_data.with_columns(
        [   
            # str -> float
            pl.col("amount").apply(lambda x: x[1:]).cast(pl.Float64),
            pl.col("total_debt").apply(lambda x: x[1:]).cast(pl.Float64),
            pl.col("credit_limit").apply(lambda x: x[1:]).cast(pl.Float64),
            pl.col("yearly_income_person").apply(lambda x: x[1:]).cast(pl.Float64),
            pl.col("per_capita_income_zipcode").apply(lambda x: x[1:]).cast(pl.Float64),
            
            # str -> Datetime
            pl.col("expires").str.strptime(dtype=pl.Date, format="%m/%Y"),
            pl.col("acct_open_date").str.strptime(dtype=pl.Date, format="%m/%Y"),
            
            # user_id + card_id
            (pl.col("user_id").cast(pl.Utf8) + "-" + pl.col("card_id").cast(pl.Utf8)).alias("user_card_id"),
            
            # bool
            (pl.col("zip") == pl.col("zipcode")).alias("same_zipcode_as_zip"),
            #(pl.col("merchant_city") == "ONLINE").alias("city_is_ONLINE"),
            pl.when((pl.col("merchant_city").is_null())&(pl.col("merchant_city") != "ONLINE")) ## TODO: 上手くまとめられないかな
            .then(pl.lit(True))
            .otherwise(pl.lit(False))
            .alias("city_is_not_America"),
        ]
    )
    
    all_data = all_data.with_columns(
        [
            # Datetime -> Month, Year
            pl.col("expires").dt.year().suffix("_year"),
            pl.col("expires").dt.month().suffix("_month"),
            pl.col("acct_open_date").dt.year().suffix("_year"),
            pl.col("acct_open_date").dt.month().suffix("_month"),
            
            # fold
            pl.lit(None).alias("fold"),
            
            # feature_engineering
            #(pl.col("amount") - pl.col("credit_limit")).cast(pl.Float64).alias("remaining_credit"),
            #(2023 - pl.col('year_pin_last_changed')).alias("YearsSincePinChange"),
        ]
    )
    
    return all_data
all_data = preprocessing(all_data)

In [6]:
# ===================================================================
#  Cross Validation
# ===================================================================
def kfold(df: pl.DataFrame) -> pl.DataFrame:
    df = df.sort("index")
    df = df.with_columns(pl.Series(range(len(df))).alias("id"))
    skf = MultilabelStratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)
    for i, (_, val) in enumerate(skf.split(X=df, y=df[["is_fraud?", "card_id"]])):
        df = df.with_columns(
            pl.when(pl.col("id").is_in(val))
            .then(pl.lit(i))
            .otherwise(pl.col("fold"))
            .alias("fold")
        )
    #print(df["fold"].value_counts())
    return df

CFG.n_splits = 50でも使えそう

In [7]:
# ===============================================================
#  Preprocessing_per_fold
# ===============================================================
def preprocessing_per_fold(CFG, train: pl.DataFrame, test: pl.DataFrame, fold:int):
    # data split
    X_train = train.filter(pl.col("fold") != fold)
    X_valid = train.filter(pl.col("fold") == fold)
    test_df = test.clone()
    
    # user_card_idごとの不正利用があったとき、無かったときのそれぞれの取引金額の平均
    tmp = X_train.groupby(by=["user_card_id", "is_fraud?"], maintain_order=True).agg(
        pl.col("amount").mean()
    )
    tmp_1 = tmp.filter(pl.col("is_fraud?") == 1).rename({"amount":"FraudAvgAmount_per_user_card"})[["user_card_id", "FraudAvgAmount_per_user_card"]]
    tmp_0 = tmp.filter(pl.col("is_fraud?") == 0).rename({"amount":"NonFraudAvgAmount_per_user_card"})[["user_card_id", "NonFraudAvgAmount_per_user_card"]]
    
    X_train = X_train.join(
        tmp_0, on="user_card_id", how="left"
    )
    X_train = X_train.join(
        tmp_1, on="user_card_id", how="left"
    )
    
    X_valid = X_valid.join(
        tmp_0, on="user_card_id", how="left"
    )
    X_valid = X_valid.join(
        tmp_1, on="user_card_id", how="left"
    )
    
    test_df = test_df.join(
        tmp_0, on="user_card_id", how="left"
    )
    test_df = test_df.join(
        tmp_1, on="user_card_id", how="left"
    )


    # count_encoding
    for col in ["merchant_id"]:
        # per user
        count_map = X_train.groupby(by=["user_id", col], maintain_order=True).count().rename(
            {"count":f"{col}_count_per_user"})
        X_train = X_train.join(count_map, on=["user_id", col], how="left")
        X_valid = X_valid.join(count_map, on=["user_id", col], how="left")
        test_df = test_df.join(count_map, on=["user_id", col], how="left")
        
        # per user&card
        count_map = X_train.groupby(by=["user_id", "card_id", col], maintain_order=True).count().rename(
            {"count":f"{col}_count_per_user_card"})
        X_train = X_train.join(count_map, on=["user_id", "card_id", col], how="left")
        X_valid = X_valid.join(count_map, on=["user_id", "card_id", col], how="left")
        test_df = test_df.join(count_map, on=["user_id", "card_id", col], how="left")
        
    # OrdinalEncoder
    oe = CustomOrdinalEncoder(encoded_missing_value=-1)
    X_train = pl.concat([X_train, 
                        oe.fit_transform(X_train[CFG.categorical_features])
                        ], how="horizontal")
    X_valid = pl.concat([X_valid, 
                        oe.transform(X_valid[CFG.categorical_features])
                        ], how="horizontal")
    test_df = pl.concat([test_df, 
                        oe.transform(test_df[CFG.categorical_features])
                        ], how="horizontal")
    
    
    return X_train, X_valid, test_df

In [8]:
# ===============================================================
#  model
# ===============================================================
lgb_param = {
    "task":"train",
    "objective": "binary",
    "boosting":"gbdt",
    "num_iterations": 10000, # default: 100
    "learning_rate": 0.05, # default: 0.1
    "num_leaves": int((2**CFG.max_depth) * 0.7), # max number of leaves in one tree
    "max_depth": CFG.max_depth, # default -1, int: limit the max depth for tree model  ### xgboost, catboostに合わせる
    "min_child_weight":1e-3, # double: minimal sum hessian in one leaf
    "min_data_in_leaf":20, # minimal number of data in one leaf
    "alpha":0.9, # double, constraints, alpha > 0.0: 
    "colsample_bytree":0.4, # 0 < "colsample_bytree" < 1
    #: LightGBM will randomly select a subset of features on each iteration (tree) if feature_fraction is smaller than 1.0
    "lambda": 0, #lambda_l2 >= 0.0: L2 regularization
    "subsample":1, #0.0 < bagging_fraction <= 1.0
    "num_threads": CFG.num_cores,
    "metric": 'binary_logloss',
    "seed" : CFG.seed,
    "verbosity": -1, 
}

CFG.use_features = CFG.numerical_features + [col+"_category" for col in CFG.categorical_features]


def train_lgb_per_user(CFG, train, test):
    # kfold
    train = kfold(train)
    preds = []
    oof_df = pl.DataFrame()
    for fold in range(CFG.n_splits):
        X_train, X_valid, test_df = preprocessing_per_fold(CFG, train, test, fold)
        categorical_features = [col for col in CFG.use_features if "_category" in col]
        lgb_train = lgb.Dataset(X_train[CFG.use_features].to_pandas(), X_train[CFG.target_cols].to_pandas(), categorical_feature = categorical_features,)
        lgb_valid = lgb.Dataset(X_valid[CFG.use_features].to_pandas(), X_valid[CFG.target_cols].to_pandas(), categorical_feature = categorical_features,)
        model = lgb.train(
                        lgb_param, 
                        lgb_train, 
                        valid_sets=[lgb_valid],
                        categorical_feature = categorical_features,
                        callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False),
                                   #lgb.log_evaluation(period=200)
                                   ],
                        )
        
        # valid
        X_valid = X_valid.with_columns(
            pl.Series(model.predict(X_valid[CFG.use_features].to_pandas(), num_iteration=model.best_iteration)).alias("pred_")
        )        
        # oof
        oof_df = pl.concat(
            [oof_df, X_valid]
        )
        # predict
        preds.append(model.predict(test_df[CFG.use_features].to_pandas(), num_iteration=model.best_iteration))
        
    test_df = test_df.with_columns(
        pl.Series(np.mean(preds, axis=0)).alias("pred_")
    )
    
    if CFG.threshold_per_user:
        _, threshold = get_score(y_true=oof_df["is_fraud?"], y_pred=oof_df["pred_"], step=0.01, return_threshold=True, disable=True)
    
        test_df = test_df.with_columns(
            pl.when(pl.col("pred_") > threshold)
            .then(1)
            .otherwise(0)
            .alias("pred")
        )
        
        oof_df = oof_df.with_columns(
            pl.when(pl.col("pred_") > threshold)
            .then(1)
            .otherwise(0)
            .alias("pred")
        )
        
    else:
        pass
        
    return oof_df, test_df

In [9]:
# ===============================================================
#  train
# ===============================================================
test_df = pl.DataFrame()
oof_df = pl.DataFrame()

for user in tqdm(all_data["user_id"].unique(maintain_order=True)):
    # data split
    train = all_data.filter(
        (pl.col("flag") == "train")&(pl.col("user_id") == user)
    )
    test = all_data.filter(
        (pl.col("flag") == "test")&(pl.col("user_id") == user)
    )
    
    _oof_df, _test_df = train_lgb_per_user(CFG, train, test)
    oof_df = pl.concat([oof_df, _oof_df])
    test_df = pl.concat([test_df, _test_df])


  0%|          | 0/97 [00:00<?, ?it/s]

TypeError: DataFrame.rename() got an unexpected keyword argument 'columns'

In [None]:
# ===================================================================
#  CV score
# ===================================================================
best_score, threshold = get_score(oof_df[CFG.target_cols], oof_df["pred_"], step=0.005, return_threshold=True, disable=False, )
print('\033[32m'+"====== CV score ======"+'\033[0m')
print('\033[32m'+f'{best_score} (threshold: {threshold})'+'\033[0m')

  0%|          | 0/200 [00:00<?, ?it/s]

[32m0.6651096184663643 (threshold: 0.33)[0m


0.6650350009413455 (threshold: 0.335)  
0.6651491553851684 (threshold: 0.335)  
0.6651380856233531 (threshold: 0.335)  
0.6651837656560127 (threshold: 0.335)  
0.6651096184663643 (threshold: 0.33)

n_splits  
-> 5にしたらCVは0.6650696416960407 (threshold: 0.335)に。(?分)  
-> 10にしたらCVは0.6704852553752019 (threshold: 0.33)に。(4分)  
-> 25にしたらCVは0.6747956356671342 (threshold: 0.34)に。(10分)

In [None]:
# ===================================================================
#  save_data
# ===================================================================

# oof_df
oof_df = oof_df.sort("index")
oof_df[["index", "pred_"]].rename({"pred_":"pred"}).write_csv(CFG.save_dir+f"oof_df_{CFG.filename}.csv", has_header=True)

# test
test_df = test_df.sort("index")
test_df = test_df.with_columns(
    pl.when(pl.col("pred_") > threshold)
    .then(1)
    .otherwise(0)
    .alias("pred")
)
test_df[["index", "pred"]].write_csv(CFG.save_dir+f"{CFG.filename}.csv", has_header=False)
test_df[["index", "pred"]].head(2)

index,pred
i64,i32
471283,0
471284,0
