In [1]:
# ===============================================================
#  Library
# ===============================================================
import numpy as np
import pandas as pd

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import warnings
warnings.simplefilter("ignore")

import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder

import sys
sys.path.append("G:/マイドライブ/signate_MUFJ2023/")
from MUFJ.utils import get_score, seed_everything
from MUFJ.model_selection import kfold
#from MUFJ.preprocessing import CustomOrdinalEncoder

from tqdm.auto import tqdm

In [2]:
# ===============================================================
#  CFG
# ===============================================================
class CFG:
    debug = False
    patience = 2
    max_depth = 3
    seed = 42
    n_splits = 50
    num_cores = 4
    data_dir = "G:/マイドライブ/signate_MUFJ2023/data/"
    save_dir = "G:/マイドライブ/signate_MUFJ2023/exp/"
    filename = "exp013"
    numerical_features = [
        "amount", 'cards_issued', 'credit_limit','year_pin_last_changed','current_age','retirement_age','birth_year','birth_month', 'latitude', 'longitude',
        'per_capita_income_zipcode', 'yearly_income_person', 'total_debt','fico_score', 'num_credit_cards', 'expires_month','expires_year','acct_open_date_month', 
        'acct_open_date_year',
        "NonFraudAvgAmount_per_user_card", 
        "merchant_id_count_encoding", 
        "pred_1", "pred_2", "pred_3", 
    ]
        
    categorical_features = [
        "errors?", 'merchant_id', 'merchant_city','merchant_state','zip',"mcc",'use_chip','card_brand','card_type', 'has_chip','gender', 'city', 'state', 'zipcode',
        "card_id", "user_id",
        "same_zipcode_as_zip",
        "city_is_not_America", 
        ]
    target_cols = ["is_fraud?"]
    threshold_per_user = False

In [3]:
# ===============================================================
#  Utils
# ===============================================================
seed_everything(CFG.seed)

In [4]:
# ===============================================================
#  Data Loading
# ===============================================================
# load data
train = pd.read_csv(CFG.data_dir+"train.csv")
test = pd.read_csv(CFG.data_dir+"test.csv")
card = pd.read_csv(CFG.data_dir+"card.csv")
user = pd.read_csv(CFG.data_dir+"user.csv")
if CFG.debug:
    train = train.sample(n=10000, random_state=CFG.seed)
    test = test.sample(n=1000, random_state=CFG.seed)
    
# add flag
train["flag"] = "train"
train = pd.merge(train, pd.read_csv(CFG.save_dir+"oof_df_exp009.csv").rename(columns={"pred":"pred_1"}), on="index", how="left")
train = pd.merge(train, pd.read_csv(CFG.save_dir+"oof_df_exp010.csv").rename(columns={"pred":"pred_2"}), on="index", how="left")
train = pd.merge(train, pd.read_csv(CFG.save_dir+"oof_df_exp011.csv").rename(columns={"pred":"pred_3"}), on="index", how="left")


test["flag"] = "test"
test = pd.merge(test, pd.read_csv(CFG.save_dir+"exp009.csv", header=None, names=["index", "pred_1"]), on="index", how="left")
test = pd.merge(test, pd.read_csv(CFG.save_dir+"exp010.csv", header=None, names=["index", "pred_2"]), on="index", how="left")
test = pd.merge(test, pd.read_csv(CFG.save_dir+"exp011.csv", header=None, names=["index", "pred_3"]), on="index", how="left")

# merge
all_data = pd.concat([train, test])
all_data = all_data.merge(card, on=["user_id", "card_id"], how="left")
all_data = all_data.merge(user, on="user_id", how="left")

In [5]:
# ===============================================================
#  Preprocessing
# ===============================================================
def preprocessing(all_data: pd.DataFrame) -> pd.DataFrame:
    
    # str -> float
    for col in ["amount", "total_debt", "credit_limit", "yearly_income_person", "per_capita_income_zipcode"]:
        all_data[col] = all_data[col].apply(lambda x: x[1:]).astype(float)
        
    # str -> datetime
    for col in ["expires", "acct_open_date"]:
        all_data[col] = pd.to_datetime(all_data[col], format="%m/%Y")
        all_data[col+"_year"] = all_data[col].dt.year
        all_data[col+"_month"] = all_data[col].dt.month

            
    # user_id + card_id
    all_data["user_card_id"] = all_data["user_id"].astype(str) + "-" + all_data["card_id"].astype(str)
    
    # bool
    all_data["same_zipcode_as_zip"] = (all_data["zip"] == all_data["zipcode"])
    all_data["city_is_not_America"] = ((all_data["zip"].isnull())&(all_data["merchant_city"] != "ONLINE"))

    return all_data
all_data = preprocessing(all_data)

In [6]:
# ===============================================================
#  Preprocessing_per_fold
# ===============================================================
def preprocessing_per_fold(CFG, train: pd.DataFrame, test: pd.DataFrame, fold:int):
    # data split
    X_train = train[train["fold"] != fold].reset_index(drop=True)
    X_valid = train[train["fold"] == fold].reset_index(drop=True)
    test_df = test.copy()
    
    # user_card_idごとの不正利用があったとき、無かったときのそれぞれの取引金額の平均
    tmp = X_train.groupby(by=["user_card_id", "is_fraud?"])["amount"].mean().reset_index()
    #tmp_0 = tmp[tmp["is_fraud?"] == 1].rename(columns={"amount":"FraudAvgAmount_per_user_card"})[["user_card_id", "FraudAvgAmount_per_user_card"]]
    tmp_1 = tmp[tmp["is_fraud?"] == 0].rename(columns={"amount":"NonFraudAvgAmount_per_user_card"})[["user_card_id", "NonFraudAvgAmount_per_user_card"]]
    X_train = X_train.merge(tmp_1, on="user_card_id", how="left")
    X_valid = X_valid.merge(tmp_1, on="user_card_id", how="left")
    test_df = test_df.merge(tmp_1, on="user_card_id", how="left")
        
    # count_encoding
    for col in ["merchant_id"]:
        count_map = X_train[col].value_counts().to_dict()
        X_train[col+"_count_encoding"] = X_train[col].map(count_map)
        X_valid[col+"_count_encoding"] = X_valid[col].map(count_map)
        test_df[col+"_count_encoding"] = test_df[col].map(count_map)

    # OrdinalEncoder: これはfoldごとではなくともよい
    oe = OrdinalEncoder(categories="auto",
                        handle_unknown="use_encoded_value",
                        unknown_value=9999,
                        encoded_missing_value=-1, 
                        )
    CFG.categorical_features_ = [feature + "_category" for feature in CFG.categorical_features]
    X_train[CFG.categorical_features_] = oe.fit_transform(X_train[CFG.categorical_features].values)
    X_valid[CFG.categorical_features_] = oe.transform(X_valid[CFG.categorical_features].values)
    test_df[CFG.categorical_features_] = oe.transform(test_df[CFG.categorical_features].values)
    return X_train, X_valid, test_df

In [7]:
# ===============================================================
#  model
# ===============================================================
lgb_param = {
    "task":"train",
    "objective": "binary",
    "boosting":"gbdt",
    "num_iterations": 10000, # default: 100
    "learning_rate": 0.05, # default: 0.1
    "num_leaves": int((2**CFG.max_depth) * 0.7), # max number of leaves in one tree
    "max_depth": CFG.max_depth, # default -1, int: limit the max depth for tree model  ### xgboost, catboostに合わせる
    "min_child_weight":1e-3, # double: minimal sum hessian in one leaf
    "min_data_in_leaf":20, # minimal number of data in one leaf
    "colsample_bytree":0.4, # 0 < "colsample_bytree" < 1
    #: LightGBM will randomly select a subset of features on each iteration (tree) if feature_fraction is smaller than 1.0
    "lambda": 0, #lambda_l2 >= 0.0: L2 regularization
    "subsample":1, #0.0 < bagging_fraction <= 1.0
    "num_threads": CFG.num_cores,
    "metric": 'binary_logloss',
    "seed" : CFG.seed,
    "verbosity": -1, 
}

CFG.use_features = CFG.numerical_features + [col+"_category" for col in CFG.categorical_features]


def train_lgb_per_user(CFG, train, test):
    preds = []
    oof_df = pd.DataFrame()
    for fold in range(CFG.n_splits):
        X_train, X_valid, test_df = preprocessing_per_fold(CFG, train, test, fold)
        categorical_features = [col for col in CFG.use_features if "_category" in col]
        lgb_train = lgb.Dataset(X_train[CFG.use_features], X_train[CFG.target_cols], categorical_feature = categorical_features,)
        lgb_valid = lgb.Dataset(X_valid[CFG.use_features], X_valid[CFG.target_cols], categorical_feature = categorical_features,)
        model = lgb.train(
                        lgb_param, 
                        lgb_train, 
                        valid_sets=[lgb_valid],
                        categorical_feature = categorical_features,
                        callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False),
                                   #lgb.log_evaluation(period=200)
                                   ],
                        )
        
        # valid
        X_valid["_pred"] = model.predict(X_valid[CFG.use_features], num_iteration=model.best_iteration)
        
        # oof
        oof_df = pd.concat([oof_df, X_valid])
        
        # predict
        preds.append(model.predict(test_df[CFG.use_features], num_iteration=model.best_iteration))
        
    test_df["_pred"] = np.mean(preds, axis=0)
    
    if CFG.threshold_per_user:
        _, threshold = get_score(y_true=oof_df["is_fraud?"], y_pred=oof_df["_pred"], step=0.01, return_threshold=True, disable=True)
        test_df["pred"] = np.where(test_df["_pred"] > threshold, 1, 0)
        oof_df["pred"] = np.where(oof_df["_pred"] > threshold, 1, 0)
        
    else:
        pass
        
    return oof_df, test_df

In [8]:
# ===================================================================
#  Cross Validation
# ===================================================================
train = all_data[all_data["flag"] == "train"].reset_index(drop=True)
test = all_data[all_data["flag"] == "test"].reset_index(drop=True)
train = kfold(CFG, train)

Making fold:   0%|          | 0/97 [00:00<?, ?it/s]

Check_fold_bias:   0%|          | 0/97 [00:00<?, ?it/s]

Fold of User_id 156 is fold
17.0    88
29.0    87
49.0    87
32.0    87
12.0    87
35.0    87
34.0    87
16.0    87
25.0    87
31.0    87
7.0     87
48.0    87
26.0    87
10.0    87
33.0    87
43.0    87
27.0    87
45.0    87
6.0     87
9.0     87
42.0    87
40.0    87
5.0     87
4.0     87
38.0    87
23.0    87
46.0    87
39.0    87
11.0    87
24.0    87
18.0    87
2.0     87
22.0    87
13.0    87
0.0     87
44.0    87
15.0    87
3.0     87
28.0    87
21.0    87
14.0    87
37.0    87
30.0    87
1.0     87
47.0    87
8.0     87
36.0    87
19.0    86
20.0    86
41.0    86
Name: count, dtype: int64
The folds for each user have been correctly allocated.


In [9]:
# ===============================================================
#  train
# ===============================================================
test_df = pd.DataFrame()
oof_df = pd.DataFrame()

for user in tqdm(all_data["user_id"].unique()):
    # train, inference
    _oof_df, _test_df = train_lgb_per_user(CFG, train[train["user_id"] == user].reset_index(drop=True), test[test["user_id"] == user].reset_index(drop=True))
    
    # concat
    oof_df = pd.concat([oof_df, _oof_df])
    test_df = pd.concat([test_df, _test_df])

  0%|          | 0/97 [00:00<?, ?it/s]

In [10]:
# ===================================================================
#  CV score
# ===================================================================
best_score, threshold = get_score(oof_df[CFG.target_cols], oof_df["_pred"], step=0.005, return_threshold=True, disable=False, )
print('\033[32m'+"====== CV score ======"+'\033[0m')
print('\033[32m'+f'{best_score} (threshold: {threshold})'+'\033[0m')

  0%|          | 0/200 [00:00<?, ?it/s]

[32m0.6789977790116603 (threshold: 0.35000000000000003)[0m


0.6854378393428929 (threshold: 0.36)

| No | CV | description |
| - | - | - |
| 0 | 0.6605582855582856 (threshold: 0.33) | baseline: 再現性が取れるようになった！ |
| 1 | 0.6612461572059738 (threshold: 0.31) | add same_zipcode_as_zip |
| 2 | 0.6613739523655713 (threshold: 0.325) | add "city_is_not_America" |
| 3 | 0.661542928387218 (threshold: 0.33) | add "NonFraudAvgAmount_per_user_card" |
| 4 | **0.6652354618211794 (threshold: 0.34)** | add "merchant_id_count_encoding" |
| 5 | 0.6649124932920181 (threshold: 0.35000000000000003) | add "mcc_count_encoding" |
| 6 | 0.6646481732070365 (threshold: 0.32) | add "merchant_city_count_encoding" |
| 7 | 0.6647920368485517 (threshold: 0.34500000000000003) | add "merchant_state_count_encoding" |

In [11]:
# ===================================================================
#  save_data
# ===================================================================

# oof_df
oof_df = oof_df.sort_values("index")
oof_df[["index", "_pred"]].rename(columns={"_pred":"pred"}).to_csv(CFG.save_dir+f"oof_df_{CFG.filename}.csv", index=False)

# test
test_df = test_df.sort_values("index")
test_df["pred"] = np.where(test_df["_pred"] > threshold, 1, 0)

test_df[["index", "_pred"]].rename(columns={"_pred":"pred"}).to_csv(CFG.save_dir+f"{CFG.filename}.csv", index=False, header=False)
test_df[["index", "_pred"]].rename(columns={"_pred":"pred"}).head(2)

Unnamed: 0,index,pred
0,471283,0.031423
0,471284,0.108292
