In [2]:
# ===============================================================
#  Library
# ===============================================================
import numpy as np
import polars as pl

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import warnings
warnings.simplefilter("ignore")

import lightgbm as lgb

import sys
sys.path.append("G:/マイドライブ/signate_MUFJ2023/")
from MUFJ.utils import get_score, seed_everything
from MUFJ.preprocessing import CustomOrdinalEncoder

from math import comb
import xgboost as xgb
from tqdm.auto import tqdm
from itertools import combinations

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# ===============================================================
#  CFG
# ===============================================================
class CFG:
    debug = False
    seed = 42
    n_splits = 5
    num_cores = 4
    data_dir = "G:/マイドライブ/signate_MUFJ2023/data/"
    stopping_rounds = 100
    save_dir = "G:/マイドライブ/signate_MUFJ2023/exp/"
    filename = "exp003"
    numerical_features = [
        "amount", 'cards_issued', 'credit_limit','year_pin_last_changed','current_age','retirement_age','birth_year','birth_month', 'latitude', 'longitude',
        'per_capita_income_zipcode', 'yearly_income_person', 'total_debt','fico_score', 'num_credit_cards', 'expires_month','expires_year','acct_open_date_month', 
        'acct_open_date_year', "YearsFromAcctOpenToPinChange",
        ]
    categorical_features = [
        "errors?", 'merchant_id', 'merchant_city','merchant_state','zip',"mcc",'use_chip','card_brand','card_type', 'has_chip','gender', 'city', 'state', 'zipcode',
        "card_id", "user_id", "same_zipcode_as_zip", "city_is_ONLINE",
        ]
    target_cols = ["is_fraud?"]

In [4]:
# ===============================================================
#  Utils
# ===============================================================
seed_everything(CFG.seed)

In [5]:
# ===============================================================
#  Data Loading
# ===============================================================
train = pl.read_csv(CFG.data_dir+"train.csv")
test = pl.read_csv(CFG.data_dir+"test.csv")
card = pl.read_csv(CFG.data_dir+"card.csv")
user = pl.read_csv(CFG.data_dir+"user.csv")

train = train.with_columns(
    pl.lit("train").alias("flag")
)
test = test.with_columns(
    [
        pl.lit(None, dtype=pl.Int64).alias("is_fraud?"),
        pl.lit("test").alias("flag"),
    ]
)

if CFG.debug:
    train = train.sample(n=10000, seed=CFG.seed)
    test = test.sample(n=1000, seed=CFG.seed)

all_data = pl.concat([train, test], how="align")
all_data = all_data.join(
    card, on=["user_id", "card_id"], how="left"
)
all_data = all_data.join(
    user, on="user_id", how="left"
)

In [6]:
# ===============================================================
#  Preprocessing
# ===============================================================
def preprocessing(all_data: pl.DataFrame) -> pl.DataFrame:
    
    all_data = all_data.with_columns(
        [   
            # str -> float
            pl.col("amount").apply(lambda x: x[1:]).cast(pl.Float64),
            pl.col("total_debt").apply(lambda x: x[1:]).cast(pl.Float64),
            pl.col("credit_limit").apply(lambda x: x[1:]).cast(pl.Float64),
            pl.col("yearly_income_person").apply(lambda x: x[1:]).cast(pl.Float64),
            pl.col("per_capita_income_zipcode").apply(lambda x: x[1:]).cast(pl.Float64),
            
            # str -> Datetime
            pl.col("expires").str.strptime(dtype=pl.Date, format="%m/%Y"),
            pl.col("acct_open_date").str.strptime(dtype=pl.Date, format="%m/%Y"),
            
            # bool
            (pl.col("zip") == pl.col("zipcode")).alias("same_zipcode_as_zip"),
            (pl.col("state") == pl.col("merchant_state")).alias("same_state"),
            #(pl.col("city") == pl.col("merchant_city")).alias("same_city"),
            (pl.col("merchant_city") == "ONLINE").alias("city_is_ONLINE"),
            #pl.when((pl.col("merchant_city").is_null())&(pl.col("merchant_city") != "ONLINE")) ## TODO: 上手くまとめられないかな
            #.then(pl.lit(True))
            #.otherwise(pl.lit(False))
            #.alias("city_is_not_America"),

            # user_id + card_id
            (pl.col("user_id").cast(pl.Utf8) + "-" + pl.col("card_id").cast(pl.Utf8)).alias("user_card_id"),
        ]
    )
    
    
    all_data = all_data.with_columns(
        [
            # Datetime -> Month, Year
            pl.col("expires").dt.year().suffix("_year"),
            pl.col("expires").dt.month().suffix("_month"),
            pl.col("acct_open_date").dt.year().suffix("_year"),
            pl.col("acct_open_date").dt.month().suffix("_month"),
        
            # feature_engineering
            #(pl.col("amount") - pl.col("credit_limit")).cast(pl.Float64).alias("remaining_credit"),
        ]
    )
    
    all_data = all_data.with_columns(
        [
            #(2023 - pl.col('year_pin_last_changed')).alias("YearsSincePinChange"),
            (pl.col("year_pin_last_changed") - pl.col("acct_open_date_year")).alias("YearsFromAcctOpenToPinChange"),
            #(pl.col("retirement_age") - pl.col("current_age")).alias("YearsUntilRetirement"),
            (pl.col("expires_year") - pl.col("year_pin_last_changed")).alias("YearsFromPinChangeToExpires"),
        ]
    )
    
    
    return all_data
all_data = preprocessing(all_data)

In [7]:
all_data[["amount", "mcc", "is_fraud?"]].filter(
    pl.col("is_fraud?") == 1
).groupby("mcc").agg(
    pl.col("amount").mean()
)

mcc,amount
i64,f64
3504,161.08938
4784,32.57881
3640,142.836434
3144,116.004
5192,72.001414
7832,99.84344
3000,146.455369
5712,104.569021
3256,104.427245
5816,139.30034


In [8]:
all_data.columns

['index',
 'user_id',
 'card_id',
 'amount',
 'errors?',
 'is_fraud?',
 'merchant_id',
 'merchant_city',
 'merchant_state',
 'zip',
 'mcc',
 'use_chip',
 'flag',
 'card_brand',
 'card_type',
 'expires',
 'has_chip',
 'cards_issued',
 'credit_limit',
 'acct_open_date',
 'year_pin_last_changed',
 'current_age',
 'retirement_age',
 'birth_year',
 'birth_month',
 'gender',
 'address',
 'city',
 'state',
 'zipcode',
 'latitude',
 'longitude',
 'per_capita_income_zipcode',
 'yearly_income_person',
 'total_debt',
 'fico_score',
 'num_credit_cards',
 'same_zipcode_as_zip',
 'same_state',
 'city_is_ONLINE',
 'user_card_id',
 'expires_year',
 'expires_month',
 'acct_open_date_year',
 'acct_open_date_month',
 'YearsFromAcctOpenToPinChange',
 'YearsFromPinChangeToExpires']

In [19]:
all_data.select(
    pl.all().
)

index,user_id,card_id,amount,errors?,is_fraud?,merchant_id,merchant_city,merchant_state,zip,mcc,use_chip,flag,card_brand,card_type,expires,has_chip,cards_issued,credit_limit,acct_open_date,year_pin_last_changed,current_age,retirement_age,birth_year,birth_month,gender,address,city,state,zipcode,latitude,longitude,per_capita_income_zipcode,yearly_income_person,total_debt,fico_score,num_credit_cards,same_zipcode_as_zip,same_state,city_is_ONLINE,user_card_id,expires_year,expires_month,acct_open_date_year,acct_open_date_month,YearsFromAcctOpenToPinChange,YearsFromPinChangeToExpires
i64,i64,i64,f64,str,i64,i64,str,str,f64,i64,str,str,str,str,date,str,i64,f64,date,i64,i64,i64,i64,i64,str,str,str,str,i64,f64,f64,f64,f64,f64,i64,i64,bool,bool,bool,str,i32,u32,i32,u32,i64,i64
0,1721,0,2.623,"""OK""",0,209237,"""Joliet""","""IL""",60436.0,5541,"""Swipe Transact…","""train""","""Mastercard""","""Credit""",2021-10-01,"""YES""",1,6900.0,1995-09-01,2009,61,65,1958,5,"""Male""","""206 Pine Lane""","""Joliet""","""IL""",60436,41.52,-88.12,17567.0,35823.0,96691.0,732,3,true,true,false,"""1721-0""",2021,10,1995,9,14,12
1,1629,3,6.4,"""OK""",0,2568,"""Edgerton""","""WI""",53534.0,5814,"""Swipe Transact…","""train""","""Mastercard""","""Debit (Prepaid…",2022-12-01,"""YES""",2,110.0,1999-06-01,2017,50,69,1969,4,"""Female""","""8886 Little Cr…","""Edgerton""","""WI""",53534,42.83,-89.07,21348.0,43529.0,126175.0,797,6,true,true,false,"""1629-3""",2022,12,1999,6,18,5
2,655,3,123.5,"""OK""",0,345310,"""Ridgefield""","""WA""",98642.0,7538,"""Swipe Transact…","""train""","""Mastercard""","""Debit""",2024-05-01,"""YES""",1,24090.0,1998-09-01,2009,56,68,1963,7,"""Male""","""273 Ocean Stre…","""Ridgefield""","""WA""",98642,45.79,-122.69,27308.0,55682.0,82696.0,750,4,true,true,false,"""655-3""",2024,5,1998,9,11,15
3,492,0,51.287,"""OK""",0,4295,"""Milton""","""FL""",32583.0,5912,"""Chip Transacti…","""train""","""Visa""","""Credit""",2022-03-01,"""YES""",1,10300.0,1993-04-01,2013,68,65,1951,3,"""Female""","""7276 Valley Dr…","""Sioux Falls""","""SD""",57107,43.54,-96.73,20153.0,39082.0,16870.0,722,4,false,false,false,"""492-0""",2022,3,1993,4,20,9
4,1969,4,17.561,"""OK""",0,350447,"""Irvington""","""NJ""",7111.0,4214,"""Swipe Transact…","""train""","""Visa""","""Debit""",2023-06-01,"""YES""",1,11592.0,1998-08-01,2015,59,68,1960,3,"""Female""","""5238 Park Stre…","""Union City""","""NJ""",7087,40.76,-74.03,16770.0,34190.0,39242.0,810,7,false,true,false,"""1969-4""",2023,6,1998,8,17,8
5,1612,6,43.454,"""OK""",0,231941,"""Warren""","""OH""",44485.0,5499,"""Swipe Transact…","""train""","""Mastercard""","""Debit""",2001-06-01,"""NO""",2,15007.0,2000-06-01,2013,81,65,1939,1,"""Female""","""374 Lexington …","""Warren""","""OH""",44485,41.23,-80.81,12406.0,11613.0,427.0,790,8,true,true,false,"""1612-6""",2001,6,2000,6,13,-12
6,783,4,13.75,"""OK""",0,212122,"""Camillus""","""NY""",13031.0,8049,"""Swipe Transact…","""train""","""Mastercard""","""Debit""",2020-06-01,"""YES""",1,23481.0,1996-06-01,2012,73,65,1946,9,"""Female""","""8258 Jefferson…","""Camillus""","""NY""",13031,43.03,-76.3,24172.0,48750.0,18724.0,709,8,true,true,false,"""783-4""",2020,6,1996,6,16,8
7,1629,5,14.85,"""OK""",0,78396,"""Edgerton""","""WI""",53534.0,7230,"""Swipe Transact…","""train""","""Mastercard""","""Debit (Prepaid…",2021-01-01,"""YES""",2,95.0,1998-10-01,2012,50,69,1969,4,"""Female""","""8886 Little Cr…","""Edgerton""","""WI""",53534,42.83,-89.07,21348.0,43529.0,126175.0,797,6,true,true,false,"""1629-5""",2021,1,1998,10,14,9
8,986,0,19.9,"""OK""",0,405337,"""Rio de Janeiro…","""Brazil""",,4121,"""Swipe Transact…","""train""","""Mastercard""","""Debit (Prepaid…",2020-05-01,"""NO""",2,22.0,2005-03-01,2011,75,67,1944,12,"""Male""","""2890 Eighth La…","""Maywood""","""IL""",60153,41.88,-87.84,15451.0,22158.0,19101.0,681,5,,false,false,"""986-0""",2020,5,2005,3,6,9
9,541,3,159.374,"""OK""",0,194570,"""Orlando""","""FL""",32839.0,4829,"""Chip Transacti…","""train""","""Mastercard""","""Credit""",2018-10-01,"""YES""",2,10800.0,1999-12-01,2005,68,67,1951,7,"""Male""","""5828 Wessex Dr…","""Orlando""","""FL""",32818,28.5,-81.37,15849.0,43004.0,15304.0,761,6,false,true,false,"""541-3""",2018,10,1999,12,6,13
