In [27]:
import polars as pl
from matplotlib_venn import venn2
import matplotlib.pyplot as plt

In [28]:
train = pl.read_csv("G:/マイドライブ/signate_MUFJ2023/data/train.csv")
test = pl.read_csv("G:/マイドライブ/signate_MUFJ2023/data/test.csv")
card = pl.read_csv("G:/マイドライブ/signate_MUFJ2023/data/card.csv")
user = pl.read_csv("G:/マイドライブ/signate_MUFJ2023/data/user.csv")

train = train.with_columns(
    pl.lit("train").alias("flag")
)
test = test.with_columns(
    [
        pl.lit("test").alias("flag"),
        pl.lit(None).cast(pl.Int64).alias("is_fraud?")
    ]
)

all_data = pl.concat([train, test], how="align")

all_data = all_data.join(card, on=["user_id", "card_id"], how="left")
all_data = all_data.join(user, on="user_id", how="left")

def apply_fe(df: pl.DataFrame) -> pl.DataFrame:
    df = df.with_columns(
        [   
            # str -> float
            pl.col("amount").apply(lambda x: x[1:]).cast(pl.Float64),
            pl.col("total_debt").apply(lambda x: x[1:]).cast(pl.Float64),
            pl.col("credit_limit").apply(lambda x: x[1:]).cast(pl.Float64),
            pl.col("yearly_income_person").apply(lambda x: x[1:]).cast(pl.Float64),
            pl.col("per_capita_income_zipcode").apply(lambda x: x[1:]).cast(pl.Float64),
            
            # str -> Datetime
            pl.col("expires").str.strptime(dtype=pl.Date, format="%m/%Y"),
            pl.col("acct_open_date").str.strptime(dtype=pl.Date, format="%m/%Y"),
            
            # bool
            (pl.col("zip") == pl.col("zipcode")).alias("same_zipcode_as_zip"),
            pl.when((pl.col("merchant_city").is_null())&(pl.col("merchant_city") != "ONLINE")) ## TODO: 上手くまとめられないかな
            .then(pl.lit(True))
            .otherwise(pl.lit(False))
            .alias("city_is_not_America"),            
        ]
    )
    
    df = df.with_columns(
        [
            # Datetime -> Month, Year
            pl.col("expires").dt.year().suffix("_year"),
            pl.col("expires").dt.month().suffix("_month"),
            pl.col("acct_open_date").dt.year().suffix("_year"),
            pl.col("acct_open_date").dt.month().suffix("_month"),        
        ]
    )
    return df

all_data = apply_fe(all_data)

all_data

index,user_id,card_id,amount,errors?,is_fraud?,merchant_id,merchant_city,merchant_state,zip,mcc,use_chip,flag,card_brand,card_type,expires,has_chip,cards_issued,credit_limit,acct_open_date,year_pin_last_changed,current_age,retirement_age,birth_year,birth_month,gender,address,city,state,zipcode,latitude,longitude,per_capita_income_zipcode,yearly_income_person,total_debt,fico_score,num_credit_cards,same_zipcode_as_zip,city_is_not_America,expires_year,expires_month,acct_open_date_year,acct_open_date_month
i64,i64,i64,f64,str,i64,i64,str,str,f64,i64,str,str,str,str,date,str,i64,f64,date,i64,i64,i64,i64,i64,str,str,str,str,i64,f64,f64,f64,f64,f64,i64,i64,bool,bool,i32,u32,i32,u32
0,1721,0,2.623,"""OK""",0,209237,"""Joliet""","""IL""",60436.0,5541,"""Swipe Transact…","""train""","""Mastercard""","""Credit""",2021-10-01,"""YES""",1,6900.0,1995-09-01,2009,61,65,1958,5,"""Male""","""206 Pine Lane""","""Joliet""","""IL""",60436,41.52,-88.12,17567.0,35823.0,96691.0,732,3,true,false,2021,10,1995,9
1,1629,3,6.4,"""OK""",0,2568,"""Edgerton""","""WI""",53534.0,5814,"""Swipe Transact…","""train""","""Mastercard""","""Debit (Prepaid…",2022-12-01,"""YES""",2,110.0,1999-06-01,2017,50,69,1969,4,"""Female""","""8886 Little Cr…","""Edgerton""","""WI""",53534,42.83,-89.07,21348.0,43529.0,126175.0,797,6,true,false,2022,12,1999,6
2,655,3,123.5,"""OK""",0,345310,"""Ridgefield""","""WA""",98642.0,7538,"""Swipe Transact…","""train""","""Mastercard""","""Debit""",2024-05-01,"""YES""",1,24090.0,1998-09-01,2009,56,68,1963,7,"""Male""","""273 Ocean Stre…","""Ridgefield""","""WA""",98642,45.79,-122.69,27308.0,55682.0,82696.0,750,4,true,false,2024,5,1998,9
3,492,0,51.287,"""OK""",0,4295,"""Milton""","""FL""",32583.0,5912,"""Chip Transacti…","""train""","""Visa""","""Credit""",2022-03-01,"""YES""",1,10300.0,1993-04-01,2013,68,65,1951,3,"""Female""","""7276 Valley Dr…","""Sioux Falls""","""SD""",57107,43.54,-96.73,20153.0,39082.0,16870.0,722,4,false,false,2022,3,1993,4
4,1969,4,17.561,"""OK""",0,350447,"""Irvington""","""NJ""",7111.0,4214,"""Swipe Transact…","""train""","""Visa""","""Debit""",2023-06-01,"""YES""",1,11592.0,1998-08-01,2015,59,68,1960,3,"""Female""","""5238 Park Stre…","""Union City""","""NJ""",7087,40.76,-74.03,16770.0,34190.0,39242.0,810,7,false,false,2023,6,1998,8
5,1612,6,43.454,"""OK""",0,231941,"""Warren""","""OH""",44485.0,5499,"""Swipe Transact…","""train""","""Mastercard""","""Debit""",2001-06-01,"""NO""",2,15007.0,2000-06-01,2013,81,65,1939,1,"""Female""","""374 Lexington …","""Warren""","""OH""",44485,41.23,-80.81,12406.0,11613.0,427.0,790,8,true,false,2001,6,2000,6
6,783,4,13.75,"""OK""",0,212122,"""Camillus""","""NY""",13031.0,8049,"""Swipe Transact…","""train""","""Mastercard""","""Debit""",2020-06-01,"""YES""",1,23481.0,1996-06-01,2012,73,65,1946,9,"""Female""","""8258 Jefferson…","""Camillus""","""NY""",13031,43.03,-76.3,24172.0,48750.0,18724.0,709,8,true,false,2020,6,1996,6
7,1629,5,14.85,"""OK""",0,78396,"""Edgerton""","""WI""",53534.0,7230,"""Swipe Transact…","""train""","""Mastercard""","""Debit (Prepaid…",2021-01-01,"""YES""",2,95.0,1998-10-01,2012,50,69,1969,4,"""Female""","""8886 Little Cr…","""Edgerton""","""WI""",53534,42.83,-89.07,21348.0,43529.0,126175.0,797,6,true,false,2021,1,1998,10
8,986,0,19.9,"""OK""",0,405337,"""Rio de Janeiro…","""Brazil""",,4121,"""Swipe Transact…","""train""","""Mastercard""","""Debit (Prepaid…",2020-05-01,"""NO""",2,22.0,2005-03-01,2011,75,67,1944,12,"""Male""","""2890 Eighth La…","""Maywood""","""IL""",60153,41.88,-87.84,15451.0,22158.0,19101.0,681,5,,false,2020,5,2005,3
9,541,3,159.374,"""OK""",0,194570,"""Orlando""","""FL""",32839.0,4829,"""Chip Transacti…","""train""","""Mastercard""","""Credit""",2018-10-01,"""YES""",2,10800.0,1999-12-01,2005,68,67,1951,7,"""Male""","""5828 Wessex Dr…","""Orlando""","""FL""",32818,28.5,-81.37,15849.0,43004.0,15304.0,761,6,false,false,2018,10,1999,12
