In [2]:
# !pip install polars
# !pip install pyarrow
# !pip install -U scikit-learn
# !pip install lightgbm

In [3]:
import os
import gc
from pathlib import Path

import numpy as np
import polars as pl
import polars.selectors as cs
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

import lightgbm as lgb

In [4]:
# for k,v  in os.environ.items():
#     if "smouz" in v:
#         print(k, ":", v)

### Data Dictionary

https://www.kaggle.com/code/jetakow/home-credit-2024-starter-notebook

https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data

Depth values:

- depth=0: These are static features directly tied to a specific case_id
- depth=1: Each case_id has an associated historical record, indexed by num_group 1.
- depth=2: Each case_id has an associated historical record, indexed by both num_group1 and num_group2.

You can read more about Credit bureau (CB) here https://en.wikipedia.org/wiki/Credit_bureau.

In [5]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df


def fill_missing(df: pd.DataFrame) -> pd.DataFrame:
    numeric_columns = [x for x in df.columns if 'int' in df[x].dtype.name or 'float' in df[x].dtype.name]
    str_columns = [x for x in df.columns if 'category' in df[x].dtype.name or 'string' in df[x].dtype.name]
    object_columns = [x for x in df.columns if 'object' in df[x].dtype.name]
    bool_columns = [x for x in df.columns if 'bool' in df[x].dtype.name]
    assert len(str_columns) + len(numeric_columns) + len(object_columns) + len(bool_columns) == df.shape[1]

    df[numeric_columns].fillna(0.0, inplace=True)
    df[str_columns] = df[str_columns].astype(str).fillna("-1").astype('category')
    df[bool_columns].fillna(False, inplace=True)
    return df

## Read Files

### base file

In [6]:
DRIVE= "I:"
USERNAME = os.environ['USERNAME']

DATA_DIR = Path(DRIVE).joinpath(USERNAME, "data", "Home Credit - Credit Risk Model Stability")
DATA_FILES_DIR = DATA_DIR.joinpath("parquet_files")
DATA_FILES_DIR

filename = DATA_FILES_DIR.joinpath("train", "train_base.parquet")
train_basetable = pl.read_parquet(filename)

CREDIT_BUREAU_FILES = DATA_FILES_DIR.joinpath("train").glob("*credit_bureau_*parquet")
PERSON_FILES = DATA_FILES_DIR.joinpath("train").glob("*person_*parquet")
STATIC_FILES = DATA_FILES_DIR.joinpath("train").glob("*static_0_*parquet")
STATIC_CB_FILES = DATA_FILES_DIR.joinpath("train").glob("*static_cb_*parquet")
APPLPREV_FILES = DATA_FILES_DIR.joinpath("train").glob("*applprev_*parquet")
DEBIT_FILES = DATA_FILES_DIR.joinpath("train").glob("*debitcard_*parquet")
DEPOSIT_FILES = DATA_FILES_DIR.joinpath("train").glob("*deposit_*parquet")
TAX_REG_FILES = DATA_FILES_DIR.joinpath("train").glob("*tax_registry_*parquet")
OTHER_FILES = DATA_FILES_DIR.joinpath("train").glob("*other_*parquet")

PERSON_FILES_LIST = list(PERSON_FILES)
CREDIT_BUREAU_FILES_LIST = list(CREDIT_BUREAU_FILES)
CREDIT_BUREAU_FILES_LIST[-3:]

[WindowsPath('I:smouz/data/Home Credit - Credit Risk Model Stability/parquet_files/train/train_credit_bureau_a_2_9.parquet'),
 WindowsPath('I:smouz/data/Home Credit - Credit Risk Model Stability/parquet_files/train/train_credit_bureau_b_1.parquet'),
 WindowsPath('I:smouz/data/Home Credit - Credit Risk Model Stability/parquet_files/train/train_credit_bureau_b_2.parquet')]

In [7]:
# filename = DATA_FILES_DIR.joinpath("train", "train_static_cb_0.parquet")
# static_df = pl.read_parquet(filename)

train_static = pl.concat(
    [
        pl.read_parquet(filepath).pipe(set_table_dtypes) for filepath in STATIC_FILES
    ],
    how="vertical_relaxed",
)

train_static_cb = pl.concat(
    [
         pl.read_parquet(filepath).pipe(set_table_dtypes) for filepath in STATIC_CB_FILES      
    ],
    how='vertical_relaxed'
)


train_person_1 = pl.read_parquet(str(PERSON_FILES_LIST[0])).pipe(set_table_dtypes)
train_person_2 = pl.read_parquet(str(PERSON_FILES_LIST[1])).pipe(set_table_dtypes)

# train_credit_bureau_b_2 = pl.concat(
#     [
#          pl.read_parquet(filepath).pipe(set_table_dtypes) for filepath in CREDIT_BUREAU_FILES      
#     ],
#     how='vertical_relaxed'
# )

train_credit_bureau_b_2 = pl.concat(
    [
         pl.read_parquet(filepath).pipe(set_table_dtypes) for filepath in CREDIT_BUREAU_FILES_LIST[:3]      
    ],
    how='vertical_relaxed'
)
gc.collect()

0

In [8]:
train_static.estimated_size() / 1024**2

1745.8659162521362

In [9]:
train_static_cb.head(3)

case_id,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,for3years_128L,for3years_504L,for3years_584L,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,foryear_850L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtcount_4527229L,pmtcount_4955617L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,riskassesment_302T,riskassesment_940T,secondquarter_766L,thirdquarter_1082L
i64,str,str,str,str,f64,str,str,f64,f64,f64,f64,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,str,str,f64,f64,f64
357,,,,"""1988-04-01""",,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,,,,,,,,,,,,,,,,,"""a55475b1""","""a55475b1""",,,,,,,,6.0,6301.4,,"""2019-01-25""",,,,,,
381,,,,"""1973-11-01""",,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,,,,,,,,,,,,,,,,,"""a55475b1""","""a55475b1""",,,,,,,,6.0,4019.6,,"""2019-01-25""",,,,,,
388,,,,"""1989-04-01""",,"""1989-04-01""",,6.0,8.0,2.0,10.0,4.0,"""a55475b1""","""a55475b1""","""a55475b1""",2.0,,,,,,,,,,,,,,,,,6.0,"""a55475b1""","""a55475b1""",10.0,,,,,,,6.0,14548.0,,"""2019-01-28""",,,,,3.0,5.0


#### train_static
each row contain unique case_id

In [10]:
train_static.head(3)

case_id,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,clientscnt_493L,clientscnt_533L,clientscnt_887L,…,numinstpaidearlyest_4493214L,numinstpaidlastcontr_4325080L,numinstpaidlate1d_3546852L,numinstregularpaid_973L,numinstregularpaidest_4493210L,numinsttopaygr_769L,numinsttopaygrest_4493213L,numinstunpaidmax_3546851L,numinstunpaidmaxest_4493212L,numnotactivated_1143L,numpmtchanneldd_318L,numrejects9m_859L,opencred_647L,paytype1st_925L,paytype_783L,payvacationpostpone_4187118D,pctinstlsallpaidearl3d_427L,pctinstlsallpaidlat10d_839L,pctinstlsallpaidlate1d_3546856L,pctinstlsallpaidlate4d_3546849L,pctinstlsallpaidlate6d_3546844L,pmtnum_254L,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,previouscontdistrict_112M,price_1097A,sellerplacecnt_915L,sellerplacescnt_216L,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,twobodfilling_608L,typesuite_864L,validfrom_1069D
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,bool,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str
0,,,1917.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,0.0,0.0,0.0,,"""OTHER""","""OTHER""",,,,,,,24.0,0.0,0.0,,"""a55475b1""",,0.0,0.0,,,0.0,0.0,,"""BO""",,
1,,,3134.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,0.0,0.0,0.0,,"""OTHER""","""OTHER""",,,,,,,18.0,0.0,0.0,,"""a55475b1""",,0.0,0.0,,,0.0,0.0,,"""BO""",,
2,,,4937.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,0.0,0.0,0.0,False,"""OTHER""","""OTHER""",,,,,,,36.0,0.0,0.0,,"""a55475b1""",,0.0,0.0,,,0.0,0.0,,"""BO""","""AL""",


In [11]:
def drop_columns_by_nulls(df: pl.DataFrame, null_threshold: float, dry_run: bool=False) -> pl.DataFrame:
    null_count = (train_static.null_count() / train_static.shape[0]).to_pandas()
    drop_cols = null_count.T[null_count.T.values >= null_threshold].index.tolist()
    print(f"Drop columns: {len(drop_cols)}, Nulls >= {null_threshold}")
    if dry_run:
        print(f"Columns exceeding threshold ({null_threshold}):\n\t{drop_cols}")
        return
    return df.drop(drop_cols)

In [12]:
drop_columns_by_nulls(train_person_2, 0.8, dry_run=True)

Drop columns: 17, Nulls >= 0.8
Columns exceeding threshold (0.8):
	['avglnamtstart24m_4525187A', 'cardtype_51L', 'clientscnt_136L', 'datelastinstal40dpd_247D', 'equalitydataagreement_891L', 'equalityempfrom_62L', 'inittransactionamount_650A', 'interestrategrace_34L', 'isbidproductrequest_292L', 'isdebitcard_729L', 'lastdependentsnum_448L', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrepayingdate_696D', 'maxannuity_4075009A', 'payvacationpostpone_4187118D', 'validfrom_1069D']


In [13]:
train_static = drop_columns_by_nulls(train_static, 0.80)
train_person_1 = drop_columns_by_nulls(train_person_1, 0.80)
train_person_2 = drop_columns_by_nulls(train_person_2, 0.80)
train_static_cb = drop_columns_by_nulls(train_static_cb, 0.80)
train_credit_bureau_b_2 = drop_columns_by_nulls(train_credit_bureau_b_2, 0.80)

Drop columns: 17, Nulls >= 0.8
Drop columns: 0, Nulls >= 0.8
Drop columns: 0, Nulls >= 0.8
Drop columns: 0, Nulls >= 0.8
Drop columns: 0, Nulls >= 0.8


In [14]:
gc.collect()

0

In [15]:
train_basetable.head(3)

case_id,date_decision,MONTH,WEEK_NUM,target
i64,str,i64,i64,i64
0,"""2019-01-03""",201901,0,0
1,"""2019-01-03""",201901,0,0
2,"""2019-01-04""",201901,0,0


In [16]:
train_person_2.head(3)

case_id,addres_district_368M,addres_role_871L,addres_zip_823M,conts_role_79M,empls_economicalst_849M,empls_employedfrom_796D,empls_employer_name_740M,num_group1,num_group2,relatedpersons_role_762T
i64,str,str,str,str,str,str,str,i64,i64,str
5,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",0,0,
6,"""P55_110_32""","""CONTACT""","""P10_68_40""","""P38_92_157""","""P164_110_33""",,"""a55475b1""",0,0,
6,"""P55_110_32""","""PERMANENT""","""P10_68_40""","""a55475b1""","""a55475b1""",,"""a55475b1""",0,1,


In [17]:
train_person_1.head(3)

case_id,birth_259D,birthdate_87D,childnum_185L,contaddr_district_15M,contaddr_matchlist_1032L,contaddr_smempladdr_334L,contaddr_zipcode_807M,education_927M,empl_employedfrom_271D,empl_employedtotal_800L,empl_industry_691L,empladdr_district_926M,empladdr_zipcode_114M,familystate_447L,gender_992L,housetype_905L,housingtype_772L,incometype_1044T,isreference_387L,language1_981M,mainoccupationinc_384A,maritalst_703L,num_group1,personindex_1023L,persontype_1072L,persontype_792L,registaddr_district_1083M,registaddr_zipcode_184M,relationshiptoclient_415T,relationshiptoclient_642T,remitter_829L,role_1084L,role_993L,safeguarantyflag_411L,sex_738L,type_25L
i64,str,str,f64,str,bool,bool,str,str,str,str,str,str,str,str,str,str,str,str,bool,str,f64,str,i64,f64,f64,f64,str,str,str,str,bool,str,str,bool,str,str
0,"""1986-07-01""",,,"""P88_18_84""",False,False,"""P167_100_165""","""P97_36_170""","""2017-09-15""","""MORE_FIVE""","""OTHER""","""P142_57_166""","""P167_100_165""","""MARRIED""",,,,"""SALARIED_GOVT""",,"""P10_39_147""",10800.0,,0,0.0,1.0,1.0,"""P88_18_84""","""P167_100_165""",,,,"""CL""",,True,"""F""","""PRIMARY_MOBILE…"
0,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,1,1.0,1.0,4.0,"""a55475b1""","""a55475b1""","""SPOUSE""",,False,"""EM""",,,,"""PHONE"""
0,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,2,2.0,4.0,5.0,"""a55475b1""","""a55475b1""","""COLLEAGUE""","""SPOUSE""",False,"""PE""",,,,"""PHONE"""


In [18]:
train_person_2.filter(pl.col('case_id') == 6)

case_id,addres_district_368M,addres_role_871L,addres_zip_823M,conts_role_79M,empls_economicalst_849M,empls_employedfrom_796D,empls_employer_name_740M,num_group1,num_group2,relatedpersons_role_762T
i64,str,str,str,str,str,str,str,i64,i64,str
6,"""P55_110_32""","""CONTACT""","""P10_68_40""","""P38_92_157""","""P164_110_33""",,"""a55475b1""",0,0,
6,"""P55_110_32""","""PERMANENT""","""P10_68_40""","""a55475b1""","""a55475b1""",,"""a55475b1""",0,1,
6,"""P204_92_178""","""CONTACT""","""P65_136_169""","""P38_92_157""","""P164_110_33""",,"""a55475b1""",1,0,"""OTHER_RELATIVE…"
6,"""P191_109_75""","""CONTACT""","""P10_68_40""","""P7_147_157""","""a55475b1""",,"""a55475b1""",1,1,"""OTHER_RELATIVE…"
6,"""P204_92_178""","""CONTACT""","""P164_28_170""","""P38_92_157""","""a55475b1""",,"""a55475b1""",1,2,
6,"""P55_110_32""","""PERMANENT""","""P10_68_40""","""P38_92_157""","""a55475b1""",,"""a55475b1""",1,3,
6,"""P204_92_178""","""PERMANENT""","""P65_136_169""","""a55475b1""","""a55475b1""",,"""a55475b1""",1,4,
6,"""P204_92_178""","""PERMANENT""","""P164_28_170""","""a55475b1""","""a55475b1""",,"""a55475b1""",1,5,


In [19]:
train_person_1.select(["case_id", "num_group1", "housetype_905L"]).sample(3)

case_id,num_group1,housetype_905L
i64,i64,str
1744695,0,
821590,2,
8244,1,


## Feature Engineering

In [20]:
train_credit_bureau_b_2.shape

(13861214, 79)

In [21]:
# train_credit_bureau_b_2['pmts_pmtsoverdue_635A']

In [25]:
# train_credit_bureau_b_2.columns

In [26]:
# We need to use aggregation functions in tables with depth > 1, so tables that contain num_group1 column or 
# also num_group2 column.
train_person_1_feats_1 = train_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

# Here num_group1=0 has special meaning, it is the person who applied for the loan.
train_person_1_feats_2 = train_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})
del train_person_1

# Here we have num_goup1 and num_group2, so we need to aggregate again.
train_credit_bureau_b_2_feats = train_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)
del train_credit_bureau_b_2

# We will process in this examples only A-type and M-type columns, so we need to select them.
selected_static_cols = []
for col in train_static.columns:
    if col[-1] in ("A", "M"):
        selected_static_cols.append(col)
print(selected_static_cols)

selected_static_cb_cols = []
for col in train_static_cb.columns:
    if col[-1] in ("A", "M"):
        selected_static_cb_cols.append(col)
print(selected_static_cb_cols)

# Join all tables together.
data = train_basetable.join(
    train_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    train_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    train_person_1_feats_1, how="left", on="case_id"
).join(
    train_person_1_feats_2, how="left", on="case_id"
).join(
    train_credit_bureau_b_2_feats, how="left", on="case_id"
)
gc.collect()

ColumnNotFoundError: pmts_pmtsoverdue_635A

Error originated just after this operation:
DF ["case_id", "annualeffectiverate_199L", "annualeffectiverate_63L", "classificationofcontr_13M"]; PROJECT */79 COLUMNS; SELECTION: "None"

In [11]:
del train_basetable, train_static, train_static_cb, train_person_1_feats_2, train_person_1_feats_1, train_credit_bureau_b_2_feats, selected_static_cols, selected_static_cb_cols
gc.collect()

0

In [12]:
case_ids = data["case_id"].unique().shuffle(seed=1)
# NOTE, need to convert above variable into numpy array
case_ids_train, case_ids_test = train_test_split(case_ids.to_numpy(), train_size=0.6, random_state=1)
case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=1)
del case_ids
gc.collect()

0

In [13]:
cols_pred = []
for col in data.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)

print(cols_pred)

def from_polars_to_pandas(case_ids: pl.DataFrame) -> pl.DataFrame:
    return (
        data.filter(pl.col("case_id").is_in(case_ids))[["case_id", "WEEK_NUM", "target"]].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas()
    )

base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)

['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 'inittransactionamount_650A', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastcancelreason_561M', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectcredamount_222A', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdebt4_972A', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'previouscontdistrict_112M', 'price_1097A', 'sumoutstandtotal_3546847A', 'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A', 'totinstallas

In [66]:
del df
gc.collect()

17710

In [15]:
print(f"Train: {X_train.shape}")
print(f"Valid: {X_valid.shape}")
print(f"Test: {X_test.shape}")

Train: (915995, 48)
Valid: (305332, 48)
Test: (305332, 48)


## Train Model

**TODO:**
- metric: try other metrics
- predicttions:
  - what is the output?
  - how to convert into binary or probabilities to correctly evalaute precision, recall, and F1 scores?

In [86]:
print(gc.collect())
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)

params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "is_unbalance": "true",
    "metric": "auc",
    "max_depth": 3,
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
}

gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_valid,
    callbacks=[lgb.log_evaluation(50), lgb.early_stopping(8)]
)



Training until validation scores don't improve for 8 rounds
[50]	valid_0's auc: 0.715839
[100]	valid_0's auc: 0.728213
[150]	valid_0's auc: 0.733725
[200]	valid_0's auc: 0.737732
[250]	valid_0's auc: 0.740725
[300]	valid_0's auc: 0.742567
[350]	valid_0's auc: 0.744519
[400]	valid_0's auc: 0.746128
[450]	valid_0's auc: 0.747383
[500]	valid_0's auc: 0.748473
[550]	valid_0's auc: 0.749145
[600]	valid_0's auc: 0.749833
[650]	valid_0's auc: 0.750607
[700]	valid_0's auc: 0.751741
[750]	valid_0's auc: 0.752269
[800]	valid_0's auc: 0.752785
[850]	valid_0's auc: 0.753432
Early stopping, best iteration is:
[875]	valid_0's auc: 0.753813


In [87]:
for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    # y_pred = gbm.predict(X, num_iteration=gbm.best_iteration)
    y_pred = gbm.predict(X, num_iteration=gbm.best_iteration)
    base["pred"] = y_pred
del base, X
gc.collect()

25

In [88]:
# help(gbm.predict)

In [89]:
# NOTE, for F1 score need to convert into binary output from probabilities
auc_train = roc_auc_score(base_train["target"], base_train["pred"])
auc_valid = roc_auc_score(base_valid["target"], base_valid["pred"])
auc_test = roc_auc_score(base_test["target"], base_test["pred"])

threshold = 0.5

eval_df = pd.DataFrame(index=['train', 'valid', 'test'])
for name, df in (('train', base_train), ('valid', base_valid), ('test', base_test)):
    eval_df.loc[name, 'auc'] = roc_auc_score(df["target"], df["pred"])
    eval_df.loc[name,'f1'] = f1_score(df["target"], np.where(df["pred"] > threshold, 1, 0))
    eval_df.loc[name,'precision'] = precision_score(df["target"], np.where(df["pred"] > threshold, 1, 0))
    eval_df.loc[name,'recall'] = recall_score(df["target"], np.where(df["pred"] > threshold, 1, 0))

eval_df

Unnamed: 0,auc,f1,precision,recall
train,0.769118,0.125179,0.068752,0.698254
valid,0.753813,0.118632,0.065037,0.674327
test,0.750145,0.120891,0.06642,0.672004


In [90]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

stability_score_train = gini_stability(base_train)
stability_score_valid = gini_stability(base_valid)
stability_score_test = gini_stability(base_test)

print(f'The stability score on the train set is: {stability_score_train}') 
print(f'The stability score on the valid set is: {stability_score_valid}') 
print(f'The stability score on the test set is: {stability_score_test}')  

The stability score on the train set is: 0.4963348065611609
The stability score on the valid set is: 0.47224260766376086
The stability score on the test set is: 0.45759281450065276
