In [None]:
!pip3 install catboost



In [None]:
from google.colab import drive
import re

drive.mount('/content/drive')

base_dir ="drive/My Drive/Colab Notebooks/competition/202401/" #Drive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm

import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier

In [None]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    base_dir ="drive/My Drive/Colab Notebooks/competition/202401/" #Drive
    VER = 1
    AUTHOR = 'shisa07'
    COMPETITION = 'FUDA2'
    DATA_PATH = Path('/content/drive/MyDrive/Colab Notebooks/competition/202401/')
    OOF_DATA_PATH = Path('/content/drive/MyDrive/Colab Notebooks/competition/202401/oof')
    MODEL_DATA_PATH = Path('/content/drive/MyDrive/Colab Notebooks/competition/202401/models')
    SUB_DATA_PATH = Path('/content/drive/MyDrive/Colab Notebooks/competition/202401/submission')
    METHOD_LIST = ['lightgbm', 'xgboost', 'catboost']
    seed = 42
    n_folds = 7
    target_col = 'MIS_Status'
    metric = 'f1_score'
    metric_maximize_flag = True
    num_boost_round = 500
    early_stopping_round = 200
    verbose = 25
    classification_lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.05,#0.05
        'seed': seed,
    }
    classification_xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': 0.05,#0.05
        'random_state': seed,
    }

    classification_cat_params = {
        'learning_rate': 0.05,#0.05
        'iterations': num_boost_round,
        'random_seed': seed,
    }
    model_weight_dict = {'lightgbm': 0.50, 'xgboost': 0.10, 'catboost': 0.40}
    # model_weight_dict = {'lightgbm': 0.40, 'xgboost': 0.3, 'catboost': 0.30}


In [None]:
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)

In [None]:
# ====================================================
# Metric
# ====================================================
# f1_score

# ====================================================
# LightGBM Metric
# ====================================================
def lgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro'), CFG.metric_maximize_flag

# ====================================================
# XGBoost Metric
# ====================================================
def xgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro')

In [None]:
train_df_raw = pd.read_csv(CFG.DATA_PATH / 'train.csv', index_col=0)
test_df_raw = pd.read_csv(CFG.DATA_PATH / 'test.csv', index_col=0)

In [None]:
# df_majority = train_df_raw[train_df_raw.MIS_Status == 1]
# df_minority = train_df_raw[train_df_raw.MIS_Status == 0]
# df_majority_downsampled = df_majority.sample(n=len(df_minority), random_state=42)
# train_df_raw = pd.concat([df_majority_downsampled, df_minority])

In [None]:
def Preprocessing(input_df: pd.DataFrame()) -> pd.DataFrame():

    def deal_missing(input_df: pd.DataFrame()) -> pd.DataFrame():
        output_df = input_df.copy()
        output_df["NullCount"] = output_df.isna().sum(axis=1)#Nullの数
        output_df['NullFlag_DisbursementDate'] = output_df['DisbursementDate'].isna()
        for col in ['RevLineCr', 'LowDoc', 'BankState', 'DisbursementDate']:
            output_df[col] = input_df[col].fillna('[UNK]')
        return output_df
    def clean_money(input_df: pd.DataFrame()) -> pd.DataFrame():
        output_df = input_df.copy()
        for col in ['DisbursementGross', 'GrAppv', 'SBA_Appv']:
            output_df[col] = input_df[col].str[1:].str.replace(',', '').str.replace(' ', '').astype(float)
        return output_df

    output_df = deal_missing(input_df)
    output_df = clean_money(output_df)
    output_df['NewExist'] = np.where(input_df['NewExist'] == 1, 1, 0)

    def make_features(input_df: pd.DataFrame()) -> pd.DataFrame():
        output_df = input_df.copy()


        code_dict = {# 31-33, 44-45, 48-49 は同じらしい => 32,33を31に, 45を44に, 49を48に変換
            32: 31,
            33: 31,
            45: 44,
            49: 48        }
        output_df["Sector"] = output_df["Sector"].replace(code_dict)

        # リボルビング列を3値に
        output_df['RevLineCr'] = output_df['RevLineCr'].apply(lambda x: x if x in ['Y', 'N'] else np.NaN)

        # Classification of states into urban (1) and rural (2) based on general knowledge
        urban_states = ['CA', 'NY', 'TX', 'FL', 'IL', 'PA', 'OH', 'GA', 'NC', 'MI', 'NJ', 'VA', 'WA', 'AZ', 'MA', 'TN', 'IN', 'MO', 'MD', 'WI', 'MN', 'CO', 'AL', 'SC', 'LA', 'KY', 'OR', 'OK', 'CT', 'IA', 'MS', 'AR', 'NV', 'NM', 'UT', 'WV', 'NE', 'ID', 'HI', 'ME', 'NH', 'RI', 'MT', 'DE', 'SD', 'ND', 'AK', 'VT', 'WY']
        rural_states = ['MT', 'WY', 'ND', 'SD', 'AK', 'VT', 'MS', 'ID', 'WV', 'NE', 'AR', 'NM', 'ME', 'NH', 'HI']

        # Function to reclassify 'UrbanRural' based on 'State'
        def reclassify_urban_rural(row):
            if row['UrbanRural'] == 0:
                if row['State'] in urban_states:
                    return 1
                elif row['State'] in rural_states:
                    return 2
            return row['UrbanRural']

        output_df['UrbanRural'] = output_df.apply(reclassify_urban_rural, axis=1)

        def remove_special_characters(city):#特殊文字削除
          if pd.isna(city):
              return city
          return re.sub(r'[^\w\s]', '', city)
        output_df['City'] = output_df['City'].apply(remove_special_characters)
        output_df['City'] = output_df['City'].str.upper()# Cityの表記ゆれ修正。(例：LOS ANGELES, Los angeles)

        patterns = ["CENSUS NAME", "RR NAME", "LOCAL NAME", "CORPORATE NAME", "PR NAME","BOROUGH OF"]
        output_df['City'] = output_df['City'].apply(lambda x: x.split(' ')[0] if any(pattern in x for pattern in patterns) else x)

        output_df['City'] = output_df['City'].str.replace('FORT ', 'FT ')
        output_df['City'] = output_df['City'].str.replace('SAINT ', 'ST ')
        output_df['City'] = output_df['City'].str.replace('PK ', 'PARK ')
        output_df['City'] = output_df['City'].str.replace('TOWN OF ', '')
        output_df['City'] = output_df['City'].str.replace('TOWNSHIP', '')

        output_df['City'] = output_df['City'].str.replace('CITY OF INDUSTRY INDUSTRY', 'CITY OF INDUSTRY')
        output_df['City'] = output_df['City'].str.replace('DEL REY OAKS', 'DEL REY')
        output_df['City'] = output_df['City'].str.replace('GLEMDALE', 'GLENDALE')
        output_df['City'] = output_df['City'].str.replace('NORTHRIDGE NORTH LOS ANGELES', 'NORTHRIDGE')
        output_df['City'] = output_df['City'].str.replace('PACIFIC', 'PACIFICA')
        output_df['City'] = output_df['City'].str.replace('PLEANSANTON', 'PLEASANTON')
        output_df['City'] = output_df['City'].str.replace('RANCHO CORDOVA MILLS', 'RANCHO CORDOVA')
        output_df['City'] = output_df['City'].str.replace('SIMI VALLEY SIMI', 'SIMI VALLEY')
        output_df['City'] = output_df['City'].str.replace('COLORADO SRINGS', 'COLORADO SPRINGS')
        output_df['City'] = output_df['City'].str.replace('JACKSONVILLE BEACH', 'JACKSONVILLE')

        output_df['City'] = output_df['City'].str.replace('COEUR DALENE', 'COEUR D ALENE')
        output_df['City'] = output_df['City'].str.replace('VILLIAGE', 'VILLAGE')
        output_df['City'] = output_df['City'].str.replace('HALIFAX HALIFAX BEACH', 'HALIFAX')
        output_df['City'] = output_df['City'].str.replace('NEWTON CENTER', 'NEWTON')
        output_df['City'] = output_df['City'].str.replace('KANSAS CITY', 'KANSAS')
        output_df['City'] = output_df['City'].str.replace('LAS CRUSES', 'LAS CRUCES')
        output_df['City'] = output_df['City'].str.replace('N LAS VEGAS ', 'NORTH LAS VEGAS')
        output_df['City'] = output_df['City'].str.replace('BAYSHORE', 'BAY SHORE')
        output_df['City'] = output_df['City'].str.replace('MILWAUKIE', 'MILWAUKEE')
        output_df['City'] = output_df['City'].str.replace('PORTLAMD', 'PORTLAND')

        output_df['City'] = output_df['City'].str.replace('HUNTINGDON VALLEY', 'HUNTINGDON')
        output_df['City'] = output_df['City'].str.replace('MC KEES ROCKS', 'MCKEES ROCKS')
        output_df['City'] = output_df['City'].str.replace('E GREENWICH', 'EAST GREENWICH')
        output_df['City'] = output_df['City'].str.replace('W COLUMBIA', 'WEST COLUMBIA')
        output_df['City'] = output_df['City'].str.replace('TYLERR', 'TYLER')
        output_df['City'] = output_df['City'].str.replace('SALT LAKE CITY', 'SALT LAKE')
        output_df['City'] = output_df['City'].str.replace('TOMAHAWK', 'TOMAH')
        output_df['City'] = output_df['City'].str.replace('CHARLESTON', 'CHARLES TOWN')


        # #米国中小企業の定義もとづき従業員数でビンニング
        # bins_sba = [0, 50, 250, 500]
        # names_sba = ['Small', 'Medium', 'Large']
        # output_df['EmployeeSize_SBA'] = pd.cut(output_df['NoEmp'], bins_sba, labels=names_sba, right=False)
        # output_df['EmployeeSize_SBA_encoding']= pd.get_dummies(output_df['EmployeeSize_SBA'])

        # bins = list(range(0, max_term + 12, 12))#termを１２ごとにビンへ
        # output_df['Term_Bin'] = pd.cut(output_df['Term'], bins=bins, right=False)

        output_df.columns = [col.replace(' ', '_').replace(',', '_').replace("'", '_').replace('"', '_').replace(':', '_') for col in output_df.columns]
        output_df['StateMatch'] = output_df['BankState'] == output_df['State']#金融機関との州が一緒か
        # # 通貨を数値に変換
        output_df['DisbursementGross'] = output_df['DisbursementGross'].replace('[\$,]', '', regex=True).astype(float)
        output_df['GrAppv'] = output_df['GrAppv'].replace('[\$,]', '', regex=True).astype(float)
        output_df['SBA_Appv'] = output_df['SBA_Appv'].replace('[\$,]', '', regex=True).astype(float)

        # # 金額関連の特徴量を処理
        output_df['SBA_GuaranteedPercentage'] = output_df['SBA_Appv'] / output_df['GrAppv']  # SBAの保証額の比率
        output_df['BankAppv'] = output_df['GrAppv'] - output_df['SBA_Appv']#総額とSBA承認額の差(おそらく銀行の上乗せ額)

        output_df['DisbursementGross'] = np.log1p(output_df['DisbursementGross'])
        output_df['GrAppv'] = np.log1p(output_df['GrAppv'])
        output_df['SBA_Appv'] = np.log1p(output_df['SBA_Appv'])

        output_df[['DisbursementDay', 'DisbursementMonth', 'DisbursementYear']] = output_df['DisbursementDate'].str.split('-', expand=True)
        output_df[['ApprovalDay', 'ApprovalMonth', 'ApprovalYear']] = output_df['ApprovalDate'].str.split('-', expand=True)

        def convert_year(year):
            # NaNを処理するための条件を追加
            if pd.isnull(year):
                return pd.NA
            year = int(year)  # 年を整数に変換
            return str(year + 2000) if year < 50 else str(year + 1900)

        # 'Year' 列を再度更新して、NoneをNaNに変換
        output_df['DisbursementYear'] = output_df['DisbursementYear'].apply(convert_year)
        output_df['ApprovalYear'] = output_df['ApprovalYear'].apply(convert_year)

        # Function to correct the ApprovalFY based on ApprovalDate
        def correct_approval_fy(row):
            if int(row['ApprovalMonth']) >= 10: # If date is on or after October 1st
                return int(row['ApprovalYear']) + 1
            else:
                return int(row['ApprovalYear'])
        month_to_num = {'Jan':'01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
                        'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08',
                        'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'}
        # ApprovalMonthカラムを月の数字に変換
        output_df['ApprovalMonth'] = output_df['ApprovalMonth'].map(month_to_num)
        output_df['DisbursementMonth'] = output_df['DisbursementMonth'].map(month_to_num)

        # # Applying the correction
        output_df['ApprovalYear'] = output_df.apply(correct_approval_fy, axis=1)

        output_df.drop(['DisbursementDate', 'ApprovalDate','DisbursementDay','ApprovalDay','ApprovalFY'], axis=1, inplace=True) # もとの日付型の列を削除

        # 不景気の期間を定義
        recession_periods = [
            {'start': '1973-11', 'end': '1975-03'},
            {'start': '1980-01', 'end': '1980-07'},
            {'start': '1981-07', 'end': '1982-11'},
            {'start': '1990-07', 'end': '1991-03'},
            {'start': '2001-03', 'end': '2001-11'},
            {'start': '2007-12', 'end': '2009-06'},
        ]

        # 承認年月をYYYY-MM形式の文字列に変換
        output_df['ApprovalYM'] = output_df['ApprovalYear'].astype(str) + '-' + output_df['ApprovalMonth'].apply(lambda x: f'{x:02}')
        output_df['DisbursementYM'] = output_df['DisbursementYear'].astype(str) + '-' + output_df['DisbursementMonth'].apply(lambda x: f'{x:02}')

        # 不景気期間中に承認されたかどうかをチェックしてフラグを立てる
        output_df['RecessionFlag'] = False  # デフォルト値を0（不景気期間外）に設定
        for period in recession_periods:
            output_df.loc[output_df['ApprovalYM'].between(period['start'], period['end']), 'RecessionFlag'] = True


        output_df['Term_NoEmp'] = output_df['Term'] * output_df['NoEmp']# 相互作用
        output_df['RetainedJob-CreateJob'] = output_df['RetainedJob'] + output_df['CreateJob']

        for col in ['FranchiseCode', 'RevLineCr', 'LowDoc', 'UrbanRural', 'State', 'BankState', 'City', 'Sector']:
            count_dict = dict(output_df[col].value_counts())
            output_df[f'{col}_ce'] = output_df[col].map(count_dict)

        for col in categorical_features:
            encoder = LabelEncoder()
            encoder.fit(output_df[col])
            output_df[col] = encoder.transform(output_df[col])

        output_df.drop(['FranchiseCode', 'RevLineCr', 'LowDoc',  'State', 'BankState', 'City', 'Sector'], axis=1, inplace=True) # を削除

        output_df.drop(['GrAppv','SBA_Appv','ApprovalYear','ApprovalMonth','DisbursementYear','DisbursementMonth'], axis=1, inplace=True) #多重共線性を落とす
        # output_df.drop(['ApprovalYear','ApprovalMonth','DisbursementYear','DisbursementMonth'], axis=1, inplace=True) #多重共線性を落とす

        return output_df

    df_processed = make_features(output_df)

    return df_processed


In [None]:
# default_numerical_features = ['Term', 'NoEmp', 'CreateJob', 'RetainedJob', 'DisbursementGross', 'GrAppv', 'SBA_Appv', 'ApprovalFY']
# default_categorical_features = ['NewExist', 'FranchiseCode', 'RevLineCr', 'LowDoc', 'UrbanRural', 'State', 'BankState', 'City', 'Sector']
# add_numerical_features = ['FranchiseCode_count_encoding', 'RevLineCr_count_encoding', 'LowDoc_count_encoding', 'UrbanRural_count_encoding', 'State_count_encoding', 'BankState_count_encoding', 'City_count_encoding', 'Sector_count_encoding']
# numerical_features = add_numerical_features + default_numerical_features
# categorical_features = ['RevLineCr', 'LowDoc', 'UrbanRural', 'State', 'Sector']
# features = numerical_features + categorical_features


# default_numerical_features = ['Term', 'NoEmp', 'CreateJob', 'RetainedJob', 'DisbursementGross', 'GrAppv', 'SBA_Appv']
# default_numerical_features = ['Term', 'NoEmp', 'CreateJob', 'RetainedJob', 'DisbursementGross']
# default_categorical_features = ['NewExist', 'RevLineCr', 'LowDoc', 'UrbanRural', 'State', 'BankState', 'City', 'Sector']
# add_numerical_features = ['FranchiseCode_ce', 'RevLineCr_ce', 'LowDoc_ce', 'UrbanRural_ce', 'State_ce', 'BankState_ce', 'City_ce', 'Sector_ce']
# numerical_features = add_numerical_features + default_numerical_features
# categorical_features = []
# features = numerical_features + categorical_features




# default_numerical_features = ['Term', 'NoEmp', 'CreateJob', 'RetainedJob', 'DisbursementGross', 'GrAppv', 'SBA_Appv']
default_numerical_features = ['Term', 'NoEmp', 'CreateJob', 'RetainedJob', 'DisbursementGross']
add_numerical_features = ['NullCount','SBA_GuaranteedPercentage','BankAppv','Term_NoEmp','RetainedJob-CreateJob',
    'FranchiseCode_ce', 'RevLineCr_ce', 'LowDoc_ce', 'UrbanRural_ce', 'State_ce', 'BankState_ce', 'City_ce', 'Sector_ce']

numerical_features = add_numerical_features + default_numerical_features

categorical_features = ['NewExist','NullFlag_DisbursementDate','StateMatch','ApprovalYM','DisbursementYM','RecessionFlag',]
features = numerical_features + categorical_features


In [None]:
features

['NullCount',
 'SBA_GuaranteedPercentage',
 'BankAppv',
 'Term_NoEmp',
 'RetainedJob-CreateJob',
 'FranchiseCode_ce',
 'RevLineCr_ce',
 'LowDoc_ce',
 'UrbanRural_ce',
 'State_ce',
 'BankState_ce',
 'City_ce',
 'Sector_ce',
 'Term',
 'NoEmp',
 'CreateJob',
 'RetainedJob',
 'DisbursementGross',
 'NewExist',
 'NullFlag_DisbursementDate',
 'StateMatch',
 'ApprovalYM',
 'DisbursementYM',
 'RecessionFlag']

In [None]:
train_df = Preprocessing(train_df_raw)
test_df = Preprocessing(test_df_raw)

In [None]:
for col in ['FranchiseCode', 'RevLineCr', 'LowDoc', 'UrbanRural', 'State', 'BankState', 'City', 'Sector']:
    count_dict = dict(train_df[col].value_counts())
    train_df[f'{col}_count_encoding'] = train_df[col].map(count_dict)
    test_df[f'{col}_count_encoding'] = test_df[col].map(count_dict).fillna(1).astype(int)

for col in categorical_features:
    encoder = LabelEncoder()
    encoder.fit(train_df[col])
    train_df[col] = encoder.transform(train_df[col])
    test_df[col] = encoder.transform(test_df[col])

In [None]:
train_df

Unnamed: 0,Term,NoEmp,NewExist,CreateJob,RetainedJob,MIS_Status,DisbursementGross,UrbanRural,NullCount,NullFlag_DisbursementDate,...,Term_NoEmp,RetainedJob-CreateJob,FranchiseCode_ce,RevLineCr_ce,LowDoc_ce,UrbanRural_ce,State_ce,BankState_ce,City_ce,Sector_ce
0,163,21,1,0,0,1,11.289794,1,0,0,...,3423,0,14033,27618.0,34313,35571,768,2382,324,9798
1,84,6,1,4,0,1,12.567241,1,0,0,...,504,4,26392,,34313,35571,527,453,8,1191
2,242,45,1,4,90,1,10.372991,1,0,0,...,10890,94,26392,27618.0,34313,35571,488,195,6,7337
3,237,4,1,0,0,1,12.341482,1,0,0,...,948,0,26392,27618.0,34313,35571,1147,2382,600,6053
4,184,0,1,0,0,1,13.171155,1,0,0,...,0,0,26392,27618.0,34313,35571,6893,6476,584,9798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42302,283,14,1,0,0,1,11.289794,1,0,0,...,3962,0,14033,27618.0,34313,35571,2849,1307,614,9798
42303,53,2,1,0,0,1,8.517393,1,0,0,...,106,0,26392,7353.0,34313,35571,6893,2382,414,7337
42304,59,6,0,0,0,1,11.002117,1,0,0,...,354,0,14033,27618.0,34313,35571,1229,2785,167,7337
42305,295,18,1,0,8,1,12.591338,1,0,0,...,5310,8,26392,27618.0,34313,35571,1004,824,2,7337


In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42307 entries, 0 to 42306
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Term                       42307 non-null  int64  
 1   NoEmp                      42307 non-null  int64  
 2   NewExist                   42307 non-null  int64  
 3   CreateJob                  42307 non-null  int64  
 4   RetainedJob                42307 non-null  int64  
 5   MIS_Status                 42307 non-null  int64  
 6   DisbursementGross          42307 non-null  float64
 7   UrbanRural                 42307 non-null  int64  
 8   NullCount                  42307 non-null  int64  
 9   NullFlag_DisbursementDate  42307 non-null  int64  
 10  StateMatch                 42307 non-null  int64  
 11  SBA_GuaranteedPercentage   42307 non-null  float64
 12  BankAppv                   42307 non-null  float64
 13  ApprovalYM                 42307 non-null  int

In [None]:
del train_df_raw,test_df_raw

In [None]:
# y=train_df['MIS_Status']
# train_df=train_df.drop('MIS_Status', axis=1)


# from imblearn.over_sampling import SMOTE

# smote = SMOTE(random_state=42)

# # SMOTEを使用してオーバーサンプリング
# X_resampled, y_resampled = smote.fit_resample(train_df, y)

In [None]:
def lightgbm_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
    lgb_valid = lgb.Dataset(x_valid, y_valid, categorical_feature=categorical_features)
    model = lgb.train(
                params = CFG.classification_lgb_params,
                train_set = lgb_train,
                num_boost_round = CFG.num_boost_round,
                valid_sets = [lgb_train, lgb_valid],
                feval = lgb_metric,
                callbacks=[lgb.early_stopping(stopping_rounds=CFG.early_stopping_round,
                                              verbose=CFG.verbose)]
            )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred
def xgboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    xgb_train = xgb.DMatrix(data=x_train, label=y_train)
    xgb_valid = xgb.DMatrix(data=x_valid, label=y_valid)
    model = xgb.train(
                CFG.classification_xgb_params,
                dtrain = xgb_train,
                num_boost_round = CFG.num_boost_round,
                evals = [(xgb_train, 'train'), (xgb_valid, 'eval')],
                early_stopping_rounds = CFG.early_stopping_round,
                verbose_eval = CFG.verbose,
                feval = xgb_metric,
                maximize = CFG.metric_maximize_flag,
            )
    # Predict validation
    valid_pred = model.predict(xgb.DMatrix(x_valid))
    return model, valid_pred
def catboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list):
    cat_train = Pool(data=x_train, label=y_train, cat_features=categorical_features)
    cat_valid = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
    model = CatBoostClassifier(**CFG.classification_cat_params)
    model.fit(cat_train,
              eval_set = [cat_valid],
              early_stopping_rounds = CFG.early_stopping_round,
              verbose = CFG.verbose,
              use_best_model = True)
    # Predict validation
    valid_pred = model.predict_proba(x_valid)[:, 1]
    return model, valid_pred

def gradient_boosting_model_cv_training(method: str, train_df: pd.DataFrame, features: list, categorical_features: list):
    # 予測値を格納するためのnumpy配列を作成
    oof_predictions = np.zeros(len(train_df))
    oof_fold = np.zeros(len(train_df))
    # kfold = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
    kfold = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)

    # 特徴量とラベルの両方を渡して層別化するための.splitメソッドの呼び出しを調整
    # for fold, (train_index, valid_index) in enumerate(kfold.split(train_df[features], train_df[CFG.target_col])):
    for fold, (train_index, valid_index) in enumerate(kfold.split(train_df)):

        print('-'*50)
        print(f'{method} training fold {fold+1}')

        x_train = train_df[features].iloc[train_index]
        y_train = train_df[CFG.target_col].iloc[train_index]
        x_valid = train_df[features].iloc[valid_index]
        y_valid = train_df[CFG.target_col].iloc[valid_index]

        if method == 'lightgbm':
            model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        elif method == 'xgboost':
            model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        elif method == 'catboost':
            model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)

        # 最良のモデルを保存
        pickle.dump(model, open(CFG.MODEL_DATA_PATH / f'{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'wb'))
        # out of folds配列に予測値を追加
        oof_predictions[valid_index] = valid_pred
        oof_fold[valid_index] = fold + 1
        del x_train, x_valid, y_train, y_valid, model, valid_pred
        gc.collect()

    # out of foldsメトリックを計算
    score = f1_score(train_df[CFG.target_col], oof_predictions >= 0.5, average='macro')
    print(f'{method} out of folds CV f1score is {score}')
    # out of folds予測値を格納するためのデータフレームを作成
    oof_df = pd.DataFrame({CFG.target_col: train_df[CFG.target_col], f'{method}_prediction': oof_predictions, 'fold': oof_fold})
    oof_df.to_csv(CFG.OOF_DATA_PATH / f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv', index = False)

def Learning(input_df: pd.DataFrame, features: list, categorical_features: list):
    for method in CFG.METHOD_LIST:
        gradient_boosting_model_cv_training(method, input_df, features, categorical_features)

In [None]:
features

['NullCount',
 'SBA_GuaranteedPercentage',
 'BankAppv',
 'Term_NoEmp',
 'RetainedJob-CreateJob',
 'FranchiseCode_ce',
 'RevLineCr_ce',
 'LowDoc_ce',
 'UrbanRural_ce',
 'State_ce',
 'BankState_ce',
 'City_ce',
 'Sector_ce',
 'Term',
 'NoEmp',
 'CreateJob',
 'RetainedJob',
 'DisbursementGross',
 'NewExist',
 'NullFlag_DisbursementDate',
 'StateMatch',
 'ApprovalYM',
 'DisbursementYM',
 'RecessionFlag']

In [None]:
Learning(train_df, features, categorical_features)

--------------------------------------------------
lightgbm training fold 1
[LightGBM] [Info] Number of positive: 32336, number of negative: 3927
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006876 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2432
[LightGBM] [Info] Number of data points in the train set: 36263, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.891708 -> initscore=2.108305
[LightGBM] [Info] Start training from score 2.108305
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[52]	training's auc: 0.884537	training's f1score: 0.649669	valid_1's auc: 0.741416	valid_1's f1score: 0.600318
--------------------------------------------------
lightgbm training fold 2
[LightGBM] [Info] Number of positive: 32389, number of negative: 3874
[LightGBM] [Inf

baseline
- lightgbm our out of folds CV f1score is 0.643594
- xgboost our out of folds CV f1score is 0.6399419
- catboost our out of folds CV f1score is 0.643645


In [None]:
def lightgbm_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(CFG.MODEL_DATA_PATH / f'lightgbm_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds
def xgboost_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(CFG.MODEL_DATA_PATH / f'xgboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict(xgb.DMatrix(x_test))
        test_pred += pred
    return test_pred / CFG.n_folds

def catboost_inference(x_test: pd.DataFrame):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(open(CFG.MODEL_DATA_PATH / f'catboost_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        pred = model.predict_proba(x_test)[:, 1]
        test_pred += pred
    return test_pred / CFG.n_folds

def gradient_boosting_model_inference(method: str, test_df: pd.DataFrame, features: list, categorical_features: list):
    x_test = test_df[features]
    if method == 'lightgbm':
        test_pred = lightgbm_inference(x_test)
    if method == 'xgboost':
        test_pred = xgboost_inference(x_test)
    if method == 'catboost':
        test_pred = catboost_inference(x_test)
    return test_pred

def Predicting(input_df: pd.DataFrame, features: list, categorical_features: list):
    output_df = input_df.copy()
    output_df['pred_prob'] = 0
    for method in CFG.METHOD_LIST:
        output_df[f'{method}_pred_prob'] = gradient_boosting_model_inference(method, input_df, features, categorical_features)
        output_df['pred_prob'] += CFG.model_weight_dict[method] * output_df[f'{method}_pred_prob']
    return output_df

In [None]:
test_df = Predicting(test_df, features, categorical_features)

In [None]:
def Postprocessing(train_df: pd.DataFrame(), test_df: pd.DataFrame()) -> (pd.DataFrame(), pd.DataFrame()):
    train_df['pred_prob'] = 0
    for method in CFG.METHOD_LIST:
        oof_df = pd.read_csv(CFG.OOF_DATA_PATH / f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv')
        train_df['pred_prob'] += CFG.model_weight_dict[method] * oof_df[f'{method}_prediction']
    best_score = 0
    best_v = 0
    for v in tqdm(np.arange(1000) / 1000):
        score = f1_score(oof_df[CFG.target_col], train_df[f'pred_prob'] >= v, average='macro')
        if score > best_score:
            best_score = score
            best_v = v
    print(best_score, best_v)
    test_df['target'] = np.where(test_df['pred_prob'] >= best_v, 1, 0)
    return train_df, test_df


In [None]:
train_df, test_df = Postprocessing(train_df, test_df)

  0%|          | 0/1000 [00:00<?, ?it/s]

0.6766397510095233 0.748


- baseline 0.6814、 0.736

In [None]:
# train_df.to_csv(CFG.DATA_PATH / f'train_prprocessed.csv', header=True)
# test_df.to_csv(CFG.DATA_PATH / f'test_prprocessed.csv', header=True)

In [None]:
# pred_dl = pd.read_csv(CFG.SUB_DATA_PATH / 'submission_dl.csv')

In [None]:
test_df[['target']].to_csv(CFG.SUB_DATA_PATH / f'seed{CFG.seed}_ver{CFG.VER}_{CFG.AUTHOR}_submission.csv', header=False)

特徴量の重要度を確認する方法

In [None]:
model = pickle.load(open(CFG.MODEL_DATA_PATH / f'lightgbm_fold1_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
importance_df = pd.DataFrame(model.feature_importance(), index=features, columns=['importance'])
importance_df['importance'] = importance_df['importance'] / np.sum(importance_df['importance'])
importance_df.sort_values('importance', ascending=False)

Unnamed: 0,importance
ApprovalYM,0.319872
DisbursementYM,0.305769
Term_NoEmp,0.052564
LowDoc_ce,0.049359
NoEmp,0.028205
RevLineCr_ce,0.025641
CreateJob,0.023718
Sector_ce,0.022436
Term,0.021154
RetainedJob,0.020513


In [None]:
model = pickle.load(open(CFG.MODEL_DATA_PATH /f'lightgbm_fold5_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
importance_df = pd.DataFrame(model.feature_importance(), index=features, columns=['importance'])
importance_df['importance'] = importance_df['importance'] / np.sum(importance_df['importance'])
importance_df.sort_values('importance', ascending=False)

Unnamed: 0,importance
ApprovalYM,0.320755
DisbursementYM,0.313208
LowDoc_ce,0.046541
Term_NoEmp,0.041509
NoEmp,0.028931
Term,0.028302
RevLineCr_ce,0.027044
City_ce,0.027044
RetainedJob,0.022642
RetainedJob-CreateJob,0.021384
