# Objective

Much thanks to @ragnar123 for his notebook, and @raddar for his integer dataset (which I further processed).
It has been modified to run successfully within the memory restrictions of a Kaggle kernel by only training one fold on each run, with the models saved for prediction in a separate notebook.

# Training & Inference

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import joblib
import itertools
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from itertools import combinations

# ====================================================
# Configurations
# ====================================================
class CFG:
    input_dir = '../input/amex-compress/'
    seed = 39
    n_folds = 5
    target = 'target'

# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    


# ====================================================
# Read data
# ====================================================
def read_data():
    train = pd.read_parquet(CFG.input_dir + 'train_fe_compress.parquet')
    train_section = np.array_split(train,5)[0]
    del train
    gc.collect()
    test = pd.read_parquet(CFG.input_dir + 'test_fe_compress.parquet')
    return train_section, test

# ====================================================
# Amex metric
# ====================================================
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

# ====================================================
# LGBM amex metric
# ====================================================
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

In [None]:
seed_everything(CFG.seed)
#train, test = read_data()
#train_and_evaluate(train, test)

In [None]:
# ====================================================
# Read train only
# ====================================================
def read_train():
    train = pd.read_parquet(CFG.input_dir + 'train_fe_compress.parquet')
    return train
# ====================================================
#  Incremental Training
# ====================================================
def train_model(train, x):
    # Label encode categorical features
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ]
    cat_features = [f"{cf}_last" for cf in cat_features]
    
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    num_cols = [col for col in num_cols if 'last' in col]
    for col in num_cols:
        train[col + '_round2'] = train[col].round(2)
    num_cols = [col for col in train.columns if 'last' in col]
    num_cols = [col[:-5] for col in num_cols if 'round' not in col]
    for col in num_cols:
        try:
            train[f'{col}_last_mean_diff'] = train[f'{col}_last'] - train[f'{col}_mean']
        except:
            pass
        
    # Transform float64 and float32 to float16
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    for col in tqdm(num_cols):
        train[col] = train[col].astype(np.float16)
    # Get feature list
    features = [col for col in train.columns if col not in ['customer_ID', CFG.target]]
    params = {
        'objective': 'binary',
        'metric': "binary_logloss",
        'boosting': 'dart',
        'seed': CFG.seed,
        'num_leaves': 100,
        'learning_rate': 0.01,
        'feature_fraction': 0.20,
        'bagging_freq': 10,
        'bagging_fraction': 0.50,
        'n_jobs': -1,
        'lambda_l2': 2,
        'min_data_in_leaf': 40,
        'force_col_wise': True
        }
    #oof_predictions = np.zeros(len(train))
    kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
        if fold == x:
            print(' ')
            print('-'*50)
            print(f'Training fold {fold} with {len(features)} features...')

            x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
            y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]

            lgb_train = lgb.Dataset(x_train, y_train, categorical_feature = cat_features)
            lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature = cat_features)

            model = lgb.train(
                params = params,
                train_set = lgb_train,
                num_boost_round = 10500,
                valid_sets = [lgb_train, lgb_valid],
                early_stopping_rounds = 100,
                verbose_eval = 500,
                feval = lgb_amex_metric
                )
            # Save best model
            joblib.dump(model, f'./lgbm_fold{fold}_seed{CFG.seed}.pkl')
            # Predict validation
            val_pred = model.predict(x_val)
            # Add to out of folds array
            #oof_predictions[val_ind] = val_pred
            score = amex_metric(y_val, val_pred)
            print(f'Our fold {fold} CV score is {score}')
            del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
            gc.collect()
    
# ====================================================
#  Helper
# ====================================================

def helper(x):
    train = read_train()
    train_model(train, x)

In [None]:
#run n times for n folds, with parameter corresponding to the fold
helper(4)