In [None]:
import pandas as pd
import numpy as np
import gc
import os
import dill as pickle
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [None]:
def learning_rate_02_decay_power_099(current_iter):
    base_learning_rate = .2
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-4 else 1e-4

In [None]:
class CFG:
    _TRAIN = True
    input_dir_default = '../input/amex-default-prediction'
    input_dir_parquet = '../input/amex-denoised-aggregated-features'
    input_dir_feather = '../input/amex-agg-dataset-feather'
    input_dir_pickle = '../input/amex-agg-data-pickle'
    seed = 42
    
class LGBCFG:
    test_size = .2
    boosting_rounds = 5000
    params = {
                    'objective': 'binary',
                    'metric': "binary_logloss",
                    'boosting': 'dart',
                    'seed': CFG.seed,
                    'max_bin': 250,
                    'num_leaves': 300,
                    'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_02_decay_power_099)],
                    'feature_fraction': .5,
                    'bagging_freq': 10,
                    'bagging_fraction': .7,
                    'n_jobs': -1,
                    'lambda_l2': 2,
                    'min_data_in_leaf': 40
    }
    
def import_df(train:bool = True):
    if train:
        df = pd.read_feather(os.path.join(CFG.input_dir_feather, 'train_agg_nonoise.ftr'))
        df = df.dropna(axis=1, thresh=int(.8 * len(df)))
                       
    else:
        df = pd.read_feather(os.path.join(CFG.input_dir_feather, 'test_agg_nonoise.ftr'))
        df = df.loc[:,columns]
    return df

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
def preprocess_and_train(df:pd.DataFrame = None, use_gpu:bool = False, drop_first:bool = True, ram_reduc:bool = True):
    sep_line = '-'*20
    
    print(f'[Info] Define categorical features\n{sep_line}')
    try:
        print('Drop customer_ID and S_2')
        features = df.drop(['customer_ID', 'S_2'], axis = 1).columns.to_list()
    except KeyError:
        features = df.columns.to_list()
    cat_features = ['B_30','B_38','D_114','D_116','D_117','D_120','D_126','D_63','D_64','D_66','D_68']
    
    features_nocat = [c for c in features if c not in cat_features]
    cat_features = [f'{cf}_last' for cf in cat_features]
    
    print(f'[Info] Calculate additional features\n{sep_line}')
    for col in df:
        if 'last' in col and col.replace('last', 'first') in df:
            #df[col + '_lag_sub'] = df[col] - df[col.replace('last', 'first')]
            df[col + '_lag_div'] = df[col] / df[col.replace('last', 'first')]     
    if drop_first:
        print('[Info] Drop "First"-columns')
        df = df.loc[:,df.columns.drop(list(df.filter(regex='first')))]
        
    df["c_PD_239"]=df["D_39_last"]/(df["P_2_last"]*(-1)+0.0001)
    df["c_PB_29"]=df["P_2_last"]*(-1)/(df["B_9_last"]*(1)+0.0001)
    df["c_PR_21"]=df["P_2_last"]*(-1)/(df["R_1_last"]+0.0001)

    df["c_BBBB"]=(df["B_9_last"]+0.001)/(df["B_23_last"]+df["B_3_last"]+0.0001)
    df["c_BBBB1"]=(df["B_33_last"]*(-1))+(df["B_18_last"]*(-1)+df["S_25_last"]*(1)+0.0001)
    df["c_BBBB2"]=(df["B_19_last"]+df["B_20_last"]+df["B_4_last"]+0.0001)

    df["c_RRR0"]=(df["R_3_last"]+0.001)/(df["R_2_last"]+df["R_4_last"]+0.0001)
    df["c_RRR1"]=(df["D_62_last"]+0.001)/(df["D_112_last"]+df["R_27_last"]+0.0001)

    df["c_PD_348"]=df["D_48_last"]/(df["P_3_last"]+0.0001)
    df["c_PD_355"]=df["D_55_last"]/(df["P_3_last"]+0.0001)

    df["c_PD_439"]=df["D_39_last"]/(df["P_4_last"]+0.0001)
    df["c_PB_49"]=df["B_9_last"]/(df["P_4_last"]+0.0001)
    df["c_PR_41"]=df["R_1_last"]/(df["P_4_last"]+0.0001)

        
    print(f'[Info] Total number of features: {len(df.columns)}')
    gc.collect()
    
    print('[Info] Optimize memory usage')
    df = reduce_mem_usage(df)
    
    print(f'[Info] Preprocess categorical features\n{sep_line}')
    cat_drop = []
    for cf in cat_features:
        if cf not in df.columns:
            cat_drop.append(cf)
    if len(cat_drop) > 0:
        for cd in cat_drop:
            cat_features.remove(cd)    
                                                
    print(f'[Info] Train-test-split\n{sep_line}')
    y = df.pop('TARGET')
    cID = df.pop('customer_ID')
    X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=LGBCFG.test_size)
    
    print(f'[Info] Delete unused dfs\n{sep_line}')
    del df
    del cID
    del y
    gc.collect()
                                                
    print(f'[Info] Define amex eval-metric\n{sep_line}')
    def amex_metric(y_true, y_pred):
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, 1].argsort()[::-1]]
        weights = np.where(labels[:,0]==0, 20, 1)
        cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
        top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
        gini = [0,0]
        for i in [1,0]:
            labels = np.transpose(np.array([y_true, y_pred]))
            labels = labels[labels[:, i].argsort()[::-1]]
            weight = np.where(labels[:,0]==0, 20, 1)
            weight_random = np.cumsum(weight / np.sum(weight))
            total_pos = np.sum(labels[:, 0] *  weight)
            cum_pos_found = np.cumsum(labels[:, 0] * weight)
            lorentz = cum_pos_found / total_pos
            gini[i] = np.sum((lorentz - weight_random) * weight)
        return 0.5 * (gini[1]/gini[0] + top_four)

    def lgb_amex_metric(y_pred, y_true):
        y_true = y_true.get_label()
        return 'amex_metric', amex_metric(y_true, y_pred), True
    
    print(f'[Info] Build lgb datasets\n{sep_line}')
    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature = cat_features)
    lgb_valid = lgb.Dataset(X_test, y_test, categorical_feature = cat_features)
    
    print(f'[Info] Delete unused dfs\n{sep_line}')
    del X_train
    del X_test
    del y_train
    del y_test
    gc.collect()
                                                
    print(f'[Info] Define lgb-model and train it\n{sep_line}') 
    if use_gpu:
        print('[Info] Train on GPU')
        LGBCFG.params['device'] = 'gpu'
    else:
        pass
    if ram_reduc:
        print(f'[Info] Limit RAM-usage\n{sep_line}')
        LGBCFG.params['histogram_pool_size '] = 1024
    else:
        pass
    
    m = lgb.train(
            params = LGBCFG.params,
            train_set = lgb_train,
            num_boost_round = LGBCFG.boosting_rounds,
            valid_sets = [lgb_train, lgb_valid],
            verbose_eval = 100,
            feval = lgb_amex_metric
    )
                                                
    print('*'*20)
    print('[FINISH Info] TRAINING FINISHED')
    print(f'[Info] Returning trained model\n{sep_line}')
    return m

In [None]:
def preprocess_and_pred(df:pd.DataFrame = None, clf:object = None, drop_first:bool = True):
    
    sep_line = '-'*20
    
    print(f'[Info] Define categorical features\n{sep_line}')
    try:
        print('Drop customer_ID and S_2')
        features = df.drop(['customer_ID', 'S_2'], axis = 1).columns.to_list()
    except KeyError:
        features = df.columns.to_list()
    cat_features = ['B_30','B_38','D_114','D_116','D_117','D_120','D_126','D_63','D_64','D_66','D_68']
    
    features_nocat = [c for c in features if c not in cat_features]
    cat_features = [f'{cf}_last' for cf in cat_features]
    
    print(f'[Info] Calculate additional features\n{sep_line}')
    for col in df:
        if 'last' in col and col.replace('last', 'first') in df:
            #df[col + '_lag_sub'] = df[col] - df[col.replace('last', 'first')]
            df[col + '_lag_div'] = df[col] / df[col.replace('last', 'first')]
    if drop_first:
        print('[Info] Drop "First"-columns')
        df = df.loc[:,df.columns.drop(list(df.filter(regex='first')))]
    
    df["c_PD_239"]=df["D_39_last"]/(df["P_2_last"]*(-1)+0.0001)
    df["c_PB_29"]=df["P_2_last"]*(-1)/(df["B_9_last"]*(1)+0.0001)
    df["c_PR_21"]=df["P_2_last"]*(-1)/(df["R_1_last"]+0.0001)

    df["c_BBBB"]=(df["B_9_last"]+0.001)/(df["B_23_last"]+df["B_3_last"]+0.0001)
    df["c_BBBB1"]=(df["B_33_last"]*(-1))+(df["B_18_last"]*(-1)+df["S_25_last"]*(1)+0.0001)
    df["c_BBBB2"]=(df["B_19_last"]+df["B_20_last"]+df["B_4_last"]+0.0001)

    df["c_RRR0"]=(df["R_3_last"]+0.001)/(df["R_2_last"]+df["R_4_last"]+0.0001)
    df["c_RRR1"]=(df["D_62_last"]+0.001)/(df["D_112_last"]+df["R_27_last"]+0.0001)

    df["c_PD_348"]=df["D_48_last"]/(df["P_3_last"]+0.0001)
    df["c_PD_355"]=df["D_55_last"]/(df["P_3_last"]+0.0001)

    df["c_PD_439"]=df["D_39_last"]/(df["P_4_last"]+0.0001)
    df["c_PB_49"]=df["B_9_last"]/(df["P_4_last"]+0.0001)
    df["c_PR_41"]=df["R_1_last"]/(df["P_4_last"]+0.0001)

    print(f'[Info] Total number of features: {len(df.columns)+1}')
    gc.collect()
    
    print('[Info] Optimize memory usage')
    df = reduce_mem_usage(df)
    
    print('[Info] Split dataframe into chunks')
    df_list = np.array_split(df, 10)
    del df
    gc.collect()
    
    def chunk(df_list:list = None, i:int = None): return df_list[i]
        
    submission_dfs = []
    for i in range(0,len(df_list)):
        sdf = chunk(df_list,i)
        customer_ID = sdf.pop('customer_ID')
    
        sep_line = '-'*200    
        print(f'[Info] Predict testset {i+1}')
        yhats = clf.predict(sdf)
        s_sub = pd.DataFrame({'customer_ID':customer_ID, 'prediction':yhats})
        submission_dfs.append(s_sub)
        
        del customer_ID
        del yhats
        del s_sub
        del sdf
    
    submission = pd.concat(submission_dfs, axis=0)
    
    return submission    

In [None]:
CFG._TRAIN = True

In [None]:
if CFG._TRAIN:
    df = import_df(train=True)
    columns = list(df.columns)
    columns.remove('TARGET')
    clf = preprocess_and_train(df=df, use_gpu=False, drop_first=True, ram_reduc=True)
    
    del df
    gc.collect()

    clf.save_model('amex-lgb', num_iteration=clf.best_iteration)
    print('[Info] Model saved')

In [None]:
CFG._TRAIN = False

In [None]:
if CFG._TRAIN==False:
    clf = lgb.Booster(model_file='./amex-lgb')
    sub_df = import_df(train=False)
    yhats = preprocess_and_pred(df=sub_df, clf=clf, drop_first=True)

In [None]:
yhats.to_csv('submission.csv', index=False)