In [1]:
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import gc,os,random
import time,datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold,GroupKFold

from utils import *
# root = args.root
# seed = args.seed
# remark = args.remark
# save_dir = args.save_dir
import lightgbm as lgb

## Config

In [2]:
root='.'
remark=''
save_dir=''
seed=42
id_name = 'customer_ID'
label_name = 'target'

eps = 1e-3

## Preprocessing

In [19]:
# lgb did not like normalize data so somewhat denormalizing it.
zz = pd.read_parquet(f'{root}/extra/tmp_feature')
for col in zz.columns:
    if 'target' in col:
        zz[col] = zz[col] // 0.001        
zz.to_parquet(f'{root}/extra/tmp_feature', compression="gzip", index=False)

In [3]:
# loading and saving manual features (polars library).
df = pl.read_parquet(f'{root}/extra/cat_feature')
for fn in ['num','diff','rank_num','last3_cat','last3_num','last3_diff', 'last6_num','ym_rank_num','tmp']:
    df = df.join(pl.read_parquet(f'{root}/extra/{fn}_feature'), on="customer_ID", how="left", ) 

train_y =  pl.read_csv(f'{root}/train_labels.csv')
df.filter(~pl.col("customer_ID").is_in(train_y["customer_ID"]),
         ).write_parquet(f'{root}/extra/lgb_main_test_feature', compression='gzip')
df = df.filter(pl.col("customer_ID").is_in(train_y["customer_ID"]),)
df = df.join(train_y, on="customer_ID", how="left", ) 
df.write_parquet(f'{root}/extra/lgb_main_train_feature', compression='gzip')

## Training

In [3]:
# train_y =  pl.read_csv(f'{root}train_labels.csv')

In [3]:
def Metric(labels,preds):
    return amex_metric_mod(labels,preds)

def amex_metric_mod(y_true, y_pred):
        # y_true => 
        # 0          0
        # 1          0
        # 2          0
        #           ..
        # 5531449    0
        # 5531450    0
        # Name: target, Length: 2765213, dtype: int64
        
        # y_pred => [0.0018315  0.00164183 0.00174071 ... 0.00212767 0.00309472]
    labels     = np.transpose(np.array([y_true, y_pred]))    

    labels     = labels[labels[:, 1].argsort()[::-1]]
        # .argsort() => Return the integer indices that would sort the Series values.
        # .argsort()[::-1] => reverse the sorted indices.
        # labels => 
        # [[1.00000000e+00 9.97522107e-01]
        # [1.00000000e+00 9.97491614e-01]
        # ...
        # [0.00000000e+00 8.86596095e-05]
        # [0.00000000e+00 8.82186859e-05]]
        # labels[:,0] denotes originl targets, labels[:,1] denotes predicted targets.
        
    weights    = np.where(labels[:,0]==0, 20, 1)
        # weights => [ 1  1  1 ... 20 20 20]
        # np.cumsum(weights) => [       1        2        3 ... 42218046 42218066 42218086]
        # np.sum(weights) => 42218086
    
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
        # cut_vals =>     
        # [[1.         0.99752211]
        #  [1.         0.99749161]
        #  ...
        #  [1.         0.69189778]
        #  [0.         0.69189679]]    
    
        # np.sum(cut_vals[:,0]), np.sum(labels[:,0]) => 388343.0, 688746.0   
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
        # top_four => 0.5638406611435856

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]] # i = 1,0
            # when i==0 then labels => 
            # [[1.         0.67152673]
            #  [1.         0.6808289 ]
            #  ...
            #  [0.         0.5255297 ]
            #  [0.         0.0018315 ]]            
        weights        = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weights / np.sum(weights))
            # weight_random =>
            # [2.36865309e-08 4.73730619e-08 7.10595928e-08 ... 9.99999052e-01
            #  9.99999526e-01 1.00000000e+00]        
        total_pos      = np.sum(labels[:, 0] *  weights)
            # total_pos => 688746.0
        cum_pos_found  = np.cumsum(labels[:, 0] * weights)
            # cum_pos_found => 
            # [1.00000e+00 2.00000e+00 3.00000e+00 ... 6.88746e+05 6.88746e+05
            #  6.88746e+05]        
        lorentz        = cum_pos_found / total_pos
            # lorentz =>        
            # [1.45191406e-06 2.90382812e-06 4.35574217e-06 ... 1.00000000e+00
            #  1.00000000e+00 1.00000000e+00]        
        gini[i]        = np.sum((lorentz - weight_random) * weights)

    return 0.5 * (gini[1]/gini[0] + top_four)

def Write_log(logFile,text,isPrint=False):
    if isPrint:
        print(text)
    logFile.write(text)
    logFile.write('\n')
    return None

In [6]:
def Lgb_train_and_predict(train, test, config, gkf=False, aug=None, output_root='./output/', run_id=None):
    if not run_id:
        run_id = 'run_lgb_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
        while os.path.exists(output_root+run_id+'/'):
            time.sleep(1)
            run_id = 'run_lgb_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
        output_path = output_root + f'{save_dir}/'
    else:
        output_path = output_root + run_id + '/'
    if not os.path.exists(output_path):
        os.mkdir(output_path)
        
#     os.system(f'cp ./*.py {output_path}')
#     os.system(f'cp ./*.sh {output_path}')
    
    config['lgb_params']['seed'] = config['seed'] # config['seed'] = seed; defined in config.
    oof, sub = None, None
    if train is not None:
        log = open(output_path + '/train.log','w',buffering=1)
        log.write(str(config)+'\n')
        features = config['feature_name']
        params = config['lgb_params']
        rounds = config['rounds']
        verbose = config['verbose_eval']
        early_stopping_rounds = config['early_stopping_rounds']
        folds = config['folds'] # 5
        seed = config['seed']
        
        # assigning dataframe instead of series.
        oof = train[[id_name]] # id-name = 'customer_ID'
        oof[label_name] = 0 # creating column of zeros.

        all_valid_metric,feature_importance = [],[]
        if gkf: # group k fold. each customer_id appears once in one fold.
            # dropping duplicate customer id from set of ['customer_ID',target].
            tmp = train.loc[:,[id_name,label_name]].drop_duplicates(id_name).reset_index(drop=True)
                # tmp.columns => Index(['customer_ID', 'target'], dtype='object').
            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
            split = skf.split(tmp,tmp[label_name])
            new_split = []
            for trn_index, val_index in split:
                # for first split => trn_index, val_index =>
                # [     1      2      3 ... 458905 458908 458911] [     0      4      5 ... 458909 458910 458912]
                
                # slice tmp, with row indexes present in 'trn_index', with column 'id_name'.
                trn_uids = tmp.loc[trn_index,id_name].values
                val_uids = tmp.loc[val_index,id_name].values
                new_split.append((train.loc[train[id_name].isin(trn_uids)].index,train.loc[train[id_name].isin(val_uids)].index))
            split = new_split
            del new_split
            _ = gc.collect()
            
            # skf = GroupKFold(n_splits=folds)
            # split = skf.split(train,train[label_name],train[id_name])
        else:
            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
            split = skf.split(train,train[label_name])
            
        for fold, (trn_index, val_index) in enumerate(split):
            evals_result_dic = {}
            
            # training customer_ID's.
            train_cids = train.loc[trn_index,id_name].values
            
            if aug: # if augmentation.
                train_aug = aug.loc[aug[id_name].isin(train_cids)]
                trn_data = lgb.Dataset(train.loc[trn_index,features].append(train_aug[features]), label=train.loc[trn_index,label_name].append(train_aug[label_name]))
            else:
                trn_data = lgb.Dataset(train.loc[trn_index,features], label=train.loc[trn_index,label_name])

            val_data = lgb.Dataset(train.loc[val_index,features], label=train.loc[val_index,label_name])
            model = lgb.train(params,
                init_model = output_path + 'foldX.ckpt', # continue training from previous save.
#                 init_model = output_path + 'fold%s.ckpt'%fold, 
                train_set  = trn_data,
                num_boost_round   = rounds,
                valid_sets = [trn_data,val_data],
                evals_result = evals_result_dic,
                early_stopping_rounds = early_stopping_rounds,
                verbose_eval = verbose
            )

#             model = lgb.Booster(model_file=output_path + '/fold%s.ckpt'%fold) # loading model from saved ckpt.
            model.save_model(output_path + '/fold%s.ckpt'%fold)

            valid_preds = model.predict(train.loc[val_index,features], num_iteration=model.best_iteration)
                # valid_preds => [0.0018315  0.00164183 0.00174071 ... 0.00257369 0.00212767 0.00309472]

            # filling a slice of dataframe.
            # replacing actual targets with predictions.
            oof.loc[val_index,label_name] = valid_preds

#             for i in range(len(evals_result_dic['valid_1'][params['metric']])//verbose):
#                 Write_log(log,' - %i round - train_metric: %.6f - valid_metric: %.6f\n'%(i*verbose,evals_result_dic['training'][params['metric']][i*verbose],evals_result_dic['valid_1'][params['metric']][i*verbose]))

            all_valid_metric.append(Metric(train.loc[val_index,label_name],valid_preds))
            Write_log(log,'- fold%s valid metric: %.6f\n'%(fold,all_valid_metric[-1]))
            
            # Get feature importances. importance_type; How the importance is calculated. 
            # importance_type=“split” => result contains numbers of times the feature is used in a model.
            # importance_type=“gain” => result contains total gains of splits which use the feature.
            importance_gain = model.feature_importance(importance_type='gain')            
            importance_split = model.feature_importance(importance_type='split')
            
            feature_name = model.feature_name()
            feature_importance.append(pd.DataFrame({'feature_name':feature_name,'importance_gain':importance_gain,'importance_split':importance_split}))            
        
        # Concatenating pandas along axis=0.
        feature_importance_df = pd.concat(feature_importance)
        
        feature_importance_df = feature_importance_df.groupby(['feature_name']).mean().reset_index()
        feature_importance_df = feature_importance_df.sort_values(by=['importance_gain'],ascending=False)
        feature_importance_df.to_csv(output_path + '/feature_importance.csv',index=False)

        mean_valid_metric = np.mean(all_valid_metric)
        global_valid_metric = Metric(train[label_name].values,oof[label_name].values)
        Write_log(log,'all valid mean metric:%.6f, global valid metric:%.6f'%(mean_valid_metric,global_valid_metric))

        oof.to_csv(output_path + '/oof.csv',index=False)

        log.close()
        os.rename(output_path + '/train.log', output_path + '/train_%.6f.log'%mean_valid_metric)

        log_df = pd.DataFrame({'run_id':[run_id],'mean metric':[round(mean_valid_metric,6)],'global metric':[round(global_valid_metric,6)],'remark':[remark]})
        if not os.path.exists(output_root + '/experiment_log.csv'):
            log_df.to_csv(output_root + '/experiment_log.csv',index=False)
        else:
            log_df.to_csv(output_root + '/experiment_log.csv',index=False,header=None,mode='a')
            
    features = config['feature_name']
    folds = config['folds']            
    if test is not None:
        sub = test[[id_name]] # assigning dataframe instead of series.
        sub['prediction'] = 0
        for fold in range(folds):
            model = lgb.Booster(model_file=output_path + '/fold%s.ckpt'%fold)
            test_preds = model.predict(test[features], num_iteration=model.best_iteration)
            sub['prediction'] += (test_preds / folds)
        sub[[id_name,'prediction']].to_csv(output_path + '/submission_1.csv.zip', compression='zip',index=False)
        
#     if save_dir in output_path:
#         os.rename(output_path,output_root+run_id+'/')
        
#     return oof,sub,(mean_valid_metric,global_valid_metric)

In [5]:
lgb_config = {
    'lgb_params':{
                  'objective' : 'binary',
                  'metric' : 'binary_logloss',
                  'boosting': 'dart',
                  'max_depth' : -1,
                  'num_leaves' : 64,
                  'learning_rate' : 0.0100, # 0.035 
                  'bagging_freq': 5,
                  'bagging_fraction' : 0.75,
                  'feature_fraction' : 0.05,
                  'min_data_in_leaf': 256,
                  'max_bin': 63,
                  'min_data_in_bin': 256,
                  # 'min_sum_heassian_in_leaf': 10,
                  'tree_learner': 'serial',
                  'boost_from_average': 'false',
                  'lambda_l1' : 0.1,
                  'lambda_l2' : 30,
                  'num_threads': 11, # cpu cores
                  'verbosity' : 0, # 1
    },
    'feature_name':[],
    'rounds':500,
    'early_stopping_rounds':100,
    'verbose_eval':50,
    'folds':5,
    'seed':seed
}


In [6]:
train = pd.read_parquet(f'{root}/extra/lgb_main_train_feature')

In [17]:
lgb_config['feature_name'] = [col for col in train.columns if col not in [id_name,label_name,'S_2'] and 'target' not in col]
test = None
Lgb_train_and_predict(train,test,lgb_config,aug=None,output_root='./o_debug/',run_id='LGB_with_manual_feature')
# 0.1736 fold4

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[10050]	training's binary_logloss: 0.174014	valid_1's binary_logloss: 0.200838
[10100]	training's binary_logloss: 0.17385	valid_1's binary_logloss: 0.200843
[10150]	training's binary_logloss: 0.173746	valid_1's binary_logloss: 0.200839
[10200]	training's binary_logloss: 0.173673	valid_1's binary_logloss: 0.200845
[10250]	training's binary_logloss: 0.173499	valid_1's binary_logloss: 0.200856
[10300]	training's binary_logloss: 0.173352	valid_1's binary_logloss: 0.200864
[10350]	training's binary_logloss: 0.173212	valid_1's binary_logloss: 0.200868
[10400]	training's binary_logloss: 0.173134	valid_1's binary_logloss: 0.200869
[10450]	training's binary_logloss: 0.173082	valid_1's binary_logloss: 0.200865
[10500]	training's binary_logloss: 0.172827	valid_1's binary_logloss: 0.20088
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you

In [7]:
lgb_config['feature_name'] = [col for col in train.columns if col not in [id_name,label_name,'S_2']]
test = None
Lgb_train_and_predict(train,test,lgb_config,aug=None,output_root='./o_debug/',run_id='LGB_with_manual_feature_and_series_oof')

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[5550]	training's binary_logloss: 0.180955	valid_1's binary_logloss: 0.204425
[5600]	training's binary_logloss: 0.180759	valid_1's binary_logloss: 0.2044
[5650]	training's binary_logloss: 0.180625	valid_1's binary_logloss: 0.204378
[5700]	training's binary_logloss: 0.18054	valid_1's binary_logloss: 0.204381
[5750]	training's binary_logloss: 0.180306	valid_1's binary_logloss: 0.204343
[5800]	training's binary_logloss: 0.18012	valid_1's binary_logloss: 0.20432
[5850]	training's binary_logloss: 0.179942	valid_1's binary_logloss: 0.204289
[5900]	training's binary_logloss: 0.179847	valid_1's binary_logloss: 0.204288
[5950]	training's binary_logloss: 0.179779	valid_1's binary_logloss: 0.204272
[6000]	training's binary_logloss: 0.179466	valid_1's binary_logloss: 0.204232
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `for

In [8]:
pd.read_csv('o_debug/experiment_log.csv')

Unnamed: 0,run_id,mean metric,global metric,remark
0,LGB_with_series_feature,0.731081,0.730968,
1,LGB_with_series_feature,0.731057,0.730957,
2,LGB_with_series_feature,0.731057,0.730957,
3,LGB_with_manual_feature_and_series_oof,0.812716,0.81252,
4,LGB_with_manual_feature,0.797596,0.797599,
5,LGB_with_manual_feature,0.841293,0.841101,
6,LGB_with_manual_feature,0.854025,0.853507,
7,LGB_with_manual_feature,0.859919,0.859501,
8,LGB_with_manual_feature_and_series_oof,0.847463,0.84762,


## Inference

In [7]:
test = pd.read_parquet(f'{root}/extra/lgb_main_test_feature')

In [10]:
test = test[0:462310]
_ = gc.collect()

In [8]:
test = test[462310:]
_ = gc.collect()

In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462311 entries, 462310 to 924620
Columns: 6386 entries, customer_ID to target13
dtypes: float32(6328), object(1), uint8(57)
memory usage: 10.9+ GB


In [9]:
lgb_config['feature_name'] = [col for col in test.columns if col not in [id_name,label_name,'S_2'] and 'target' not in col]
train = None
Lgb_train_and_predict(train,test,lgb_config,aug=None,output_root='./o_debug/',run_id='LGB_with_manual_feature')

In [10]:
lgb_config['feature_name'] = [col for col in test.columns if col not in [id_name,label_name,'S_2']]
train = None
Lgb_train_and_predict(train,test,lgb_config,aug=None,output_root='./o_debug/',run_id='LGB_with_manual_feature_and_series_oof')

## Concat

In [11]:
sub = pd.read_csv('./o_debug/LGB_with_manual_feature/submission.csv.zip') 
sub_1 = pd.read_csv('./o_debug/LGB_with_manual_feature/submission_1.csv.zip')
tmp = pd.concat([sub,sub_1])
tmp.to_csv('./o_debug/LGB_with_manual_feature/submission_fnl.csv.zip', compression='zip',index=False)

In [12]:
sub = pd.read_csv('./o_debug/LGB_with_manual_feature_and_series_oof/submission.csv.zip') 
sub_1 = pd.read_csv('./o_debug/LGB_with_manual_feature_and_series_oof/submission_1.csv.zip')
tmp = pd.concat([sub,sub_1])
tmp.to_csv('./o_debug/LGB_with_manual_feature_and_series_oof/submission_fnl.csv.zip', compression='zip',index=False)