In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

import optuna

from sklearn.model_selection import train_test_split
import sklearn.metrics

from xgboost import XGBClassifier

import cupy, cudf # GPU libraries
import matplotlib.pyplot as plt, gc, os

import gc
import warnings
warnings.filterwarnings('ignore')

In [2]:
def read_train_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
    else: df = cudf.read_parquet(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = cudf.to_datetime( df.S_2 )
    # SORT BY CUSTOMER AND DATE (so agg('last') works correctly)
    #df = df.sort_values(['customer_ID','S_2'])
    #df = df.reset_index(drop=True)
    # FILL NAN
    df = df.fillna(0) 
    print('shape of data:', df.shape)
    
    return df

print('Reading train data...')
TRAIN_PATH = '../input/amex-data-integer-dtypes-parquet-format/train.parquet'
train = read_train_file(path = TRAIN_PATH)

Reading train data...
shape of data: (5531451, 190)


In [3]:
def process_and_feature_engineer(df):
    # FEATURE ENGINEERING FROM 
    # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]

    test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]

    test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

    df = cudf.concat([test_num_agg, test_cat_agg], axis=1)
    del test_num_agg, test_cat_agg
    print('shape after engineering', df.shape )
    
    return df

train = process_and_feature_engineer(train)

shape after engineering (458913, 918)


In [4]:
# ADD TARGETS
targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
targets = targets.set_index('customer_ID')
train = train.merge(targets, left_index=True, right_index=True, how='left')
train.target = train.target.astype('int8')
del targets

# NEEDED TO MAKE CV DETERMINISTIC (cudf merge above randomly shuffles rows)
train = train.sort_index().reset_index()

# FEATURES
FEATURES = train.columns[1:-1]
print(f'There are {len(FEATURES)} features!')

There are 918 features!


In [5]:
train_pd = train.to_pandas()
del train
_ = gc.collect()

In [6]:
train_df, test_df = train_test_split(train_pd, test_size=0.25, stratify=train_pd['target'])
del train_pd
_ = gc.collect()

In [7]:
len(train_df),len(test_df)

(344184, 114729)

In [8]:
X_train = train_df.drop(['customer_ID', 'target'], axis=1)
X_test = test_df.drop(['customer_ID', 'target'], axis=1)

In [9]:
X_train

Unnamed: 0,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_last,D_39_mean,D_39_std,D_39_min,D_39_max,D_39_last,...,D_63_nunique,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique
266636,0.993001,0.023384,0.928278,1.009935,0.965443,5.461538,4.611858,0,16,16,...,1,13,0,1,13,-1,1,13,6,1
245443,0.477611,0.041451,0.385553,0.524959,0.524759,0.846154,2.303843,0,8,0,...,1,13,2,1,13,-1,1,13,6,1
35791,0.510347,0.093577,0.381381,0.595612,0.590560,6.875000,8.871101,0,20,18,...,2,8,3,2,8,-1,1,8,1,2
248343,0.846316,0.018792,0.803425,0.885623,0.803425,7.384615,6.970634,0,24,8,...,1,13,0,1,13,1,1,13,6,1
279305,0.745176,0.041000,0.671002,0.803394,0.728161,1.307692,4.150996,0,15,0,...,1,13,0,1,13,-1,1,13,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274558,0.868150,0.014132,0.849938,0.893494,0.866048,0.153846,0.375534,0,1,0,...,1,13,0,1,13,-1,1,13,6,1
133954,0.456201,0.041485,0.386068,0.503760,0.464785,0.000000,0.000000,0,0,0,...,2,13,3,1,13,-1,1,13,6,1
443708,0.663713,0.040872,0.587852,0.718465,0.718465,5.076923,6.291019,0,18,12,...,1,13,3,2,13,1,1,13,3,2
310320,0.719256,0.023928,0.689424,0.753959,0.692723,11.384615,14.268488,0,42,42,...,1,13,0,1,13,-1,1,13,6,1


In [10]:
y_train = train_df['target']
y_test = test_df['target']

In [11]:
y_train

266636    0
245443    0
35791     0
248343    0
279305    0
         ..
274558    0
133954    0
443708    0
310320    0
23931     1
Name: target, Length: 344184, dtype: int8

In [12]:
del train_df, test_df
_ = gc.collect()

In [13]:
# optuna

def objective(trial):
    
    param = {
        'booster':'gbtree',
        'tree_method':'gpu_hist', 
        "objective": "binary:logistic",
        'lambda': trial.suggest_loguniform(
            'lambda', 1e-3, 10.0
        ),
        'alpha': trial.suggest_loguniform(
            'alpha', 1e-3, 10.0
        ),
        'colsample_bytree': trial.suggest_float(
            'colsample_bytree', 0.5,1,step=0.1
        ),
        'subsample': trial.suggest_float(
            'subsample', 0.5,1,step=0.1
        ),
        'learning_rate': trial.suggest_float(
            'learning_rate', 0.001,0.05,step=0.001
        ),
        'n_estimators': trial.suggest_int(
            "n_estimators", 80,1000,10
        ),
        'max_depth': trial.suggest_int(
            'max_depth', 2,10,1
        ),
        'random_state': 99,
        'min_child_weight': trial.suggest_int(
            'min_child_weight', 1,256,1
        ),
    }
    
    model = XGBClassifier(**param, enable_categorical = True) 
    
    model.fit(X_train,y_train)
    
    preds = pd.DataFrame(model.predict(X_test))
    
    accuracy = sklearn.metrics.accuracy_score(pd.DataFrame(y_test.reset_index()['target']),preds)
    
    return accuracy


In [14]:
%%time
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials= 100)

[32m[I 2022-08-05 11:22:02,880][0m A new study created in memory with name: no-name-b11cdb4a-de3b-4083-97ac-cc2f7d339aff[0m
[32m[I 2022-08-05 11:22:46,470][0m Trial 0 finished with value: 0.9024047973921153 and parameters: {'lambda': 0.0024661416749729944, 'alpha': 3.104098599282743, 'colsample_bytree': 0.7, 'subsample': 0.9, 'learning_rate': 0.049, 'n_estimators': 290, 'max_depth': 8, 'min_child_weight': 101}. Best is trial 0 with value: 0.9024047973921153.[0m
[32m[I 2022-08-05 11:24:00,296][0m Trial 1 finished with value: 0.9007574370908837 and parameters: {'lambda': 0.009285449215382179, 'alpha': 9.582805283914348, 'colsample_bytree': 0.5, 'subsample': 0.5, 'learning_rate': 0.012, 'n_estimators': 840, 'max_depth': 6, 'min_child_weight': 143}. Best is trial 0 with value: 0.9024047973921153.[0m
[32m[I 2022-08-05 11:24:43,674][0m Trial 2 finished with value: 0.9003390598715233 and parameters: {'lambda': 0.0012640944492645165, 'alpha': 0.11476324991786277, 'colsample_bytree':

CPU times: user 2h 16min 18s, sys: 3min 23s, total: 2h 19min 42s
Wall time: 2h 51s


In [15]:
best_params = study.best_trial.params
"""
best_params = {"alpha":0.5420861968603762, "base_score":0.5, "booster1":'gbtree',
              "callbacks":None, "colsample_bylevel":1, "colsample_bynode":1,
              "colsample_bytree":0.7, "early_stopping_rounds":None,
              "enable_categorical":True, "eval_metric":None, "gamma":0, "gpu_id":0,
              "grow_policy":'depthwise', "importance_type":None,
              "interaction_constraints":'', "lambda": 0.6898903764598321,
              "learning_rate":0.030000000000000002, "max_bin":256,
              "max_cat_to_onehot":4, "max_delta_step":0, "max_depth":9, "max_leaves":0,
              "min_child_weight":72, "missing":np.nan, "monotone_constraints":'()',
              "n_estimators":1000, "n_jobs":0, "num_parallel_tree":1,
              "predictor":'auto', "random_state":0, "enable_categorical":True}
"""
best_params['tree_method'] = 'gpu_hist'
best_params['booster'] = 'gbtree'


In [16]:
print(best_params)

{'lambda': 8.212715002119015, 'alpha': 0.3588508007814575, 'colsample_bytree': 0.6, 'subsample': 0.6, 'learning_rate': 0.042, 'n_estimators': 1000, 'max_depth': 7, 'min_child_weight': 67, 'tree_method': 'gpu_hist', 'booster': 'gbtree'}


In [17]:
final_model = XGBClassifier(**best_params,enable_categorical = True)

In [18]:
final_model.fit(X_train,y_train)

XGBClassifier(alpha=0.3588508007814575, base_score=0.5, booster='gbtree',
              callbacks=None, colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.6, early_stopping_rounds=None,
              enable_categorical=True, eval_metric=None, gamma=0, gpu_id=0,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', lambda=8.212715002119015,
              learning_rate=0.042, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=7, max_leaves=0, min_child_weight=67,
              missing=nan, monotone_constraints='()', n_estimators=1000,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [19]:
del X_train,X_test,y_train,y_test
_ = gc.collect()

In [20]:
def read_test_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
    else: df = cudf.read_parquet(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    #df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = cudf.to_datetime( df.S_2 )
    # SORT BY CUSTOMER AND DATE (so agg('last') works correctly)
    #df = df.sort_values(['customer_ID','S_2'])
    #df = df.reset_index(drop=True)
    # FILL NAN
    df = df.fillna(0) 
    print('shape of data:', df.shape)
    
    return df

print('Reading test data...')
TEST_PATH = '../input/amex-data-integer-dtypes-parquet-format/test.parquet'
test = read_test_file(path = TEST_PATH)

Reading test data...
shape of data: (11363762, 190)


In [21]:
test.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-02-19,0.631315,0,0.010728,0.814497,0.0,0.168651,0.0,0.002347,...,-1,-1,-1,-1,0,0.0,0.0,-1,0.008281,-1
1,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-03-25,0.587042,0,0.011026,0.810848,0.0,0.241389,0.0,0.009132,...,-1,-1,-1,0,0,0.0,0.0,0,0.003753,0
2,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-04-25,0.609056,0,0.01639,1.00462,0.0,0.266976,0.0,0.004192,...,-1,-1,-1,0,0,0.0,0.0,0,0.002156,0
3,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-05-20,0.614911,0,0.021672,0.816549,0.0,0.188947,0.0,0.015325,...,-1,-1,-1,0,0,0.0,0.0,0,0.005206,0
4,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-06-15,0.591673,8,0.015923,0.810456,0.0,0.180035,0.0,0.011281,...,-1,-1,-1,0,0,0.0,0.0,0,0.007421,0


In [22]:
test = process_and_feature_engineer(test)

shape after engineering (924621, 918)


In [23]:
test['prediction'] = final_model.predict_proba(test)[:,1]

In [24]:
final = pd.DataFrame(test['prediction'].to_pandas())

In [25]:
final.to_csv("submission.csv", index=True)