In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sn

import optuna
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
import sklearn.metrics


import cupy, cudf # GPU libraries

import gc, os
import warnings
warnings.filterwarnings('ignore')

In [None]:
def read_train_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
    else: df = cudf.read_parquet(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = cudf.to_datetime( df.S_2 )
    # SORT BY CUSTOMER AND DATE (so agg('last') works correctly)
    #df = df.sort_values(['customer_ID','S_2'])
    #df = df.reset_index(drop=True)
    # FILL NAN
    df = df.fillna(0) 
    print('shape of data:', df.shape)
    
    return df

print('Reading train data...')
TRAIN_PATH = '../input/amex-data-integer-dtypes-parquet-format/train.parquet'
train = read_train_file(path = TRAIN_PATH)

In [None]:
def process_and_feature_engineer(df):
    # FEATURE ENGINEERING FROM 
    # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]

    test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]

    test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

    df = cudf.concat([test_num_agg, test_cat_agg], axis=1)
    del test_num_agg, test_cat_agg
    print('shape after engineering', df.shape )
    
    return df

train = process_and_feature_engineer(train)

In [None]:
targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
targets = targets.set_index('customer_ID')
train = train.merge(targets, left_index=True, right_index=True, how='left')
train.target = train.target.astype('int8')
del targets

# NEEDED TO MAKE CV DETERMINISTIC (cudf merge above randomly shuffles rows)
train = train.sort_index().reset_index()

# FEATURES
FEATURES = train.columns[1:-1]
print(f'There are {len(FEATURES)} features!')

In [None]:
train_pd = train.to_pandas()
del train
_ = gc.collect()

train_df, test_df = train_test_split(train_pd, test_size=0.25, stratify=train_pd['target'])
del train_pd
_ = gc.collect()

In [None]:
X_train = train_df.drop(['customer_ID', 'target'], axis=1)
X_test = test_df.drop(['customer_ID', 'target'], axis=1)
y_train = train_df['target']
y_test = test_df['target']

In [None]:
del train_df, test_df
_ = gc.collect()

In [None]:
def objective(trial):
    
    param = {
        'booster':'gbtree',
        'tree_method':'gpu_hist', 
        "objective": "binary:logistic",
        'lambda': trial.suggest_loguniform(
            'lambda', 1e-3, 10.0
        ),
        'alpha': trial.suggest_loguniform(
            'alpha', 1e-3, 10.0
        ),
        'colsample_bytree': trial.suggest_float(
            'colsample_bytree', 0.5,1,step=0.1
        ),
        'subsample': trial.suggest_float(
            'subsample', 0.5,1,step=0.1
        ),
        'learning_rate': trial.suggest_float(
            'learning_rate', 0.001,0.05,step=0.001
        ),
        'n_estimators': trial.suggest_int(
            "n_estimators", 80,1000,10
        ),
        'max_depth': trial.suggest_int(
            'max_depth', 2,10,1
        ),
        'random_state': 99,
        'min_child_weight': trial.suggest_int(
            'min_child_weight', 1,256,1
        ),
    }
    
    model = XGBClassifier(**param, enable_categorical = True) 
    
    model.fit(X_train,y_train)
    
    preds = pd.DataFrame(model.predict(X_test))
    
    accuracy = sklearn.metrics.accuracy_score(pd.DataFrame(y_test.reset_index()['target']),preds)
    
    return accuracy

In [None]:
%%time
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials= 100)

In [None]:
best_params = study.best_trial.params
"""
best_params = {"alpha":0.5420861968603762, "base_score":0.5, "booster1":'gbtree',
              "callbacks":None, "colsample_bylevel":1, "colsample_bynode":1,
              "colsample_bytree":0.7, "early_stopping_rounds":None,
              "enable_categorical":True, "eval_metric":None, "gamma":0, "gpu_id":0,
              "grow_policy":'depthwise', "importance_type":None,
              "interaction_constraints":'', "lambda": 0.6898903764598321,
              "learning_rate":0.030000000000000002, "max_bin":256,
              "max_cat_to_onehot":4, "max_delta_step":0, "max_depth":9, "max_leaves":0,
              "min_child_weight":72, "missing":np.nan, "monotone_constraints":'()',
              "n_estimators":1000, "n_jobs":0, "num_parallel_tree":1,
              "predictor":'auto', "random_state":0, "enable_categorical":True}
"""
best_params['tree_method'] = 'gpu_hist'
best_params['booster'] = 'gbtree'

In [None]:
final_model = XGBClassifier(**best_params,enable_categorical = True)
final_model.fit(X_train,y_train)

In [None]:
del X_train,X_test,y_train,y_test
_ = gc.collect()

**LGBM**

In [None]:
class CONFIG:
    random_state = 1001
    kaggle = True
    path = '../input/amexfeather'
    local_path = ''

In [None]:
df_train = pd.read_feather(f'{CONFIG.path}/train_data.ftr')
df_train = df_train.drop(['S_2'], axis=1)
df_train = df_train.groupby('customer_ID')
df_train = df_train.tail(1)
df_train.set_index('customer_ID', inplace=True) 
#df_train.tail(2)

In [None]:
total_cols = df_train.columns.to_list()
cat_features = ['B_30', 'B_38', 'D_126', 'D_63', 'D_64']
num_features = [col for col in total_cols if col not in cat_features + ["target", "customer_ID", "S_2"] ]
len(num_features) + len(cat_features)

In [None]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy="most_frequent")
transformed_df = pd.DataFrame(imputer.fit_transform(df_train[cat_features]),columns = cat_features)
df_train[cat_features] = transformed_df[cat_features]

In [None]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999)
oe.fit(df_train[cat_features])
df_train_enc = oe.transform(df_train[cat_features])
df_train[cat_features]=df_train_enc

In [None]:
total_cols = df_train.columns.to_list()
cat_features = ['B_30', 'B_38', 'D_126', 'D_63', 'D_64']
num_features = [col for col in total_cols if col not in cat_features + ["target", "customer_ID", "S_2"] ]

In [None]:
x = df_train[cat_features + num_features]
y = df_train['target']

x.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x,y,
                            test_size=0.3,random_state=CONFIG.random_state, 
                                                    stratify = y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
'''
import optuna
import optuna.integration.lightgbm as lgb
from sklearn.model_selection import RepeatedKFold

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

rkf = RepeatedKFold(n_splits = 3, n_repeats = 3, random_state=42)

fixed_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'dart',
    'force_row_wise' : True,
    'random_state' : CONFIG.random_state,
    'extra_trees' : True,
    'feature_pre_filter': False,
    'verbose' : -1,
    'n_estimators': 300,
    'early_stopping_round': 30
}

X = np.array(X_train[features])
y = np.array(y_train).flatten()

dtrain = lgb.Dataset(X, label = y, categorical_feature = 'auto')    

tuner = lgb.LightGBMTunerCV(
        fixed_params, dtrain, 
        verbose_eval = None,
        time_budget = 1000,
        folds = rkf,
        num_boost_round = 10,
        shuffle = True
)

tuner.run()
'''

In [None]:
#best_params = tuner.best_params
best_params={'objective': 'binary',
 'metric': 'auc',
 'boosting_type': 'dart',
 'force_row_wise': True,
 'random_state': 1001,
 'extra_trees': True,
 'feature_pre_filter': False,
 'verbose': -1,
 'n_estimators': 300,
 'early_stopping_round': 30,
 'lambda_l1': 0.0,
 'lambda_l2': 0.0,
 'num_leaves': 31,
 'feature_fraction': 0.8,
 'bagging_fraction': 1.0,
 'bagging_freq': 0,
 'min_child_samples': 20}

In [None]:
'''
search_params = { 
    'learning_rate' : 0.065,
    'lambda_l1': 0.9018017181896126,
    'lambda_l2': 0.06256451709708931,
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 1.0,
    'bagging_freq': 0,
    'min_child_samples': 20
}

fixed_params={
    'objective': 'binary',
     'metric': 'auc',
     'boosting_type': 'dart',
     'force_row_wise': True,
     'random_state': 1001,
     'extra_trees': True,
     'feature_pre_filter': False,
     'verbose': -1,
     'n_estimators': 300,
     'early_stopping_round': 30
}
'''

In [None]:
model = LGBMClassifier(**best_params)
model.fit(
    X_train, y_train, 
    eval_set=[(X_test,y_test)],
    callbacks=[log_evaluation(100)]
)

In [None]:
del X_train, X_test, y_train, y_test, df_train, x, y, missing_df, transformed_df, df_train_enc
_ = gc.collect()

**Importing the test data and ensembling**

In [None]:
def read_test_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
    else: df = cudf.read_parquet(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    #df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = cudf.to_datetime( df.S_2 )
    # SORT BY CUSTOMER AND DATE (so agg('last') works correctly)
    #df = df.sort_values(['customer_ID','S_2'])
    #df = df.reset_index(drop=True)
    # FILL NAN
    df = df.fillna(0) 
    print('shape of data:', df.shape)
    
    return df

print('Reading test data...')
TEST_PATH = '../input/amex-data-integer-dtypes-parquet-format/test.parquet'
test_XBG = read_test_file(path = TEST_PATH)


test_LGBM = pd.read_feather('/kaggle/input/amexfeather/test_data.ftr')
test_LGBM = test_LGBM.groupby('customer_ID')
test_LGBM = test_LGBM.tail(1)
test_LGBM.set_index('customer_ID', inplace=True)

test_LGBM.drop(['S_2'], axis=1, inplace=True)
test_LGBM.drop(columns = dropped_columns,axis=1, inplace=True)

In [None]:
test_XBG = process_and_feature_engineer(test_XBG)

transformed_test = pd.DataFrame(imputer.transform(test_LGBM[cat_features]),columns = cat_features)
test_LGBM[cat_features] = transformed_test[cat_features]
df_test_enc = oe.transform(test_LGBM[cat_features])
test_LGBM[cat_features]=df_test_enc

del df_test_enc, transformed_test

In [None]:
test_XBG['prediction'] = final_model.predict_proba(test_XBG)[:,1]
test_LGBM["prediction"] = model.predict_proba(test_LGBM[cat_features + num_features])[:,1]

df = test_XBG['prediction']
df["prediction1"] = test_LGBM["prediction"]
##Ensemble
df['mean'] = df[['prediction', 'prediction1']].mean(axis=1)
final = pd.DataFrame(df['mean'].to_pandas())


In [None]:
final.to_csv("submission.csv", index=True)