In [None]:
# import libraries
import numpy as np
import pandas as pd
import cupy, cudf # GPU libraries
import matplotlib.pyplot as plt
import gc

In [None]:
# load training data
train = cudf.read_parquet("../input/amex-data-integer-dtypes-parquet-format/train.parquet")
train.shape

In [None]:
# REDUCE DTYPE FOR CUSTOMER AND DATE
train['customer_ID'] = train['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
train.S_2 = cudf.to_datetime( train.S_2 )

In [None]:
# check categorical variables
cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    
for feat in cat_features:
    print(feat,train[feat].unique())

In [None]:
# data preprocessing
def preprocessing(df,training_data=False):
    # keep only the last record for each customer
    df.drop_duplicates('customer_ID',keep='last',inplace=True)
    
    if training_data:
        labels = cudf.read_csv("../input/amex-default-prediction/train_labels.csv")
        labels['customer_ID'] = labels['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
       # merge labels and train datasets to get the labelled data for training
        df = cudf.merge(df,labels,on="customer_ID",how='left')
        df.drop(['customer_ID'],1,inplace=True)
        del labels
        gc.collect()
         
        
    # convert into dummies
    dummies = cudf.get_dummies(df[cat_features])

    # drop categorical variables 
    print("dropping cat features")
    df.drop(cat_features,1,inplace = True)
    # concat dummy variables with X
    df = cudf.concat([df, dummies], axis=1)

    df.drop(['S_2'],1,inplace=True)
    
    df = df.fillna(-127) 
    
    del dummies
    gc.collect()
    return df

In [None]:
# perform preprocessing
train = preprocessing(train,training_data=True)
x_target = train.target.values
train.drop('target',1,inplace = True)

In [None]:
train = train.to_pandas() # free GPU memory
gc.collect()

In [None]:
#Improting the PCA module
from sklearn.decomposition import PCA
pca = PCA(svd_solver='randomized', random_state=42)

In [None]:
#let's apply PCA
pca.fit(train)

In [None]:
#List of PCA components.It would be the same as the number of variables
pca.components_

In [None]:
#Plotting the scree plot
#Making the screeplot - plotting the cumulative variance against the number of components
%matplotlib inline
fig = plt.figure(figsize = (12,8))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()

In [None]:
#Using incremental PCA for efficiency - saves a lot of time on larger datasets
from sklearn.decomposition import IncrementalPCA
pca_final = IncrementalPCA(n_components=30)

In [None]:
# performing pca on the final principal components
df_pca = cudf.DataFrame(pca_final.fit_transform(train))
df_pca['target'] = x_target
df_pca.shape

In [None]:
# free memory
del train,x_target
gc.collect()

In [None]:
# FEATURES
FEATURES = df_pca.columns
print(f'There are {len(FEATURES)} features!')

In [None]:
# LOAD XGB LIBRARY
from sklearn.model_selection import KFold
import xgboost as xgb
print('XGB Version',xgb.__version__)

# XGB MODEL PARAMETERS
xgb_parms = { 
    'max_depth':4, 
    'learning_rate':0.05, 
    'subsample':0.8,
    'colsample_bytree':0.6, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'tree_method':'gpu_hist',
    'predictor':'gpu_predictor',
    'random_state':42
}

In [None]:
# NEEDED WITH DeviceQuantileDMatrix BELOW
class IterLoadForDMatrix(xgb.core.DataIter):
    def __init__(self, df=None, features=None, target=None, batch_size=256*1024):
        self.features = features
        self.target = target
        self.df = df
        self.it = 0 # set iterator to 0
        self.batch_size = batch_size
        self.batches = int( np.ceil( len(df) / self.batch_size ) )
        super().__init__()

    def reset(self):
        '''Reset the iterator'''
        self.it = 0

    def next(self, input_data):
        '''Yield next batch of data.'''
        if self.it == self.batches:
            return 0 # Return 0 when there's no more batch.
        
        a = self.it * self.batch_size
        b = min( (self.it + 1) * self.batch_size, len(self.df) )
        dt = cudf.DataFrame(self.df.iloc[a:b])
        input_data(data=dt[self.features], label=dt[self.target]) #, weight=dt['weight'])
        self.it += 1
        return 1

In [None]:
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [None]:
# VERSION NAME FOR SAVED MODEL FILES
VER = 1

skf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# build and save models

for fold,(train_idx,valid_idx) in enumerate(skf.split(df_pca,df_pca.target)):
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print('#'*25)
    
    # TRAIN, VALID, TEST FOR FOLD K
    Xy_train = IterLoadForDMatrix(df_pca.loc[train_idx], FEATURES, 'target')
    X_valid = df_pca.loc[valid_idx, FEATURES]
    y_valid = df_pca.loc[valid_idx, 'target']
    
    dtrain = xgb.DeviceQuantileDMatrix(Xy_train, max_bin=256)
    dvalid = xgb.DMatrix(data=X_valid, label=y_valid)

    # TRAIN MODEL FOLD K
    model = xgb.train(xgb_parms, 
                dtrain=dtrain,
                evals=[(dtrain,'train'),(dvalid,'valid')],
                num_boost_round=9999,
                early_stopping_rounds=100,
                verbose_eval=100) 
    model.save_model(f'XGB_v{VER}_fold{fold}.xgb')

In [None]:
# CLEAN RAM
del df_pca
_ = gc.collect()

In [None]:
# load test data
test = cudf.read_parquet("../input/amex-data-integer-dtypes-parquet-format/test.parquet")
test = preprocessing(test,training_data=False)

test = test.to_pandas()
test_customers = test.customer_ID
test.drop(['customer_ID'],1,inplace=True)
test = pd.DataFrame(pca_final.fit_transform(test))

In [None]:
# make predictions
FOLDS = 5
test_preds = []

dtest = xgb.DMatrix(data=test)
del test
gc.collect()

# INFER XGB MODELS ON TEST DATA
model = xgb.Booster()
model.load_model(f'XGB_v{VER}_fold0.xgb')
preds = model.predict(dtest)
for f in range(1,FOLDS):
    model.load_model(f'XGB_v{VER}_fold{f}.xgb')
    preds += model.predict(dtest)
preds /= FOLDS
test_preds.append(preds)

# CLEAN MEMORY
del dtest, model
_ = gc.collect()

In [None]:
# lets prepare for the prediction submission
test_preds = np.concatenate(test_preds)
sub = pd.DataFrame()
sub['customer_ID'] = test_customers
sub['prediction'] = test_preds
sub.to_csv('submission.csv',index=False)