In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amexfeather/test_data_f32.ftr
/kaggle/input/amexfeather/train_data.ftr
/kaggle/input/amexfeather/train_data_f32.ftr
/kaggle/input/amexfeather/test_data.ftr
/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv


In [2]:
#pip install lightgbm --install-option=--gpu

In [3]:
import pandas as pd
import numpy as np

from lightgbm import LGBMClassifier, early_stopping, log_evaluation

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import seaborn as sns
import matplotlib.pyplot as plt

import gc

In [4]:
class CONFIG:
    random_state = 1001
    kaggle = True
    path = '../input/amexfeather'
    local_path = ''

In [5]:
df_train = pd.read_feather(f'{CONFIG.path}/train_data.ftr')

In [6]:
df_train = df_train.drop(['S_2'], axis=1)
df_train = df_train.groupby('customer_ID')
df_train = df_train.tail(1)
df_train.set_index('customer_ID', inplace=True) 
#df_train.tail(2)

In [7]:
total_cols = df_train.columns.to_list()
cat_features = ['B_30', 'B_38', 'D_126', 'D_63', 'D_64']
num_features = [col for col in total_cols if col not in cat_features + ["target", "customer_ID", "S_2"] ]
len(num_features) + len(cat_features)

188

In [8]:
temp = df_train.isna().sum().mul(100).div(len(df_train)).sort_values(ascending=False)
missing_df = pd.DataFrame(temp).reset_index()

dropped_columns = missing_df[missing_df[0]>0.7]["index"].values
df_train.drop(columns = dropped_columns,axis=1, inplace=True)

In [9]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy="most_frequent")
transformed_df = pd.DataFrame(imputer.fit_transform(df_train[cat_features]),columns = cat_features)
df_train[cat_features] = transformed_df[cat_features]

In [10]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999)
oe.fit(df_train[cat_features])
df_train_enc = oe.transform(df_train[cat_features])
df_train[cat_features]=df_train_enc

### **Competition Metric¶**

In [11]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [12]:
total_cols = df_train.columns.to_list()
cat_features = ['B_30', 'B_38', 'D_126', 'D_63', 'D_64']
num_features = [col for col in total_cols if col not in cat_features + ["target", "customer_ID", "S_2"] ]

In [13]:
x = df_train[cat_features + num_features]
y = df_train['target']

x.shape, y.shape

((458913, 124), (458913,))

In [14]:
features = cat_features + num_features

In [15]:
X_train, X_test, y_train, y_test = train_test_split(x,y,
                            test_size=0.3,random_state=CONFIG.random_state, 
                                                    stratify = y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((321239, 124), (137674, 124), (321239,), (137674,))

In [16]:
'''
import optuna
import optuna.integration.lightgbm as lgb
from sklearn.model_selection import RepeatedKFold

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

rkf = RepeatedKFold(n_splits = 3, n_repeats = 3, random_state=42)

fixed_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'dart',
    'force_row_wise' : True,
    'random_state' : CONFIG.random_state,
    'extra_trees' : True,
    'feature_pre_filter': False,
    'verbose' : -1,
    'n_estimators': 300,
    'early_stopping_round': 30
}

X = np.array(X_train[features])
y = np.array(y_train).flatten()

dtrain = lgb.Dataset(X, label = y, categorical_feature = 'auto')    

tuner = lgb.LightGBMTunerCV(
        fixed_params, dtrain, 
        verbose_eval = None,
        time_budget = 1000,
        folds = rkf,
        num_boost_round = 10,
        shuffle = True
)

tuner.run()
'''



In [17]:
#best_params = tuner.best_params

In [18]:
best_params={'objective': 'binary',
 'metric': 'auc',
 'boosting_type': 'dart',
 'force_row_wise': True,
 'random_state': 1001,
 'extra_trees': True,
 'feature_pre_filter': False,
 'verbose': -1,
 'n_estimators': 300,
 'early_stopping_round': 30,
 'lambda_l1': 0.0,
 'lambda_l2': 0.0,
 'num_leaves': 31,
 'feature_fraction': 0.8,
 'bagging_fraction': 1.0,
 'bagging_freq': 0,
 'min_child_samples': 20}

In [19]:
'''
search_params = { 
    'learning_rate' : 0.065,
    'lambda_l1': 0.9018017181896126,
    'lambda_l2': 0.06256451709708931,
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 1.0,
    'bagging_freq': 0,
    'min_child_samples': 20
}

fixed_params={
    'objective': 'binary',
     'metric': 'auc',
     'boosting_type': 'dart',
     'force_row_wise': True,
     'random_state': 1001,
     'extra_trees': True,
     'feature_pre_filter': False,
     'verbose': -1,
     'n_estimators': 300,
     'early_stopping_round': 30
}
'''

"\nsearch_params = { \n    'learning_rate' : 0.065,\n    'lambda_l1': 0.9018017181896126,\n    'lambda_l2': 0.06256451709708931,\n    'num_leaves': 31,\n    'feature_fraction': 0.5,\n    'bagging_fraction': 1.0,\n    'bagging_freq': 0,\n    'min_child_samples': 20\n}\n\nfixed_params={\n    'objective': 'binary',\n     'metric': 'auc',\n     'boosting_type': 'dart',\n     'force_row_wise': True,\n     'random_state': 1001,\n     'extra_trees': True,\n     'feature_pre_filter': False,\n     'verbose': -1,\n     'n_estimators': 300,\n     'early_stopping_round': 30\n}\n"

In [20]:
model = LGBMClassifier(**best_params)#(**fixed_params, **search_params)

In [21]:
model.fit(
    X_train, y_train, 
    eval_set=[(X_test,y_test)],
    callbacks=[log_evaluation(100)]
)





[100]	valid_0's auc: 0.953823
[200]	valid_0's auc: 0.955498
[300]	valid_0's auc: 0.956111


LGBMClassifier(bagging_fraction=1.0, bagging_freq=0, boosting_type='dart',
               early_stopping_round=30, extra_trees=True, feature_fraction=0.8,
               feature_pre_filter=False, force_row_wise=True, lambda_l1=0.0,
               lambda_l2=0.0, metric='auc', n_estimators=300,
               objective='binary', random_state=1001, verbose=-1)

In [22]:
del X_train, X_test, y_train, y_test, df_train, x, y, missing_df, transformed_df, df_train_enc
_ = gc.collect()

In [23]:
test = pd.read_feather('/kaggle/input/amexfeather/test_data.ftr')
test = test.groupby('customer_ID')
test = test.tail(1)
test.set_index('customer_ID', inplace=True)

test.drop(['S_2'], axis=1, inplace=True)
test.drop(columns = dropped_columns,axis=1, inplace=True)

In [24]:
_ = gc.collect()

In [25]:
#imputer 
transformed_test = pd.DataFrame(imputer.transform(test[cat_features]),columns = cat_features)
test[cat_features] = transformed_test[cat_features]

In [26]:
_ = gc.collect()

In [27]:
#encodeing
df_test_enc = oe.transform(test[cat_features])
test[cat_features]=df_test_enc

In [28]:
del df_test_enc, transformed_test
_ = gc.collect()

In [29]:
#test.set_index('customer_ID', inplace=True)
test["prediction"] = model.predict_proba(test[cat_features + num_features])[:,1]
test.head()
test["prediction"].to_csv("submission.csv", index=True)