In [38]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, RocCurveDisplay, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import random
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amexfeather/test_data_f32.ftr
/kaggle/input/amexfeather/train_data.ftr
/kaggle/input/amexfeather/train_data_f32.ftr
/kaggle/input/amexfeather/test_data.ftr


In [39]:
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline as mp
# importing FeatureUnion for combining transformers
from sklearn.pipeline import FeatureUnion

In [40]:
from sklearn.impute import SimpleImputer
from sklearn.impute import MissingIndicator
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.decomposition import PCA

In [41]:
# importing classifiers to try with
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn import metrics

In [7]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [8]:
import gc
from sklearn.model_selection import train_test_split

In [9]:
train_df_ = pd.read_feather('../input/amexfeather/train_data.ftr')


In [10]:
train_df_.dtypes

customer_ID            object
S_2            datetime64[ns]
P_2                   float16
D_39                  float16
B_1                   float16
                    ...      
D_142                 float16
D_143                 float16
D_144                 float16
D_145                 float16
target                  int64
Length: 191, dtype: object

In [11]:
train_df_.shape

(5531451, 191)

# **Feature Engineering**

In [12]:
train_df = (train_df_.groupby('customer_ID')
            .tail(1).set_index('customer_ID',drop=True)
            .sort_index()
            .drop(['S_2'],axis='columns')
           )

In [13]:
del train_df_
gc.collect()

23

# Drop columns with more 50% missing values.

In [14]:
to_drop = [column for column in train_df.columns if (train_df[column].isnull().sum()/len(train_df[column])*100) >=50]
train_df = train_df.drop(columns = to_drop)
print('number of columns with >= 50% missing value = ', len(to_drop))

number of columns with >= 50% missing value =  29


In [15]:
to_drop

['D_42',
 'D_49',
 'D_50',
 'D_53',
 'D_56',
 'B_17',
 'D_66',
 'D_73',
 'D_76',
 'R_9',
 'D_82',
 'B_29',
 'D_87',
 'D_88',
 'D_105',
 'D_106',
 'R_26',
 'D_108',
 'D_110',
 'D_111',
 'B_39',
 'B_42',
 'D_132',
 'D_134',
 'D_135',
 'D_136',
 'D_137',
 'D_138',
 'D_142']

# Remove correlated values

In [16]:
train_df_wo_target = train_df.drop(["target"],axis=1)

cor_matrix = train_df_wo_target.corr()
col_core = set()

for i in range(len(cor_matrix.columns)):
    for j in range(i):
        if(cor_matrix.iloc[i, j] > 0.9):
            col_name = cor_matrix.columns[i]
            col_core.add(col_name)
col_core

{'B_11',
 'B_13',
 'B_15',
 'B_23',
 'B_33',
 'B_37',
 'D_104',
 'D_119',
 'D_141',
 'D_143',
 'D_74',
 'D_75',
 'D_77',
 'S_24',
 'S_7'}

In [13]:
train_df = train_df.drop(col_core, axis=1)

In [17]:
all_cols = train_df.columns.to_list()
cat_cols = train_df.select_dtypes("category").columns.tolist()
num_cols = train_df.select_dtypes(include =['float16','int64']).columns.tolist()
num_cols=list(set(train_df[num_cols]) - {'target'})

In [18]:

x = train_df[cat_cols+num_cols]
y = train_df['target']
print(x.shape,y.shape)

(458913, 159) (458913,)


# Categorical pipeline to replace missing values with most frequent value, encode categorical variables with ordinal encorder and to do standard scaling.

In [19]:
# define categorical pipeline
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan)),
    ('encoder', OrdinalEncoder())
#     ('scaler', StandardScaler())
])
print(cat_pipe)

Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OrdinalEncoder())])


# Numerical pipeline to replace missing values with median value and to do standard scaling.

In [20]:
# define numerical pipeline
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median', missing_values=np.nan))
#   ('scaler', StandardScaler())
])
print(num_pipe)

Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])


In [22]:
preprocess = ColumnTransformer([
    ('cat', cat_pipe, cat_cols),
    ('num', num_pipe, num_cols)
])

In [23]:
## Split the train set 
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x, y, test_size=0.2,random_state=20)
print(xtrain.shape,'\n',xtest.shape)

(367130, 159) 
 (91783, 159)


In [23]:
xtrain.head()

Unnamed: 0_level_0,D_63,D_64,D_68,B_30,B_38,D_114,D_116,D_117,D_120,D_126,...,D_51,D_83,R_24,B_10,D_139,D_128,B_21,S_23,D_93,D_96
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ae1310f501ec09d5ec57e60ff9887cfd760a6ffc104e3320cd1d3c3de5e8d370,CR,O,6.0,0.0,2.0,1.0,0.0,-1.0,0.0,1.0,...,1.0,0.00914,0.008026,0.278076,0.003878,1.006836,0.005066,0.138916,0.007874,0.003441
b074d9521f89e2239ad0b4abc6f016b1f9cfe11bb4326a2be70ac751bba4aced,CO,R,6.0,0.0,2.0,1.0,0.0,-1.0,0.0,1.0,...,0.336182,0.006077,0.003904,0.294434,0.009117,1.007812,0.009628,0.136719,0.000365,0.004093
bcedf353a230cdf47c8e027f453d13b3c44a62dd8c8d83ff512056192db6f1d4,CO,R,4.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,...,0.004215,0.004631,0.006218,0.125366,0.00621,0.001267,0.009659,0.140625,0.009544,0.003492
912e2cfc8e9be9e38ca23739dafb786158f33195f01923aeb15ec1e5a9b8fbe7,CO,O,3.0,0.0,1.0,1.0,0.0,5.0,1.0,1.0,...,0.003796,0.001546,0.003983,0.019501,1.004883,1.004883,0.004234,0.138306,0.001263,0.002991
da845f64b9b409f0385622d792fe6f0239b1b265a6e32c1845ada5b957a39561,CR,O,6.0,0.0,2.0,1.0,0.0,-1.0,0.0,1.0,...,0.666992,0.004612,0.000614,0.24707,0.008087,1.001953,0.005314,0.132935,0.002342,0.004253


In [24]:
xtrain_cus_id = xtrain.index
xtest_cus_id = xtest.index

In [25]:
preprocess.fit(xtrain)
xtrain = preprocess.transform(xtrain)
xtest = preprocess.transform(xtest)

In [27]:
xtrain = pd.DataFrame(xtrain, columns = cat_cols + num_cols )
xtest = pd.DataFrame(xtest, columns = cat_cols + num_cols )

In [27]:
xtrain.head()

Unnamed: 0,D_63,D_64,D_68,B_30,B_38,D_114,D_116,D_117,D_120,D_126,...,D_51,D_83,R_24,B_10,D_139,D_128,B_21,S_23,D_93,D_96
0,2.0,1.0,5.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.00914,0.008026,0.278076,0.003878,1.006836,0.005066,0.138916,0.007874,0.003441
1,1.0,2.0,5.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.336182,0.006077,0.003904,0.294434,0.009117,1.007812,0.009628,0.136719,0.000365,0.004093
2,1.0,2.0,3.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,...,0.004215,0.004631,0.006218,0.125366,0.00621,0.001267,0.009659,0.140625,0.009544,0.003492
3,1.0,1.0,2.0,0.0,0.0,1.0,0.0,5.0,1.0,1.0,...,0.003796,0.001546,0.003983,0.019501,1.004883,1.004883,0.004234,0.138306,0.001263,0.002991
4,2.0,1.0,5.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.666992,0.004612,0.000614,0.24707,0.008087,1.001953,0.005314,0.132935,0.002342,0.004253


In [28]:
xtrain['customer_ID']=xtrain_cus_id

In [25]:
xtrain.head()

Unnamed: 0,D_63,D_64,D_68,B_30,B_38,D_114,D_116,D_117,D_120,D_126,...,D_115,D_127,R_19,R_5,P_4,D_54,B_5,B_18,R_7,customer_ID
0,2.0,1.0,5.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.388428,1.000977,0.001179,0.007015,0.007187,1.003906,0.078125,1.004883,0.001063,ae1310f501ec09d5ec57e60ff9887cfd760a6ffc104e33...
1,1.0,2.0,5.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.385986,0.002756,0.003948,0.006565,0.00019,1.006836,0.104309,1.009766,0.009888,b074d9521f89e2239ad0b4abc6f016b1f9cfe11bb4326a...
2,1.0,2.0,3.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,...,0.094727,8.2e-05,0.008476,0.003895,0.007,1.00293,0.024551,0.50293,0.004745,bcedf353a230cdf47c8e027f453d13b3c44a62dd8c8d83...
3,1.0,1.0,2.0,0.0,0.0,1.0,0.0,5.0,1.0,1.0,...,0.047394,0.004944,0.006824,0.008331,0.96582,1.000977,0.00626,0.406982,0.002607,912e2cfc8e9be9e38ca23739dafb786158f33195f01923...
4,2.0,1.0,5.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1.053711,1.004883,0.00242,0.001918,0.001436,1.003906,0.149536,0.689453,0.007683,da845f64b9b409f0385622d792fe6f0239b1b265a6e32c...


In [29]:
xtrain = (xtrain.groupby('customer_ID')
            .tail(1).set_index('customer_ID',drop=True)
            .sort_index()
           )

In [30]:
import gc
gc.collect()

187

In [32]:
train_df.shape

(458913, 160)

In [31]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score

In [34]:
from sklearn.model_selection import StratifiedKFold 

In [36]:
gc.collect()

115

# Test Data

In [37]:
%%time 

## Load the test dataset 
test_df =pd.read_feather('../input/amexfeather/test_data.ftr')
test_df.head(5)

CPU times: user 18.5 s, sys: 12.8 s, total: 31.3 s
Wall time: 36.1 s


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-02-19,0.631348,0.001912,0.010727,0.814453,0.007545,0.168701,0.009972,0.002348,...,,,,,0.004669,,,,0.008278,
1,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-03-25,0.586914,0.005276,0.011024,0.811035,0.001817,0.241333,0.000166,0.009132,...,,,,0.000142,0.00494,0.009018,,0.003695,0.003754,0.00146
2,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-04-25,0.608887,0.003326,0.016388,1.004883,0.000114,0.26709,0.004196,0.004192,...,,,,7.4e-05,0.002113,0.004658,,0.003155,0.002155,0.006481
3,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-05-20,0.614746,0.009064,0.021667,0.816406,0.00972,0.188965,0.004124,0.015327,...,,,,0.004742,0.006393,0.00289,,0.006042,0.005207,0.007858
4,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-06-15,0.591797,0.23877,0.01593,0.810547,0.002026,0.180054,0.000731,0.011284,...,,,,0.008133,0.00433,0.008385,,0.001008,0.00742,0.009468


In [38]:

import gc
gc.collect()

23

In [39]:
# Group and sort the test dataset by customer_id

test_df = (test_df.groupby('customer_ID').tail(1)
          .set_index('customer_ID',drop=True)
          .sort_index()
          .drop(['S_2'],axis='columns'))
test_df.head(5)


Unnamed: 0_level_0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.568848,0.121399,0.01078,1.009766,0.006924,0.149414,0.000396,0.003576,0.10376,0.007397,...,,,,0.005913,0.00125,0.006542,,0.009163,0.003691,0.00322
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.841309,0.126465,0.016556,1.008789,0.009712,0.112183,0.006191,0.011383,,,...,,,,0.004345,0.000866,0.009117,,0.002197,0.000247,0.007778
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.697754,0.002724,0.001485,0.810059,0.002621,0.166138,0.004887,0.015945,,0.105286,...,,,,1.000977,0.008896,0.895996,0.150146,1.009766,0.457764,0.092041
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,0.513184,0.324707,0.149536,0.205688,0.002277,0.181152,0.005814,0.498535,,0.21167,...,,,,1.007812,0.003754,0.919922,0.255371,1.007812,0.500977,0.182983
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557,0.254395,0.768066,0.563477,0.038025,0.50293,0.168335,0.009483,0.831055,,0.071899,...,,,,0.006622,0.00114,0.009529,,0.009407,0.001557,0.000525


In [40]:
test_df = test_df.drop(columns = to_drop)

In [69]:
gc.collect()

443

In [42]:
# test_df = test_df.drop(col_core, axis=1)

In [43]:
test_cat_cols = test_df.select_dtypes("category").columns.tolist()
test_num_cols = test_df.select_dtypes(include =['float16','int64']).columns.tolist()
# test_num_cols=list(set(test_df[test_num_cols]) - {'target'})

In [44]:
x_test = test_df[test_cat_cols+test_num_cols]

In [45]:
x_test_cus_id = x_test.index

In [46]:
x_test_ = preprocess.transform(x_test)

In [47]:
x_test_ = pd.DataFrame(x_test, columns = test_cat_cols + test_num_cols )

In [48]:
x_test_['customer_ID']=x_test_cus_id

In [49]:
x_test_.drop(['customer_ID'],axis='columns')

In [50]:
x_test_ = (x_test.groupby('customer_ID')
            .tail(1).set_index('customer_ID',drop=True)
            .sort_index()
           )

# KNN

In [70]:
knn_model=KNeighborsClassifier(n_neighbors=5)

knn_model.fit(xtrain,ytrain)

y_pred_knn=knn_model.predict(xtest)
y_pred_prob_knn = knn_model.predict_proba(xtest)[:,1]

In [71]:
y_pred_prob_knn=pd.DataFrame(data={'prediction':y_pred_prob_knn})
y_true=pd.DataFrame(data={'target':ytest.reset_index(drop=True)})

print('Model accuracy score : {0:0.4f}'. format(accuracy_score(ytest, y_pred_knn)))
print('F1 Score: %.3f' % f1_score(ytest, y_pred_knn))
print('Precision: %.3f' % precision_score(ytest, y_pred_knn))
print('Recall: %.3f' % recall_score(ytest, y_pred_knn))
print("Model amex score ", amex_metric(y_true = y_true, y_pred = y_pred_prob_knn))

Model accuracy score : 0.6876
F1 Score: 0.155
Precision: 0.258
Recall: 0.111
Model amex score  0.020269799468200973


# SVM

In [None]:
from sklearn.svm import SVC
svm_model=SVC(kernel='linear', C=10.0, random_state=1)

svm_model.fit(xtrain,ytrain)

y_pred_svm=svm_model.predict(xtest)
y_pred_prob_svm = knn_model.predict_proba(xtest)[:,1]

In [None]:
y_pred_prob_svm=pd.DataFrame(data={'prediction':y_pred_prob_svm})
y_true=pd.DataFrame(data={'target':ytest.reset_index(drop=True)})

print('Model accuracy score : {0:0.4f}'. format(accuracy_score(ytest, y_pred_svm)))
print('F1 Score: %.3f' % f1_score(ytest, y_pred_svm))
print('Precision: %.3f' % precision_score(ytest, y_pred_svm))
print('Recall: %.3f' % recall_score(ytest, y_pred_svm))
print("Model amex score ", amex_metric(y_true = y_true, y_pred = y_pred_prob_svm))

# XGBoost

In [49]:
xgb_model = XGBClassifier(objective='binary:logistic', 
                      n_estimators=200,
                      eta=0.2,
                      seed=12,
                      learning_rate=0.02,
                      use_label_encoder=False,
                      eval_metric='aucpr'
                            )
xgb_model.fit(xtrain,ytrain)
y_pred = xgb_model.predict(xtest)
y_pred_prob_xgb = xgb_model.predict_proba(xtest)[:,1]



In [68]:
y_pred_prob_xgb=pd.DataFrame(data={'prediction':y_pred_prob_xgb})
y_true=pd.DataFrame(data={'target':ytest.reset_index(drop=True)})

print('Model accuracy score : {0:0.4f}'. format(accuracy_score(ytest, y_pred)))
# print('F1 Score: %.3f' % f1_score(ytest, y_pred))
# print('Precision: %.3f' % precision_score(ytest, y_pred))
# print('Recall: %.3f' % recall_score(ytest, y_pred))
#print("Model amex score ", amex_metric(y_true = y_true, y_pred = y_pred_prob_xgb))


Model accuracy score : 0.7416


# CatBoost

In [33]:
model_cat= CatBoostClassifier(iterations=2000, random_state=22, task_type='GPU')

model_cat.fit(xtrain,ytrain)

y_pred2=model_cat.predict(xtest)
y_pred_prob = model_cat.predict_proba(xtest)[:,1]

Learning rate set to 0.013091
0:	learn: 0.6898541	total: 119ms	remaining: 3m 57s
1:	learn: 0.6866509	total: 249ms	remaining: 4m 8s
2:	learn: 0.6835368	total: 341ms	remaining: 3m 46s
3:	learn: 0.6805048	total: 451ms	remaining: 3m 44s
4:	learn: 0.6775543	total: 492ms	remaining: 3m 16s
5:	learn: 0.6746849	total: 584ms	remaining: 3m 13s
6:	learn: 0.6718928	total: 667ms	remaining: 3m 9s
7:	learn: 0.6691768	total: 750ms	remaining: 3m 6s
8:	learn: 0.6665363	total: 836ms	remaining: 3m 5s
9:	learn: 0.6639691	total: 895ms	remaining: 2m 58s
10:	learn: 0.6614695	total: 975ms	remaining: 2m 56s
11:	learn: 0.6590413	total: 1.01s	remaining: 2m 46s
12:	learn: 0.6566772	total: 1.07s	remaining: 2m 43s
13:	learn: 0.6543774	total: 1.13s	remaining: 2m 40s
14:	learn: 0.6521416	total: 1.22s	remaining: 2m 41s
15:	learn: 0.6499664	total: 1.31s	remaining: 2m 42s
16:	learn: 0.6478492	total: 1.47s	remaining: 2m 51s
17:	learn: 0.6457910	total: 1.64s	remaining: 3m
18:	learn: 0.6437869	total: 1.75s	remaining: 3m 2s
1

In [35]:
y_pred_prob=pd.DataFrame(data={'prediction':y_pred_prob})
y_true=pd.DataFrame(data={'target':ytest.reset_index(drop=True)})

# # compute and print accuracy score
print('Model accuracy score : {0:0.4f}'. format(accuracy_score(ytest, y_pred2)))
print("Model amex score ", amex_metric(y_true = y_true, y_pred = y_pred_prob))

Model accuracy score : 0.7416
Model amex score  0.009388634631613012


# LightGBM

In [52]:
best_param= {"n_estimators":1500,
            "learning_rate":0.04,
            "max_depth":16,
            "subsample":0.32,
             "bagging_freq": 3,
             "random_state": 37,
             "boosting_type":'gbdt',
             "min_child_samples": 2000,
             'objective': 'binary'
            }

In [53]:
gbm_test_preds, gini=[],[]
ft_importance=pd.DataFrame(index=x.columns)
# cross - validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=32)
cross_val_score = []
amex_scores=[]
for fold_index, (train_id, val_id) in enumerate(cv.split(x,y)):
    # get the train and val set for this cross validation
    print("="*20, end=" ")
    print("Fold ", fold_index, end = " ")
    print("="*20, )
    X_train, X_val = x.iloc[train_id], x.iloc[val_id]
    y_train, y_val = y[train_id], y[val_id]

    # define the model
    model =  LGBMClassifier(**best_param)
    # fit the model
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=200, eval_metric=["auc"])

    # predict 
    y_pred = model.predict_proba(X_val)[:,1]

    y_pred=pd.DataFrame(data={'prediction':y_pred})
    y_true=pd.DataFrame(data={'target':y_val.reset_index(drop=True)})
    amex_score=amex_metric(y_true = y_true, y_pred = y_pred)

    cross_val_score.append(roc_auc_score(y_val, y_pred))
    print("Amex score {} --- cross validation score {}".format(amex_score,cross_val_score))
    if len(amex_scores)!=0:
        if max(amex_scores)<amex_score:
            x_test_pred = model.predict_proba(x_test)[:,1]
    else:
        x_test_pred = model.predict_proba(x_test)[:,1]
    
    amex_scores.append(amex_score)
    
    del X_train, X_val,y_train, y_val
    gc.collect()

gc.collect()

[200]	valid_0's auc: 0.957136	valid_0's binary_logloss: 0.2294
[400]	valid_0's auc: 0.958404	valid_0's binary_logloss: 0.226014
[600]	valid_0's auc: 0.958602	valid_0's binary_logloss: 0.225508
Amex score 0.7807267762405102 --- cross validation score [0.9586291154738787]
[200]	valid_0's auc: 0.959031	valid_0's binary_logloss: 0.224617
[400]	valid_0's auc: 0.960232	valid_0's binary_logloss: 0.221108
Amex score 0.7878051699069024 --- cross validation score [0.9586291154738787, 0.9603216173475442]
[200]	valid_0's auc: 0.958045	valid_0's binary_logloss: 0.227362
[400]	valid_0's auc: 0.959294	valid_0's binary_logloss: 0.223782
[600]	valid_0's auc: 0.959469	valid_0's binary_logloss: 0.223271
Amex score 0.784546271749896 --- cross validation score [0.9586291154738787, 0.9603216173475442, 0.9595032028230337]
[200]	valid_0's auc: 0.957472	valid_0's binary_logloss: 0.228754
[400]	valid_0's auc: 0.958914	valid_0's binary_logloss: 0.224837
[600]	valid_0's auc: 0.959137	valid_0's binary_logloss: 0.2

0

In [58]:
print("Max amex score ", max(amex_scores))

Max amex score  0.7878051699069024


In [55]:
gc.collect()

46

# Make Prediction

In [60]:
x_test['prediction']=x_test_pred

In [61]:
x_test['prediction'].head()

customer_ID
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7    0.814266
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5    0.439110
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8    0.863213
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694    0.902025
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557    0.555565
Name: prediction, dtype: float64

# Output

In [62]:
import base64
from IPython.display import HTML
def download_csv(df, title = "Download CSV file", filename = "submission.csv"):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

In [None]:
download_csv(x_test['prediction'])