In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import multiprocessing
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
from time import time
import datetime
from tqdm import tqdm, tqdm_notebook
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, train_test_split, TimeSeriesSplit
from sklearn.metrics import roc_auc_score, auc
warnings.simplefilter('ignore')
sns.set()
from scipy.stats import ks_2samp

import pickle

In [2]:
local_test=True
TARGET='isFraud'

In [3]:
%%time
test_id=pd.read_csv('input/test_identity.csv')
test_tr=pd.read_csv('input/test_transaction.csv')
train_id =pd.read_csv('input/train_identity.csv')
train_tr=pd.read_csv('input/train_transaction.csv')
sub=pd.read_csv('input/sample_submission.csv')

Wall time: 2min 32s


In [4]:
train = pd.merge(train_tr, train_id, on='TransactionID', how='left')
test = pd.merge(test_tr, test_id, on='TransactionID', how='left')

del test_id, test_tr, train_id, train_tr
gc.collect()

39

In [5]:
# def reduce_mem_usage(df, verbose=True):
#     numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
#     start_mem = df.memory_usage().sum() / 1024**2    
#     for col in tqdm_notebook(df.columns):
#         col_type = df[col].dtypes
#         if col_type in numerics:
#             c_min = df[col].min()
#             c_max = df[col].max()
#             if str(col_type)[:3] == 'int':
#                 if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
#                     df[col] = df[col].astype(np.int8)
#                 elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
#                     df[col] = df[col].astype(np.int16)
#                 elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
#                     df[col] = df[col].astype(np.int32)
#                 elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
#                     df[col] = df[col].astype(np.int64)  
#             else:
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
#                 elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
#                     df[col] = df[col].astype(np.float32)
#                 else:
#                     df[col] = df[col].astype(np.float64)    
#     end_mem = df.memory_usage().sum() / 1024**2
#     if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
#     return df

# train=reduce_mem_usage(train)
# test=reduce_mem_usage(test)

In [6]:
i_cols = ['card1','card2','card3','card4','card5','card6']

for col in i_cols:
    print(col,':',train[col].isna().sum())

########################### Let's play "sudoku" and fill nans in cards columns
i_cols = ['TransactionID','card1','card2','card3','card4','card5','card6']

full_df = pd.concat([train[i_cols], test[i_cols]])

## I've used frequency encoding before so we have ints here
## we will drop very rare cards
full_df['card6'] = np.where(full_df['card6']==30, np.nan, full_df['card6'])
full_df['card6'] = np.where(full_df['card6']==16, np.nan, full_df['card6'])


i_cols = ['card2','card5']
for col in i_cols:
    temp_df = full_df.groupby(['card1',col])[col].agg(['count']).reset_index()
    temp_df = temp_df.sort_values(by=['card1','count'], ascending=False).reset_index(drop=True)
    del temp_df['count']
    temp_df = temp_df.drop_duplicates(keep='first').reset_index(drop=True)
    temp_df.index = temp_df['card1'].values
    temp_df = temp_df[col].to_dict()

## We will find best match for nan values and fill with it
for col in i_cols:
    temp_df = full_df.groupby(['card1',col])[col].agg(['count']).reset_index()
    temp_df = temp_df.sort_values(by=['card1','count'], ascending=False).reset_index(drop=True)
    del temp_df['count']
    temp_df = temp_df.drop_duplicates(subset='card1', keep='first').reset_index(drop=True)
    temp_df.index = temp_df['card1'].values
    temp_df = temp_df[col].to_dict()
    full_df[col] = np.where(full_df[col].isna(), full_df['card1'].map(temp_df), full_df[col])
    
i_cols = ['card1','card2','card5']
for col in i_cols:
    train[col] = full_df[full_df['TransactionID'].isin(train['TransactionID'])][col].values
    test[col] = full_df[full_df['TransactionID'].isin(test['TransactionID'])][col].values

########################### Let's check how many nans left
i_cols = ['card1','card2','card3','card4','card5','card6']

for col in i_cols:
    print(col,':',train[col].isna().sum())
    
    
    
##################################################################### SAME THING WITH CARD 6 and CARD4

########################### Let's play "sudoku" and fill nans in cards columns
i_cols = ['TransactionID','card1','card2','card3','card4','card5','card6']

full_df = pd.concat([train[i_cols], test[i_cols]])

## I've used frequency encoding before so we have ints here
## we will drop very rare cards
full_df['card6'] = np.where(full_df['card6']==30, np.nan, full_df['card6'])
full_df['card6'] = np.where(full_df['card6']==16, np.nan, full_df['card6'])


i_cols = ['card6']
for col in i_cols:
    temp_df = full_df.groupby(['card4',col])[col].agg(['count']).reset_index()
    temp_df = temp_df.sort_values(by=['card4','count'], ascending=False).reset_index(drop=True)
    del temp_df['count']
    temp_df = temp_df.drop_duplicates(keep='first').reset_index(drop=True)
    temp_df.index = temp_df['card4'].values
    temp_df = temp_df[col].to_dict()

## We will find best match for nan values and fill with it
for col in i_cols:
    temp_df = full_df.groupby(['card4',col])[col].agg(['count']).reset_index()
    temp_df = temp_df.sort_values(by=['card4','count'], ascending=False).reset_index(drop=True)
    del temp_df['count']
    temp_df = temp_df.drop_duplicates(subset='card4', keep='first').reset_index(drop=True)
    temp_df.index = temp_df['card4'].values
    temp_df = temp_df[col].to_dict()
    full_df[col] = np.where(full_df[col].isna(), full_df['card4'].map(temp_df), full_df[col])
    
i_cols = ['card6']
for col in i_cols:
    train[col] = full_df[full_df['TransactionID'].isin(train['TransactionID'])][col].values
    test[col] = full_df[full_df['TransactionID'].isin(test['TransactionID'])][col].values

########################### Let's check how many nans left
i_cols = ['card1','card2','card3','card4','card5','card6']

for col in i_cols:
    print(col,':',train[col].isna().sum())

card1 : 0
card2 : 8933
card3 : 1565
card4 : 1577
card5 : 4259
card6 : 1571
card1 : 0
card2 : 4663
card3 : 1565
card4 : 1577
card5 : 698
card6 : 1571
card1 : 0
card2 : 4663
card3 : 1565
card4 : 1577
card5 : 698
card6 : 1565


In [7]:
for feature in ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'id_36']:
    train[feature + '_count_full'] = train[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))
    test[feature + '_count_full'] = test[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))

# Encoding - count encoding separately for train and test
for feature in ['id_01', 'id_31', 'id_33', 'id_36']:
    train[feature + '_count_dist'] = train[feature].map(train[feature].value_counts(dropna=False))
    test[feature + '_count_dist'] = test[feature].map(test[feature].value_counts(dropna=False))

In [None]:
i_cols = ['M1','M2','M3','M5','M6','M7','M8','M9']

# Replace T with 1 and F with 0 before taking the sum
for col in i_cols:
    train[col] = train[col].apply(lambda x: 1 if x == 'T' else (0 if x == 'F' else np.nan))
    test[col] = test[col].apply(lambda x: 1 if x == 'T' else (0 if x == 'F' else np.nan))

for df in [train, test]:
    df['M_sum'] = df[i_cols].sum(axis=1).astype(np.int8)
    df['M_na'] = df[i_cols].isna().sum(axis=1).astype(np.int8)

In [None]:
for col in ['ProductCD','M4']:
    temp_dict = train.groupby([col])[TARGET].agg(['mean']).reset_index().rename(
                                                        columns={'mean': col+'_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col+'_target_mean'].to_dict()

    train[col+'_target_mean'] = train[col].map(temp_dict)
    test[col+'_target_mean']  = test[col].map(temp_dict)

In [16]:
train['uid'] = train['card1'].astype(str)+'_'+train['card2'].astype(str)
test['uid'] = test['card1'].astype(str)+'_'+test['card2'].astype(str)

train['uid2'] = train['uid'].astype(str)+'_'+train['card3'].astype(str)+'_'+train['card4'].astype(str)
test['uid2'] = test['uid'].astype(str)+'_'+test['card3'].astype(str)+'_'+test['card4'].astype(str)

train['uid3'] = train['uid2'].astype(str)+'_'+train['addr1'].astype(str)+'_'+train['addr2'].astype(str)
test['uid3'] = test['uid2'].astype(str)+'_'+test['addr1'].astype(str)+'_'+test['addr2'].astype(str)

# Check if the Transaction Amount is common or not (we can use freq encoding here)
# In our dialog with a model we are telling to trust or not to these values   
train['TransactionAmt_check']  = np.where(train['TransactionAmt'].isin(test['TransactionAmt']), 1, 0)
train['ProductCD_check'] = np.where(train['ProductCD'].isin(test['ProductCD']), 1, 0)
test['TransactionAmt_check']  = np.where(test['TransactionAmt'].isin(train['TransactionAmt']), 1, 0)
test['ProductCD_check']  = np.where(test['ProductCD'].isin(train['ProductCD']), 1, 0)

In [17]:
i_cols = ['card1','card2','card3','card5','uid','uid2','uid3']

for col in i_cols:
    for agg_type in ['mean','std']:
        new_col_name = col+'_TransactionAmt_'+agg_type
        temp_df = pd.concat([train[[col, 'TransactionAmt']], test[[col,'TransactionAmt']]])
        #temp_df['TransactionAmt'] = temp_df['TransactionAmt'].astype(int)
        temp_df = temp_df.groupby([col])['TransactionAmt'].agg([agg_type]).reset_index().rename(
                                                columns={agg_type: new_col_name})
        
        temp_df.index = list(temp_df[col])
        temp_df = temp_df[new_col_name].to_dict()   
    
        train[new_col_name] = train[col].map(temp_df)
        test[new_col_name]  = test[col].map(temp_df)
           

train['TransactionAmt'] = np.log1p(train['TransactionAmt'])
test['TransactionAmt'] = np.log1p(test['TransactionAmt'])

for col in ['card1','card2','card3','card4','card5','card6','addr1','addr2','dist2','C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C13','C14','TransactionAmt','D2','D8','D9'
            ,'uid','uid2','uid3']:
    for agg_type in ['mean','std']:
        if (train[col].dtypes!='object'):
            new_col_name = col+'_ProductCD_'+agg_type
            temp_dict = train.groupby(['ProductCD'])[col].agg([agg_type]).reset_index().rename(
                                                                columns={agg_type: new_col_name})
            temp_dict.index = temp_dict[new_col_name].values
            temp_dict = temp_dict[new_col_name].to_dict()
        
            train[new_col_name] = train[col].map(temp_dict)
            test[new_col_name]  = test[col].map(temp_dict)

In [18]:
p = 'P_emaildomain'
r = 'R_emaildomain'
uknown = 'email_not_provided'

for df in [train, test]:
    df[p] = df[p].fillna(uknown)
    df[r] = df[r].fillna(uknown)
    
    # Check if P_emaildomain matches R_emaildomain
    df['email_check'] = np.where((df[p]==df[r])&(df[p]!=uknown),1,0)

    df[p+'_prefix'] = df[p].apply(lambda x: x.split('.')[0])
    df[r+'_prefix'] = df[r].apply(lambda x: x.split('.')[0])

In [19]:
for df in [train, test]:
    ########################### Device info
    df['DeviceInfo'] = df['DeviceInfo'].fillna('unknown_device').str.lower()
    df['DeviceInfo_device'] = df['DeviceInfo'].apply(lambda x: ''.join([i for i in x if i.isalpha()]))
    df['DeviceInfo_version'] = df['DeviceInfo'].apply(lambda x: ''.join([i for i in x if i.isnumeric()]))
    
    ########################### Device info 2
    df['id_30'] = df['id_30'].fillna('unknown_device').str.lower()
    df['id_30_device'] = df['id_30'].apply(lambda x: ''.join([i for i in x if i.isalpha()]))
    df['id_30_version'] = df['id_30'].apply(lambda x: ''.join([i for i in x if i.isnumeric()]))
    
    ########################### Browser
    df['id_31'] = df['id_31'].fillna('unknown_device').str.lower()
    df['id_31_device'] = df['id_31'].apply(lambda x: ''.join([i for i in x if i.isalpha()]))

In [20]:
def id_split(dataframe):
    dataframe['device_name'] = dataframe['DeviceInfo'].str.split('/', expand=True)[0]
    dataframe['device_version'] = dataframe['DeviceInfo'].str.split('/', expand=True)[1]

    dataframe['OS_id_30'] = dataframe['id_30'].str.split(' ', expand=True)[0]
    dataframe['version_id_30'] = dataframe['id_30'].str.split(' ', expand=True)[1]

    dataframe['browser_id_31'] = dataframe['id_31'].str.split(' ', expand=True)[0]
    dataframe['version_id_31'] = dataframe['id_31'].str.split(' ', expand=True)[1]

    dataframe['screen_width'] = dataframe['id_33'].str.split('x', expand=True)[0]
    dataframe['screen_height'] = dataframe['id_33'].str.split('x', expand=True)[1]

    dataframe.loc[dataframe['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    dataframe.loc[dataframe['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    dataframe.loc[dataframe['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    dataframe.loc[dataframe['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    dataframe.loc[dataframe['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    dataframe.loc[dataframe['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    dataframe.loc[dataframe.device_name.isin(dataframe.device_name.value_counts()[dataframe.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    dataframe['had_id'] = 1
    gc.collect()
    
    return dataframe
train=id_split(train)
test=id_split(test)

In [21]:
train['id_02_to_mean_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('mean')
train['id_02_to_mean_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('mean')
train['id_02_to_std_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('std')
train['id_02_to_std_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('std')

test['id_02_to_mean_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('mean')
test['id_02_to_mean_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('mean')
test['id_02_to_std_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('std')
test['id_02_to_std_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('std')

train['D15_to_mean_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('mean')
train['D15_to_mean_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('mean')
train['D15_to_std_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('std')
train['D15_to_std_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('std')

test['D15_to_mean_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('mean')
test['D15_to_mean_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('mean')
test['D15_to_std_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('std')
test['D15_to_std_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('std')

train['D15_to_mean_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('mean')
train['D15_to_mean_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('mean')
train['D15_to_std_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('std')
train['D15_to_std_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('std')

test['D15_to_mean_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('mean')
test['D15_to_mean_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('mean')
test['D15_to_std_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('std')
test['D15_to_std_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('std')

In [22]:
train['TransactionAmt_Log'] = np.log(train['TransactionAmt'])
test['TransactionAmt_Log'] = np.log(test['TransactionAmt'])

# New feature - decimal part of the transaction amount.
train['TransactionAmt_decimal'] = ((train['TransactionAmt'] - train['TransactionAmt'].astype(int)) * 1000).astype(int)
test['TransactionAmt_decimal'] = ((test['TransactionAmt'] - test['TransactionAmt'].astype(int)) * 1000).astype(int)

In [23]:
for feature in ['id_02__id_20', 'id_02__D8', 'D11__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__C2', 
                'card2__dist1', 'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1',
                'id_02__id_14','id_14__id_20','id_02__id_17','id_14__id_17','id_17__id_20','id_02__id_19','id_14__id_19','id_17__id_19',
                'id_19__id_20'
               ]:

    f1, f2 = feature.split('__')
    train[feature] = train[f1].astype(str) + '_' + train[f2].astype(str)
    test[feature] = test[f1].astype(str) + '_' + test[f2].astype(str)

    le = LabelEncoder()
    le.fit(list(train[feature].astype(str).values) + list(test[feature].astype(str).values))
    train[feature] = le.transform(list(train[feature].astype(str).values))
    test[feature] = le.transform(list(test[feature].astype(str).values))

In [24]:
# def get_diff_columns(train_df, test_df, show_plots=True, show_all=False, threshold=0.01):
#     """"Use KS to estimate columns where distributions differ a lot from each other"""
#     #Kolmogorov–Smirnov test
#     diff_data = []
#     for col in tqdm(test_df.columns):
#         if (test_df[col].dtypes!='object' and col!='TransactionID'and col!='DT'):
#             statistic, pvalue = ks_2samp(
#                 train_df[col].values, 
#                 test_df[col].values
#             )
#             if pvalue == 0 and statistic >0.15:
#                 diff_data.append({'feature': col, 'p': np.round(pvalue, 5), 'statistic': np.round(np.abs(statistic), 2)})

#     diff_df = pd.DataFrame(diff_data).sort_values(by='statistic', ascending=False)
#     return diff_df

# diff_df=get_diff_columns(train,test)
# diff_df.to_csv('diff_df.csv', index=False)

diff_df = pd.read_csv('diff_df.csv')

train=train.drop(diff_df['feature'],axis=1)
test=test.drop(diff_df['feature'],axis=1)


100%|████████████████████████████████████████████████████████████████████████████████| 563/563 [03:16<00:00,  2.33it/s]


In [25]:
to_drop=pd.read_csv('to_drop.csv')
train=train.drop(to_drop['Feature'],axis=1,errors='ignore')
test=test.drop(to_drop['Feature'],axis=1,errors='ignore')

In [26]:
to_drop_na=[]
for col in train.columns:
    na=train[col].isnull().sum()
    if (na>len(train)*0.9):
        to_drop_na.append(col)
        
train=train.drop(to_drop_na,axis=1)
test=test.drop(to_drop_na,axis=1)

In [30]:
list_cols = []
for col in tqdm(test.columns):
  
    if (col.startswith("V")==False) and col != 'TransactionID':
        
        #temp_df = pd.concat([train[[col]], test[[col]]])
        fq_encode = train[col].value_counts(dropna=False).to_dict()   
        train[col+'_fq_enc'] = train[col].map(fq_encode)
        test[col+'_fq_enc']  = test[col].map(fq_encode)
    else:
        list_cols.append(col)

100%|████████████████████████████████████████████████████████████████████████████████| 301/301 [00:29<00:00, 10.20it/s]


In [34]:
# idea: remove fq_enc if more than 100 categories
fq_uniques = list()
for col in train.columns:
    if '_fq_enc' in col:
        fq_uniques.append(train[col].nunique())
pd.Series(np.array(fq_uniques)).describe()

In [43]:
for col in train.columns:
    if train[col].dtype=='O':
        train[col] = train[col].fillna('unseen_before_label')
        test[col]  = test[col].fillna('unseen_before_label')
        
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        
        le = LabelEncoder()
        le.fit(list(train[col])+list(test[col]))
        train[col] = le.transform(train[col])
        test[col]  = le.transform(test[col])
        
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')

In [59]:
# C_cols = [col for col in train.columns if col.startswith('C') and '_' not in col]
# for col in C_cols: 
#     train[col+'_third_quartile'] = train.groupby('uid3')[col].transform(lambda x: x > x.quantile(0.75))
#     test[col+'_third_quartile'] = test.groupby('uid3')[col].transform(lambda x: x > x.quantile(0.75))
    
# quartile_cols = [col for col in train.columns if col.endswith('quartile')]
# train[['TransactionID']+quartile_cols].to_csv('trainquartiles.csv', index=False)
# test[['TransactionID']+quartile_cols].to_csv('testquartiles.csv', index=False)
train_quartiles = pd.read_csv('trainquartiles.csv')
train = pd.merge(train, train_quartiles, on='TransactionID')
test_quartiles = pd.read_csv('trainquartiles.csv')
test = pd.merge(test, test_quartiles, on='TransactionID')

In [96]:
id_cols = [col for col in train.columns if 'id' in col and len(col) == 5]
train['id_nan_count'] = train[id_cols].isnull().sum(axis=1)
test['id_nan_count'] = test[id_cols].isnull().sum(axis=1)

In [97]:
rm_cols = [
    'TransactionID','TransactionDT', # These columns are pure noise right now
    TARGET,                          # Not target in features))
    'uid','uid2','uid3',             # Our new client uID -> very noisy data
    'bank_type',                     # Victims bank could differ by time
     'DT','DT_M','DT_W','DT_D',       # Temporary Variables
    'DT_hour','DT_day_week','DT_day',
    'DT_D_total','DT_W_total','DT_M_total',
    'id_30','id_31','id_33',
]

In [98]:
def permut_imp(model, vl_x, vl_y):
    permutation_importances = {}
    base_roc = roc_auc_score(vl_y, model.predict())
    for col in features_columns:
        save = vl_x[col].copy()
        dtype = X_valid[col].dtype
        vl_x[col] = np.random.permutation(vl_x[col])
        X_valid[col] = vl_x[col].astype(dtype)

        predict_permutation = model.predict(vl_x)
        score_after_permut = roc_auc_score(vl_y, predict_permutation)
        
        perte = base_roc - score_after_permut
        permut_importances[col] = perte * 100 
        X_valid[col] = save
        
        return permut_importances

In [103]:
import lightgbm as lgb

def make_predictions_lgb(tr_df, tt_df, features_columns, target, lgb_params, NFOLDS=2):
    folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)
    
    X,y = tr_df[features_columns], tr_df[target]    
    P = tt_df[features_columns] 
    
        
    predictions = np.zeros(len(sub))
    permut_imp_allfolds = []
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print('Fold:',fold_)
        tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]
        vl_x, vl_y = X.iloc[val_idx,:], y[val_idx]
            
        print(len(tr_x),len(vl_x))
        tr_data = lgb.Dataset(tr_x, label=tr_y)
    
        vl_data = lgb.Dataset(vl_x, label=vl_y)  
    
        estimator = lgb.train(
            lgb_params,
            tr_data,
            valid_sets = [tr_data, vl_data],
            verbose_eval = 200,
        )   
        
        pp_p = estimator.predict(P)
        predictions += pp_p/NFOLDS
        
        permut_imp_fold = permut_imp(estimator, vl_x, vl_y)
        permut_imp_allfolds.append(permut_imp_fold)
        
        del tr_x, tr_y, vl_x, vl_y, tr_data, vl_data
        gc.collect()

    tt_df['prediction'] = predictions
            
    
    return estimator,tt_df

In [104]:
features_columns = [col for col in list(train) if col not in rm_cols]

In [None]:
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 496,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':800,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': 24,
                    'early_stopping_rounds':100, 
                } 

lgb_params['learning_rate'] = 0.005
lgb_params['n_estimators'] = 1800
lgb_params['early_stopping_rounds'] = 100    
clf,test_predictions = make_predictions_lgb(train, test, features_columns, TARGET, lgb_params, NFOLDS=7)
#fold 2: 10h26-> 42

Fold: 0
506177 84363
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.960897	valid_1's auc: 0.94032
[400]	training's auc: 0.984326	valid_1's auc: 0.955628
[600]	training's auc: 0.994611	valid_1's auc: 0.965867


In [None]:
test_predictions['isFraud'] = test_predictions['prediction']
test_predictions['TransactionID']=test['TransactionID']
test_predictions[['TransactionID','isFraud']].to_csv('submission.csv', index=False)

# PLB: